From d0a4234276aa2d9d9a9e0365efe74c03e82d62c1 Mon Sep 17 00:00:00 2001 From: Saleel Date: Tue, 26 Mar 2024 01:20:12 +0530 Subject: [PATCH 1/2] chore: remove libs/regex_to_circom in favour of zk-regex --- libs/regex_to_circom/README.md | 40 -- libs/regex_to_circom/gen.py | 191 --------- libs/regex_to_circom/lexical.js | 555 --------------------------- libs/regex_to_circom/regex_to_dfa.js | 293 -------------- 4 files changed, 1079 deletions(-) delete mode 100644 libs/regex_to_circom/README.md delete mode 100644 libs/regex_to_circom/gen.py delete mode 100644 libs/regex_to_circom/lexical.js delete mode 100644 libs/regex_to_circom/regex_to_dfa.js diff --git a/libs/regex_to_circom/README.md b/libs/regex_to_circom/README.md deleted file mode 100644 index 1733aacb8..000000000 --- a/libs/regex_to_circom/README.md +++ /dev/null @@ -1,40 +0,0 @@ -# ZK Regex - -This code generates a circom regex file with Python and JS, but doesn't support all regex syntax. You have to edit the test_regex function in regex_to_dfa.js to change what is generated. - -Note that there is a buggy JS version of this code with tests and a command line tool at https://github.com/zk-email-verify/zk-regex/, which also now supports some additional character classes. Once it reaches parity, we expect to update this repo to use that library instead of gen.py. - -Edit the regex on the top of lexical.js to change which regex is generated, then run `python3 gen.py`. - -## Circom Instructions - -First, generate a regex. Go to our [min_dfa fork](zkregex.com/min_dfa) of cyberzhg's toolbox and insert your regex on the top line. We've forked [min-dfa into a UI here](zkregex.com/min_dfa) to create a UI that converts existing regexes with [] support, as well as escapes \_, and the character classes a-z, A-Z, and 0-9. It also shows the DFA states very clearly so you can choose accept states easily. This should make converting regexes into DFA form way cleaner. - -In the function `test_regex()` in `regex_to_dfa.js`, modify either `let raw_regex = ` (that supports some regex strings like `[A-Za-z0-9]` [but no other character ranges]) or modify `let regex = regexToMinDFASpec()` (that does not support generic brackets or character ranges, only the limited syntax in https://zkregex.com/min_dfa) in `regex_to_circom/regex_to_dfa.js`. The top line of min_dfa tool corresponds to the "raw_regex", and the second line corresponds to the expanded "regex". - -Then run `npx tsx regex_to_dfa.js` to make sure that it compiles and `tsx` is installed, and then remove all `console.log` statements except for the last line, and finally run `python3 gen.py`. - -This will output a circom body. Wrap it the same way for instance circuits/regexes/from_regex.circom is written. Note that states in the zkregex [min_dfa visualizer](zkregex.com/min_dfa) are now 0 indexed (previous to Apr 2023 you had to subtract 1 from the indexes that showed up to match the circom, now it is the same). - -Note that if your regex uses `^` at the start to mean sentinel starting character, you have to edit the resulting regex.circom file to manually change `94` (ascii code of ^) to `128` (manually inserted sentinel character meaning start, you'll see it defined as the 0th character of the string). - -We will soon have a website [WIP](https://frontend-zk-regex.vercel.app/) that automatically does this. If you'd like to make this process simpler, cleaner, and less hacky, we'd recommend making a PR here or to the zk-regex library (which is a bit out of date regex-string wise and match group-wise). - -## Notes - -states[i+1][j] means that there was a character at msg[i] which led to the transition to state j. - -This means that reveal for index i should be looking at state of index i+1. - -Note that ^ has to be manually replaced with \x80 in the circom regex. - -## Halo2 - -You can use the compiled halo2_regex_lookup.txt file as input to the https://github.com/zk-email-verify/halo2-regex/ library, which will generate a regex circuit in halo2 instead. That circuit is more efficient than this one for large inputs for use for fast clientside proofs that require privacy. - -## Some regexes - -There are more in the regex section of the top level zk-email-verify README. Here are some examples however, for instance for from/subject/to order-free extraction: - -raw regex: ((\\n|\x80|^)(((from):([A-Za-z0-9 _."@-]+<)?[a-zA-Z0-9_.-]+@[a-zA-Z0-9_.]+>)?|(subject:[a-zA-Z 0-9]+)?|((to):([A-Za-z0-9 _."@-]+<)?[a-zA-Z0-9_.-]+@[a-zA-Z0-9_.]+>)?)(\\r))+ -min-dfa version: (((\n|^)(((from):([A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|0|1|2|3|4|5|6|7|8|9| |_|.|"|@|-]+<)?[a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9|_|.|-]+@[a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9|_|.]+>)?|(subject:[a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z| |0|1|2|3|4|5|6|7|8|9]+)?|((to):([A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|0|1|2|3|4|5|6|7|8|9| |_|.|"|@|-]+<)?[a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9|_|.|-]+@[a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9|_|.]+>)?)(\r))+) diff --git a/libs/regex_to_circom/gen.py b/libs/regex_to_circom/gen.py deleted file mode 100644 index b4507045f..000000000 --- a/libs/regex_to_circom/gen.py +++ /dev/null @@ -1,191 +0,0 @@ -import subprocess -import json -import string - -# Clear file -OUTPUT_HALO2 = True - -graph_json = json.loads(subprocess.check_output(['npx', 'tsx', 'regex_to_dfa.js'])) -N = len(graph_json) - -# Outgoing nodes -graph = [{} for i in range(N)] -# Incoming Nodes -rev_graph = [[] for i in range(N)] -accept_nodes = set() - -for i in range(N): - for k in graph_json[i]['edges']: - # assert len(k) == 1 - # assert ord(k) < 128 - v = graph_json[i]['edges'][k] - graph[i][k] = v - rev_graph[v].append((k, i)) - # Iterates over value in set for halo2 lookup, append to file - - if graph_json[i]['type'] == 'accept': - accept_nodes.add(i) - -accept_nodes = list(accept_nodes) -assert len(accept_nodes) == 1 - -if (OUTPUT_HALO2): - with open('halo2_regex_lookup.txt', 'w') as f: - for a in accept_nodes: - print(str(a) + " ", file=f, end='') - print("", file=f) - for i in range(N): - for k in graph_json[i]['edges']: - v = graph_json[i]['edges'][k] - for val in json.loads(k): - with open('halo2_regex_lookup.txt', 'a') as f: - print(i, v, ord(val), file=f) - -eq_i = 0 -lt_i = 0 -and_i = 0 -multi_or_i = 0 - -lines = [] -lines.append("for (var i = 0; i < num_bytes; i++) {") - -assert 0 not in accept_nodes - -for i in range(1, N): - outputs = [] - for k, prev_i in rev_graph[i]: - vals = json.loads(k) - eq_outputs = [] - - uppercase = set(string.ascii_uppercase) - lowercase = set(string.ascii_lowercase) - digits = set(string.digits) - vals = set(vals) - - if uppercase <= vals: - vals -= uppercase - lines.append(f"\tlt[{lt_i}][i] = LessThan(8);") - lines.append(f"\tlt[{lt_i}][i].in[0] <== 64;") - lines.append(f"\tlt[{lt_i}][i].in[1] <== in[i];") - - lines.append(f"\tlt[{lt_i+1}][i] = LessThan(8);") - lines.append(f"\tlt[{lt_i+1}][i].in[0] <== in[i];") - lines.append(f"\tlt[{lt_i+1}][i].in[1] <== 91;") - - lines.append(f"\tand[{and_i}][i] = AND();") - lines.append(f"\tand[{and_i}][i].a <== lt[{lt_i}][i].out;") - lines.append(f"\tand[{and_i}][i].b <== lt[{lt_i+1}][i].out;") - - eq_outputs.append(('and', and_i)) - lt_i += 2 - and_i += 1 - - if lowercase <= vals: - vals -= lowercase - lines.append(f"\tlt[{lt_i}][i] = LessThan(8);") - lines.append(f"\tlt[{lt_i}][i].in[0] <== 96;") - lines.append(f"\tlt[{lt_i}][i].in[1] <== in[i];") - - lines.append(f"\tlt[{lt_i+1}][i] = LessThan(8);") - lines.append(f"\tlt[{lt_i+1}][i].in[0] <== in[i];") - lines.append(f"\tlt[{lt_i+1}][i].in[1] <== 123;") - - lines.append(f"\tand[{and_i}][i] = AND();") - lines.append(f"\tand[{and_i}][i].a <== lt[{lt_i}][i].out;") - lines.append(f"\tand[{and_i}][i].b <== lt[{lt_i+1}][i].out;") - - eq_outputs.append(('and', and_i)) - lt_i += 2 - and_i += 1 - - if digits <= vals: - vals -= digits - lines.append(f"\tlt[{lt_i}][i] = LessThan(8);") - lines.append(f"\tlt[{lt_i}][i].in[0] <== 47;") - lines.append(f"\tlt[{lt_i}][i].in[1] <== in[i];") - - lines.append(f"\tlt[{lt_i+1}][i] = LessThan(8);") - lines.append(f"\tlt[{lt_i+1}][i].in[0] <== in[i];") - lines.append(f"\tlt[{lt_i+1}][i].in[1] <== 58;") - - lines.append(f"\tand[{and_i}][i] = AND();") - lines.append(f"\tand[{and_i}][i].a <== lt[{lt_i}][i].out;") - lines.append(f"\tand[{and_i}][i].b <== lt[{lt_i+1}][i].out;") - - eq_outputs.append(('and', and_i)) - lt_i += 2 - and_i += 1 - - for c in vals: - assert len(c) == 1 - lines.append(f"\teq[{eq_i}][i] = IsEqual();") - lines.append(f"\teq[{eq_i}][i].in[0] <== in[i];") - lines.append(f"\teq[{eq_i}][i].in[1] <== {ord(c)};") - eq_outputs.append(('eq', eq_i)) - eq_i += 1 - - lines.append(f"\tand[{and_i}][i] = AND();") - lines.append(f"\tand[{and_i}][i].a <== states[i][{prev_i}];") - - if len(eq_outputs) == 1: - lines.append(f"\tand[{and_i}][i].b <== {eq_outputs[0][0]}[{eq_outputs[0][1]}][i].out;") - elif len(eq_outputs) > 1: - lines.append(f"\tmulti_or[{multi_or_i}][i] = MultiOR({len(eq_outputs)});") - for output_i in range(len(eq_outputs)): - lines.append(f"\tmulti_or[{multi_or_i}][i].in[{output_i}] <== {eq_outputs[output_i][0]}[{eq_outputs[output_i][1]}][i].out;") - lines.append(f"\tand[{and_i}][i].b <== multi_or[{multi_or_i}][i].out;") - multi_or_i += 1 - - outputs.append(and_i) - and_i += 1 - # print(f"states[i+1][{i}] = states[i][{prev_i}] AND (in[i] == {repr(k)})") - if len(outputs) == 1: - lines.append(f"\tstates[i+1][{i}] <== and[{outputs[0]}][i].out;") - elif len(outputs) > 1: - lines.append(f"\tmulti_or[{multi_or_i}][i] = MultiOR({len(outputs)});") - for output_i in range(len(outputs)): - lines.append(f"\tmulti_or[{multi_or_i}][i].in[{output_i}] <== and[{outputs[output_i]}][i].out;") - lines.append(f"\tstates[i+1][{i}] <== multi_or[{multi_or_i}][i].out;") - multi_or_i += 1 - -lines.append("}") - -declarations = [] - -if eq_i > 0: - declarations.append(f"component eq[{eq_i}][num_bytes];") -if lt_i > 0: - declarations.append(f"component lt[{lt_i}][num_bytes];") -if and_i > 0: - declarations.append(f"component and[{and_i}][num_bytes];") -if multi_or_i > 0: - declarations.append(f"component multi_or[{multi_or_i}][num_bytes];") -declarations.append(f"signal states[num_bytes+1][{N}];") -declarations.append("") - -init_code = [] - -init_code.append("for (var i = 0; i < num_bytes; i++) {") -init_code.append("\tstates[i][0] <== 1;") -init_code.append("}") - -init_code.append(f"for (var i = 1; i < {N}; i++) {{") -init_code.append("\tstates[0][i] <== 0;") -init_code.append("}") - -init_code.append("") - -lines = declarations + init_code + lines - -accept_node = accept_nodes[0] -accept_lines = [""] -accept_lines.append("signal final_state_sum[num_bytes+1];") -accept_lines.append(f"final_state_sum[0] <== states[0][{accept_node}];") -accept_lines.append("for (var i = 1; i <= num_bytes; i++) {") -accept_lines.append(f"\tfinal_state_sum[i] <== final_state_sum[i-1] + states[i][{accept_node}];") -accept_lines.append("}") -accept_lines.append("out <== final_state_sum[num_bytes];") - -lines += accept_lines - -print("\n".join(lines)) diff --git a/libs/regex_to_circom/lexical.js b/libs/regex_to_circom/lexical.js deleted file mode 100644 index 7e0627e60..000000000 --- a/libs/regex_to_circom/lexical.js +++ /dev/null @@ -1,555 +0,0 @@ -/** - * Try parsing simple regular expression to syntax tree. - * - * Basic grammars: - * Empty: S -> ϵ - * Cat: S -> S S - * Or: S -> S | S - * Star: S -> S * - * Text: S -> [0-9a-zA-Z] - * S -> ( S ) - * - * Extension: - * Plus: S -> S + -> S S * - * Ques: S -> S ? -> (S | ϵ) - * - * @param {string} text The input regular expression - * @return {string|object} Returns a string that is an error message if failed to parse the expression, - * otherwise returns an object which is the syntax tree. - * - * Edited from https://github.com/CyberZHG/toolbox/blob/gh-pages/js/lexical.js - */ -function parseRegex(text) { - "use strict"; - function parseSub(text, begin, end, first) { - var i, - sub, - last = 0, - node = { begin: begin, end: end }, - virNode, - tempNode, - stack = 0, - parts = []; - if (text.length === 0) { - return "Error: empty input at " + begin + "."; - } - if (first) { - for (i = 0; i <= text.length; i += 1) { - if (i === text.length || (text[i] === "|" && stack === 0)) { - if (last === 0 && i === text.length) { - return parseSub(text, begin + last, begin + i, false); - } - sub = parseSub(text.slice(last, i), begin + last, begin + i, true); - if (typeof sub === "string") { - return sub; - } - parts.push(sub); - last = i + 1; - } else if (text[i] === "(") { - stack += 1; - } else if (text[i] === ")") { - stack -= 1; - } - } - if (parts.length === 1) { - return parts[0]; - } - node.type = "or"; - node.parts = parts; - } else { - for (i = 0; i < text.length; i += 1) { - if (text[i] === "(") { - last = i + 1; - i += 1; - stack = 1; - while (i < text.length && stack !== 0) { - if (text[i] === "(") { - stack += 1; - } else if (text[i] === ")") { - stack -= 1; - } - i += 1; - } - if (stack !== 0) { - return "Error: missing right bracket for " + (begin + last) + "."; - } - i -= 1; - sub = parseSub(text.slice(last, i), begin + last, begin + i, true); - if (typeof sub === "string") { - return sub; - } - sub.begin -= 1; - sub.end += 1; - parts.push(sub); - } else if (text[i] === "*") { - if (parts.length === 0) { - return "Error: unexpected * at " + (begin + i) + "."; - } - tempNode = { begin: parts[parts.length - 1].begin, end: parts[parts.length - 1].end + 1 }; - tempNode.type = "star"; - tempNode.sub = parts[parts.length - 1]; - parts[parts.length - 1] = tempNode; - } else if (text[i] === "+") { - if (parts.length === 0) { - return "Error: unexpected + at " + (begin + i) + "."; - } - virNode = { begin: parts[parts.length - 1].begin, end: parts[parts.length - 1].end + 1 }; - virNode.type = "star"; - virNode.sub = parts[parts.length - 1]; - tempNode = { begin: parts[parts.length - 1].begin, end: parts[parts.length - 1].end + 1 }; - tempNode.type = "cat"; - tempNode.parts = [parts[parts.length - 1], virNode]; - parts[parts.length - 1] = tempNode; - } else if (text[i] === "?") { - if (parts.length === 0) { - return "Error: unexpected + at " + (begin + i) + "."; - } - virNode = { begin: parts[parts.length - 1].begin, end: parts[parts.length - 1].end + 1 }; - virNode.type = "empty"; - virNode.sub = parts[parts.length - 1]; - tempNode = { begin: parts[parts.length - 1].begin, end: parts[parts.length - 1].end + 1 }; - tempNode.type = "or"; - tempNode.parts = [parts[parts.length - 1], virNode]; - parts[parts.length - 1] = tempNode; - } else if (text[i] === "ϵ") { - tempNode = { begin: begin + i, end: begin + i + 1 }; - tempNode.type = "empty"; - parts.push(tempNode); - } else if (Array.isArray(text[i])) { - tempNode = { begin: begin + i, end: begin + i + 1 }; - tempNode.type = "text"; - tempNode.text = text[i][0]; - parts.push(tempNode); - } else { - tempNode = { begin: begin + i, end: begin + i + 1 }; - tempNode.type = "text"; - tempNode.text = text[i]; - parts.push(tempNode); - } - } - if (parts.length === 1) { - return parts[0]; - } - node.type = "cat"; - node.parts = parts; - } - return node; - } - - let new_text = []; - let i = 0; - while (i < text.length) { - if (text[i] === "\\") { - const escapeMap = { n: "\n", r: "\r", t: "\t", v: "\v", f: "\f", "^": String.fromCharCode(128) }; - const char = text[i + 1]; - new_text.push([escapeMap[char] || char]); - i += 2; - } else { - new_text.push(text[i]); - i += 1; - } - } - return parseSub(new_text, 0, new_text.length, true); -} - -/** - * Convert regular expression to nondeterministic finite automaton. - * - * @param {string} text @see parseRegex() - * @return {object|string} - */ -function regexToNfa(text) { - "use strict"; - function generateGraph(node, start, end, count) { - var i, last, temp, tempStart, tempEnd; - if (!start.hasOwnProperty("id")) { - start.id = count; - count += 1; - } - switch (node.type) { - case "empty": - start.edges.push(["ϵ", end]); - break; - case "text": - start.edges.push([node.text, end]); - break; - case "cat": - last = start; - for (i = 0; i < node.parts.length - 1; i += 1) { - temp = { type: "", edges: [] }; - count = generateGraph(node.parts[i], last, temp, count); - last = temp; - } - count = generateGraph(node.parts[node.parts.length - 1], last, end, count); - break; - case "or": - for (i = 0; i < node.parts.length; i += 1) { - tempStart = { type: "", edges: [] }; - tempEnd = { type: "", edges: [["ϵ", end]] }; - start.edges.push(["ϵ", tempStart]); - count = generateGraph(node.parts[i], tempStart, tempEnd, count); - } - break; - case "star": - tempStart = { type: "", edges: [] }; - tempEnd = { - type: "", - edges: [ - ["ϵ", tempStart], - ["ϵ", end], - ], - }; - start.edges.push(["ϵ", tempStart]); - start.edges.push(["ϵ", end]); - count = generateGraph(node.sub, tempStart, tempEnd, count); - break; - } - if (!end.hasOwnProperty("id")) { - end.id = count; - count += 1; - } - return count; - } - var ast = parseRegex(text), - start = { type: "start", edges: [] }, - accept = { type: "accept", edges: [] }; - if (typeof ast === "string") { - return ast; - } - generateGraph(ast, start, accept, 0); - return start; -} - -/** - * Convert nondeterministic finite automaton to deterministic finite automaton. - * - * @param {object} nfa @see regexToNfa(), the function assumes that the given NFA is valid. - * @return {object} dfa Returns the first element of the DFA. - */ -function nfaToDfa(nfa) { - "use strict"; - function getClosure(nodes) { - var i, - closure = [], - stack = [], - symbols = [], - type = "", - top; - for (i = 0; i < nodes.length; i += 1) { - stack.push(nodes[i]); - closure.push(nodes[i]); - if (nodes[i].type === "accept") { - type = "accept"; - } - } - while (stack.length > 0) { - top = stack.pop(); - // If top is of type string and starts with "Error" then return error - if (typeof top === "string" && top[0] === "E") { - console.log(top); - continue; - } - for (i = 0; i < top.edges.length; i += 1) { - if (top.edges[i][0] === "ϵ") { - if (closure.indexOf(top.edges[i][1]) < 0) { - stack.push(top.edges[i][1]); - closure.push(top.edges[i][1]); - if (top.edges[i][1].type === "accept") { - type = "accept"; - } - } - } else { - if (symbols.indexOf(top.edges[i][0]) < 0) { - symbols.push(top.edges[i][0]); - } - } - } - } - closure.sort(function (a, b) { - return a.id - b.id; - }); - symbols.sort(); - return { - key: closure - .map(function (x) { - return x.id; - }) - .join(","), - items: closure, - symbols: symbols, - type: type, - edges: [], - trans: {}, - }; - } - function getClosedMove(closure, symbol) { - var i, - j, - node, - nexts = []; - for (i = 0; i < closure.items.length; i += 1) { - node = closure.items[i]; - for (j = 0; j < node.edges.length; j += 1) { - if (symbol === node.edges[j][0]) { - if (nexts.indexOf(node.edges[j][1]) < 0) { - nexts.push(node.edges[j][1]); - } - } - } - } - return getClosure(nexts); - } - function toAlphaCount(n) { - var a = "A".charCodeAt(0), - z = "Z".charCodeAt(0), - len = z - a + 1, - s = ""; - while (n >= 0) { - s = String.fromCharCode((n % len) + a) + s; - n = Math.floor(n / len) - 1; - } - return s; - } - var i, - first = getClosure([nfa]), - states = {}, - front = 0, - top, - closure, - queue = [first], - count = 0; - first.id = toAlphaCount(count); - states[first.key] = first; - while (front < queue.length) { - top = queue[front]; - front += 1; - for (i = 0; i < top.symbols.length; i += 1) { - closure = getClosedMove(top, top.symbols[i]); - if (!states.hasOwnProperty(closure.key)) { - count += 1; - closure.id = toAlphaCount(count); - states[closure.key] = closure; - queue.push(closure); - } - top.trans[top.symbols[i]] = states[closure.key]; - top.edges.push([top.symbols[i], states[closure.key]]); - } - } - return first; -} - -/** - * Convert the DFA to its minimum form using Hopcroft's algorithm. - * - * @param {object} dfa @see nfaToDfa(), the function assumes that the given DFA is valid. - * @return {object} dfa Returns the first element of the minimum DFA. - */ -function minDfa(dfa) { - "use strict"; - function getReverseEdges(start) { - var i, - top, - symbol, - next, - front = 0, - queue = [start], - visited = {}, - symbols = {}, // The input alphabet - idMap = {}, // Map id to states - revEdges = {}; // Map id to the ids which connects to the id with an alphabet - visited[start.id] = true; - while (front < queue.length) { - top = queue[front]; - front += 1; - idMap[top.id] = top; - for (i = 0; i < top.symbols.length; i += 1) { - symbol = top.symbols[i]; - if (!symbols.hasOwnProperty(symbol)) { - symbols[symbol] = true; - } - next = top.trans[symbol]; - if (!revEdges.hasOwnProperty(next.id)) { - revEdges[next.id] = {}; - } - if (!revEdges[next.id].hasOwnProperty(symbol)) { - revEdges[next.id][symbol] = []; - } - revEdges[next.id][symbol].push(top.id); - if (!visited.hasOwnProperty(next.id)) { - visited[next.id] = true; - queue.push(next); - } - } - } - return [Object.keys(symbols), idMap, revEdges]; - } - function hopcroft(symbols, idMap, revEdges) { - var i, - j, - k, - keys, - key, - key1, - key2, - top, - group1, - group2, - symbol, - revGroup, - ids = Object.keys(idMap).sort(), - partitions = {}, - front = 0, - queue = [], - visited = {}; - group1 = []; - group2 = []; - for (i = 0; i < ids.length; i += 1) { - if (idMap[ids[i]].type === "accept") { - group1.push(ids[i]); - } else { - group2.push(ids[i]); - } - } - key = group1.join(","); - partitions[key] = group1; - queue.push(key); - visited[key] = 0; - if (group2.length !== 0) { - key = group2.join(","); - partitions[key] = group2; - queue.push(key); - } - while (front < queue.length) { - top = queue[front]; - front += 1; - if (top) { - top = top.split(","); - for (i = 0; i < symbols.length; i += 1) { - symbol = symbols[i]; - revGroup = {}; - for (j = 0; j < top.length; j += 1) { - if (revEdges.hasOwnProperty(top[j]) && revEdges[top[j]].hasOwnProperty(symbol)) { - for (k = 0; k < revEdges[top[j]][symbol].length; k += 1) { - revGroup[revEdges[top[j]][symbol][k]] = true; - } - } - } - keys = Object.keys(partitions); - for (j = 0; j < keys.length; j += 1) { - key = keys[j]; - group1 = []; - group2 = []; - for (k = 0; k < partitions[key].length; k += 1) { - if (revGroup.hasOwnProperty(partitions[key][k])) { - group1.push(partitions[key][k]); - } else { - group2.push(partitions[key][k]); - } - } - if (group1.length !== 0 && group2.length !== 0) { - delete partitions[key]; - key1 = group1.join(","); - key2 = group2.join(","); - partitions[key1] = group1; - partitions[key2] = group2; - if (visited.hasOwnProperty(key1)) { - queue[visited[key1]] = null; - visited[key1] = queue.length; - queue.push(key1); - visited[key2] = queue.length; - queue.push(key2); - } else if (group1.length <= group2.length) { - visited[key1] = queue.length; - queue.push(key1); - } else { - visited[key2] = queue.length; - queue.push(key2); - } - } - } - } - } - } - return Object.values(partitions); - } - function buildMinNfa(start, partitions, idMap, revEdges) { - var i, - j, - temp, - node, - symbol, - nodes = [], - group = {}, - edges = {}; - partitions.sort(function (a, b) { - var ka = a.join(","), - kb = b.join(","); - if (ka < kb) { - return -1; - } - if (ka > kb) { - return 1; - } - return 0; - }); - for (i = 0; i < partitions.length; i += 1) { - if (partitions[i].indexOf(start.id) >= 0) { - if (i > 0) { - temp = partitions[i]; - partitions[i] = partitions[0]; - partitions[0] = temp; - } - break; - } - } - for (i = 0; i < partitions.length; i += 1) { - node = { - id: (i + 1).toString(), - key: partitions[i].join(","), - items: [], - symbols: [], - type: idMap[partitions[i][0]].type, - edges: [], - trans: {}, - }; - for (j = 0; j < partitions[i].length; j += 1) { - node.items.push(idMap[partitions[i][j]]); - group[partitions[i][j]] = i; - } - edges[i] = {}; - nodes.push(node); - } - Object.keys(revEdges).forEach(function (to) { - Object.keys(revEdges[to]).forEach(function (symbol) { - revEdges[to][symbol].forEach(function (from) { - if (!edges[group[from]].hasOwnProperty(group[to])) { - edges[group[from]][group[to]] = {}; - } - edges[group[from]][group[to]][symbol] = true; - }); - }); - }); - Object.keys(edges).forEach(function (from) { - Object.keys(edges[from]).forEach(function (to) { - symbol = JSON.stringify(Object.keys(edges[from][to]).sort()); - nodes[from].symbols.push(symbol); - nodes[from].edges.push([symbol, nodes[to]]); - nodes[from].trans[symbol] = nodes[to]; - }); - }); - return nodes[0]; - } - var edgesTuple = getReverseEdges(dfa), - symbols = edgesTuple[0], - idMap = edgesTuple[1], - revEdges = edgesTuple[2], - partitions = hopcroft(symbols, idMap, revEdges); - return buildMinNfa(dfa, partitions, idMap, revEdges); -} - -if (typeof require === "function") { - exports.parseRegex = parseRegex; - exports.regexToNfa = regexToNfa; - exports.nfaToDfa = nfaToDfa; - exports.minDfa = minDfa; -} diff --git a/libs/regex_to_circom/regex_to_dfa.js b/libs/regex_to_circom/regex_to_dfa.js deleted file mode 100644 index 1bb87beed..000000000 --- a/libs/regex_to_circom/regex_to_dfa.js +++ /dev/null @@ -1,293 +0,0 @@ -/*jslint browser: true*/ -/*global require, exports*/ -// import { STRING_PRESELECTOR } from "../src/helpers/constants.ts"; -import { minDfa, nfaToDfa, regexToNfa } from "./lexical"; - -/** This section defines helper regex components -- to edit the regex used, edit the return - * of the test_regex function. - * All of the relevant regexes are in the main repo README. - */ - -// Helper components -const a2z = "a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z"; -const A2Z = "A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z"; -const r0to9 = "0|1|2|3|4|5|6|7|8|9"; -const alphanum = `${a2z}|${A2Z}|${r0to9}`; - -const key_chars = `(${a2z})`; -const catch_all = - "(0|1|2|3|4|5|6|7|8|9|a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|!|\"|#|$|%|&|'|\\(|\\)|\\*|\\+|,|-|.|/|:|;|<|=|>|\\?|@|\\[|\\\\|\\]|^|_|`|{|\\||}|~| |\t|\n|\r|\x0b|\x0c)"; -const catch_all_without_semicolon = - "(0|1|2|3|4|5|6|7|8|9|a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|!|\"|#|$|%|&|'|\\(|\\)|\\*|\\+|,|-|.|/|:|<|=|>|\\?|@|\\[|\\\\|\\]|^|_|`|{|\\||}|~| |\t|\n|\r|\x0b|\x0c)"; - -const email_chars = `${alphanum}|_|.|-`; -const base_64 = `(${alphanum}|\\+|/|=)`; -const word_char = `(${alphanum}|_)`; - -const a2z_nosep = "abcdefghijklmnopqrstuvwxyz"; -const A2Z_nosep = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; -const a2f_nosep = "abcdef"; -const A2F_nosep = "ABCDEF"; -const r0to9_nosep = "0123456789"; - -// TODO: Note that this is replicated code in lexical.js as well -// Note that ^ has to be manually replaced with \x80 in the regex -const escapeMap = { n: "\n", r: "\r", t: "\t", v: "\v", f: "\f" }; -let whitespace = Object.values(escapeMap); -const slash_s = whitespace.join("|"); - -// The test_regex function whose return needs to be edited -// Note that in order to specify some strings in regex, we must use \\ to escape \'s. -// For instance, matching the literal + is represented as \\+. -// However, matching the literal \r (ascii 60) character is still \r -// Matching \ then an r as two characters would be \\r in the js string literal -function test_regex() { - // let to_from_regex_old = '(\r\n|\x80)(to|from):([A-Za-z0-9 _."@-]+<)?[a-zA-Z0-9_.-]+@[a-zA-Z0-9_.]+>?\r\n'; - // let regex = `\r\ndkim-signature:(${key_chars}=${catch_all_without_semicolon}+; )+bh=${base_64}+; `; - // let order_invariant_regex_raw = `((\\n|\x80|^)(((from):([A-Za-z0-9 _."@-]+<)?[a-zA-Z0-9_.-]+@[a-zA-Z0-9_.]+>)?|(subject:[a-zA-Z 0-9]+)?|((to):([A-Za-z0-9 _."@-]+<)?[a-zA-Z0-9_.-]+@[a-zA-Z0-9_.]+>)?|(dkim-signature:((a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z)=(0|1|2|3|4|5|6|7|8|9|a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|!|"|#|$|%|&|\'|\\(|\\)|\\*|\\+|,|-|.|/|:|<|=|>|\\?|@|[|\\\\|]|^|_|`|{|\\||}|~| |\t|\n|\r|\x0B|\f)+; ))?)(\\r))+` // Uses a-z syntax instead of | for each char - let email_address_regex = `([a-zA-Z0-9._%\\+-=]+@[a-zA-Z0-9.-]+)`; - - // ------- HEADER/SIGNATURE REGEX -------- - let order_invariant_header_regex_raw = `(((\\n|^)(((from):([A-Za-z0-9 _."@-]+<)?[a-zA-Z0-9_.-]+@[a-zA-Z0-9_.]+>)?|(subject:[a-zA-Z 0-9]+)?|((to):([A-Za-z0-9 _."@-]+<)?[a-zA-Z0-9_.-]+@[a-zA-Z0-9_.]+>)?)(\\r))+)\\n`; - let sig_regex = `\r\ndkim-signature:(${key_chars}=${catch_all_without_semicolon}+; )+bh=${base_64}+; `; - - // let full_header_regex = order_invariant_header_regex_raw + sig_regex; - // let raw_regex = order_invariant_header_regex_raw; - // let regex = regexToMinDFASpec(raw_regex) + sig_regex; - // console.log(format_regex_printable(sig_regex)); - - // -------- SUBJECT REGEXES -------- - // This raw subject line (with \\ replaced with \) can be put into regexr.com to test new match strings and sanity check that it works - // TODO: Other valid chars in email addresses: #$%!^/&*, outlined at https://ladedu.com/valid-characters-for-email-addresses-the-complete-list/ and in the RFC - - // -- SEND SPECIFIC REGEXES -- - // let send_specific_raw_subject_regex = `((\r\n)|^)subject:[Ss]end (\$)?[0-9]+(.[0-9]+)? [a-zA-Z]+ to (${email_address_regex}|0x[0-9a-fA_F]+)\r\n`; - // let raw_subject_regex = `((\r\n)|^)subject:[a-zA-Z]+ (\\$)?[0-9]+(.[0-9]+)? [a-zA-Z]+ to (([a-zA-Z0-9._%\\+-=]+@[a-zA-Z0-9.-]+)|0x[0-9]+)\r\n`; - // Input: ((\\r\\n)|^)subject:[Ss]end (\$)?[0-9]+(.[0-9]+)? [a-zA-Z]+ to (([a-zA-Z0-9._%\+-=]+@[a-zA-Z0-9.-]+)|0x[0-9]+)\\r\\n - // This can be pasted into the first line of https://zkregex.com/min_dfa (after replacing \\ -> \) - // ((\\r\\n)|\^)subject:[Ss]end (\$)?[0-9]+(\.[0-9])? (ETH|DAI|USDC|eth|usdc|dai) to (([a-zA-Z0-9\._%\+-]+@[a-zA-Z0-9\.-]+.[a-zA-Z0-9]+)|0x[0-9]+)\\r\\n - // console.log(raw_subject_regex); - - // -- GENERIC SUBJECT COMMANDS -- - let raw_subject_regex = `((\r\n)|^)subject:[a-zA-Z]+ (\\$)?[0-9]+(.[0-9]+)? [a-zA-Z]+ to (${email_address_regex}|0x[0-9a-fA_F]+)\r\n`; - - // -------- OTHER FIELD REGEXES -------- - let raw_from_regex = `(\r\n|^)from:([A-Za-z0-9 _.,"@-]+)<[a-zA-Z0-9_.-]+@[a-zA-Z0-9_.-]+>\r\n`; - // let message_id_regex = `(\r\n|^)message-id:<[=@.\\+_-a-zA-Z0-9]+>\r\n`; - - // -------- TWITTER BODY REGEX --------- - // let regex = STRING_PRESELECTOR + `${word_char}+`; - - // ---------- DEPRECATAED REGEXES ---------- - // let order_invariant_header_regex_raw = `(((\\n|^)(((from):([A-Za-z0-9 _."@-]+<)?[a-zA-Z0-9_.-]+@[a-zA-Z0-9_.]+>)?|(subject:[a-zA-Z 0-9]+)?|((to):([A-Za-z0-9 _."@-]+<)?[a-zA-Z0-9_.-]+@[a-zA-Z0-9_.]+>)?)(\\r))+)`; - // let order_invariant_full_regex_raw = `(dkim-signature:((a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z)=(0|1|2|3|4|5|6|7|8|9|a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|!|"|#|$|%|&|\'|\\(|\\)|\\*|\\+|,|-|.|/|:|<|=|>|\\?|@|[|\\\\|]|^|_|\`|{|\\||}|~| |\t|\n|\r|\x0B|\f)+; ))?)(\\r))+` // Uses a-z syntax instead of | for each char - // let old_regex = '(\r\n|\x80)(to|from):([A-Za-z0-9 _."@-]+<)?[a-zA-Z0-9_.-]+@[a-zA-Z0-9_.]+>?\r\n'; - // let regex = `(\n|^)(to|from):((${email_chars}|"|@| )+<)?(${email_chars})+@(${email_chars})+>?\r`; - // let regex = `(\r\n|^)(to|from):((${email_chars}|"|@| )+<)?(${email_chars})+@(${email_chars})+>?\r\n`; - // 'dkim-signature:((a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z)=(0|1|2|3|4|5|6|7|8|9|a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|!|"|#|$|%|&|\'|\\(|\\)|\\*|\\+|,|-|.|/|:|<|=|>|\\?|@|[|\\\\|]|^|_|`|{|\\||}|~| |\t|\n|\r|\x0B|\f)+; )+bh=(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9|\\+|/|=)+; ' - // let regex = 'hello(0|1|2|3|4|5|6|7|8|9)+world'; - - // --------- FINAL CONVERSION --------- - // console.log(format_regex_printable(raw_subject_regex)); - let regex = regexToMinDFASpec(raw_subject_regex); - // console.log(format_regex_printable(regex)); - - return regex; -} - -// Escapes and prints regexes (might be buggy) -function format_regex_printable(s) { - const escaped_string_json = JSON.stringify(s); - const escaped_string = escaped_string_json.slice(1, escaped_string_json.length - 1); - return escaped_string - .replaceAll("\\\\\\\\", "\\") - .replaceAll("\\\\", "\\") - .replaceAll("\\|", "\\\\|") - .replaceAll("/", "\\/") - .replaceAll("\u000b", "\\♥") - .replaceAll("|[|", "|\\[|") - .replaceAll("|]|", "|\\]|") - .replaceAll("|.|", "|\\.|") - .replaceAll("|$|", "|\\$|") - .replaceAll("|^|", "|\\^|"); - // let escaped = escape_whitespace(escape_whitespace(s.replaceAll("\\\\", "ZZZZZZZ"))); - // let fixed = escaped.replaceAll("\\(", "(").replaceAll("\\)", ")").replaceAll("\\+", "+").replaceAll("\\*", "*").replaceAll("\\?", "?"); -} - -// Note that this is not complete and very case specific i.e. can only handle a-z and a-f, and not a-c. -// This function expands [] sections to convert values for https://zkregex.com/min_dfa -// The input is a regex with [] and special characters (i.e. the first line of min_dfa tool) -// The output is expanded regexes without any special characters -function regexToMinDFASpec(str) { - // Replace all A-Z with A2Z etc - // TODO: Upstream this to min_dfa - let combined_nosep = str - .replaceAll("A-Z", A2Z_nosep) - .replaceAll("a-z", a2z_nosep) - .replaceAll("A-F", A2F_nosep) - .replaceAll("a-f", a2f_nosep) - .replaceAll("0-9", r0to9_nosep) - .replaceAll("\\w", A2Z_nosep + r0to9_nosep + a2z_nosep + "_") - .replaceAll("\\d", r0to9_nosep) - .replaceAll("\\s", slash_s); - // .replaceAll("\\w", A2Z_nosep + r0to9_nosep + a2z_nosep); // I think that there's also an underscore here - - function addPipeInsideBrackets(str) { - let result = ""; - let insideBrackets = false; - for (let i = 0; i < str.length; i++) { - if (str[i] === "[") { - result += str[i]; - insideBrackets = true; - continue; - } else if (str[i] === "]") { - insideBrackets = false; - } - let str_to_add = str[i]; - if (str[i] === "\\") { - i++; - str_to_add += str[i]; - } - result += insideBrackets ? "|" + str_to_add : str_to_add; - } - return result.replaceAll("[|", "[").replaceAll("[", "(").replaceAll("]", ")"); - } - - // function makeCurlyBracesFallback(str) { - // let result = ""; - // let insideBrackets = false; - // for (let i = 0; i < str.length; i++) { - // if (str[i] === "{") { - // result += str[i]; - // insideBrackets = true; - // continue; - // } else if (str[i] === "}") { - // insideBrackets = false; - // } - // result += insideBrackets ? "|" + str[i] : str[i]; - // } - // return result.replaceAll("[|", "[").replaceAll("[", "(").replaceAll("]", ")"); - // } - - function checkIfBracketsHavePipes(str) { - let result = true; - let insideBrackets = false; - let insideParens = 0; - let indexAt = 0; - for (let i = 0; i < str.length; i++) { - if (indexAt >= str.length) break; - if (str[indexAt] === "[") { - insideBrackets = true; - indexAt++; - continue; - } else if (str[indexAt] === "]") { - insideBrackets = false; - } - if (str[indexAt] === "(") { - insideParens++; - } else if (str[indexAt] === ")") { - insideParens--; - } - if (insideBrackets) { - if (str[indexAt] === "|") { - indexAt++; - } else { - result = false; - return result; - } - } - if (!insideParens && str[indexAt] === "|") { - console.log("Error: | outside of parens!"); - } - if (str[indexAt] === "\\") { - indexAt++; - } - indexAt++; - } - return result; - } - - let combined; - if (!checkIfBracketsHavePipes(combined_nosep)) { - // console.log("Adding pipes within brackets between everything!"); - combined = addPipeInsideBrackets(combined_nosep); - if (!checkIfBracketsHavePipes(combined)) { - console.log("Did not add brackets correctly!"); - } - } else { - combined = combined_nosep; - } - - return combined; -} - -function toNature(col) { - var i, - j, - base = "ABCDEFGHIJKLMNOPQRSTUVWXYZ", - result = 0; - if ("1" <= col[0] && col[0] <= "9") { - result = parseInt(col, 10); - } else { - for (i = 0, j = col.length - 1; i < col.length; i += 1, j -= 1) { - result += Math.pow(base.length, j) * (base.indexOf(col[i]) + 1); - } - } - return result; -} - -function printGraphForRegex(regex) { - let nfa = regexToNfa(regex); - let dfa = minDfa(nfaToDfa(nfa)); - - var i, - j, - states = {}, - nodes = [], - stack = [dfa], - symbols = [], - top; - - while (stack.length > 0) { - top = stack.pop(); - if (!states.hasOwnProperty(top.id)) { - states[top.id] = top; - top.nature = toNature(top.id); - nodes.push(top); - for (i = 0; i < top.edges.length; i += 1) { - if (top.edges[i][0] !== "ϵ" && symbols.indexOf(top.edges[i][0]) < 0) { - symbols.push(top.edges[i][0]); - } - stack.push(top.edges[i][1]); - } - } - } - nodes.sort(function (a, b) { - return a.nature - b.nature; - }); - symbols.sort(); - - let graph = []; - for (let i = 0; i < nodes.length; i += 1) { - let curr = {}; - curr.type = nodes[i].type; - curr.edges = {}; - for (let j = 0; j < symbols.length; j += 1) { - if (nodes[i].trans.hasOwnProperty(symbols[j])) { - curr.edges[symbols[j]] = nodes[i].trans[symbols[j]].nature - 1; - } - } - graph[nodes[i].nature - 1] = curr; - } - - console.log(JSON.stringify(graph)); - return JSON.stringify(graph); -} - -let regex = test_regex(); -printGraphForRegex(regex); - -if (typeof require === "function") { - exports.regexToMinDFASpec = regexToMinDFASpec; - exports.toNature = toNature; -} From 90fe03df5884b365fd0939c3934e76a62ab6f8e8 Mon Sep 17 00:00:00 2001 From: Saleel Date: Tue, 26 Mar 2024 14:09:30 +0530 Subject: [PATCH 2/2] remove regex_to_circom from docs --- docs/MiscellaniousDocs/twitterREADME.md | 4 +--- docs/README.md | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/docs/MiscellaniousDocs/twitterREADME.md b/docs/MiscellaniousDocs/twitterREADME.md index d1cf70d27..607df13b5 100644 --- a/docs/MiscellaniousDocs/twitterREADME.md +++ b/docs/MiscellaniousDocs/twitterREADME.md @@ -52,9 +52,7 @@ If you encounter any issues while using the Twitter Email Verifier. Feel free to - + + ## Terminology