OLD | NEW |
(Empty) | |
| 1 """JsLex: a lexer for Javascript""" |
| 2 # Originally from https://bitbucket.org/ned/jslex |
| 3 from __future__ import unicode_literals |
| 4 |
| 5 import re |
| 6 |
| 7 |
| 8 class Tok(object): |
| 9 """ |
| 10 A specification for a token class. |
| 11 """ |
| 12 num = 0 |
| 13 |
| 14 def __init__(self, name, regex, next=None): |
| 15 self.id = Tok.num |
| 16 Tok.num += 1 |
| 17 self.name = name |
| 18 self.regex = regex |
| 19 self.next = next |
| 20 |
| 21 |
| 22 def literals(choices, prefix="", suffix=""): |
| 23 """ |
| 24 Create a regex from a space-separated list of literal `choices`. |
| 25 |
| 26 If provided, `prefix` and `suffix` will be attached to each choice |
| 27 individually. |
| 28 """ |
| 29 return "|".join(prefix + re.escape(c) + suffix for c in choices.split()) |
| 30 |
| 31 |
| 32 class Lexer(object): |
| 33 """ |
| 34 A generic multi-state regex-based lexer. |
| 35 """ |
| 36 |
| 37 def __init__(self, states, first): |
| 38 self.regexes = {} |
| 39 self.toks = {} |
| 40 |
| 41 for state, rules in states.items(): |
| 42 parts = [] |
| 43 for tok in rules: |
| 44 groupid = "t%d" % tok.id |
| 45 self.toks[groupid] = tok |
| 46 parts.append("(?P<%s>%s)" % (groupid, tok.regex)) |
| 47 self.regexes[state] = re.compile("|".join(parts), re.MULTILINE | re.
VERBOSE) |
| 48 |
| 49 self.state = first |
| 50 |
| 51 def lex(self, text): |
| 52 """ |
| 53 Lexically analyze `text`. |
| 54 |
| 55 Yields pairs (`name`, `tokentext`). |
| 56 """ |
| 57 end = len(text) |
| 58 state = self.state |
| 59 regexes = self.regexes |
| 60 toks = self.toks |
| 61 start = 0 |
| 62 |
| 63 while start < end: |
| 64 for match in regexes[state].finditer(text, start): |
| 65 name = match.lastgroup |
| 66 tok = toks[name] |
| 67 toktext = match.group(name) |
| 68 start += len(toktext) |
| 69 yield (tok.name, toktext) |
| 70 |
| 71 if tok.next: |
| 72 state = tok.next |
| 73 break |
| 74 |
| 75 self.state = state |
| 76 |
| 77 |
| 78 class JsLexer(Lexer): |
| 79 """ |
| 80 A Javascript lexer |
| 81 |
| 82 >>> lexer = JsLexer() |
| 83 >>> list(lexer.lex("a = 1")) |
| 84 [('id', 'a'), ('ws', ' '), ('punct', '='), ('ws', ' '), ('dnum', '1')] |
| 85 |
| 86 This doesn't properly handle non-ASCII characters in the Javascript source. |
| 87 """ |
| 88 |
| 89 # Because these tokens are matched as alternatives in a regex, longer |
| 90 # possibilities must appear in the list before shorter ones, for example, |
| 91 # '>>' before '>'. |
| 92 # |
| 93 # Note that we don't have to detect malformed Javascript, only properly |
| 94 # lex correct Javascript, so much of this is simplified. |
| 95 |
| 96 # Details of Javascript lexical structure are taken from |
| 97 # http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-262.pdf |
| 98 |
| 99 # A useful explanation of automatic semicolon insertion is at |
| 100 # http://inimino.org/~inimino/blog/javascript_semicolons |
| 101 |
| 102 both_before = [ |
| 103 Tok("comment", r"/\*(.|\n)*?\*/"), |
| 104 Tok("linecomment", r"//.*?$"), |
| 105 Tok("ws", r"\s+"), |
| 106 Tok("keyword", literals(""" |
| 107 break case catch class const continue debugger |
| 108 default delete do else enum export extends |
| 109 finally for function if import in instanceof |
| 110 new return super switch this throw try typeof |
| 111 var void while with |
| 112 """, suffix=r"\b"), next='reg'), |
| 113 Tok("reserved", literals("null true false", suffix=r"\b"), next='div'), |
| 114 Tok("id", r""" |
| 115 ([a-zA-Z_$ ]|\\u[0-9a-fA-Z]{4}) # first char |
| 116 ([a-zA-Z_$0-9]|\\u[0-9a-fA-F]{4})* # rest chars |
| 117 """, next='div'), |
| 118 Tok("hnum", r"0[xX][0-9a-fA-F]+", next='div'), |
| 119 Tok("onum", r"0[0-7]+"), |
| 120 Tok("dnum", r""" |
| 121 ( (0|[1-9][0-9]*) # DecimalIntegerLiteral |
| 122 \. # dot |
| 123 [0-9]* # DecimalDigits-opt |
| 124 ([eE][-+]?[0-9]+)? # ExponentPart-opt |
| 125 | |
| 126 \. # dot |
| 127 [0-9]+ # DecimalDigits |
| 128 ([eE][-+]?[0-9]+)? # ExponentPart-opt |
| 129 | |
| 130 (0|[1-9][0-9]*) # DecimalIntegerLiteral |
| 131 ([eE][-+]?[0-9]+)? # ExponentPart-opt |
| 132 ) |
| 133 """, next='div'), |
| 134 Tok("punct", literals(""" |
| 135 >>>= === !== >>> <<= >>= <= >= == != << >> && |
| 136 || += -= *= %= &= |= ^= |
| 137 """), next="reg"), |
| 138 Tok("punct", literals("++ -- ) ]"), next='div'), |
| 139 Tok("punct", literals("{ } ( [ . ; , < > + - * % & | ^ ! ~ ? : ="), next
='reg'), |
| 140 Tok("string", r'"([^"\\]|(\\(.|\n)))*?"', next='div'), |
| 141 Tok("string", r"'([^'\\]|(\\(.|\n)))*?'", next='div'), |
| 142 ] |
| 143 |
| 144 both_after = [ |
| 145 Tok("other", r"."), |
| 146 ] |
| 147 |
| 148 states = { |
| 149 # slash will mean division |
| 150 'div': both_before + [ |
| 151 Tok("punct", literals("/= /"), next='reg'), |
| 152 ] + both_after, |
| 153 |
| 154 # slash will mean regex |
| 155 'reg': both_before + [ |
| 156 Tok("regex", |
| 157 r""" |
| 158 / # opening slash |
| 159 # First character is.. |
| 160 ( [^*\\/[] # anything but * \ / or [ |
| 161 | \\. # or an escape sequence |
| 162 | \[ # or a class, which has |
| 163 ( [^\]\\] # anything but \ or ] |
| 164 | \\. # or an escape sequence |
| 165 )* # many times |
| 166 \] |
| 167 ) |
| 168 # Following characters are same, except for excluding a star |
| 169 ( [^\\/[] # anything but \ / or [ |
| 170 | \\. # or an escape sequence |
| 171 | \[ # or a class, which has |
| 172 ( [^\]\\] # anything but \ or ] |
| 173 | \\. # or an escape sequence |
| 174 )* # many times |
| 175 \] |
| 176 )* # many times |
| 177 / # closing slash |
| 178 [a-zA-Z0-9]* # trailing flags |
| 179 """, next='div'), |
| 180 ] + both_after, |
| 181 } |
| 182 |
| 183 def __init__(self): |
| 184 super(JsLexer, self).__init__(self.states, 'reg') |
| 185 |
| 186 |
| 187 def prepare_js_for_gettext(js): |
| 188 """ |
| 189 Convert the Javascript source `js` into something resembling C for |
| 190 xgettext. |
| 191 |
| 192 What actually happens is that all the regex literals are replaced with |
| 193 "REGEX". |
| 194 """ |
| 195 def escape_quotes(m): |
| 196 """Used in a regex to properly escape double quotes.""" |
| 197 s = m.group(0) |
| 198 if s == '"': |
| 199 return r'\"' |
| 200 else: |
| 201 return s |
| 202 |
| 203 lexer = JsLexer() |
| 204 c = [] |
| 205 for name, tok in lexer.lex(js): |
| 206 if name == 'regex': |
| 207 # C doesn't grok regexes, and they aren't needed for gettext, |
| 208 # so just output a string instead. |
| 209 tok = '"REGEX"' |
| 210 elif name == 'string': |
| 211 # C doesn't have single-quoted strings, so make all strings |
| 212 # double-quoted. |
| 213 if tok.startswith("'"): |
| 214 guts = re.sub(r"\\.|.", escape_quotes, tok[1:-1]) |
| 215 tok = '"' + guts + '"' |
| 216 elif name == 'id': |
| 217 # C can't deal with Unicode escapes in identifiers. We don't |
| 218 # need them for gettext anyway, so replace them with something |
| 219 # innocuous |
| 220 tok = tok.replace("\\", "U") |
| 221 c.append(tok) |
| 222 return ''.join(c) |
OLD | NEW |