|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222 |
- """JsLex: a lexer for Javascript"""
- # Originally from https://bitbucket.org/ned/jslex
- from __future__ import unicode_literals
-
- import re
-
-
- class Tok(object):
- """
- A specification for a token class.
- """
- num = 0
-
- def __init__(self, name, regex, next=None):
- self.id = Tok.num
- Tok.num += 1
- self.name = name
- self.regex = regex
- self.next = next
-
-
- def literals(choices, prefix="", suffix=""):
- """
- Create a regex from a space-separated list of literal `choices`.
-
- If provided, `prefix` and `suffix` will be attached to each choice
- individually.
- """
- return "|".join(prefix + re.escape(c) + suffix for c in choices.split())
-
-
- class Lexer(object):
- """
- A generic multi-state regex-based lexer.
- """
-
- def __init__(self, states, first):
- self.regexes = {}
- self.toks = {}
-
- for state, rules in states.items():
- parts = []
- for tok in rules:
- groupid = "t%d" % tok.id
- self.toks[groupid] = tok
- parts.append("(?P<%s>%s)" % (groupid, tok.regex))
- self.regexes[state] = re.compile("|".join(parts), re.MULTILINE | re.VERBOSE)
-
- self.state = first
-
- def lex(self, text):
- """
- Lexically analyze `text`.
-
- Yields pairs (`name`, `tokentext`).
- """
- end = len(text)
- state = self.state
- regexes = self.regexes
- toks = self.toks
- start = 0
-
- while start < end:
- for match in regexes[state].finditer(text, start):
- name = match.lastgroup
- tok = toks[name]
- toktext = match.group(name)
- start += len(toktext)
- yield (tok.name, toktext)
-
- if tok.next:
- state = tok.next
- break
-
- self.state = state
-
-
- class JsLexer(Lexer):
- """
- A Javascript lexer
-
- >>> lexer = JsLexer()
- >>> list(lexer.lex("a = 1"))
- [('id', 'a'), ('ws', ' '), ('punct', '='), ('ws', ' '), ('dnum', '1')]
-
- This doesn't properly handle non-ASCII characters in the Javascript source.
- """
-
- # Because these tokens are matched as alternatives in a regex, longer
- # possibilities must appear in the list before shorter ones, for example,
- # '>>' before '>'.
- #
- # Note that we don't have to detect malformed Javascript, only properly
- # lex correct Javascript, so much of this is simplified.
-
- # Details of Javascript lexical structure are taken from
- # http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-262.pdf
-
- # A useful explanation of automatic semicolon insertion is at
- # http://inimino.org/~inimino/blog/javascript_semicolons
-
- both_before = [
- Tok("comment", r"/\*(.|\n)*?\*/"),
- Tok("linecomment", r"//.*?$"),
- Tok("ws", r"\s+"),
- Tok("keyword", literals("""
- break case catch class const continue debugger
- default delete do else enum export extends
- finally for function if import in instanceof
- new return super switch this throw try typeof
- var void while with
- """, suffix=r"\b"), next='reg'),
- Tok("reserved", literals("null true false", suffix=r"\b"), next='div'),
- Tok("id", r"""
- ([a-zA-Z_$ ]|\\u[0-9a-fA-Z]{4}) # first char
- ([a-zA-Z_$0-9]|\\u[0-9a-fA-F]{4})* # rest chars
- """, next='div'),
- Tok("hnum", r"0[xX][0-9a-fA-F]+", next='div'),
- Tok("onum", r"0[0-7]+"),
- Tok("dnum", r"""
- ( (0|[1-9][0-9]*) # DecimalIntegerLiteral
- \. # dot
- [0-9]* # DecimalDigits-opt
- ([eE][-+]?[0-9]+)? # ExponentPart-opt
- |
- \. # dot
- [0-9]+ # DecimalDigits
- ([eE][-+]?[0-9]+)? # ExponentPart-opt
- |
- (0|[1-9][0-9]*) # DecimalIntegerLiteral
- ([eE][-+]?[0-9]+)? # ExponentPart-opt
- )
- """, next='div'),
- Tok("punct", literals("""
- >>>= === !== >>> <<= >>= <= >= == != << >> &&
- || += -= *= %= &= |= ^=
- """), next="reg"),
- Tok("punct", literals("++ -- ) ]"), next='div'),
- Tok("punct", literals("{ } ( [ . ; , < > + - * % & | ^ ! ~ ? : ="), next='reg'),
- Tok("string", r'"([^"\\]|(\\(.|\n)))*?"', next='div'),
- Tok("string", r"'([^'\\]|(\\(.|\n)))*?'", next='div'),
- ]
-
- both_after = [
- Tok("other", r"."),
- ]
-
- states = {
- # slash will mean division
- 'div': both_before + [
- Tok("punct", literals("/= /"), next='reg'),
- ] + both_after,
-
- # slash will mean regex
- 'reg': both_before + [
- Tok("regex",
- r"""
- / # opening slash
- # First character is..
- ( [^*\\/[] # anything but * \ / or [
- | \\. # or an escape sequence
- | \[ # or a class, which has
- ( [^\]\\] # anything but \ or ]
- | \\. # or an escape sequence
- )* # many times
- \]
- )
- # Following characters are same, except for excluding a star
- ( [^\\/[] # anything but \ / or [
- | \\. # or an escape sequence
- | \[ # or a class, which has
- ( [^\]\\] # anything but \ or ]
- | \\. # or an escape sequence
- )* # many times
- \]
- )* # many times
- / # closing slash
- [a-zA-Z0-9]* # trailing flags
- """, next='div'),
- ] + both_after,
- }
-
- def __init__(self):
- super(JsLexer, self).__init__(self.states, 'reg')
-
-
- def prepare_js_for_gettext(js):
- """
- Convert the Javascript source `js` into something resembling C for
- xgettext.
-
- What actually happens is that all the regex literals are replaced with
- "REGEX".
- """
- def escape_quotes(m):
- """Used in a regex to properly escape double quotes."""
- s = m.group(0)
- if s == '"':
- return r'\"'
- else:
- return s
-
- lexer = JsLexer()
- c = []
- for name, tok in lexer.lex(js):
- if name == 'regex':
- # C doesn't grok regexes, and they aren't needed for gettext,
- # so just output a string instead.
- tok = '"REGEX"'
- elif name == 'string':
- # C doesn't have single-quoted strings, so make all strings
- # double-quoted.
- if tok.startswith("'"):
- guts = re.sub(r"\\.|.", escape_quotes, tok[1:-1])
- tok = '"' + guts + '"'
- elif name == 'id':
- # C can't deal with Unicode escapes in identifiers. We don't
- # need them for gettext anyway, so replace them with something
- # innocuous
- tok = tok.replace("\\", "U")
- c.append(tok)
- return ''.join(c)
|