You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

jslex.py 7.6 KiB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222
  1. """JsLex: a lexer for Javascript"""
  2. # Originally from https://bitbucket.org/ned/jslex
  3. from __future__ import unicode_literals
  4. import re
  5. class Tok(object):
  6. """
  7. A specification for a token class.
  8. """
  9. num = 0
  10. def __init__(self, name, regex, next=None):
  11. self.id = Tok.num
  12. Tok.num += 1
  13. self.name = name
  14. self.regex = regex
  15. self.next = next
  16. def literals(choices, prefix="", suffix=""):
  17. """
  18. Create a regex from a space-separated list of literal `choices`.
  19. If provided, `prefix` and `suffix` will be attached to each choice
  20. individually.
  21. """
  22. return "|".join(prefix + re.escape(c) + suffix for c in choices.split())
  23. class Lexer(object):
  24. """
  25. A generic multi-state regex-based lexer.
  26. """
  27. def __init__(self, states, first):
  28. self.regexes = {}
  29. self.toks = {}
  30. for state, rules in states.items():
  31. parts = []
  32. for tok in rules:
  33. groupid = "t%d" % tok.id
  34. self.toks[groupid] = tok
  35. parts.append("(?P<%s>%s)" % (groupid, tok.regex))
  36. self.regexes[state] = re.compile("|".join(parts), re.MULTILINE | re.VERBOSE)
  37. self.state = first
  38. def lex(self, text):
  39. """
  40. Lexically analyze `text`.
  41. Yields pairs (`name`, `tokentext`).
  42. """
  43. end = len(text)
  44. state = self.state
  45. regexes = self.regexes
  46. toks = self.toks
  47. start = 0
  48. while start < end:
  49. for match in regexes[state].finditer(text, start):
  50. name = match.lastgroup
  51. tok = toks[name]
  52. toktext = match.group(name)
  53. start += len(toktext)
  54. yield (tok.name, toktext)
  55. if tok.next:
  56. state = tok.next
  57. break
  58. self.state = state
  59. class JsLexer(Lexer):
  60. """
  61. A Javascript lexer
  62. >>> lexer = JsLexer()
  63. >>> list(lexer.lex("a = 1"))
  64. [('id', 'a'), ('ws', ' '), ('punct', '='), ('ws', ' '), ('dnum', '1')]
  65. This doesn't properly handle non-ASCII characters in the Javascript source.
  66. """
  67. # Because these tokens are matched as alternatives in a regex, longer
  68. # possibilities must appear in the list before shorter ones, for example,
  69. # '>>' before '>'.
  70. #
  71. # Note that we don't have to detect malformed Javascript, only properly
  72. # lex correct Javascript, so much of this is simplified.
  73. # Details of Javascript lexical structure are taken from
  74. # http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-262.pdf
  75. # A useful explanation of automatic semicolon insertion is at
  76. # http://inimino.org/~inimino/blog/javascript_semicolons
  77. both_before = [
  78. Tok("comment", r"/\*(.|\n)*?\*/"),
  79. Tok("linecomment", r"//.*?$"),
  80. Tok("ws", r"\s+"),
  81. Tok("keyword", literals("""
  82. break case catch class const continue debugger
  83. default delete do else enum export extends
  84. finally for function if import in instanceof
  85. new return super switch this throw try typeof
  86. var void while with
  87. """, suffix=r"\b"), next='reg'),
  88. Tok("reserved", literals("null true false", suffix=r"\b"), next='div'),
  89. Tok("id", r"""
  90. ([a-zA-Z_$ ]|\\u[0-9a-fA-Z]{4}) # first char
  91. ([a-zA-Z_$0-9]|\\u[0-9a-fA-F]{4})* # rest chars
  92. """, next='div'),
  93. Tok("hnum", r"0[xX][0-9a-fA-F]+", next='div'),
  94. Tok("onum", r"0[0-7]+"),
  95. Tok("dnum", r"""
  96. ( (0|[1-9][0-9]*) # DecimalIntegerLiteral
  97. \. # dot
  98. [0-9]* # DecimalDigits-opt
  99. ([eE][-+]?[0-9]+)? # ExponentPart-opt
  100. |
  101. \. # dot
  102. [0-9]+ # DecimalDigits
  103. ([eE][-+]?[0-9]+)? # ExponentPart-opt
  104. |
  105. (0|[1-9][0-9]*) # DecimalIntegerLiteral
  106. ([eE][-+]?[0-9]+)? # ExponentPart-opt
  107. )
  108. """, next='div'),
  109. Tok("punct", literals("""
  110. >>>= === !== >>> <<= >>= <= >= == != << >> &&
  111. || += -= *= %= &= |= ^=
  112. """), next="reg"),
  113. Tok("punct", literals("++ -- ) ]"), next='div'),
  114. Tok("punct", literals("{ } ( [ . ; , < > + - * % & | ^ ! ~ ? : ="), next='reg'),
  115. Tok("string", r'"([^"\\]|(\\(.|\n)))*?"', next='div'),
  116. Tok("string", r"'([^'\\]|(\\(.|\n)))*?'", next='div'),
  117. ]
  118. both_after = [
  119. Tok("other", r"."),
  120. ]
  121. states = {
  122. # slash will mean division
  123. 'div': both_before + [
  124. Tok("punct", literals("/= /"), next='reg'),
  125. ] + both_after,
  126. # slash will mean regex
  127. 'reg': both_before + [
  128. Tok("regex",
  129. r"""
  130. / # opening slash
  131. # First character is..
  132. ( [^*\\/[] # anything but * \ / or [
  133. | \\. # or an escape sequence
  134. | \[ # or a class, which has
  135. ( [^\]\\] # anything but \ or ]
  136. | \\. # or an escape sequence
  137. )* # many times
  138. \]
  139. )
  140. # Following characters are same, except for excluding a star
  141. ( [^\\/[] # anything but \ / or [
  142. | \\. # or an escape sequence
  143. | \[ # or a class, which has
  144. ( [^\]\\] # anything but \ or ]
  145. | \\. # or an escape sequence
  146. )* # many times
  147. \]
  148. )* # many times
  149. / # closing slash
  150. [a-zA-Z0-9]* # trailing flags
  151. """, next='div'),
  152. ] + both_after,
  153. }
  154. def __init__(self):
  155. super(JsLexer, self).__init__(self.states, 'reg')
  156. def prepare_js_for_gettext(js):
  157. """
  158. Convert the Javascript source `js` into something resembling C for
  159. xgettext.
  160. What actually happens is that all the regex literals are replaced with
  161. "REGEX".
  162. """
  163. def escape_quotes(m):
  164. """Used in a regex to properly escape double quotes."""
  165. s = m.group(0)
  166. if s == '"':
  167. return r'\"'
  168. else:
  169. return s
  170. lexer = JsLexer()
  171. c = []
  172. for name, tok in lexer.lex(js):
  173. if name == 'regex':
  174. # C doesn't grok regexes, and they aren't needed for gettext,
  175. # so just output a string instead.
  176. tok = '"REGEX"'
  177. elif name == 'string':
  178. # C doesn't have single-quoted strings, so make all strings
  179. # double-quoted.
  180. if tok.startswith("'"):
  181. guts = re.sub(r"\\.|.", escape_quotes, tok[1:-1])
  182. tok = '"' + guts + '"'
  183. elif name == 'id':
  184. # C can't deal with Unicode escapes in identifiers. We don't
  185. # need them for gettext anyway, so replace them with something
  186. # innocuous
  187. tok = tok.replace("\\", "U")
  188. c.append(tok)
  189. return ''.join(c)