You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

html_parser.py 5.0 KiB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128
  1. import re
  2. import sys
  3. from django.utils import six
  4. from django.utils.six.moves import html_parser as _html_parser
  5. current_version = sys.version_info
  6. use_workaround = current_version < (2, 7, 3)
  7. try:
  8. HTMLParseError = _html_parser.HTMLParseError
  9. except AttributeError:
  10. # create a dummy class for Python 3.5+ where it's been removed
  11. class HTMLParseError(Exception):
  12. pass
  13. if not use_workaround:
  14. if six.PY3:
  15. class HTMLParser(_html_parser.HTMLParser):
  16. """Explicitly set convert_charrefs to be False.
  17. This silences a deprecation warning on Python 3.4, but we can't do
  18. it at call time because Python 2.7 does not have the keyword
  19. argument.
  20. """
  21. def __init__(self, convert_charrefs=False, **kwargs):
  22. _html_parser.HTMLParser.__init__(self, convert_charrefs=convert_charrefs, **kwargs)
  23. else:
  24. HTMLParser = _html_parser.HTMLParser
  25. else:
  26. tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*')
  27. class HTMLParser(_html_parser.HTMLParser):
  28. """
  29. Patched version of stdlib's HTMLParser with patch from:
  30. http://bugs.python.org/issue670664
  31. """
  32. def __init__(self):
  33. _html_parser.HTMLParser.__init__(self)
  34. self.cdata_tag = None
  35. def set_cdata_mode(self, tag):
  36. try:
  37. self.interesting = _html_parser.interesting_cdata
  38. except AttributeError:
  39. self.interesting = re.compile(r'</\s*%s\s*>' % tag.lower(), re.I)
  40. self.cdata_tag = tag.lower()
  41. def clear_cdata_mode(self):
  42. self.interesting = _html_parser.interesting_normal
  43. self.cdata_tag = None
  44. # Internal -- handle starttag, return end or -1 if not terminated
  45. def parse_starttag(self, i):
  46. self.__starttag_text = None
  47. endpos = self.check_for_whole_start_tag(i)
  48. if endpos < 0:
  49. return endpos
  50. rawdata = self.rawdata
  51. self.__starttag_text = rawdata[i:endpos]
  52. # Now parse the data between i+1 and j into a tag and attrs
  53. attrs = []
  54. match = tagfind.match(rawdata, i + 1)
  55. assert match, 'unexpected call to parse_starttag()'
  56. k = match.end()
  57. self.lasttag = tag = match.group(1).lower()
  58. while k < endpos:
  59. m = _html_parser.attrfind.match(rawdata, k)
  60. if not m:
  61. break
  62. attrname, rest, attrvalue = m.group(1, 2, 3)
  63. if not rest:
  64. attrvalue = None
  65. elif (attrvalue[:1] == '\'' == attrvalue[-1:] or
  66. attrvalue[:1] == '"' == attrvalue[-1:]):
  67. attrvalue = attrvalue[1:-1]
  68. if attrvalue:
  69. attrvalue = self.unescape(attrvalue)
  70. attrs.append((attrname.lower(), attrvalue))
  71. k = m.end()
  72. end = rawdata[k:endpos].strip()
  73. if end not in (">", "/>"):
  74. lineno, offset = self.getpos()
  75. if "\n" in self.__starttag_text:
  76. lineno = lineno + self.__starttag_text.count("\n")
  77. offset = (len(self.__starttag_text)
  78. - self.__starttag_text.rfind("\n"))
  79. else:
  80. offset = offset + len(self.__starttag_text)
  81. self.error("junk characters in start tag: %r"
  82. % (rawdata[k:endpos][:20],))
  83. if end.endswith('/>'):
  84. # XHTML-style empty tag: <span attr="value" />
  85. self.handle_startendtag(tag, attrs)
  86. else:
  87. self.handle_starttag(tag, attrs)
  88. if tag in self.CDATA_CONTENT_ELEMENTS:
  89. self.set_cdata_mode(tag) # <--------------------------- Changed
  90. return endpos
  91. # Internal -- parse endtag, return end or -1 if incomplete
  92. def parse_endtag(self, i):
  93. rawdata = self.rawdata
  94. assert rawdata[i:i + 2] == "</", "unexpected call to parse_endtag"
  95. match = _html_parser.endendtag.search(rawdata, i + 1) # >
  96. if not match:
  97. return -1
  98. j = match.end()
  99. match = _html_parser.endtagfind.match(rawdata, i) # </ + tag + >
  100. if not match:
  101. if self.cdata_tag is not None: # *** add ***
  102. self.handle_data(rawdata[i:j]) # *** add ***
  103. return j # *** add ***
  104. self.error("bad end tag: %r" % (rawdata[i:j],))
  105. # --- changed start ---------------------------------------------------
  106. tag = match.group(1).strip()
  107. if self.cdata_tag is not None:
  108. if tag.lower() != self.cdata_tag:
  109. self.handle_data(rawdata[i:j])
  110. return j
  111. # --- changed end -----------------------------------------------------
  112. self.handle_endtag(tag.lower())
  113. self.clear_cdata_mode()
  114. return j