From 2b045433b97f834080e4dd37a0dd668ffa07c11f Mon Sep 17 00:00:00 2001 From: renmingshuai Date: Thu, 4 Nov 2021 09:13:18 +0800 Subject: [PATCH] uncaught exception in get_tokens_unprocessed (cherry picked from commit c784948b760e1197b5c5ded63d672387e52fbeba) --- backport-Fix-raw-token-lexer-Unicode.patch | 83 +++++++ backport-fix-pop-from-empty-stack.patch | 26 +++ ...upport-Python3-and-handle-exceptions.patch | 216 ++++++++++++++++++ python-pygments.spec | 8 +- 4 files changed, 332 insertions(+), 1 deletion(-) create mode 100644 backport-Fix-raw-token-lexer-Unicode.patch create mode 100644 backport-fix-pop-from-empty-stack.patch create mode 100644 backport-support-Python3-and-handle-exceptions.patch diff --git a/backport-Fix-raw-token-lexer-Unicode.patch b/backport-Fix-raw-token-lexer-Unicode.patch new file mode 100644 index 0000000..8da122a --- /dev/null +++ b/backport-Fix-raw-token-lexer-Unicode.patch @@ -0,0 +1,83 @@ +From f65ac3f1a9511fd802518a41e91ff21c4a2c99ae Mon Sep 17 00:00:00 2001 +From: Georg Brandl +Date: Thu, 24 Dec 2020 16:59:46 +0100 +Subject: [PATCH] Fix raw token lexer w.r.t. Unicode + +Conflict:do not change CHANGES file +Reference:https://github.com/pygments/pygments/commit/f65ac3f1a9511fd802518a41e91ff21c4a2c99ae + +--- + pygments/lexers/special.py | 36 ++++++++++++++++++------------------ + 1 file changed, 18 insertions(+), 18 deletions(-) + +diff --git a/pygments/lexers/special.py b/pygments/lexers/special.py +index 4016c59..a1e2200 100644 +--- a/pygments/lexers/special.py ++++ b/pygments/lexers/special.py +@@ -38,7 +38,7 @@ class TextLexer(Lexer): + + _ttype_cache = {} + +-line_re = re.compile(b'.*?\n') ++line_re = re.compile('.*?\n') + + + class RawTokenLexer(Lexer): +@@ -64,20 +64,20 @@ class RawTokenLexer(Lexer): + Lexer.__init__(self, **options) + + def get_tokens(self, text): +- if isinstance(text, text_type): +- # raw token stream never has any non-ASCII characters +- text = text.encode('ascii') +- if self.compress == 'gz': +- import gzip +- gzipfile = gzip.GzipFile('', 'rb', 9, BytesIO(text)) +- text = gzipfile.read() +- elif self.compress == 'bz2': +- import bz2 +- text = bz2.decompress(text) +- +- # do not call Lexer.get_tokens() because we do not want Unicode +- # decoding to occur, and stripping is not optional. +- text = text.strip(b'\n') + b'\n' ++ if self.compress: ++ if isinstance(text, str): ++ text = text.encode('latin1') ++ if self.compress == 'gz': ++ import gzip ++ gzipfile = gzip.GzipFile('', 'rb', 9, BytesIO(text)) ++ text = gzipfile.read() ++ elif self.compress == 'bz2': ++ import bz2 ++ text = bz2.decompress(text) ++ text = text.decode('latin1') ++ ++ # do not call Lexer.get_tokens() because stripping is not optional. ++ text = text.strip('\n') + '\n' + for i, t, v in self.get_tokens_unprocessed(text): + yield t, v + +@@ -85,9 +85,9 @@ class RawTokenLexer(Lexer): + length = 0 + for match in line_re.finditer(text): + try: +- ttypestr, val = match.group().split(b'\t', 1) ++ ttypestr, val = match.group().rstrip().split('\t', 1) + except ValueError: +- val = match.group().decode('ascii', 'replace') ++ val = match.group() + ttype = Error + else: + ttype = _ttype_cache.get(ttypestr) +@@ -99,6 +99,6 @@ class RawTokenLexer(Lexer): + raise ValueError('malformed token name') + ttype = getattr(ttype, ttype_) + _ttype_cache[ttypestr] = ttype +- val = val[2:-2].decode('unicode-escape') ++ val = val[1:-1].encode().decode('unicode-escape') + yield length, ttype, val + length += len(val) +-- +2.27.0 + diff --git a/backport-fix-pop-from-empty-stack.patch b/backport-fix-pop-from-empty-stack.patch new file mode 100644 index 0000000..8e88fc9 --- /dev/null +++ b/backport-fix-pop-from-empty-stack.patch @@ -0,0 +1,26 @@ +From eb39c43b6ef992abadb0d25f0504d0cf2f3ccd86 Mon Sep 17 00:00:00 2001 +From: Georg Brandl +Date: Mon, 11 Jan 2021 09:53:12 +0100 +Subject: [PATCH] xquery: fix pop from empty stack + +Conflict:NA +Reference:https://github.com/pygments/pygments/commit/eb39c43b6ef992abadb0d25f0504d0cf2f3ccd86 + +--- + pygments/lexers/webmisc.py | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/pygments/lexers/webmisc.py b/pygments/lexers/webmisc.py +index dde8357a1..abc9076c8 100644 +--- a/pygments/lexers/webmisc.py ++++ b/pygments/lexers/webmisc.py +@@ -128,7 +128,8 @@ def operator_root_callback(lexer, match, ctx): + + def popstate_tag_callback(lexer, match, ctx): + yield match.start(), Name.Tag, match.group(1) +- ctx.stack.append(lexer.xquery_parse_state.pop()) ++ if lexer.xquery_parse_state: ++ ctx.stack.append(lexer.xquery_parse_state.pop()) + ctx.pos = match.end() + + def popstate_xmlcomment_callback(lexer, match, ctx): diff --git a/backport-support-Python3-and-handle-exceptions.patch b/backport-support-Python3-and-handle-exceptions.patch new file mode 100644 index 0000000..a0e0555 --- /dev/null +++ b/backport-support-Python3-and-handle-exceptions.patch @@ -0,0 +1,216 @@ +From c2cf688397b0b2adb649e51946c00714b74d2d9e Mon Sep 17 00:00:00 2001 +From: Anders Kaseorg +Date: Sun, 14 Feb 2021 00:01:47 -0800 +Subject: [PATCH] RawToken{Formatter,Lexer}: support Python 3 and handle exceptions (#1602) + +Conflict:do not change CHANGES file +Reference:https://github.com/pygments/pygments/commit/c2cf688397b0b2adb649e51946c00714b74d2d9e + +--- + pygments/formatters/other.py | 16 ++++----- + pygments/lexers/special.py | 35 ++++++++++--------- + tests/test_raw_token.py | 68 ++++++++++++++++++++++++++++++++++++ + 3 files changed, 94 insertions(+), 25 deletions(-) + create mode 100644 tests/test_raw_token.py + +diff --git a/pygments/formatters/other.py b/pygments/formatters/other.py +index c09eff0..aab062e 100644 +--- a/pygments/formatters/other.py ++++ b/pygments/formatters/other.py +@@ -87,35 +87,33 @@ class RawTokenFormatter(Formatter): + if self.compress == 'gz': + import gzip + outfile = gzip.GzipFile('', 'wb', 9, outfile) +- +- def write(text): +- outfile.write(text.encode()) +- flush = outfile.flush ++ ++ write = outfile.write ++ flush = outfile.close + elif self.compress == 'bz2': + import bz2 + compressor = bz2.BZ2Compressor(9) + + def write(text): +- outfile.write(compressor.compress(text.encode())) ++ outfile.write(compressor.compress(text)) + + def flush(): + outfile.write(compressor.flush()) + outfile.flush() + else: +- def write(text): +- outfile.write(text.encode()) ++ write = outfile.write + flush = outfile.flush + + if self.error_color: + for ttype, value in tokensource: +- line = "%s\t%r\n" % (ttype, value) ++ line = b"%r\t%r\n" % (ttype, value) + if ttype is Token.Error: + write(colorize(self.error_color, line)) + else: + write(line) + else: + for ttype, value in tokensource: +- write("%s\t%r\n" % (ttype, value)) ++ write(b"%r\t%r\n" % (ttype, value)) + flush() + + +diff --git a/pygments/lexers/special.py b/pygments/lexers/special.py +index a1e2200..b6f53ee 100644 +--- a/pygments/lexers/special.py ++++ b/pygments/lexers/special.py +@@ -9,11 +9,12 @@ + :license: BSD, see LICENSE for details. + """ + ++import ast + import re + + from pygments.lexer import Lexer + from pygments.token import Token, Error, Text +-from pygments.util import get_choice_opt, text_type, BytesIO ++from pygments.util import get_choice_opt + + + __all__ = ['TextLexer', 'RawTokenLexer'] +@@ -43,9 +44,7 @@ line_re = re.compile('.*?\n') + + class RawTokenLexer(Lexer): + """ +- Recreate a token stream formatted with the `RawTokenFormatter`. This +- lexer raises exceptions during parsing if the token stream in the +- file is malformed. ++ Recreate a token stream formatted with the `RawTokenFormatter`. + + Additional options accepted: + +@@ -67,13 +66,16 @@ class RawTokenLexer(Lexer): + if self.compress: + if isinstance(text, str): + text = text.encode('latin1') +- if self.compress == 'gz': +- import gzip +- gzipfile = gzip.GzipFile('', 'rb', 9, BytesIO(text)) +- text = gzipfile.read() +- elif self.compress == 'bz2': +- import bz2 +- text = bz2.decompress(text) ++ try: ++ if self.compress == 'gz': ++ import gzip ++ text = gzip.decompress(text) ++ elif self.compress == 'bz2': ++ import bz2 ++ text = bz2.decompress(text) ++ except OSError: ++ yield Error, text.decode('latin1') ++ if isinstance(text, bytes): + text = text.decode('latin1') + + # do not call Lexer.get_tokens() because stripping is not optional. +@@ -86,10 +88,6 @@ class RawTokenLexer(Lexer): + for match in line_re.finditer(text): + try: + ttypestr, val = match.group().rstrip().split('\t', 1) +- except ValueError: +- val = match.group() +- ttype = Error +- else: + ttype = _ttype_cache.get(ttypestr) + if not ttype: + ttype = Token +@@ -99,6 +97,11 @@ class RawTokenLexer(Lexer): + raise ValueError('malformed token name') + ttype = getattr(ttype, ttype_) + _ttype_cache[ttypestr] = ttype +- val = val[1:-1].encode().decode('unicode-escape') ++ val = ast.literal_eval(val) ++ if not isinstance(val, str): ++ raise ValueError('expected str') ++ except (SyntaxError, ValueError): ++ val = match.group() ++ ttype = Error + yield length, ttype, val + length += len(val) +diff --git a/tests/test_raw_token.py b/tests/test_raw_token.py +new file mode 100644 +index 0000000..bae5a49 +--- /dev/null ++++ b/tests/test_raw_token.py +@@ -0,0 +1,68 @@ ++import bz2 ++import gzip ++ ++from pygments import highlight ++from pygments.formatters import HtmlFormatter, RawTokenFormatter ++from pygments.lexers import PythonLexer, RawTokenLexer ++ ++ ++def test_raw_token(): ++ code = "2 + α" ++ raw = highlight(code, PythonLexer(), RawTokenFormatter()) ++ html = highlight(code, PythonLexer(), HtmlFormatter()) ++ ++ assert highlight(raw, RawTokenLexer(), RawTokenFormatter()) == raw ++ assert highlight(raw, RawTokenLexer(), HtmlFormatter()) == html ++ assert highlight(raw.decode(), RawTokenLexer(), HtmlFormatter()) == html ++ ++ raw_gz = highlight(code, PythonLexer(), RawTokenFormatter(compress="gz")) ++ assert gzip.decompress(raw_gz) == raw ++ assert highlight(raw_gz, RawTokenLexer(compress="gz"), RawTokenFormatter()) == raw ++ assert ( ++ highlight( ++ raw_gz.decode("latin1"), RawTokenLexer(compress="gz"), RawTokenFormatter() ++ ) ++ == raw ++ ) ++ ++ raw_bz2 = highlight(code, PythonLexer(), RawTokenFormatter(compress="bz2")) ++ assert bz2.decompress(raw_bz2) == raw ++ assert highlight(raw_bz2, RawTokenLexer(compress="bz2"), RawTokenFormatter()) == raw ++ assert ( ++ highlight( ++ raw_bz2.decode("latin1"), RawTokenLexer(compress="bz2"), RawTokenFormatter() ++ ) ++ == raw ++ ) ++ ++ ++def test_invalid_raw_token(): ++ # These should not throw exceptions. ++ assert ( ++ highlight("Tolkien", RawTokenLexer(), RawTokenFormatter()) ++ == b"Token.Error\t'Tolkien\\n'\n" ++ ) ++ assert ( ++ highlight("Tolkien\t'x'", RawTokenLexer(), RawTokenFormatter()) ++ == b"Token\t'x'\n" ++ ) ++ assert ( ++ highlight("Token.Text\t42", RawTokenLexer(), RawTokenFormatter()) ++ == b"Token.Error\t'Token.Text\\t42\\n'\n" ++ ) ++ assert ( ++ highlight("Token.Text\t'", RawTokenLexer(), RawTokenFormatter()) ++ == b'Token.Error\t"Token.Text\\t\'\\n"\n' ++ ) ++ assert ( ++ highlight("Token.Text\t'α'", RawTokenLexer(), RawTokenFormatter()) ++ == b"Token.Text\t'\\u03b1'\n" ++ ) ++ assert ( ++ highlight("Token.Text\tu'α'", RawTokenLexer(), RawTokenFormatter()) ++ == b"Token.Text\t'\\u03b1'\n" ++ ) ++ assert ( ++ highlight(b"Token.Text\t'\xff'", RawTokenLexer(), RawTokenFormatter()) ++ == b"Token.Text\t'\\xff'\n" ++ ) +-- +2.27.0 + diff --git a/python-pygments.spec b/python-pygments.spec index 24dcd72..d36ab54 100644 --- a/python-pygments.spec +++ b/python-pygments.spec @@ -16,7 +16,7 @@ need to prettify source code. Highlights are: \ Name: python-pygments Summary: Syntax highlighting engine written in Python Version: 2.5.2 -Release: 4 +Release: 5 License: BSD URL: http://pygments.org/ Source0: https://pypi.org/packages/source/P/Pygments/Pygments-%{version}.tar.gz @@ -25,6 +25,9 @@ BuildArch: noarch Patch6000: backport-CVE-2021-20270.patch Patch6001: backport-CVE-2021-27291.patch Patch6002: backport-weed-out-more-backtracking-string-regexes.patch +Patch6003: backport-fix-pop-from-empty-stack.patch +Patch6004: backport-Fix-raw-token-lexer-Unicode.patch +Patch6005: backport-support-Python3-and-handle-exceptions.patch %description %{_description} @@ -85,6 +88,9 @@ cp -r doc/docs doc/reST %lang(en) %{_mandir}/man1/pygmentize.1* %changelog +* Thu Nov 04 2021 tianwei - 2.5.2-5 +- uncaught exception in get_tokens_unprocessed + * Sun Sep 26 2021 huangduirong - 2.5.2-4 - Infinite loop in get_tokens_unprocessed -- Gitee