From 2b045433b97f834080e4dd37a0dd668ffa07c11f Mon Sep 17 00:00:00 2001
From: renmingshuai <renmingshuai@huawei.com>
Date: Thu, 4 Nov 2021 09:13:18 +0800
Subject: [PATCH] uncaught exception in get_tokens_unprocessed

(cherry picked from commit c784948b760e1197b5c5ded63d672387e52fbeba)
---
 backport-Fix-raw-token-lexer-Unicode.patch    |  83 +++++++
 backport-fix-pop-from-empty-stack.patch       |  26 +++
 ...upport-Python3-and-handle-exceptions.patch | 216 ++++++++++++++++++
 python-pygments.spec                          |   8 +-
 4 files changed, 332 insertions(+), 1 deletion(-)
 create mode 100644 backport-Fix-raw-token-lexer-Unicode.patch
 create mode 100644 backport-fix-pop-from-empty-stack.patch
 create mode 100644 backport-support-Python3-and-handle-exceptions.patch

diff --git a/backport-Fix-raw-token-lexer-Unicode.patch b/backport-Fix-raw-token-lexer-Unicode.patch
new file mode 100644
index 0000000..8da122a
--- /dev/null
+++ b/backport-Fix-raw-token-lexer-Unicode.patch
@@ -0,0 +1,83 @@
+From f65ac3f1a9511fd802518a41e91ff21c4a2c99ae Mon Sep 17 00:00:00 2001
+From: Georg Brandl <georg@python.org>
+Date: Thu, 24 Dec 2020 16:59:46 +0100
+Subject: [PATCH] Fix raw token lexer w.r.t. Unicode
+
+Conflict:do not change CHANGES file
+Reference:https://github.com/pygments/pygments/commit/f65ac3f1a9511fd802518a41e91ff21c4a2c99ae
+
+---
+ pygments/lexers/special.py | 36 ++++++++++++++++++------------------
+ 1 file changed, 18 insertions(+), 18 deletions(-)
+
+diff --git a/pygments/lexers/special.py b/pygments/lexers/special.py
+index 4016c59..a1e2200 100644
+--- a/pygments/lexers/special.py
++++ b/pygments/lexers/special.py
+@@ -38,7 +38,7 @@ class TextLexer(Lexer):
+ 
+ _ttype_cache = {}
+ 
+-line_re = re.compile(b'.*?\n')
++line_re = re.compile('.*?\n')
+ 
+ 
+ class RawTokenLexer(Lexer):
+@@ -64,20 +64,20 @@ class RawTokenLexer(Lexer):
+         Lexer.__init__(self, **options)
+ 
+     def get_tokens(self, text):
+-        if isinstance(text, text_type):
+-            # raw token stream never has any non-ASCII characters
+-            text = text.encode('ascii')
+-        if self.compress == 'gz':
+-            import gzip
+-            gzipfile = gzip.GzipFile('', 'rb', 9, BytesIO(text))
+-            text = gzipfile.read()
+-        elif self.compress == 'bz2':
+-            import bz2
+-            text = bz2.decompress(text)
+-
+-        # do not call Lexer.get_tokens() because we do not want Unicode
+-        # decoding to occur, and stripping is not optional.
+-        text = text.strip(b'\n') + b'\n'
++        if self.compress:
++            if isinstance(text, str):
++                text = text.encode('latin1')
++            if self.compress == 'gz':
++                import gzip
++                gzipfile = gzip.GzipFile('', 'rb', 9, BytesIO(text))
++                text = gzipfile.read()
++            elif self.compress == 'bz2':
++                import bz2
++                text = bz2.decompress(text)
++            text = text.decode('latin1')
++
++        # do not call Lexer.get_tokens() because stripping is not optional.
++        text = text.strip('\n') + '\n'
+         for i, t, v in self.get_tokens_unprocessed(text):
+             yield t, v
+ 
+@@ -85,9 +85,9 @@ class RawTokenLexer(Lexer):
+         length = 0
+         for match in line_re.finditer(text):
+             try:
+-                ttypestr, val = match.group().split(b'\t', 1)
++                ttypestr, val = match.group().rstrip().split('\t', 1)
+             except ValueError:
+-                val = match.group().decode('ascii', 'replace')
++                val = match.group()
+                 ttype = Error
+             else:
+                 ttype = _ttype_cache.get(ttypestr)
+@@ -99,6 +99,6 @@ class RawTokenLexer(Lexer):
+                             raise ValueError('malformed token name')
+                         ttype = getattr(ttype, ttype_)
+                     _ttype_cache[ttypestr] = ttype
+-                val = val[2:-2].decode('unicode-escape')
++                val = val[1:-1].encode().decode('unicode-escape')
+             yield length, ttype, val
+             length += len(val)
+-- 
+2.27.0
+
diff --git a/backport-fix-pop-from-empty-stack.patch b/backport-fix-pop-from-empty-stack.patch
new file mode 100644
index 0000000..8e88fc9
--- /dev/null
+++ b/backport-fix-pop-from-empty-stack.patch
@@ -0,0 +1,26 @@
+From eb39c43b6ef992abadb0d25f0504d0cf2f3ccd86 Mon Sep 17 00:00:00 2001
+From: Georg Brandl <georg@python.org>
+Date: Mon, 11 Jan 2021 09:53:12 +0100
+Subject: [PATCH] xquery: fix pop from empty stack
+
+Conflict:NA     
+Reference:https://github.com/pygments/pygments/commit/eb39c43b6ef992abadb0d25f0504d0cf2f3ccd86
+
+---
+ pygments/lexers/webmisc.py | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/pygments/lexers/webmisc.py b/pygments/lexers/webmisc.py
+index dde8357a1..abc9076c8 100644
+--- a/pygments/lexers/webmisc.py
++++ b/pygments/lexers/webmisc.py
+@@ -128,7 +128,8 @@ def operator_root_callback(lexer, match, ctx):
+ 
+     def popstate_tag_callback(lexer, match, ctx):
+         yield match.start(), Name.Tag, match.group(1)
+-        ctx.stack.append(lexer.xquery_parse_state.pop())
++        if lexer.xquery_parse_state:
++            ctx.stack.append(lexer.xquery_parse_state.pop())
+         ctx.pos = match.end()
+ 
+     def popstate_xmlcomment_callback(lexer, match, ctx):
diff --git a/backport-support-Python3-and-handle-exceptions.patch b/backport-support-Python3-and-handle-exceptions.patch
new file mode 100644
index 0000000..a0e0555
--- /dev/null
+++ b/backport-support-Python3-and-handle-exceptions.patch
@@ -0,0 +1,216 @@
+From c2cf688397b0b2adb649e51946c00714b74d2d9e Mon Sep 17 00:00:00 2001
+From: Anders Kaseorg <andersk@mit.edu>
+Date: Sun, 14 Feb 2021 00:01:47 -0800
+Subject: [PATCH] RawToken{Formatter,Lexer}: support Python 3 and handle exceptions (#1602)
+
+Conflict:do not change CHANGES file
+Reference:https://github.com/pygments/pygments/commit/c2cf688397b0b2adb649e51946c00714b74d2d9e
+
+---
+ pygments/formatters/other.py | 16 ++++-----
+ pygments/lexers/special.py   | 35 ++++++++++---------
+ tests/test_raw_token.py      | 68 ++++++++++++++++++++++++++++++++++++
+ 3 files changed, 94 insertions(+), 25 deletions(-)
+ create mode 100644 tests/test_raw_token.py
+
+diff --git a/pygments/formatters/other.py b/pygments/formatters/other.py
+index c09eff0..aab062e 100644
+--- a/pygments/formatters/other.py
++++ b/pygments/formatters/other.py
+@@ -87,35 +87,33 @@ class RawTokenFormatter(Formatter):
+         if self.compress == 'gz':
+             import gzip
+             outfile = gzip.GzipFile('', 'wb', 9, outfile)
+-
+-            def write(text):
+-                outfile.write(text.encode())
+-            flush = outfile.flush
++           
++            write = outfile.write
++            flush = outfile.close
+         elif self.compress == 'bz2':
+             import bz2
+             compressor = bz2.BZ2Compressor(9)
+ 
+             def write(text):
+-                outfile.write(compressor.compress(text.encode()))
++                outfile.write(compressor.compress(text))
+ 
+             def flush():
+                 outfile.write(compressor.flush())
+                 outfile.flush()
+         else:
+-            def write(text):
+-                outfile.write(text.encode())
++            write = outfile.write
+             flush = outfile.flush
+ 
+         if self.error_color:
+             for ttype, value in tokensource:
+-                line = "%s\t%r\n" % (ttype, value)
++                line = b"%r\t%r\n" % (ttype, value)
+                 if ttype is Token.Error:
+                     write(colorize(self.error_color, line))
+                 else:
+                     write(line)
+         else:
+             for ttype, value in tokensource:
+-                write("%s\t%r\n" % (ttype, value))
++                write(b"%r\t%r\n" % (ttype, value))
+         flush()
+ 
+ 
+diff --git a/pygments/lexers/special.py b/pygments/lexers/special.py
+index a1e2200..b6f53ee 100644
+--- a/pygments/lexers/special.py
++++ b/pygments/lexers/special.py
+@@ -9,11 +9,12 @@
+     :license: BSD, see LICENSE for details.
+ """
+ 
++import ast
+ import re
+ 
+ from pygments.lexer import Lexer
+ from pygments.token import Token, Error, Text
+-from pygments.util import get_choice_opt, text_type, BytesIO
++from pygments.util import get_choice_opt
+ 
+ 
+ __all__ = ['TextLexer', 'RawTokenLexer']
+@@ -43,9 +44,7 @@ line_re = re.compile('.*?\n')
+ 
+ class RawTokenLexer(Lexer):
+     """
+-    Recreate a token stream formatted with the `RawTokenFormatter`.  This
+-    lexer raises exceptions during parsing if the token stream in the
+-    file is malformed.
++    Recreate a token stream formatted with the `RawTokenFormatter`.
+ 
+     Additional options accepted:
+ 
+@@ -67,13 +66,16 @@ class RawTokenLexer(Lexer):
+         if self.compress:
+             if isinstance(text, str):
+                 text = text.encode('latin1')
+-            if self.compress == 'gz':
+-                import gzip
+-                gzipfile = gzip.GzipFile('', 'rb', 9, BytesIO(text))
+-                text = gzipfile.read()
+-            elif self.compress == 'bz2':
+-                import bz2
+-                text = bz2.decompress(text)
++            try:
++                if self.compress == 'gz':
++                    import gzip
++                    text = gzip.decompress(text)
++                elif self.compress == 'bz2':
++                    import bz2
++                    text = bz2.decompress(text)
++            except OSError:
++                yield Error, text.decode('latin1')
++        if isinstance(text, bytes):
+             text = text.decode('latin1')
+ 
+         # do not call Lexer.get_tokens() because stripping is not optional.
+@@ -86,10 +88,6 @@ class RawTokenLexer(Lexer):
+         for match in line_re.finditer(text):
+             try:
+                 ttypestr, val = match.group().rstrip().split('\t', 1)
+-            except ValueError:
+-                val = match.group()
+-                ttype = Error
+-            else:
+                 ttype = _ttype_cache.get(ttypestr)
+                 if not ttype:
+                     ttype = Token
+@@ -99,6 +97,11 @@ class RawTokenLexer(Lexer):
+                             raise ValueError('malformed token name')
+                         ttype = getattr(ttype, ttype_)
+                     _ttype_cache[ttypestr] = ttype
+-                val = val[1:-1].encode().decode('unicode-escape')
++                val = ast.literal_eval(val)
++                if not isinstance(val, str):
++                    raise ValueError('expected str')
++            except (SyntaxError, ValueError):
++                val = match.group()
++                ttype = Error
+             yield length, ttype, val
+             length += len(val)
+diff --git a/tests/test_raw_token.py b/tests/test_raw_token.py
+new file mode 100644
+index 0000000..bae5a49
+--- /dev/null
++++ b/tests/test_raw_token.py
+@@ -0,0 +1,68 @@
++import bz2
++import gzip
++
++from pygments import highlight
++from pygments.formatters import HtmlFormatter, RawTokenFormatter
++from pygments.lexers import PythonLexer, RawTokenLexer
++
++
++def test_raw_token():
++    code = "2 + α"
++    raw = highlight(code, PythonLexer(), RawTokenFormatter())
++    html = highlight(code, PythonLexer(), HtmlFormatter())
++
++    assert highlight(raw, RawTokenLexer(), RawTokenFormatter()) == raw
++    assert highlight(raw, RawTokenLexer(), HtmlFormatter()) == html
++    assert highlight(raw.decode(), RawTokenLexer(), HtmlFormatter()) == html
++
++    raw_gz = highlight(code, PythonLexer(), RawTokenFormatter(compress="gz"))
++    assert gzip.decompress(raw_gz) == raw
++    assert highlight(raw_gz, RawTokenLexer(compress="gz"), RawTokenFormatter()) == raw
++    assert (
++        highlight(
++            raw_gz.decode("latin1"), RawTokenLexer(compress="gz"), RawTokenFormatter()
++        )
++        == raw
++    )
++
++    raw_bz2 = highlight(code, PythonLexer(), RawTokenFormatter(compress="bz2"))
++    assert bz2.decompress(raw_bz2) == raw
++    assert highlight(raw_bz2, RawTokenLexer(compress="bz2"), RawTokenFormatter()) == raw
++    assert (
++        highlight(
++            raw_bz2.decode("latin1"), RawTokenLexer(compress="bz2"), RawTokenFormatter()
++        )
++        == raw
++    )
++
++
++def test_invalid_raw_token():
++    # These should not throw exceptions.
++    assert (
++        highlight("Tolkien", RawTokenLexer(), RawTokenFormatter())
++        == b"Token.Error\t'Tolkien\\n'\n"
++    )
++    assert (
++        highlight("Tolkien\t'x'", RawTokenLexer(), RawTokenFormatter())
++        == b"Token\t'x'\n"
++    )
++    assert (
++        highlight("Token.Text\t42", RawTokenLexer(), RawTokenFormatter())
++        == b"Token.Error\t'Token.Text\\t42\\n'\n"
++    )
++    assert (
++        highlight("Token.Text\t'", RawTokenLexer(), RawTokenFormatter())
++        == b'Token.Error\t"Token.Text\\t\'\\n"\n'
++    )
++    assert (
++        highlight("Token.Text\t'α'", RawTokenLexer(), RawTokenFormatter())
++        == b"Token.Text\t'\\u03b1'\n"
++    )
++    assert (
++        highlight("Token.Text\tu'α'", RawTokenLexer(), RawTokenFormatter())
++        == b"Token.Text\t'\\u03b1'\n"
++    )
++    assert (
++        highlight(b"Token.Text\t'\xff'", RawTokenLexer(), RawTokenFormatter())
++        == b"Token.Text\t'\\xff'\n"
++    )
+-- 
+2.27.0
+
diff --git a/python-pygments.spec b/python-pygments.spec
index 24dcd72..d36ab54 100644
--- a/python-pygments.spec
+++ b/python-pygments.spec
@@ -16,7 +16,7 @@ need to prettify source code. Highlights are: \
 Name:           python-pygments
 Summary:        Syntax highlighting engine written in Python
 Version:        2.5.2
-Release:        4
+Release:        5
 License:        BSD
 URL:            http://pygments.org/
 Source0:        https://pypi.org/packages/source/P/Pygments/Pygments-%{version}.tar.gz
@@ -25,6 +25,9 @@ BuildArch:      noarch
 Patch6000:      backport-CVE-2021-20270.patch
 Patch6001:      backport-CVE-2021-27291.patch
 Patch6002:      backport-weed-out-more-backtracking-string-regexes.patch
+Patch6003:      backport-fix-pop-from-empty-stack.patch
+Patch6004:      backport-Fix-raw-token-lexer-Unicode.patch
+Patch6005:      backport-support-Python3-and-handle-exceptions.patch
 
 %description
 %{_description}
@@ -85,6 +88,9 @@ cp -r doc/docs doc/reST
 %lang(en) %{_mandir}/man1/pygmentize.1*
 
 %changelog
+* Thu Nov 04 2021 tianwei<tianwei12@huawei.com> - 2.5.2-5
+- uncaught exception in get_tokens_unprocessed
+
 * Sun Sep 26 2021 huangduirong<huangduirong@huawei.com> - 2.5.2-4
 - Infinite loop in get_tokens_unprocessed
 
-- 
Gitee