diff --git a/backport-CVE-2024-5206.patch b/backport-CVE-2024-5206.patch new file mode 100644 index 0000000000000000000000000000000000000000..0c7bfb26cfa0262cd074240afc99c7f491aa91ec --- /dev/null +++ b/backport-CVE-2024-5206.patch @@ -0,0 +1,203 @@ +From 70ca21f106b603b611da73012c9ade7cd8e438b8 Mon Sep 17 00:00:00 2001 +From: Olivier Grisel +Date: Mon, 22 Apr 2024 15:10:46 +0200 +Subject: [PATCH] FIX remove the computed stop_words_ attribute of text + vectorizer (#28823) + +Origin: +https://github.com/scikit-learn/scikit-learn/commit/70ca21f106b603b611da73012c9ade7cd8e438b8 +--- + sklearn/feature_extraction/tests/test_text.py | 35 ---------------- + sklearn/feature_extraction/text.py | 40 +++---------------- + 2 files changed, 5 insertions(+), 70 deletions(-) + +diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py +index 79c5486..9c4a017 100644 +--- a/sklearn/feature_extraction/tests/test_text.py ++++ b/sklearn/feature_extraction/tests/test_text.py +@@ -584,14 +584,11 @@ def test_feature_names(): + @pytest.mark.parametrize('Vectorizer', (CountVectorizer, TfidfVectorizer)) + def test_vectorizer_max_features(Vectorizer): + expected_vocabulary = set(['burger', 'beer', 'salad', 'pizza']) +- expected_stop_words = set([u'celeri', u'tomato', u'copyright', u'coke', +- u'sparkling', u'water', u'the']) + + # test bounded number of extracted features + vectorizer = Vectorizer(max_df=0.6, max_features=4) + vectorizer.fit(ALL_FOOD_DOCS) + assert_equal(set(vectorizer.vocabulary_), expected_vocabulary) +- assert_equal(vectorizer.stop_words_, expected_stop_words) + + + def test_count_vectorizer_max_features(): +@@ -626,21 +623,16 @@ def test_vectorizer_max_df(): + vect.fit(test_data) + assert 'a' in vect.vocabulary_.keys() + assert_equal(len(vect.vocabulary_.keys()), 6) +- assert_equal(len(vect.stop_words_), 0) + + vect.max_df = 0.5 # 0.5 * 3 documents -> max_doc_count == 1.5 + vect.fit(test_data) + assert 'a' not in vect.vocabulary_.keys() # {ae} ignored + assert_equal(len(vect.vocabulary_.keys()), 4) # {bcdt} remain +- assert 'a' in vect.stop_words_ +- assert_equal(len(vect.stop_words_), 2) + + vect.max_df = 1 + vect.fit(test_data) + assert 'a' not in vect.vocabulary_.keys() # {ae} ignored + assert_equal(len(vect.vocabulary_.keys()), 4) # {bcdt} remain +- assert 'a' in vect.stop_words_ +- assert_equal(len(vect.stop_words_), 2) + + + def test_vectorizer_min_df(): +@@ -649,21 +641,16 @@ def test_vectorizer_min_df(): + vect.fit(test_data) + assert 'a' in vect.vocabulary_.keys() + assert_equal(len(vect.vocabulary_.keys()), 6) +- assert_equal(len(vect.stop_words_), 0) + + vect.min_df = 2 + vect.fit(test_data) + assert 'c' not in vect.vocabulary_.keys() # {bcdt} ignored + assert_equal(len(vect.vocabulary_.keys()), 2) # {ae} remain +- assert 'c' in vect.stop_words_ +- assert_equal(len(vect.stop_words_), 4) + + vect.min_df = 0.8 # 0.8 * 3 documents -> min_doc_count == 2.4 + vect.fit(test_data) + assert 'c' not in vect.vocabulary_.keys() # {bcdet} ignored + assert_equal(len(vect.vocabulary_.keys()), 1) # {a} remains +- assert 'c' in vect.stop_words_ +- assert_equal(len(vect.stop_words_), 5) + + + def test_count_binary_occurrences(): +@@ -936,28 +923,6 @@ def test_countvectorizer_vocab_dicts_when_pickling(): + assert_equal(cv.get_feature_names(), unpickled_cv.get_feature_names()) + + +-def test_stop_words_removal(): +- # Ensure that deleting the stop_words_ attribute doesn't affect transform +- +- fitted_vectorizers = ( +- TfidfVectorizer().fit(JUNK_FOOD_DOCS), +- CountVectorizer(preprocessor=strip_tags).fit(JUNK_FOOD_DOCS), +- CountVectorizer(strip_accents=strip_eacute).fit(JUNK_FOOD_DOCS) +- ) +- +- for vect in fitted_vectorizers: +- vect_transform = vect.transform(JUNK_FOOD_DOCS).toarray() +- +- vect.stop_words_ = None +- stop_None_transform = vect.transform(JUNK_FOOD_DOCS).toarray() +- +- delattr(vect, 'stop_words_') +- stop_del_transform = vect.transform(JUNK_FOOD_DOCS).toarray() +- +- assert_array_equal(stop_None_transform, vect_transform) +- assert_array_equal(stop_del_transform, vect_transform) +- +- + def test_pickling_transformer(): + X = CountVectorizer().fit_transform(JUNK_FOOD_DOCS) + orig = TfidfTransformer().fit(X) +diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py +index 19d5c7f..bd9da0b 100644 +--- a/sklearn/feature_extraction/text.py ++++ b/sklearn/feature_extraction/text.py +@@ -795,15 +795,6 @@ class CountVectorizer(BaseEstimator, VectorizerMixin): + vocabulary_ : dict + A mapping of terms to feature indices. + +- stop_words_ : set +- Terms that were ignored because they either: +- +- - occurred in too many documents (`max_df`) +- - occurred in too few documents (`min_df`) +- - were cut off by feature selection (`max_features`). +- +- This is only available if no vocabulary was given. +- + Examples + -------- + >>> from sklearn.feature_extraction.text import CountVectorizer +@@ -827,11 +818,6 @@ class CountVectorizer(BaseEstimator, VectorizerMixin): + -------- + HashingVectorizer, TfidfVectorizer + +- Notes +- ----- +- The ``stop_words_`` attribute can get large and increase the model size +- when pickling. This attribute is provided only for introspection and can +- be safely removed using delattr or set to None before pickling. + """ + + def __init__(self, input='content', encoding='utf-8', +@@ -909,18 +895,16 @@ class CountVectorizer(BaseEstimator, VectorizerMixin): + mask = new_mask + + new_indices = np.cumsum(mask) - 1 # maps old indices to new +- removed_terms = set() + for term, old_index in list(six.iteritems(vocabulary)): + if mask[old_index]: + vocabulary[term] = new_indices[old_index] + else: + del vocabulary[term] +- removed_terms.add(term) + kept_indices = np.where(mask)[0] + if len(kept_indices) == 0: + raise ValueError("After pruning, no terms remain. Try a lower" + " min_df or a higher max_df.") +- return X[:, kept_indices], removed_terms ++ return X[:, kept_indices] + + def _count_vocab(self, raw_documents, fixed_vocab): + """Create sparse feature matrix, and vocabulary where fixed_vocab=False +@@ -1046,10 +1030,10 @@ class CountVectorizer(BaseEstimator, VectorizerMixin): + if max_doc_count < min_doc_count: + raise ValueError( + "max_df corresponds to < documents than min_df") +- X, self.stop_words_ = self._limit_features(X, vocabulary, +- max_doc_count, +- min_doc_count, +- max_features) ++ X = self._limit_features(X, vocabulary, ++ max_doc_count, ++ min_doc_count, ++ max_features) + + self.vocabulary_ = vocabulary + +@@ -1459,15 +1443,6 @@ class TfidfVectorizer(CountVectorizer): + The inverse document frequency (IDF) vector; only defined + if ``use_idf`` is True. + +- stop_words_ : set +- Terms that were ignored because they either: +- +- - occurred in too many documents (`max_df`) +- - occurred in too few documents (`min_df`) +- - were cut off by feature selection (`max_features`). +- +- This is only available if no vocabulary was given. +- + Examples + -------- + >>> from sklearn.feature_extraction.text import TfidfVectorizer +@@ -1491,11 +1466,6 @@ class TfidfVectorizer(CountVectorizer): + TfidfTransformer : Performs the TF-IDF transformation from a provided + matrix of counts. + +- Notes +- ----- +- The ``stop_words_`` attribute can get large and increase the model size +- when pickling. This attribute is provided only for introspection and can +- be safely removed using delattr or set to None before pickling. + """ + + def __init__(self, input='content', encoding='utf-8', +-- +2.33.0 + diff --git a/python-scikit-learn.spec b/python-scikit-learn.spec index 3209927b86cb978ebe0e993decf0ad6e0934a7b4..380bf031d3b783a84b8bf88173e5f8e3a2930c8c 100644 --- a/python-scikit-learn.spec +++ b/python-scikit-learn.spec @@ -3,10 +3,11 @@ Name: python-scikit-learn Summary: A Python module for machine learning built on top of SciPy Version: 0.20.4 -Release: 4 +Release: 5 License: BSD URL: https://scikit-learn.org/stable/ Source0: https://github.com/scikit-learn/scikit-learn/archive/%{version}/scikit-learn-%{version}.tar.gz +Patch3000: backport-CVE-2024-5206.patch %global _description\ scikit-learn is a Python module for machine learning built on top of SciPy\ @@ -19,14 +20,14 @@ Summary: %summary %{?python_provide:%python_provide python3-scikit-learn} %{?python_provide:%python_provide python3-sklearn} -BuildRequires: git python3-devel python3-numpy python3-Cython python3-pytest +BuildRequires: python3-devel python3-numpy python3-Cython python3-pytest Requires: python3 >= 3.5 python3-numpy >= 1.11.0 Requires: python3-scipy >= 0.17.0 python3-joblib >= 0.11 %description -n python3-scikit-learn %_description %prep -%autosetup -n scikit-learn-%{version} -p1 -Sgit +%autosetup -n scikit-learn-%{version} -p1 %build %py3_build @@ -41,6 +42,9 @@ Requires: python3-scipy >= 0.17.0 python3-joblib >= 0.11 %{python3_sitearch}/scikit_learn-*.egg-info %changelog +* Mon Jun 17 2024 yaoxin - 0.20.4-5 +- Fix CVE-2024-5206 + * Mon Sep 27 2021 lingsheng - 0.20.4-4 - Provides python-scikit-learn and python-sklearn for compatibility