diff --git a/backport-CVE-2024-5206.patch b/backport-CVE-2024-5206.patch deleted file mode 100644 index fb0a8e82027dcdcca6f54652ef17bfbd7f6e35ba..0000000000000000000000000000000000000000 --- a/backport-CVE-2024-5206.patch +++ /dev/null @@ -1,235 +0,0 @@ -From 70ca21f106b603b611da73012c9ade7cd8e438b8 Mon Sep 17 00:00:00 2001 -From: Olivier Grisel -Date: Mon, 22 Apr 2024 15:10:46 +0200 -Subject: [PATCH] FIX remove the computed stop_words_ attribute of text - vectorizer (#28823) - ---- - doc/whats_new/v1.4.rst | 18 ++++++++ - sklearn/feature_extraction/tests/test_text.py | 42 ------------------- - sklearn/feature_extraction/text.py | 36 +--------------- - 3 files changed, 20 insertions(+), 76 deletions(-) - -diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst -index ad3cc40..321db3b 100644 ---- a/doc/whats_new/v1.4.rst -+++ b/doc/whats_new/v1.4.rst -@@ -14,6 +14,24 @@ For a short description of the main highlights of the release, please refer to - - .. include:: changelog_legend.inc - -+Security -+-------- -+ -+- |Fix| :class:`feature_extraction.text.CountVectorizer` and -+ :class:`feature_extraction.text.TfidfVectorizer` no longer store discarded -+ tokens from the training set in their `stop_words_` attribute. This attribute -+ would hold too frequent (above `max_df`) but also too rare tokens (below -+ `min_df`). This fixes a potential security issue (data leak) if the discarded -+ rare tokens hold sensitive information from the training set without the -+ model developer's knowledge. -+ -+ Note: users of those classes are encouraged to either retrain their pipelines -+ with the new scikit-learn version or to manually clear the `stop_words_` -+ attribute from previously trained instances of those transformers. This -+ attribute was designed only for model inspection purposes and has no impact -+ on the behavior of the transformers. -+ :pr:`28823` by :user:`Olivier Grisel `. -+ - Changed models - -------------- - -diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py -index 7c7cac8..b784716 100644 ---- a/sklearn/feature_extraction/tests/test_text.py -+++ b/sklearn/feature_extraction/tests/test_text.py -@@ -757,21 +757,11 @@ def test_feature_names(): - @pytest.mark.parametrize("Vectorizer", (CountVectorizer, TfidfVectorizer)) - def test_vectorizer_max_features(Vectorizer): - expected_vocabulary = {"burger", "beer", "salad", "pizza"} -- expected_stop_words = { -- "celeri", -- "tomato", -- "copyright", -- "coke", -- "sparkling", -- "water", -- "the", -- } - - # test bounded number of extracted features - vectorizer = Vectorizer(max_df=0.6, max_features=4) - vectorizer.fit(ALL_FOOD_DOCS) - assert set(vectorizer.vocabulary_) == expected_vocabulary -- assert vectorizer.stop_words_ == expected_stop_words - - - def test_count_vectorizer_max_features(): -@@ -806,21 +796,16 @@ def test_vectorizer_max_df(): - vect.fit(test_data) - assert "a" in vect.vocabulary_.keys() - assert len(vect.vocabulary_.keys()) == 6 -- assert len(vect.stop_words_) == 0 - - vect.max_df = 0.5 # 0.5 * 3 documents -> max_doc_count == 1.5 - vect.fit(test_data) - assert "a" not in vect.vocabulary_.keys() # {ae} ignored - assert len(vect.vocabulary_.keys()) == 4 # {bcdt} remain -- assert "a" in vect.stop_words_ -- assert len(vect.stop_words_) == 2 - - vect.max_df = 1 - vect.fit(test_data) - assert "a" not in vect.vocabulary_.keys() # {ae} ignored - assert len(vect.vocabulary_.keys()) == 4 # {bcdt} remain -- assert "a" in vect.stop_words_ -- assert len(vect.stop_words_) == 2 - - - def test_vectorizer_min_df(): -@@ -829,21 +814,16 @@ def test_vectorizer_min_df(): - vect.fit(test_data) - assert "a" in vect.vocabulary_.keys() - assert len(vect.vocabulary_.keys()) == 6 -- assert len(vect.stop_words_) == 0 - - vect.min_df = 2 - vect.fit(test_data) - assert "c" not in vect.vocabulary_.keys() # {bcdt} ignored - assert len(vect.vocabulary_.keys()) == 2 # {ae} remain -- assert "c" in vect.stop_words_ -- assert len(vect.stop_words_) == 4 - - vect.min_df = 0.8 # 0.8 * 3 documents -> min_doc_count == 2.4 - vect.fit(test_data) - assert "c" not in vect.vocabulary_.keys() # {bcdet} ignored - assert len(vect.vocabulary_.keys()) == 1 # {a} remains -- assert "c" in vect.stop_words_ -- assert len(vect.stop_words_) == 5 - - - def test_count_binary_occurrences(): -@@ -1156,28 +1136,6 @@ def test_countvectorizer_vocab_dicts_when_pickling(): - ) - - --def test_stop_words_removal(): -- # Ensure that deleting the stop_words_ attribute doesn't affect transform -- -- fitted_vectorizers = ( -- TfidfVectorizer().fit(JUNK_FOOD_DOCS), -- CountVectorizer(preprocessor=strip_tags).fit(JUNK_FOOD_DOCS), -- CountVectorizer(strip_accents=strip_eacute).fit(JUNK_FOOD_DOCS), -- ) -- -- for vect in fitted_vectorizers: -- vect_transform = vect.transform(JUNK_FOOD_DOCS).toarray() -- -- vect.stop_words_ = None -- stop_None_transform = vect.transform(JUNK_FOOD_DOCS).toarray() -- -- delattr(vect, "stop_words_") -- stop_del_transform = vect.transform(JUNK_FOOD_DOCS).toarray() -- -- assert_array_equal(stop_None_transform, vect_transform) -- assert_array_equal(stop_del_transform, vect_transform) -- -- - def test_pickling_transformer(): - X = CountVectorizer().fit_transform(JUNK_FOOD_DOCS) - orig = TfidfTransformer().fit(X) -diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py -index 29104c2..e9727ae 100644 ---- a/sklearn/feature_extraction/text.py -+++ b/sklearn/feature_extraction/text.py -@@ -1081,15 +1081,6 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator): - True if a fixed vocabulary of term to indices mapping - is provided by the user. - -- stop_words_ : set -- Terms that were ignored because they either: -- -- - occurred in too many documents (`max_df`) -- - occurred in too few documents (`min_df`) -- - were cut off by feature selection (`max_features`). -- -- This is only available if no vocabulary was given. -- - See Also - -------- - HashingVectorizer : Convert a collection of text documents to a -@@ -1098,12 +1089,6 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator): - TfidfVectorizer : Convert a collection of raw documents to a matrix - of TF-IDF features. - -- Notes -- ----- -- The ``stop_words_`` attribute can get large and increase the model size -- when pickling. This attribute is provided only for introspection and can -- be safely removed using delattr or set to None before pickling. -- - Examples - -------- - >>> from sklearn.feature_extraction.text import CountVectorizer -@@ -1242,19 +1227,17 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator): - mask = new_mask - - new_indices = np.cumsum(mask) - 1 # maps old indices to new -- removed_terms = set() - for term, old_index in list(vocabulary.items()): - if mask[old_index]: - vocabulary[term] = new_indices[old_index] - else: - del vocabulary[term] -- removed_terms.add(term) - kept_indices = np.where(mask)[0] - if len(kept_indices) == 0: - raise ValueError( - "After pruning, no terms remain. Try a lower min_df or a higher max_df." - ) -- return X[:, kept_indices], removed_terms -+ return X[:, kept_indices] - - def _count_vocab(self, raw_documents, fixed_vocab): - """Create sparse feature matrix, and vocabulary where fixed_vocab=False""" -@@ -1399,7 +1382,7 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator): - raise ValueError("max_df corresponds to < documents than min_df") - if max_features is not None: - X = self._sort_features(X, vocabulary) -- X, self.stop_words_ = self._limit_features( -+ X = self._limit_features( - X, vocabulary, max_doc_count, min_doc_count, max_features - ) - if max_features is None: -@@ -1932,15 +1915,6 @@ class TfidfVectorizer(CountVectorizer): - The inverse document frequency (IDF) vector; only defined - if ``use_idf`` is True. - -- stop_words_ : set -- Terms that were ignored because they either: -- -- - occurred in too many documents (`max_df`) -- - occurred in too few documents (`min_df`) -- - were cut off by feature selection (`max_features`). -- -- This is only available if no vocabulary was given. -- - See Also - -------- - CountVectorizer : Transforms text into a sparse matrix of n-gram counts. -@@ -1948,12 +1922,6 @@ class TfidfVectorizer(CountVectorizer): - TfidfTransformer : Performs the TF-IDF transformation from a provided - matrix of counts. - -- Notes -- ----- -- The ``stop_words_`` attribute can get large and increase the model size -- when pickling. This attribute is provided only for introspection and can -- be safely removed using delattr or set to None before pickling. -- - Examples - -------- - >>> from sklearn.feature_extraction.text import TfidfVectorizer --- -2.27.0 - diff --git a/python-scikit-learn.spec b/python-scikit-learn.spec index 36cd388d5fc19e542ddd7cff40f3c335fa150671..8785b39bdb4dc53e58dc3ca73a47a30eb47d7920 100644 --- a/python-scikit-learn.spec +++ b/python-scikit-learn.spec @@ -2,12 +2,11 @@ Name: python-scikit-learn Summary: A Python module for machine learning built on top of SciPy -Version: 1.4.0 -Release: 2 -License: BSD +Version: 1.6.1 +Release: 1 +License: BSD-3-Clause URL: https://scikit-learn.org/stable/ -Source0: https://files.pythonhosted.org/packages/source/s/scikit-learn/scikit-learn-%{version}.tar.gz -Patch3000: backport-CVE-2024-5206.patch +Source0: https://files.pythonhosted.org/packages/source/s/scikit_learn/scikit_learn-%{version}.tar.gz %global _description\ scikit-learn is a Python module for machine learning built on top of SciPy\ @@ -21,14 +20,14 @@ Summary: %summary %{?python_provide:%python_provide python3-sklearn} BuildRequires: python3-devel python3-numpy python3-Cython python3-pytest python3-scipy g++ -BuildRequires: python3-pip python3-wheel +BuildRequires: python3-pip python3-wheel python3-meson-python Requires: python3 >= 3.5 python3-numpy >= 1.11.0 Requires: python3-scipy >= 0.17.0 python3-joblib >= 0.11 %description -n python3-scikit-learn %_description %prep -%autosetup -n scikit-learn-%{version} -p1 +%autosetup -n scikit_learn-%{version} -p1 %build CFLAGS="$RPM_OPT_FLAGS -s" @@ -44,6 +43,26 @@ CFLAGS="$RPM_OPT_FLAGS -s" %{python3_sitearch}/scikit_learn-%{version}.dist-info/ %changelog +* Sun Apr 27 2025 yaoxin <1024769339@qq.com> - 1.6.1-1 +- Update to 1.6.1: + * __sklearn_tags__ was introduced for setting tags in estimators. + * Scikit-learn classes and functions can be used while only having + a import sklearn import line. + * Many classes now properly handle Matplotlib aliases for style + parameters (e.g., c and color, ls and linestyle, etc). + * `utils.validation.validate_data` is introduced and replaces + previously private base.BaseEstimator._validate_data method. + This is intended for third party estimator developers, who should + use this function in most cases instead of `utils.check_array` + and `utils.check_X_y`. + * Additional estimators and functions have been updated to include + support for all Array API compliant inputs. + * Preliminary free-threaded CPython 3.13 support + * The tags.input_tags.sparse flag was corrected for a majority of + estimators. + * _more_tags, _get_tags, and _safe_tags are now raising a `DeprecationWarning` + instead of a `FutureWarning` to only notify developers instead of end-users. + * Fri Jun 07 2024 xuchenchen - 1.4.0-2 - Type:CVES - ID:CVE-2024-5206 diff --git a/scikit-learn-1.4.0.tar.gz b/scikit_learn-1.6.1.tar.gz similarity index 48% rename from scikit-learn-1.4.0.tar.gz rename to scikit_learn-1.6.1.tar.gz index 1293e4285f824eec57fa9e4ec62b5a572bfdeb03..afa4f73ddf5ad68e5fb5e3d67fe9cca474774771 100644 Binary files a/scikit-learn-1.4.0.tar.gz and b/scikit_learn-1.6.1.tar.gz differ