Dans ce défi, application du flux de travail complet appris dans le cours — de la prétraitement des données à l'entraînement jusqu'à l'évaluation du modèle.


import unittest
import pandas as pd
import numpy as np

def _dynamic_test(test_case, condition, success_message, failure_message):
    if condition:
        test_case._testMethodName = success_message
        test_case.assertTrue(True, success_message)
    else:
        test_case._testMethodName = failure_message
        test_case.fail(failure_message)

class TestPipelineWithGridSearch(unittest.TestCase):

    @classmethod
    def setUpClass(cls):
        cls.df = pd.read_csv(
            'https://codefinity-content-media.s3.eu-west-1.amazonaws.com/a65bbc96-309e-4df9-a790-a1eb8c815a1c/penguins.csv'
        )
        cls.df = cls.df[cls.df.isna().sum(axis=1) < 2]
        import user_code
        cls.uc = user_code  # очікується: label_enc, y, X_train, X_test, y_train, y_test, ct, param_grid, grid_search, pipe

    def test_target_encoded_with_labelencoder(self):
        from sklearn.preprocessing import LabelEncoder
        uc = self.uc
        y_arr = np.asarray(uc.y)
        cond = isinstance(getattr(uc, 'label_enc', None), LabelEncoder) \
               and hasattr(uc.label_enc, 'classes_') \
               and y_arr.ndim == 1 and np.issubdtype(y_arr.dtype, np.integer)
        _dynamic_test(
            self,
            cond,
            "Target is encoded with LabelEncoder to integer dtype",
            "Target must be encoded with LabelEncoder to integer dtype"
        )

    def test_train_test_split_33_percent(self):
        uc = self.uc
        has_vars = all(hasattr(uc, v) for v in ['X_train', 'X_test', 'y_train', 'y_test'])
        if not has_vars:
            cond = False
        else:
            n_total = len(uc.X_train) + len(uc.X_test)
            expected_test = int(round(0.33 * n_total))
            cond = len(uc.y_train) + len(uc.y_test) == n_total and abs(len(uc.X_test) - expected_test) <= 2
        _dynamic_test(
            self,
            cond,
            "Data split uses approximately 33% for the test set",
            "Data must be split with test_size=0.33"
        )

    def test_columntransformer_ohe_passthrough(self):
        from sklearn.compose import ColumnTransformer
        from sklearn.preprocessing import OneHotEncoder
        uc = self.uc
        ct = getattr(uc, 'ct', None)
        cond_ct = isinstance(ct, ColumnTransformer)
        found_ohe = False
        if cond_ct:
            for _, trans, cols in ct.transformers:
                if isinstance(trans, OneHotEncoder):
                    cols_set = set(cols) if isinstance(cols, (list, tuple)) else {cols}
                    if cols_set == {'island', 'sex'}:
                        found_ohe = True
                        break
        cond = cond_ct and found_ohe and getattr(ct, 'remainder', None) == 'passthrough'
        _dynamic_test(
            self,
            cond,
            "ColumnTransformer applies OneHotEncoder to ['island', 'sex'] with remainder='passthrough'",
            "ColumnTransformer must OneHotEncode ['island', 'sex'] and set remainder='passthrough'"
        )

    def test_param_grid_values(self):
        uc = self.uc
        pg = getattr(uc, 'param_grid', None)
        
        cond = False
        if isinstance(pg, dict) and 'n_neighbors' in pg:
            n_neighbors_vals = pg['n_neighbors']
            # Перевіряємо, що це список (або кортеж), він не порожній, і всі його елементи — непарні числа
            if len(n_neighbors_vals) > 0:
                cond = all(isinstance(x, int) and x % 2 != 0 for x in n_neighbors_vals)
                
        _dynamic_test(
            self,
            cond,
            "param_grid defines odd values for n_neighbors, allowing experimentation",
            "param_grid must be a dict and include 'n_neighbors' with a list of odd integer values"
        )

    def test_gridsearchcv_with_knn(self):
        from sklearn.model_selection import GridSearchCV
        from sklearn.neighbors import KNeighborsClassifier
        uc = self.uc
        gs = getattr(uc, 'grid_search', None)
        cond = isinstance(gs, GridSearchCV) and isinstance(getattr(gs, 'estimator', None), KNeighborsClassifier)
        _dynamic_test(
            self,
            cond,
            "GridSearchCV is initialized with KNeighborsClassifier and given param_grid",
            "GridSearchCV must be initialized with KNeighborsClassifier and given param_grid"
        )

    def test_pipeline_structure_and_order(self):
        # Очікуваний порядок: columntransformer -> simpleimputer -> standardscaler -> gridsearchcv
        uc = self.uc
        step_names = [name for name, _ in getattr(uc, 'pipe', getattr(uc, 'pipeline', object())).steps] \
                     if hasattr(uc, 'pipe') else []
        expected = ['columntransformer', 'simpleimputer', 'standardscaler', 'gridsearchcv']
        cond = step_names == expected
        _dynamic_test(
            self,
            cond,
            "Pipeline steps are in order: ColumnTransformer, SimpleImputer, StandardScaler, GridSearchCV",
            "Pipeline steps must be: ColumnTransformer, SimpleImputer, StandardScaler, GridSearchCV"
        )

    def test_fitted_on_train_and_scores_on_test(self):
        uc = self.uc
        try:
            score = uc.pipe.score(uc.X_test, uc.y_test)
            cond = isinstance(score, (float, np.floating)) and np.isfinite(score)
        except Exception:
            cond = False
        _dynamic_test(
            self,
            cond,
            "Pipeline is fitted on train and computes a finite score on the test set",
            "Pipeline must be fitted on train and compute a finite score on the test set"
        )

    def test_predictions_returned_for_X_test(self):
        uc = self.uc
        try:
            y_pred = uc.pipe.predict(uc.X_test)
            cond = isinstance(y_pred, (np.ndarray, list)) and len(y_pred) == len(uc.X_test)
        except Exception:
            cond = False
        _dynamic_test(
            self,
            cond,
            "Pipeline returns predictions for X_test",
            "Pipeline must return predictions for X_test"
        )

    def test_best_estimator_available(self):
        uc = self.uc
        try:
            be = uc.grid_search.best_estimator_
            cond = be is not None
        except Exception:
            cond = False
        _dynamic_test(
            self,
            cond,
            "Best estimator is available via grid_search.best_estimator_",
            "Best estimator must be available via grid_search.best_estimator_"
        )

if __name__ == "__main__":
    unittest.main()

test_code.py

L'apprentissage automatique est désormais utilisé partout. Vous souhaitez l'apprendre vous-même ? Ce cours constitue une introduction au monde de l'apprentissage automatique afin de vous permettre d'acquérir les concepts de base, de travailler avec Scikit-learn – la bibliothèque la plus populaire pour le ML – et de réaliser votre premier projet d'apprentissage automatique.
Ce cours s'adresse aux étudiants ayant des connaissances de base en Python, Pandas et Numpy.

Découvrez les concepts de l'apprentissage automatique et le flux de travail d'un projet ML.

Le prétraitement est probablement l’étape la plus importante d’un projet ML. Ce chapitre couvre les étapes de prétraitement nécessaires pour presque tout jeu de données.

Un pipeline est une méthode élégante pour combiner toutes les étapes de prétraitement ainsi qu’un modèle. Les pipelines facilitent grandement l’entraînement et l’utilisation d’un modèle.

La modélisation est l'étape la plus intéressante d'un projet ML. Apprenons à construire, ajuster et évaluer le modèle !

Challenge: Putting It All Together

Solution