En este desafío, aplica el flujo de trabajo completo aprendido en el curso: desde el preprocesamiento de datos hasta el entrenamiento y la evaluación del modelo.


import unittest
import pandas as pd
import numpy as np

def _dynamic_test(test_case, condition, success_message, failure_message):
    if condition:
        test_case._testMethodName = success_message
        test_case.assertTrue(True, success_message)
    else:
        test_case._testMethodName = failure_message
        test_case.fail(failure_message)

class TestPipelineWithGridSearch(unittest.TestCase):

    @classmethod
    def setUpClass(cls):
        cls.df = pd.read_csv(
            'https://codefinity-content-media.s3.eu-west-1.amazonaws.com/a65bbc96-309e-4df9-a790-a1eb8c815a1c/penguins.csv'
        )
        cls.df = cls.df[cls.df.isna().sum(axis=1) < 2]
        import user_code
        cls.uc = user_code  # очікується: label_enc, y, X_train, X_test, y_train, y_test, ct, param_grid, grid_search, pipe

    def test_target_encoded_with_labelencoder(self):
        from sklearn.preprocessing import LabelEncoder
        uc = self.uc
        y_arr = np.asarray(uc.y)
        cond = isinstance(getattr(uc, 'label_enc', None), LabelEncoder) \
               and hasattr(uc.label_enc, 'classes_') \
               and y_arr.ndim == 1 and np.issubdtype(y_arr.dtype, np.integer)
        _dynamic_test(
            self,
            cond,
            "Target is encoded with LabelEncoder to integer dtype",
            "Target must be encoded with LabelEncoder to integer dtype"
        )

    def test_train_test_split_33_percent(self):
        uc = self.uc
        has_vars = all(hasattr(uc, v) for v in ['X_train', 'X_test', 'y_train', 'y_test'])
        if not has_vars:
            cond = False
        else:
            n_total = len(uc.X_train) + len(uc.X_test)
            expected_test = int(round(0.33 * n_total))
            cond = len(uc.y_train) + len(uc.y_test) == n_total and abs(len(uc.X_test) - expected_test) <= 2
        _dynamic_test(
            self,
            cond,
            "Data split uses approximately 33% for the test set",
            "Data must be split with test_size=0.33"
        )

    def test_columntransformer_ohe_passthrough(self):
        from sklearn.compose import ColumnTransformer
        from sklearn.preprocessing import OneHotEncoder
        uc = self.uc
        ct = getattr(uc, 'ct', None)
        cond_ct = isinstance(ct, ColumnTransformer)
        found_ohe = False
        if cond_ct:
            for _, trans, cols in ct.transformers:
                if isinstance(trans, OneHotEncoder):
                    cols_set = set(cols) if isinstance(cols, (list, tuple)) else {cols}
                    if cols_set == {'island', 'sex'}:
                        found_ohe = True
                        break
        cond = cond_ct and found_ohe and getattr(ct, 'remainder', None) == 'passthrough'
        _dynamic_test(
            self,
            cond,
            "ColumnTransformer applies OneHotEncoder to ['island', 'sex'] with remainder='passthrough'",
            "ColumnTransformer must OneHotEncode ['island', 'sex'] and set remainder='passthrough'"
        )

    def test_param_grid_values(self):
        uc = self.uc
        pg = getattr(uc, 'param_grid', None)
        
        cond = False
        if isinstance(pg, dict) and 'n_neighbors' in pg:
            n_neighbors_vals = pg['n_neighbors']
            # Перевіряємо, що це список (або кортеж), він не порожній, і всі його елементи — непарні числа
            if len(n_neighbors_vals) > 0:
                cond = all(isinstance(x, int) and x % 2 != 0 for x in n_neighbors_vals)
                
        _dynamic_test(
            self,
            cond,
            "param_grid defines odd values for n_neighbors, allowing experimentation",
            "param_grid must be a dict and include 'n_neighbors' with a list of odd integer values"
        )

    def test_gridsearchcv_with_knn(self):
        from sklearn.model_selection import GridSearchCV
        from sklearn.neighbors import KNeighborsClassifier
        uc = self.uc
        gs = getattr(uc, 'grid_search', None)
        cond = isinstance(gs, GridSearchCV) and isinstance(getattr(gs, 'estimator', None), KNeighborsClassifier)
        _dynamic_test(
            self,
            cond,
            "GridSearchCV is initialized with KNeighborsClassifier and given param_grid",
            "GridSearchCV must be initialized with KNeighborsClassifier and given param_grid"
        )

    def test_pipeline_structure_and_order(self):
        # Очікуваний порядок: columntransformer -> simpleimputer -> standardscaler -> gridsearchcv
        uc = self.uc
        step_names = [name for name, _ in getattr(uc, 'pipe', getattr(uc, 'pipeline', object())).steps] \
                     if hasattr(uc, 'pipe') else []
        expected = ['columntransformer', 'simpleimputer', 'standardscaler', 'gridsearchcv']
        cond = step_names == expected
        _dynamic_test(
            self,
            cond,
            "Pipeline steps are in order: ColumnTransformer, SimpleImputer, StandardScaler, GridSearchCV",
            "Pipeline steps must be: ColumnTransformer, SimpleImputer, StandardScaler, GridSearchCV"
        )

    def test_fitted_on_train_and_scores_on_test(self):
        uc = self.uc
        try:
            score = uc.pipe.score(uc.X_test, uc.y_test)
            cond = isinstance(score, (float, np.floating)) and np.isfinite(score)
        except Exception:
            cond = False
        _dynamic_test(
            self,
            cond,
            "Pipeline is fitted on train and computes a finite score on the test set",
            "Pipeline must be fitted on train and compute a finite score on the test set"
        )

    def test_predictions_returned_for_X_test(self):
        uc = self.uc
        try:
            y_pred = uc.pipe.predict(uc.X_test)
            cond = isinstance(y_pred, (np.ndarray, list)) and len(y_pred) == len(uc.X_test)
        except Exception:
            cond = False
        _dynamic_test(
            self,
            cond,
            "Pipeline returns predictions for X_test",
            "Pipeline must return predictions for X_test"
        )

    def test_best_estimator_available(self):
        uc = self.uc
        try:
            be = uc.grid_search.best_estimator_
            cond = be is not None
        except Exception:
            cond = False
        _dynamic_test(
            self,
            cond,
            "Best estimator is available via grid_search.best_estimator_",
            "Best estimator must be available via grid_search.best_estimator_"
        )

if __name__ == "__main__":
    unittest.main()

test_code.py

El aprendizaje automático se utiliza actualmente en todas partes. ¿Quieres aprenderlo por ti mismo? Este curso es una introducción al mundo del aprendizaje automático para que puedas aprender los conceptos básicos, trabajar con Scikit-learn – la biblioteca más popular para ML – y construir tu primer proyecto de aprendizaje automático.
Este curso está dirigido a estudiantes con conocimientos básicos de Python, Pandas y Numpy.

Aprenda los conceptos de Machine Learning y el flujo de trabajo de un proyecto de ML.

El preprocesamiento es probablemente la etapa más importante de un proyecto de ML. Este capítulo abarca los pasos de preprocesamiento necesarios para casi cualquier conjunto de datos.

Una tubería es una forma ordenada de combinar todos los pasos de preprocesamiento junto con un modelo. Las tuberías facilitan considerablemente el entrenamiento y uso de un modelo.

El modelado es la etapa más divertida de un proyecto de ML. Aprendamos a construir, ajustar y evaluar el modelo.

Desafío: Integrándolo Todo

Solución