Mean imputation is a straightforward technique for handling missing values in numerical data. You replace each missing value in a column with the mean of the non-missing values from that same column. This method is most appropriate when the data is missing at random and the distribution of values is not heavily skewed. However, mean imputation can distort the variance and relationships in your data, especially if many values are missing or if the data is not normally distributed. It is important to consider these limitations before choosing mean imputation for your data cleaning workflow.

import pandas as pd
import numpy as np

data = {
    "id": [1, 2, 3, 4, 5],
    "score": [85, np.nan, 78, np.nan, 92]
}
df = pd.DataFrame(data)
print(df)


import unittest
import user_code
import ast
import re   
import unittest
import importlib
import pandas as pd
import numpy as np

class TestTask(unittest.TestCase):
    def setUp(self):
        self.data = {
            "id": [1, 2, 3, 4, 5],
            "score": [85, np.nan, 78, np.nan, 92]
        }
        self.df = pd.DataFrame(self.data)
        self.expected_mean = np.mean([85, 78, 92])

    def test_missing_values_replaced_with_mean(self):
        import user_code
        importlib.reload(user_code)
        df_imputed = user_code.impute_with_mean(self.df.copy(), "score")
        missing_indices = self.df["score"].isna()
        replaced_values = df_imputed.loc[missing_indices, "score"].values
        all_replaced = np.allclose(replaced_values, self.expected_mean)
        _dynamic_test(
            self,
            all_replaced,
            "All missing values are replaced with the mean",
            f"Expected all missing values to be {self.expected_mean}, got {replaced_values}"
        )

    def test_non_missing_values_unchanged(self):
        import user_code
        importlib.reload(user_code)
        df_imputed = user_code.impute_with_mean(self.df.copy(), "score")
        non_missing_indices = ~self.df["score"].isna()
        original = self.df.loc[non_missing_indices, "score"].values
        after = df_imputed.loc[non_missing_indices, "score"].values
        unchanged = np.allclose(original, after)
        _dynamic_test(
            self,
            unchanged,
            "Non-missing values remain unchanged",
            f"Non-missing values changed: original {original}, after {after}"
        )

    def test_other_columns_unchanged(self):
        import user_code
        importlib.reload(user_code)
        df_imputed = user_code.impute_with_mean(self.df.copy(), "score")
        unchanged = (df_imputed["id"] == self.df["id"]).all()
        _dynamic_test(
            self,
            unchanged,
            "Other columns remain unchanged",
            f"Column 'id' changed: expected {self.df['id'].tolist()}, got {df_imputed['id'].tolist()}"
        )

    def test_shape_unchanged(self):
        import user_code
        importlib.reload(user_code)
        df_imputed = user_code.impute_with_mean(self.df.copy(), "score")
        _dynamic_test(
            self,
            df_imputed.shape == self.df.shape,
            "DataFrame shape remains unchanged",
            f"Expected shape {self.df.shape}, got {df_imputed.shape}"
        )

    def test_no_missing_after_imputation(self):
        import user_code
        importlib.reload(user_code)
        df_imputed = user_code.impute_with_mean(self.df.copy(), "score")
        no_missing = df_imputed["score"].isna().sum() == 0
        _dynamic_test(
            self,
            no_missing,
            "No missing values remain in the column after imputation",
            "There are still missing values after imputation."
        )

def _dynamic_test(test_case, condition, success_message, failure_message):
    if condition:
        test_case._testMethodName = success_message
        test_case.assertTrue(True, success_message)
    else:
        test_case._testMethodName = failure_message
        test_case.fail(failure_message)

def normalize_text(text):
    text = text.lower()
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"\s*([,:?])\s*", r"\1 ", text)
    return text.strip()

def change_var(code: str, var_name: str, value: str) -> str:
    tree = ast.parse(code)
    lines = code.splitlines()
    for i, node in enumerate(tree.body):
        if isinstance(node, ast.Assign):
            for target in node.targets:
                if isinstance(target, ast.Name) and target.id == var_name:
                    start_line = node.lineno - 1
                    line = lines[start_line]
                    indent = ' ' * (len(line) - len(line.lstrip()))
                    lines[start_line] = f"{indent}{var_name} = {value}"
                    next_line = len(lines)
                    for next_node in tree.body[i+1:]:
                        if hasattr(next_node, 'lineno'):
                            next_line = next_node.lineno - 1
                            break
                    if next_line > start_line + 1:
                        lines[start_line+1:next_line] = []
                    
                    return '\\n'.join(lines)
    return code

if __name__ == "__main__":
    unittest.main()


test_main.py

Master essential data cleaning techniques in Python using powerful libraries and practical tasks. This course is designed for learners with intermediate Python skills who want to efficiently prepare and clean data for analysis and machine learning.

Explore the core concepts of data cleaning, why it matters, and the essential tools and techniques in Python.

Delve deeper into techniques for managing missing and duplicate data using pandas and numpy.

Focus on techniques for making data consistent, correcting errors, and detecting outliers.

Challenge: Impute Missing Values with Mean

Lösning