Outliers can significantly impact the quality of your data analysis, especially when they arise from errors or rare events that do not reflect typical patterns. When you want to reduce the influence of extreme values while keeping all your data points, replacing outliers with the **median** of the column is a robust technique. The **median** is resistant to the effect of outliers, so it provides a stable replacement value that maintains the overall distribution of your data. This approach is especially useful when you want to avoid losing data by removing rows, and when the **mean** would be skewed by the very outliers you are trying to address.

import pandas as pd

# Example DataFrame with outliers in the 'score' column
data = {
    "name": ["Alice", "Bob", "Charlie", "David", "Eve"],
    "score": [85, 90, 300, 88, 92]  # 300 is an outlier
}
df = pd.DataFrame(data)

# Let's say outliers have been identified using the IQR method
# For this example, we know that 300 is an outlier
outlier_mask = df["score"] > 150

print("Original DataFrame:")
print(df)
print("\nOutlier mask:")
print(outlier_mask)



import unittest
import user_code
import ast
import re   
import unittest
import importlib
import pandas as pd

class TestTask(unittest.TestCase):
    def setUp(self):
        self.data = {
            "name": ["Alice", "Bob", "Charlie", "David", "Eve"],
            "score": [85, 90, 300, 88, 92]
        }
        self.df = pd.DataFrame(self.data)
        self.outlier_mask = self.df["score"] > 150
        self.df_original = pd.DataFrame(self.data)

    def test_replaces_outliers_with_median(self):
        import user_code
        importlib.reload(user_code)
        df = self.df.copy()
        outlier_mask = self.outlier_mask.copy()
        user_code.replace_outliers_with_median(df, "score", outlier_mask)
        median = self.df_original.loc[~self.outlier_mask, "score"].median()
        # Outlier index is 2 (Charlie)
        replaced = df.loc[2, "score"] == median
        _dynamic_test(
            self,
            replaced,
            "Outlier value is replaced with median correctly.",
            f"Expected outlier at index 2 to be replaced with median {median}, got {df.loc[2, 'score']}"
        )

    def test_non_outliers_not_modified(self):
        import user_code
        importlib.reload(user_code)
        df = self.df.copy()
        outlier_mask = self.outlier_mask.copy()
        user_code.replace_outliers_with_median(df, "score", outlier_mask)
        # Non-outlier indices: 0,1,3,4
        for idx in [0,1,3,4]:
            unchanged = df.loc[idx, "score"] == self.df_original.loc[idx, "score"]
            _dynamic_test(
                self,
                unchanged,
                f"Non-outlier at index {idx} remains unchanged.",
                f"Non-outlier at index {idx} was changed: expected {self.df_original.loc[idx, 'score']}, got {df.loc[idx, 'score']}"
            )

    def test_inplace_modification(self):
        import user_code
        importlib.reload(user_code)
        df = self.df.copy()
        outlier_mask = self.outlier_mask.copy()
        df_id_before = id(df)
        user_code.replace_outliers_with_median(df, "score", outlier_mask)
        df_id_after = id(df)
        _dynamic_test(
            self,
            df_id_before == df_id_after,
            "DataFrame is modified in place.",
            "DataFrame was not modified in place."
        )

def _dynamic_test(test_case, condition, success_message, failure_message):
    if condition:
        test_case._testMethodName = success_message
        test_case.assertTrue(True, success_message)
    else:
        test_case._testMethodName = failure_message
        test_case.fail(failure_message)

def normalize_text(text):
    text = text.lower()
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"\s*([,:?])\s*", r"\1 ", text)
    return text.strip()

def change_var(code: str, var_name: str, value: str) -> str:
    tree = ast.parse(code)
    lines = code.splitlines()
    for i, node in enumerate(tree.body):
        if isinstance(node, ast.Assign):
            for target in node.targets:
                if isinstance(target, ast.Name) and target.id == var_name:
                    start_line = node.lineno - 1
                    line = lines[start_line]
                    indent = ' ' * (len(line) - len(line.lstrip()))
                    lines[start_line] = f"{indent}{var_name} = {value}"
                    next_line = len(lines)
                    for next_node in tree.body[i+1:]:
                        if hasattr(next_node, 'lineno'):
                            next_line = next_node.lineno - 1
                            break
                    if next_line > start_line + 1:
                        lines[start_line+1:next_line] = []
                    
                    return '\\n'.join(lines)
    return code

if __name__ == "__main__":
    unittest.main()


test_main.py

Master essential data cleaning techniques in Python using powerful libraries and practical tasks. This course is designed for learners with intermediate Python skills who want to efficiently prepare and clean data for analysis and machine learning.

Explore the core concepts of data cleaning, why it matters, and the essential tools and techniques in Python.

Delve deeper into techniques for managing missing and duplicate data using pandas and numpy.

Focus on techniques for making data consistent, correcting errors, and detecting outliers.

Challenge: Replace Outliers with Median

Løsning