When working with real-world datasets, you often encounter missing values represented as `NaN` (not a number). Deciding when to drop rows with missing data depends on the context and the importance of the missing information. Dropping rows is appropriate when the dataset is large enough that removing some rows will not significantly impact your analysis, or when the missing data is scattered randomly and does not represent a systematic issue. However, this approach can lead to loss of valuable information, especially if missing values are concentrated in a particular group or if the dataset is small. Always consider whether dropping rows could introduce bias or reduce the representativeness of your data.

import pandas as pd
import numpy as np

data = {
    "name": ["Alice", "Bob", "Charlie", "David"],
    "age": [25, np.nan, 30, 22],
    "city": ["New York", "Los Angeles", np.nan, "Chicago"]
}

df = pd.DataFrame(data)
print(df)


import unittest
import user_code
import ast
import re   
import unittest
import pandas as pd
import numpy as np
import importlib

class TestTask(unittest.TestCase):
    def test_remove_missing_rows(self):
        import user_code
        importlib.reload(user_code)
        data = {
            "name": ["Alice", "Bob", "Charlie", "David"],
            "age": [25, np.nan, 30, 22],
            "city": ["New York", "Los Angeles", np.nan, "Chicago"]
        }
        df = pd.DataFrame(data)
        result = user_code.drop_missing_rows(df)
        expected = pd.DataFrame({
            "name": ["Alice", "David"],
            "age": [25.0, 22.0],
            "city": ["New York", "Chicago"]
        }, index=[0, 3])
        cond = (
            isinstance(result, pd.DataFrame) and
            result.reset_index(drop=True).equals(expected.reset_index(drop=True))
        )
        _dynamic_test(
            self,
            cond,
            "Correctly removes rows with missing values.",
            f"Expected DataFrame with rows 0 and 3 only, got:\\n{result}"
        )

    def test_original_not_modified(self):
        import user_code
        importlib.reload(user_code)
        data = {
            "name": ["Alice", "Bob", "Charlie", "David"],
            "age": [25, np.nan, 30, 22],
            "city": ["New York", "Los Angeles", np.nan, "Chicago"]
        }
        df = pd.DataFrame(data)
        df_copy = df.copy(deep=True)
        _ = user_code.drop_missing_rows(df)
        cond = df.equals(df_copy)
        _dynamic_test(
            self,
            cond,
            "Original DataFrame is not modified.",
            "Original DataFrame was modified."
        )

    def test_columns_unchanged(self):
        import user_code
        importlib.reload(user_code)
        data = {
            "col1": [1, np.nan],
            "col2": [2, 3]
        }
        df = pd.DataFrame(data)
        result = user_code.drop_missing_rows(df)
        cond = list(result.columns) == list(df.columns)
        _dynamic_test(
            self,
            cond,
            "Returned DataFrame has the same columns as input.",
            f"Expected columns {list(df.columns)}, got {list(result.columns)}."
        )

    def test_no_missing_values(self):
        import user_code
        importlib.reload(user_code)
        data = {
            "A": [1, 2, 3],
            "B": [4, 5, 6]
        }
        df = pd.DataFrame(data)
        result = user_code.drop_missing_rows(df)
        cond = result.equals(df)
        _dynamic_test(
            self,
            cond,
            "Returns identical DataFrame if there are no missing values.",
            f"Expected identical DataFrame, got:\\n{result}"
        )

    def test_all_rows_missing(self):
        import user_code
        importlib.reload(user_code)
        data = {
            "A": [np.nan, np.nan],
            "B": [np.nan, np.nan]
        }
        df = pd.DataFrame(data)
        result = user_code.drop_missing_rows(df)
        cond = (
            isinstance(result, pd.DataFrame) and
            result.empty and
            list(result.columns) == list(df.columns)
        )
        _dynamic_test(
            self,
            cond,
            "Returns empty DataFrame with same columns when all rows contain missing values.",
            f"Expected empty DataFrame with columns {list(df.columns)}, got:\\n{result}"
        )

def _dynamic_test(test_case, condition, success_message, failure_message):
    if condition:
        test_case._testMethodName = success_message
        test_case.assertTrue(True, success_message)
    else:
        test_case._testMethodName = failure_message
        test_case.fail(failure_message)

def normalize_text(text):
    text = text.lower()
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"\s*([,:?])\s*", r"\1 ", text)
    return text.strip()

def change_var(code: str, var_name: str, value: str) -> str:
    tree = ast.parse(code)
    lines = code.splitlines()
    for i, node in enumerate(tree.body):
        if isinstance(node, ast.Assign):
            for target in node.targets:
                if isinstance(target, ast.Name) and target.id == var_name:
                    start_line = node.lineno - 1
                    line = lines[start_line]
                    indent = ' ' * (len(line) - len(line.lstrip()))
                    lines[start_line] = f"{indent}{var_name} = {value}"
                    next_line = len(lines)
                    for next_node in tree.body[i+1:]:
                        if hasattr(next_node, 'lineno'):
                            next_line = next_node.lineno - 1
                            break
                    if next_line > start_line + 1:
                        lines[start_line+1:next_line] = []
                    
                    return '\\n'.join(lines)
    return code

if __name__ == "__main__":
    unittest.main()


test_main.py

Master essential data cleaning techniques in Python using powerful libraries and practical tasks. This course is designed for learners with intermediate Python skills who want to efficiently prepare and clean data for analysis and machine learning.

Explore the core concepts of data cleaning, why it matters, and the essential tools and techniques in Python.

Delve deeper into techniques for managing missing and duplicate data using pandas and numpy.

Focus on techniques for making data consistent, correcting errors, and detecting outliers.

Challenge: Drop Rows with Missing Data

Oplossing