Ensuring that your data contains only **unique records** is crucial for accurate analysis. Duplicate rows can distort statistics, lead to misleading results, and undermine the reliability of your conclusions. By removing duplicates, you help guarantee that every observation is counted just once, maintaining the integrity of your dataset.

import pandas as pd

data = {
    "Name": ["Alice", "Bob", "Alice", "Charlie", "Bob"],
    "Age": [25, 30, 25, 35, 30],
    "City": ["New York", "Paris", "New York", "London", "Paris"]
}

df = pd.DataFrame(data)
print(df)


import unittest
import user_code
import ast
import re   
import unittest
import pandas as pd
import importlib
import io
from contextlib import redirect_stdout

class TestTask(unittest.TestCase):
    def setUp(self):
        import user_code
        importlib.reload(user_code)
        self.func = getattr(user_code, 'remove_duplicates', None)

    def test_removes_duplicate_rows(self):
        import user_code
        data = {
            "Name": ["Alice", "Bob", "Alice", "Charlie", "Bob"],
            "Age": [25, 30, 25, 35, 30],
            "City": ["New York", "Paris", "New York", "London", "Paris"]
        }
        df = pd.DataFrame(data)
        result = self.func(df)
        expected = pd.DataFrame({
            "Name": ["Alice", "Bob", "Charlie"],
            "Age": [25, 30, 35],
            "City": ["New York", "Paris", "London"]
        }, index=[0,1,3])
        cond = (
            isinstance(result, pd.DataFrame) and
            result.reset_index(drop=True).equals(expected.reset_index(drop=True)) and
            list(result.columns) == list(df.columns) and
            all(result.dtypes == df.dtypes)
        )
        _dynamic_test(
            self,
            cond,
            "Duplicate rows are removed, only first occurrence is kept, column order and dtypes preserved.",
            f"Expected DataFrame:\\n{expected}\\nGot:\\n{result}"
        )

    def test_no_duplicates(self):
        import user_code
        data = {
            "A": [1, 2, 3],
            "B": ["x", "y", "z"]
        }
        df = pd.DataFrame(data)
        result = self.func(df)
        cond = (
            isinstance(result, pd.DataFrame) and
            result.equals(df) and
            list(result.columns) == list(df.columns) and
            all(result.dtypes == df.dtypes)
        )
        _dynamic_test(
            self,
            cond,
            "No change for DataFrame with no duplicates.",
            f"Expected unchanged DataFrame, got:\\n{result}"
        )

    def test_all_rows_duplicated(self):
        import user_code
        data = {
            "A": [1, 1, 1],
            "B": [2, 2, 2]
        }
        df = pd.DataFrame(data)
        result = self.func(df)
        expected = pd.DataFrame({"A": [1], "B": [2]})
        cond = (
            isinstance(result, pd.DataFrame) and
            result.reset_index(drop=True).equals(expected.reset_index(drop=True)) and
            list(result.columns) == list(df.columns) and
            all(result.dtypes == df.dtypes)
        )
        _dynamic_test(
            self,
            cond,
            "All duplicate rows removed, only first kept.",
            f"Expected DataFrame:\\n{expected}\\nGot:\\n{result}"
        )

    def test_preserves_column_order_and_types(self):
        import user_code
        data = {
            "first": [1, 2, 1],
            "second": [3.0, 4.5, 3.0],
            "third": [True, False, True]
        }
        df = pd.DataFrame(data)
        result = self.func(df)
        cond = (
            isinstance(result, pd.DataFrame) and
            list(result.columns) == list(df.columns) and
            all(result.dtypes == df.dtypes)
        )
        _dynamic_test(
            self,
            cond,
            "Column order and types are preserved after duplicate removal.",
            f"Column order or types changed. Columns: {list(result.columns)}, Types: {result.dtypes}"
        )

def _dynamic_test(test_case, condition, success_message, failure_message):
    if condition:
        test_case._testMethodName = success_message
        test_case.assertTrue(True, success_message)
    else:
        test_case._testMethodName = failure_message
        test_case.fail(failure_message)

def normalize_text(text):
    text = text.lower()
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"\s*([,:?])\s*", r"\1 ", text)
    return text.strip()

def change_var(code: str, var_name: str, value: str) -> str:
    tree = ast.parse(code)
    lines = code.splitlines()
    for i, node in enumerate(tree.body):
        if isinstance(node, ast.Assign):
            for target in node.targets:
                if isinstance(target, ast.Name) and target.id == var_name:
                    start_line = node.lineno - 1
                    line = lines[start_line]
                    indent = ' ' * (len(line) - len(line.lstrip()))
                    lines[start_line] = f"{indent}{var_name} = {value}"
                    next_line = len(lines)
                    for next_node in tree.body[i+1:]:
                        if hasattr(next_node, 'lineno'):
                            next_line = next_node.lineno - 1
                            break
                    if next_line > start_line + 1:
                        lines[start_line+1:next_line] = []
                    
                    return '\\n'.join(lines)
    return code

if __name__ == "__main__":
    unittest.main()


test_main.py

Master essential data cleaning techniques in Python using powerful libraries and practical tasks. This course is designed for learners with intermediate Python skills who want to efficiently prepare and clean data for analysis and machine learning.

Explore the core concepts of data cleaning, why it matters, and the essential tools and techniques in Python.

Delve deeper into techniques for managing missing and duplicate data using pandas and numpy.

Focus on techniques for making data consistent, correcting errors, and detecting outliers.

Challenge: Remove Duplicate Rows

Ratkaisu