Duplicate data occurs when the same row appears more than once in a dataset. These duplicate entries can skew your analysis by overrepresenting certain values, leading to inaccurate statistics, misleading trends, and unreliable results. Detecting and quantifying duplicate rows is a fundamental part of data cleaning, as it helps you understand the extent of the problem and informs your next steps—such as removing or consolidating these duplicates.

import pandas as pd

data = {
    "Name": ["Alice", "Bob", "Alice", "Charlie", "Bob", "Alice"],
    "Age": [25, 30, 25, 35, 30, 25],
    "City": ["NY", "LA", "NY", "SF", "LA", "NY"]
}
df = pd.DataFrame(data)
print(df)


import unittest
import user_code
import ast
import re   
import unittest
import pandas as pd
import importlib
import io
from contextlib import redirect_stdout

class TestTask(unittest.TestCase):
    def test_multiple_duplicates(self):
        import user_code
        importlib.reload(user_code)
        data = {
            "Name": ["Alice", "Bob", "Alice", "Charlie", "Bob", "Alice"],
            "Age": [25, 30, 25, 35, 30, 25],
            "City": ["NY", "LA", "NY", "SF", "LA", "NY"]
        }
        df = pd.DataFrame(data)
        result = user_code.count_duplicates(df)
        _dynamic_test(
            self,
            result == 3,
            "Correctly returns 3 duplicate rows",
            f"Expected 3, got {result} for duplicate rows",
        )

    def test_no_duplicates(self):
        import user_code
        importlib.reload(user_code)
        data = {
            "Name": ["Alice", "Bob", "Charlie"],
            "Age": [25, 30, 35],
            "City": ["NY", "LA", "SF"]
        }
        df = pd.DataFrame(data)
        result = user_code.count_duplicates(df)
        _dynamic_test(
            self,
            result == 0,
            "Correctly returns 0 when no duplicate rows",
            f"Expected 0, got {result} for duplicate rows",
        )

    def test_all_duplicates(self):
        import user_code
        importlib.reload(user_code)
        data = {
            "Name": ["Alice"] * 5,
            "Age": [25] * 5,
            "City": ["NY"] * 5
        }
        df = pd.DataFrame(data)
        result = user_code.count_duplicates(df)
        _dynamic_test(
            self,
            result == 4,
            "Correctly returns 4 when all rows are duplicates except the first",
            f"Expected 4, got {result} for duplicate rows",
        )

    def test_empty_dataframe(self):
        import user_code
        importlib.reload(user_code)
        df = pd.DataFrame({"A": [], "B": []})
        result = user_code.count_duplicates(df)
        _dynamic_test(
            self,
            result == 0,
            "Correctly returns 0 for empty DataFrame",
            f"Expected 0, got {result} for empty DataFrame",
        )

def _dynamic_test(test_case, condition, success_message, failure_message):
    if condition:
        test_case._testMethodName = success_message
        test_case.assertTrue(True, success_message)
    else:
        test_case._testMethodName = failure_message
        test_case.fail(failure_message)

def normalize_text(text):
    text = text.lower()
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"\s*([,:?])\s*", r"\1 ", text)
    return text.strip()

def change_var(code: str, var_name: str, value: str) -> str:
    tree = ast.parse(code)
    lines = code.splitlines()
    for i, node in enumerate(tree.body):
        if isinstance(node, ast.Assign):
            for target in node.targets:
                if isinstance(target, ast.Name) and target.id == var_name:
                    start_line = node.lineno - 1
                    line = lines[start_line]
                    indent = ' ' * (len(line) - len(line.lstrip()))
                    lines[start_line] = f"{indent}{var_name} = {value}"
                    next_line = len(lines)
                    for next_node in tree.body[i+1:]:
                        if hasattr(next_node, 'lineno'):
                            next_line = next_node.lineno - 1
                            break
                    if next_line > start_line + 1:
                        lines[start_line+1:next_line] = []
                    
                    return '\\n'.join(lines)
    return code

if __name__ == "__main__":
    unittest.main()


test_main.py

Master essential data cleaning techniques in Python using powerful libraries and practical tasks. This course is designed for learners with intermediate Python skills who want to efficiently prepare and clean data for analysis and machine learning.

Explore the core concepts of data cleaning, why it matters, and the essential tools and techniques in Python.

Delve deeper into techniques for managing missing and duplicate data using pandas and numpy.

Focus on techniques for making data consistent, correcting errors, and detecting outliers.

Challenge: Count Duplicates

Ratkaisu