Flagging duplicates within a dataset is a crucial step in many data cleaning workflows, especially when you need to investigate or audit data quality rather than simply removing repeated entries. There are many situations where you might not want to drop duplicates immediately. For instance, you may want to review which records are repeated before deciding on the best course of action, or you may need to report on the prevalence of duplication in your data to stakeholders. Sometimes, duplicate entries can indicate data entry errors, system glitches, or even fraudulent activity, so keeping them flagged allows for further analysis and traceability. By adding a column to your dataset to indicate whether a row is a duplicate, you retain all original information while making it easy to filter, summarize, or visualize duplication patterns later in your workflow.

import pandas as pd

data = {
    "id": [1, 2, 2, 3, 4, 4, 4],
    "name": ["Alice", "Bob", "Bob", "Charlie", "David", "David", "David"],
    "score": [85, 90, 90, 95, 80, 80, 80]
}
df = pd.DataFrame(data)
print(df)


import unittest
import user_code
import ast
import re   
import importlib
import csv
import unittest
import importlib
import pandas as pd

class TestTask(unittest.TestCase):
    def setUp(self):
        self.data = {
            "id": [1, 2, 2, 3, 4, 4, 4],
            "name": ["Alice", "Bob", "Bob", "Charlie", "David", "David", "David"],
            "score": [85, 90, 90, 95, 80, 80, 80]
        }
        self.df = pd.DataFrame(self.data)

    def test_returns_dataframe_with_is_duplicate_column(self):
        import user_code
        importlib.reload(user_code)
        result = user_code.flag_duplicates(self.df)
        has_column = isinstance(result, pd.DataFrame) and 'is_duplicate' in result.columns
        _dynamic_test(
            self,
            has_column,
            "Returned DataFrame includes 'is_duplicate' column.",
            "Returned DataFrame does not include 'is_duplicate' column."
        )

    def test_is_duplicate_marks_all_duplicates_true(self):
        import user_code
        importlib.reload(user_code)
        result = user_code.flag_duplicates(self.df)
        # The rows with index 1,2 and 4,5,6 are duplicates
        expected = [False, True, True, False, True, True, True]
        actual = list(result['is_duplicate']) if result is not None and 'is_duplicate' in result.columns else []
        _dynamic_test(
            self,
            actual == expected,
            "'is_duplicate' column correctly flags duplicates as True.",
            f"Expected {expected}, got {actual}."
        )

    def test_is_duplicate_marks_unique_false(self):
        import user_code
        importlib.reload(user_code)
        result = user_code.flag_duplicates(self.df)
        # Indexes 0 and 3 are unique rows
        unique_flags = [result.loc[0, 'is_duplicate'], result.loc[3, 'is_duplicate']] if result is not None and 'is_duplicate' in result.columns else [None, None]
        _dynamic_test(
            self,
            unique_flags == [False, False],
            "Unique rows are correctly flagged as False.",
            f"Expected unique flags [False, False], got {unique_flags}."
        )
        
def _dynamic_test(test_case, condition, success_message, failure_message):
    if condition:
        test_case._testMethodName = success_message
        test_case.assertTrue(True, success_message)
    else:
        test_case._testMethodName = failure_message
        test_case.fail(failure_message)

def normalize_text(text):
    text = text.lower()
    text = re.sub(r"\\s{2,}", " ", text)
    text = re.sub(r"\\s*([,:?])\\s*", r"\\1 ", text)
    return text.strip()

def change_var(code: str, var_name: str, value: str) -> str:
    tree = ast.parse(code)
    lines = code.splitlines()
    changed = False
    # Collect all assignment nodes to modify
    assign_nodes = [
        (i, node)
        for i, node in enumerate(tree.body)
        if isinstance(node, ast.Assign)
        and any(isinstance(target, ast.Name) and target.id == var_name for target in node.targets)
    ]

    # If nothing to change, return unmodified code
    if not assign_nodes:
        return code

    # Perform replacements for all matching assignments (from last to first to not break line offsets)
    for i, node in reversed(assign_nodes):
        start_line = node.lineno - 1
        line = lines[start_line]
        indent = ' ' * (len(line) - len(line.lstrip()))
        lines[start_line] = f"{indent}{var_name} = {value}"
        next_line = len(lines)
        for next_node in tree.body[i+1:]:
            if hasattr(next_node, 'lineno'):
                next_line = next_node.lineno - 1
                break
        if next_line > start_line + 1:
            lines[start_line+1:next_line] = []
        changed = True

    return '\\n'.join(lines) if changed else code

if __name__ == "__main__":
    unittest.main()


test_main.py

Master essential data cleaning techniques in Python using powerful libraries and practical tasks. This course is designed for learners with intermediate Python skills who want to efficiently prepare and clean data for analysis and machine learning.

Explore the core concepts of data cleaning, why it matters, and the essential tools and techniques in Python.

Delve deeper into techniques for managing missing and duplicate data using pandas and numpy.

Focus on techniques for making data consistent, correcting errors, and detecting outliers.

Challenge: Flag Duplicate Entries

Ratkaisu