データセット内の重複をフラグ付けすることは、多くのデータクレンジング作業において重要なステップです。これは、単に重複したエントリを削除するのではなく、データ品質を調査または監査する必要がある場合に特に有用です。重複をすぐに削除したくない状況も多く存在します。たとえば、どのレコードが重複しているかを確認してから最適な対応策を決定したい場合や、データ内の重複の発生状況をステークホルダーに報告する必要がある場合などです。重複エントリは、データ入力ミスやシステムの不具合、不正行為の兆候であることもあるため、フラグ付けしておくことでさらなる分析や追跡が可能になります。行が重複しているかどうかを示す列をデータセットに追加することで、元の情報をすべて保持しつつ、後のワークフローで重複パターンを簡単にフィルタリング、集計、可視化できるようになります。

import pandas as pd

data = {
    "id": [1, 2, 2, 3, 4, 4, 4],
    "name": ["Alice", "Bob", "Bob", "Charlie", "David", "David", "David"],
    "score": [85, 90, 90, 95, 80, 80, 80]
}
df = pd.DataFrame(data)
print(df)


import unittest
import user_code
import ast
import re   
import importlib
import csv
import unittest
import importlib
import pandas as pd

class TestTask(unittest.TestCase):
    def setUp(self):
        self.data = {
            "id": [1, 2, 2, 3, 4, 4, 4],
            "name": ["Alice", "Bob", "Bob", "Charlie", "David", "David", "David"],
            "score": [85, 90, 90, 95, 80, 80, 80]
        }
        self.df = pd.DataFrame(self.data)

    def test_returns_dataframe_with_is_duplicate_column(self):
        import user_code
        importlib.reload(user_code)
        result = user_code.flag_duplicates(self.df)
        has_column = isinstance(result, pd.DataFrame) and 'is_duplicate' in result.columns
        _dynamic_test(
            self,
            has_column,
            "Returned DataFrame includes 'is_duplicate' column.",
            "Returned DataFrame does not include 'is_duplicate' column."
        )

    def test_is_duplicate_marks_all_duplicates_true(self):
        import user_code
        importlib.reload(user_code)
        result = user_code.flag_duplicates(self.df)
        # The rows with index 1,2 and 4,5,6 are duplicates
        expected = [False, True, True, False, True, True, True]
        actual = list(result['is_duplicate']) if result is not None and 'is_duplicate' in result.columns else []
        _dynamic_test(
            self,
            actual == expected,
            "'is_duplicate' column correctly flags duplicates as True.",
            f"Expected {expected}, got {actual}."
        )

    def test_is_duplicate_marks_unique_false(self):
        import user_code
        importlib.reload(user_code)
        result = user_code.flag_duplicates(self.df)
        # Indexes 0 and 3 are unique rows
        unique_flags = [result.loc[0, 'is_duplicate'], result.loc[3, 'is_duplicate']] if result is not None and 'is_duplicate' in result.columns else [None, None]
        _dynamic_test(
            self,
            unique_flags == [False, False],
            "Unique rows are correctly flagged as False.",
            f"Expected unique flags [False, False], got {unique_flags}."
        )
        
def _dynamic_test(test_case, condition, success_message, failure_message):
    if condition:
        test_case._testMethodName = success_message
        test_case.assertTrue(True, success_message)
    else:
        test_case._testMethodName = failure_message
        test_case.fail(failure_message)

def normalize_text(text):
    text = text.lower()
    text = re.sub(r"\\s{2,}", " ", text)
    text = re.sub(r"\\s*([,:?])\\s*", r"\\1 ", text)
    return text.strip()

def change_var(code: str, var_name: str, value: str) -> str:
    tree = ast.parse(code)
    lines = code.splitlines()
    changed = False
    # Collect all assignment nodes to modify
    assign_nodes = [
        (i, node)
        for i, node in enumerate(tree.body)
        if isinstance(node, ast.Assign)
        and any(isinstance(target, ast.Name) and target.id == var_name for target in node.targets)
    ]

    # If nothing to change, return unmodified code
    if not assign_nodes:
        return code

    # Perform replacements for all matching assignments (from last to first to not break line offsets)
    for i, node in reversed(assign_nodes):
        start_line = node.lineno - 1
        line = lines[start_line]
        indent = ' ' * (len(line) - len(line.lstrip()))
        lines[start_line] = f"{indent}{var_name} = {value}"
        next_line = len(lines)
        for next_node in tree.body[i+1:]:
            if hasattr(next_node, 'lineno'):
                next_line = next_node.lineno - 1
                break
        if next_line > start_line + 1:
            lines[start_line+1:next_line] = []
        changed = True

    return '\\n'.join(lines) if changed else code

if __name__ == "__main__":
    unittest.main()


test_main.py

強力なライブラリと実践的な課題を用いて、Pythonでのデータクリーニングの基本技術を習得します。本コースは、中級レベルのPythonスキルを持ち、分析や機械学習のために効率的にデータを準備・クリーニングしたい学習者向けに設計されています。

データクリーニングの基本概念、その重要性、そしてPythonにおける主要なツールと手法を学びます。

pandasとnumpyを使用した欠損データおよび重複データの管理手法についてさらに深く学びます。

データの一貫性を確保し、エラーを修正し、外れ値を検出するための手法に焦点を当てます。

チャレンジ：重複エントリのフラグ付け

解答