重複データは、同じ行がデータセット内に複数回出現する場合に発生します。これらの重複エントリは、特定の値が過剰に表現されることで分析結果を歪め、統計値の不正確さ、誤解を招く傾向、不信頼な結果につながります。重複行の検出と定量化はデータクリーニングの基本的な作業であり、問題の規模を把握し、重複の削除や統合など次の対応策を決定するための指針となります。

import pandas as pd

data = {
    "Name": ["Alice", "Bob", "Alice", "Charlie", "Bob", "Alice"],
    "Age": [25, 30, 25, 35, 30, 25],
    "City": ["NY", "LA", "NY", "SF", "LA", "NY"]
}
df = pd.DataFrame(data)
print(df)


import unittest
import user_code
import ast
import re   
import unittest
import pandas as pd
import importlib
import io
from contextlib import redirect_stdout

class TestTask(unittest.TestCase):
    def test_multiple_duplicates(self):
        import user_code
        importlib.reload(user_code)
        data = {
            "Name": ["Alice", "Bob", "Alice", "Charlie", "Bob", "Alice"],
            "Age": [25, 30, 25, 35, 30, 25],
            "City": ["NY", "LA", "NY", "SF", "LA", "NY"]
        }
        df = pd.DataFrame(data)
        result = user_code.count_duplicates(df)
        _dynamic_test(
            self,
            result == 3,
            "Correctly returns 3 duplicate rows",
            f"Expected 3, got {result} for duplicate rows",
        )

    def test_no_duplicates(self):
        import user_code
        importlib.reload(user_code)
        data = {
            "Name": ["Alice", "Bob", "Charlie"],
            "Age": [25, 30, 35],
            "City": ["NY", "LA", "SF"]
        }
        df = pd.DataFrame(data)
        result = user_code.count_duplicates(df)
        _dynamic_test(
            self,
            result == 0,
            "Correctly returns 0 when no duplicate rows",
            f"Expected 0, got {result} for duplicate rows",
        )

    def test_all_duplicates(self):
        import user_code
        importlib.reload(user_code)
        data = {
            "Name": ["Alice"] * 5,
            "Age": [25] * 5,
            "City": ["NY"] * 5
        }
        df = pd.DataFrame(data)
        result = user_code.count_duplicates(df)
        _dynamic_test(
            self,
            result == 4,
            "Correctly returns 4 when all rows are duplicates except the first",
            f"Expected 4, got {result} for duplicate rows",
        )

    def test_empty_dataframe(self):
        import user_code
        importlib.reload(user_code)
        df = pd.DataFrame({"A": [], "B": []})
        result = user_code.count_duplicates(df)
        _dynamic_test(
            self,
            result == 0,
            "Correctly returns 0 for empty DataFrame",
            f"Expected 0, got {result} for empty DataFrame",
        )

def _dynamic_test(test_case, condition, success_message, failure_message):
    if condition:
        test_case._testMethodName = success_message
        test_case.assertTrue(True, success_message)
    else:
        test_case._testMethodName = failure_message
        test_case.fail(failure_message)

def normalize_text(text):
    text = text.lower()
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"\s*([,:?])\s*", r"\1 ", text)
    return text.strip()

def change_var(code: str, var_name: str, value: str) -> str:
    tree = ast.parse(code)
    lines = code.splitlines()
    for i, node in enumerate(tree.body):
        if isinstance(node, ast.Assign):
            for target in node.targets:
                if isinstance(target, ast.Name) and target.id == var_name:
                    start_line = node.lineno - 1
                    line = lines[start_line]
                    indent = ' ' * (len(line) - len(line.lstrip()))
                    lines[start_line] = f"{indent}{var_name} = {value}"
                    next_line = len(lines)
                    for next_node in tree.body[i+1:]:
                        if hasattr(next_node, 'lineno'):
                            next_line = next_node.lineno - 1
                            break
                    if next_line > start_line + 1:
                        lines[start_line+1:next_line] = []
                    
                    return '\\n'.join(lines)
    return code

if __name__ == "__main__":
    unittest.main()


test_main.py

強力なライブラリと実践的な課題を用いて、Pythonでのデータクリーニングの基本技術を習得します。本コースは、中級レベルのPythonスキルを持ち、分析や機械学習のために効率的にデータを準備・クリーニングしたい学習者向けに設計されています。

データクリーニングの基本概念、その重要性、そしてPythonにおける主要なツールと手法を学びます。

pandasとnumpyを使用した欠損データおよび重複データの管理手法についてさらに深く学びます。

データの一貫性を確保し、エラーを修正し、外れ値を検出するための手法に焦点を当てます。

チャレンジ：重複のカウント

解答