欠損データは、実際のデータセットでよく見られる問題であり、一部のエントリが存在しない、不完全である、または「利用不可」と記録されている場合があります。データを分析またはモデリングする前に、これらの欠損値がどこで発生しているかを特定することが重要です。欠損データに対処しないと、不正確な結果や偏った洞察、後続処理でのエラーにつながる可能性があります。欠損値の存在と位置を認識することが、データを分析に適したクリーンで信頼性の高いものにするための第一歩です。

import pandas as pd
import numpy as np

# Create a sample DataFrame with missing values
data = {
    "Name": ["Alice", "Bob", "Charlie", "David"],
    "Age": [25, np.nan, 30, 22],
    "City": ["New York", "Los Angeles", np.nan, "Chicago"],
    "Score": [85, 90, np.nan, 88]
}

df = pd.DataFrame(data)
print(df)


import unittest
import user_code
import ast
import re   
import unittest
import pandas as pd
import numpy as np
import importlib
import io
from contextlib import redirect_stdout

class TestTask(unittest.TestCase):
    def test_shape_preserved(self):
        import user_code
        importlib.reload(user_code)
        data = {
            "A": [1, 2, np.nan],
            "B": [4, 5, 6],
            "C": [np.nan, 8, 9]
        }
        df = pd.DataFrame(data)
        result = user_code.detect_missing(df)
        _dynamic_test(
            self,
            isinstance(result, pd.DataFrame) and result.shape == df.shape,
            "Returned DataFrame has the same shape as input.",
            f"Returned DataFrame shape {getattr(result, 'shape', None)} does not match input shape {df.shape}."
        )

    def test_correct_missing_locations(self):
        import user_code
        importlib.reload(user_code)
        data = {
            "A": [1, 2, np.nan],
            "B": [4, np.nan, 6],
            "C": [np.nan, 8, 9]
        }
        df = pd.DataFrame(data)
        expected = df.isna()
        result = user_code.detect_missing(df)
        _dynamic_test(
            self,
            result.equals(expected),
            "Correct boolean DataFrame for missing values.",
            f"Expected:\\n{expected}\\nGot:\\n{result}"
        )

    def test_no_missing_values(self):
        import user_code
        importlib.reload(user_code)
        data = {
            "A": [1, 2, 3],
            "B": [4, 5, 6],
            "C": [7, 8, 9]
        }
        df = pd.DataFrame(data)
        expected = df.isna()
        result = user_code.detect_missing(df)
        _dynamic_test(
            self,
            result.equals(expected),
            "Returns all False DataFrame when no missing values.",
            f"Expected:\\n{expected}\\nGot:\\n{result}"
        )

    def test_all_missing_values(self):
        import user_code
        importlib.reload(user_code)
        data = {
            "A": [np.nan, np.nan],
            "B": [np.nan, np.nan]
        }
        df = pd.DataFrame(data)
        expected = df.isna()
        result = user_code.detect_missing(df)
        _dynamic_test(
            self,
            result.equals(expected),
            "Returns all True DataFrame when all values are missing.",
            f"Expected:\\n{expected}\\nGot:\\n{result}"
        )

def _dynamic_test(test_case, condition, success_message, failure_message):
    if condition:
        test_case._testMethodName = success_message
        test_case.assertTrue(True, success_message)
    else:
        test_case._testMethodName = failure_message
        test_case.fail(failure_message)

def normalize_text(text):
    text = text.lower()
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"\s*([,:?])\s*", r"\1 ", text)
    return text.strip()

def change_var(code: str, var_name: str, value: str) -> str:
    tree = ast.parse(code)
    lines = code.splitlines()
    for i, node in enumerate(tree.body):
        if isinstance(node, ast.Assign):
            for target in node.targets:
                if isinstance(target, ast.Name) and target.id == var_name:
                    start_line = node.lineno - 1
                    line = lines[start_line]
                    indent = ' ' * (len(line) - len(line.lstrip()))
                    lines[start_line] = f"{indent}{var_name} = {value}"
                    next_line = len(lines)
                    for next_node in tree.body[i+1:]:
                        if hasattr(next_node, 'lineno'):
                            next_line = next_node.lineno - 1
                            break
                    if next_line > start_line + 1:
                        lines[start_line+1:next_line] = []
                    
                    return '\\n'.join(lines)
    return code

if __name__ == "__main__":
    unittest.main()


test_main.py

強力なライブラリと実践的な課題を用いて、Pythonでのデータクリーニングの基本技術を習得します。本コースは、中級レベルのPythonスキルを持ち、分析や機械学習のために効率的にデータを準備・クリーニングしたい学習者向けに設計されています。

データクリーニングの基本概念、その重要性、そしてPythonにおける主要なツールと手法を学びます。

pandasとnumpyを使用した欠損データおよび重複データの管理手法についてさらに深く学びます。

データの一貫性を確保し、エラーを修正し、外れ値を検出するための手法に焦点を当てます。

チャレンジ：欠損データの特定

解答