実際のデータを扱う際、同じ内容を表しているにもかかわらず、異なる表記で記録されているカテゴリ値に頻繁に遭遇します。例えば、アンケート調査では同じ列に `Yes`、`yes`、`YES` などの回答が記録されることがあります。これらの不一致は、データの分析や集計を行う際に問題となります。なぜなら、Python や pandas ではこれらが異なる値として扱われるためです。これらのエントリを標準化することは、データの一貫性を保ち、正確な結果を得るために不可欠です。

import pandas as pd

data = {
    "Response": ["Yes", "no", "YES", "No", "yes", "NO", "nO", "YeS"]
}
df = pd.DataFrame(data)
print(df)


import unittest
import user_code
import ast
import re   
import unittest
import pandas as pd
import importlib
import user_code

class TestTask(unittest.TestCase):
    def setUp(self):
        self.data = {
            "Response": ["Yes", "no", "YES", "No", "yes", "NO", "nO", "YeS"],
            "Other": [1,2,3,4,5,6,7,8]
        }
        self.df = pd.DataFrame(self.data)

    def test_all_values_lowercase(self):
        importlib.reload(user_code)
        result = user_code.standardize_column_case(self.df, "Response")
        expected = [x.lower() for x in self.data["Response"]]
        actual = list(result["Response"])
        _dynamic_test(
            self,
            actual == expected,
            "All values in the specified column are converted to lowercase.",
            f"Expected lowercase values {expected}, got {actual}.",
        )

    def test_other_columns_unchanged(self):
        importlib.reload(user_code)
        result = user_code.standardize_column_case(self.df, "Response")
        actual = list(result["Other"])
        expected = self.data["Other"]
        _dynamic_test(
            self,
            actual == expected,
            "Other columns are unchanged.",
            f"Other column changed: expected {expected}, got {actual}.",
        )

    def test_shape_unchanged(self):
        importlib.reload(user_code)
        result = user_code.standardize_column_case(self.df, "Response")
        _dynamic_test(
            self,
            result.shape == self.df.shape,
            "Returned DataFrame has the same shape as input.",
            f"Expected shape {self.df.shape}, got {result.shape}.",
        )

def _dynamic_test(test_case, condition, success_message, failure_message):
    if condition:
        test_case._testMethodName = success_message
        test_case.assertTrue(True, success_message)
    else:
        test_case._testMethodName = failure_message
        test_case.fail(failure_message)

def normalize_text(text):
    text = text.lower()
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"\s*([,:?])\s*", r"\1 ", text)
    return text.strip()

def change_var(code: str, var_name: str, value: str) -> str:
    tree = ast.parse(code)
    lines = code.splitlines()
    for i, node in enumerate(tree.body):
        if isinstance(node, ast.Assign):
            for target in node.targets:
                if isinstance(target, ast.Name) and target.id == var_name:
                    start_line = node.lineno - 1
                    line = lines[start_line]
                    indent = ' ' * (len(line) - len(line.lstrip()))
                    lines[start_line] = f"{indent}{var_name} = {value}"
                    next_line = len(lines)
                    for next_node in tree.body[i+1:]:
                        if hasattr(next_node, 'lineno'):
                            next_line = next_node.lineno - 1
                            break
                    if next_line > start_line + 1:
                        lines[start_line+1:next_line] = []
                    
                    return '\\n'.join(lines)
    return code

if __name__ == "__main__":
    unittest.main()


test_main.py

強力なライブラリと実践的な課題を用いて、Pythonでのデータクリーニングの基本技術を習得します。本コースは、中級レベルのPythonスキルを持ち、分析や機械学習のために効率的にデータを準備・クリーニングしたい学習者向けに設計されています。

データクリーニングの基本概念、その重要性、そしてPythonにおける主要なツールと手法を学びます。

pandasとnumpyを使用した欠損データおよび重複データの管理手法についてさらに深く学びます。

データの一貫性を確保し、エラーを修正し、外れ値を検出するための手法に焦点を当てます。

チャレンジ：カテゴリ値の標準化

解答