DataFrameでカテゴリカルデータを扱う際、文字列値の先頭や末尾に余分な空白があると、重大な不整合が発生する可能性があります。例えば、`"apple"`、`" apple"`、`"apple "` は見た目は同じでも、Pythonでは異なる文字列として扱われます。これにより、データのグループ化、フィルタリング、比較時に問題が生じ、誤った分析やパターンの見落としにつながることがあります。空白を削除してこれらの不整合を解消することは、データ分析の準備における重要な最初のステップです。

import pandas as pd

data = {
    "Fruit": [" apple", "banana ", "  cherry ", "date"],
    "Color": [" red", "yellow ", " red ", "brown"],
    "Count": [10, 5, 7, 3]
}

df = pd.DataFrame(data)
print(df)

`select_dtypes(include="object")` を使用すると、DataFrame内で文字列データを含む列のみを選択できます。これにより、`str.strip()` などの文字列操作を、テキストを格納している列のみに簡単に適用でき、数値や他の型の列には影響しません。

ノート


import unittest
import user_code
import ast
import re   
import unittest
import pandas as pd
import importlib

class TestTask(unittest.TestCase):
    def setUp(self):
        self.data = {
            "Fruit": [" apple", "banana ", "  cherry ", "date"],
            "Color": [" red", "yellow ", " red ", "brown"],
            "Count": [10, 5, 7, 3]
        }
        self.df = pd.DataFrame(self.data)
        self.expected_data = {
            "Fruit": ["apple", "banana", "cherry", "date"],
            "Color": ["red", "yellow", "red", "brown"],
            "Count": [10, 5, 7, 3]
        }
        self.expected_df = pd.DataFrame(self.expected_data)

    def test_whitespace_removed(self):
        import user_code
        importlib.reload(user_code)
        result = user_code.strip_whitespace(self.df)
        # Check all string columns have whitespace removed
        for col in ["Fruit", "Color"]:
            actual = list(result[col])
            expected = list(self.expected_df[col])
            _dynamic_test(
                self,
                actual == expected,
                f"Whitespace successfully removed from column '{col}'",
                f"Expected: {expected} in column '{col}', got: {actual}",
            )

    def test_non_string_columns_unchanged(self):
        import user_code
        importlib.reload(user_code)
        result = user_code.strip_whitespace(self.df)
        actual = list(result["Count"])
        expected = list(self.df["Count"])
        _dynamic_test(
            self,
            actual == expected,
            "Non-string columns remain unchanged",
            f"Expected: {expected} in 'Count', got: {actual}",
        )

    def test_shape_and_column_order(self):
        import user_code
        importlib.reload(user_code)
        result = user_code.strip_whitespace(self.df)
        _dynamic_test(
            self,
            result.shape == self.df.shape,
            "Shape of DataFrame is unchanged",
            f"Expected shape {self.df.shape}, got {result.shape}",
        )
        _dynamic_test(
            self,
            list(result.columns) == list(self.df.columns),
            "Column order is unchanged",
            f"Expected columns {list(self.df.columns)}, got {list(result.columns)}",
        )

def _dynamic_test(test_case, condition, success_message, failure_message):
    if condition:
        test_case._testMethodName = success_message
        test_case.assertTrue(True, success_message)
    else:
        test_case._testMethodName = failure_message
        test_case.fail(failure_message)

def normalize_text(text):
    text = text.lower()
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"\s*([,:?])\s*", r"\1 ", text)
    return text.strip()

def change_var(code: str, var_name: str, value: str) -> str:
    tree = ast.parse(code)
    lines = code.splitlines()
    for i, node in enumerate(tree.body):
        if isinstance(node, ast.Assign):
            for target in node.targets:
                if isinstance(target, ast.Name) and target.id == var_name:
                    start_line = node.lineno - 1
                    line = lines[start_line]
                    indent = ' ' * (len(line) - len(line.lstrip()))
                    lines[start_line] = f"{indent}{var_name} = {value}"
                    next_line = len(lines)
                    for next_node in tree.body[i+1:]:
                        if hasattr(next_node, 'lineno'):
                            next_line = next_node.lineno - 1
                            break
                    if next_line > start_line + 1:
                        lines[start_line+1:next_line] = []
                    
                    return '\\n'.join(lines)
    return code

if __name__ == "__main__":
    unittest.main()


test_main.py

強力なライブラリと実践的な課題を用いて、Pythonでのデータクリーニングの基本技術を習得します。本コースは、中級レベルのPythonスキルを持ち、分析や機械学習のために効率的にデータを準備・クリーニングしたい学習者向けに設計されています。

データクリーニングの基本概念、その重要性、そしてPythonにおける主要なツールと手法を学びます。

pandasとnumpyを使用した欠損データおよび重複データの管理手法についてさらに深く学びます。

データの一貫性を確保し、エラーを修正し、外れ値を検出するための手法に焦点を当てます。

チャレンジ：文字列から空白を削除する

解答