多くの実際のデータセットでは、クラスの不均衡問題がよく発生します。これは、一方のクラス（多数派）が他方のクラス（少数派）を大きく上回る状況です。この不均衡は、モデルが多数派クラスを予測しやすくなり、少数派クラスの予測精度が低下する原因となります。一般的な解決策の一つが**アンダーサンプリング**であり、多数派クラスのサンプル数をランダムに減らして、少数派クラスと同じ数に揃えます。このチャレンジでは、この手法を実践的に体験します。2つのクラスを持つカテゴリカルなターゲット列を含むDataFrameが与えられます。目的は、多数派クラスをランダムにアンダーサンプリングすることで、両クラスが同数となる新しいDataFrameを返すことです。


import unittest
import user_code
import ast
import re   
import importlib
import csv
import unittest
import pandas as pd
import importlib
import sys

class TestTask(unittest.TestCase):
    def setUp(self):
        if "user_code" in sys.modules:
            del sys.modules["user_code"]
        import user_code
        importlib.reload(user_code)
        self.undersample_majority_class = getattr(user_code, 'undersample_majority_class', None)

    def test_balanced_result(self):
        import user_code
        data = {
            "feature": list(range(12)),
            "target":  ["A"] * 8 + ["B"] * 4
        }
        df = pd.DataFrame(data)
        balanced = self.undersample_majority_class(df, "target") if self.undersample_majority_class else None
        _dynamic_test(
            self,
            balanced is not None and set(balanced['target'].value_counts()) == {4},
            "Returned DataFrame contains both classes in equal numbers",
            f"Expected both classes to have 4 samples, got counts: {None if balanced is None else balanced['target'].value_counts().to_dict()}"
        )

    def test_minority_preserved(self):
        import user_code
        data = {
            "feature": list(range(8)),
            "target":  ["A"] * 2 + ["B"] * 6
        }
        df = pd.DataFrame(data)
        balanced = self.undersample_majority_class(df, "target") if self.undersample_majority_class else None
        minority_rows = df[df["target"] == "A"]
        if balanced is not None:
            for idx, row in minority_rows.iterrows():
                match = ((balanced["feature"] == row["feature"]) & (balanced["target"] == row["target"]))
                _dynamic_test(
                    self,
                    match.any(),
                    "All original minority class samples are preserved",
                    f"Minority sample {row.to_dict()} not found in balanced DataFrame"
                )
        else:
            _dynamic_test(self, False, "", "Function returned None")

    def test_majority_count(self):
        import user_code
        data = {
            "feature": list(range(10)),
            "target":  ["X"] * 7 + ["Y"] * 3
        }
        df = pd.DataFrame(data)
        balanced = self.undersample_majority_class(df, "target") if self.undersample_majority_class else None
        if balanced is not None:
            counts = balanced['target'].value_counts()
            _dynamic_test(
                self,
                counts["X"] == counts["Y"] == 3,
                "Majority class is undersampled to match minority class count",
                f"Expected 3 samples for each class, got: {counts.to_dict()}"
            )
        else:
            _dynamic_test(self, False, "", "Function returned None")

    def test_already_balanced(self):
        import user_code
        data = {
            "feature": list(range(6)),
            "target":  ["A", "A", "A", "B", "B", "B"]
        }
        df = pd.DataFrame(data)
        balanced = self.undersample_majority_class(df, "target") if self.undersample_majority_class else None
        if balanced is not None:
            counts = balanced['target'].value_counts()
            _dynamic_test(
                self,
                counts["A"] == counts["B"] == 3 and len(balanced) == 6,
                "Function works when DataFrame is already balanced",
                f"Expected 3 samples for each class, got: {counts.to_dict()}"
            )
        else:
            _dynamic_test(self, False, "", "Function returned None")

    def test_different_labels(self):
        import user_code
        data = {
            "feature": list(range(9)),
            "target":  [1, 1, 1, 1, 0, 0, 0, 0, 0]
        }
        df = pd.DataFrame(data)
        balanced = self.undersample_majority_class(df, "target") if self.undersample_majority_class else None
        if balanced is not None:
            counts = balanced['target'].value_counts()
            _dynamic_test(
                self,
                set(counts) == {4},
                "Function works for any two-class DataFrame, regardless of class labels",
                f"Expected both classes to have 4 samples, got: {counts.to_dict()}"
            )
        else:
            _dynamic_test(self, False, "", "Function returned None")

def _dynamic_test(test_case, condition, success_message, failure_message):
    if condition:
        test_case._testMethodName = success_message
        test_case.assertTrue(True, success_message)
    else:
        test_case._testMethodName = failure_message
        test_case.fail(failure_message)

def normalize_text(text):
    text = text.lower()
    text = re.sub(r"\\s{2,}", " ", text)
    text = re.sub(r"\\s*([,:?])\\s*", r"\\1 ", text)
    return text.strip()

def change_var(code: str, var_name: str, value: str) -> str:
    tree = ast.parse(code)
    lines = code.splitlines()
    changed = False
    # Collect all assignment nodes to modify
    assign_nodes = [
        (i, node)
        for i, node in enumerate(tree.body)
        if isinstance(node, ast.Assign)
        and any(isinstance(target, ast.Name) and target.id == var_name for target in node.targets)
    ]

    # If nothing to change, return unmodified code
    if not assign_nodes:
        return code

    # Perform replacements for all matching assignments (from last to first to not break line offsets)
    for i, node in reversed(assign_nodes):
        start_line = node.lineno - 1
        line = lines[start_line]
        indent = ' ' * (len(line) - len(line.lstrip()))
        lines[start_line] = f"{indent}{var_name} = {value}"
        next_line = len(lines)
        for next_node in tree.body[i+1:]:
            if hasattr(next_node, 'lineno'):
                next_line = next_node.lineno - 1
                break
        if next_line > start_line + 1:
            lines[start_line+1:next_line] = []
        changed = True

    return '\\n'.join(lines) if changed else code

if __name__ == "__main__":
    unittest.main()


test_main.py

実践的かつハンズオンのコースで、現実世界の大規模データ課題に取り組む意欲的なデータサイエンティスト向けです。Pythonと主要なライブラリを用いて、大規模データセットの効率的な処理、サンプリング、分析方法を学びます。各セクションには、分かりやすいビデオ解説とインタラクティブな課題が含まれており、専門知識を身につけることができます。

メモリに収まりきらない大規模データセットを扱うための基礎的な戦略として、チャンク処理やストリーミング手法を学びます。

オーバーサンプリングやアンダーサンプリングを含む、大規模データセットのバランス調整およびサンプリング手法を探求します。

高速かつメモリ効率の良いデータ処理のためにpolarsライブラリを使用する方法を学習します。

チャレンジ：アンダーサンプリングの適用

解答