このチャレンジでは、大規模データセットにおけるクラス不均衡の処理として、**オーバーサンプリング**を実践します。`pandas`のDataFrameが与えられており、ターゲット列にはクラスの不均衡があります。目的は、マイノリティクラスをオーバーサンプリングし、両クラスの行数が同じになる新しいDataFrameを作成することです。この手法は、モデルがマジョリティクラスに偏るのを防ぎたい場合に有効です。


import unittest
import user_code
import ast
import re   
import importlib
import csv
import unittest
import pandas as pd
import importlib

class TestTask(unittest.TestCase):
    def setUp(self):
        self.user_code = importlib.import_module("user_code")
        self.data = {
            "feature1": [1, 2, 3, 4, 5, 6],
            "feature2": [10, 20, 30, 40, 50, 60],
            "target": ["A", "A", "A", "B", "B", "B"]
        }
        self.df = pd.DataFrame(self.data).iloc[:-1]  # Remove one B

    def test_columns_preserved(self):
        importlib.reload(self.user_code)
        balanced_df = self.user_code.oversample_minority(self.df, "target")
        _dynamic_test(
            self,
            set(balanced_df.columns) == set(self.df.columns),
            "Returned DataFrame contains the same columns as input DataFrame.",
            f"Returned DataFrame columns {balanced_df.columns.tolist()} do not match input columns {self.df.columns.tolist()}"
        )

    def test_balanced_class_counts(self):
        importlib.reload(self.user_code)
        balanced_df = self.user_code.oversample_minority(self.df, "target")
        counts = balanced_df["target"].value_counts()
        _dynamic_test(
            self,
            len(set(counts.values)) == 1 and counts["A"] == counts["B"],
            "Each class in the target column is balanced (equal row counts).",
            f"Class counts are not balanced: {counts.to_dict()}"
        )

    def test_total_row_count(self):
        importlib.reload(self.user_code)
        majority_count = self.df["target"].value_counts().max()
        balanced_df = self.user_code.oversample_minority(self.df, "target")
        expected_total = 2 * majority_count
        _dynamic_test(
            self,
            balanced_df.shape[0] == expected_total,
            "Total number of rows is twice the original majority class count.",
            f"Expected {expected_total} rows, got {balanced_df.shape[0]}"
        )

    def test_no_column_alteration(self):
        importlib.reload(self.user_code)
        balanced_df = self.user_code.oversample_minority(self.df, "target")
        _dynamic_test(
            self,
            all(col in balanced_df.columns for col in self.df.columns) and all(col in self.df.columns for col in balanced_df.columns),
            "No columns were dropped or altered in the returned DataFrame.",
            f"Returned columns: {balanced_df.columns.tolist()}, original columns: {self.df.columns.tolist()}"
        )

def _dynamic_test(test_case, condition, success_message, failure_message):
    if condition:
        test_case._testMethodName = success_message
        test_case.assertTrue(True, success_message)
    else:
        test_case._testMethodName = failure_message
        test_case.fail(failure_message)

def normalize_text(text):
    text = text.lower()
    text = re.sub(r"\\s{2,}", " ", text)
    text = re.sub(r"\\s*([,:?])\\s*", r"\\1 ", text)
    return text.strip()

def change_var(code: str, var_name: str, value: str) -> str:
    tree = ast.parse(code)
    lines = code.splitlines()
    changed = False
    # Collect all assignment nodes to modify
    assign_nodes = [
        (i, node)
        for i, node in enumerate(tree.body)
        if isinstance(node, ast.Assign)
        and any(isinstance(target, ast.Name) and target.id == var_name for target in node.targets)
    ]

    # If nothing to change, return unmodified code
    if not assign_nodes:
        return code

    # Perform replacements for all matching assignments (from last to first to not break line offsets)
    for i, node in reversed(assign_nodes):
        start_line = node.lineno - 1
        line = lines[start_line]
        indent = ' ' * (len(line) - len(line.lstrip()))
        lines[start_line] = f"{indent}{var_name} = {value}"
        next_line = len(lines)
        for next_node in tree.body[i+1:]:
            if hasattr(next_node, 'lineno'):
                next_line = next_node.lineno - 1
                break
        if next_line > start_line + 1:
            lines[start_line+1:next_line] = []
        changed = True

    return '\\n'.join(lines) if changed else code

if __name__ == "__main__":
    unittest.main()


test_main.py

実践的かつハンズオンのコースで、現実世界の大規模データ課題に取り組む意欲的なデータサイエンティスト向けです。Pythonと主要なライブラリを用いて、大規模データセットの効率的な処理、サンプリング、分析方法を学びます。各セクションには、分かりやすいビデオ解説とインタラクティブな課題が含まれており、専門知識を身につけることができます。

メモリに収まりきらない大規模データセットを扱うための基礎的な戦略として、チャンク処理やストリーミング手法を学びます。

オーバーサンプリングやアンダーサンプリングを含む、大規模データセットのバランス調整およびサンプリング手法を探求します。

高速かつメモリ効率の良いデータ処理のためにpolarsライブラリを使用する方法を学習します。

チャレンジ：オーバーサンプリングを適用する

解答