大規模なデータセットを扱う際、ファイル全体をメモリに読み込まずに集計処理を行う必要がよくあります。一般的な作業の一つは、非常に大きなCSVファイル内の特定の列の値を合計することです。ファイルがメモリに収まらない場合でも、`pandas` の `read_csv()` 関数と `chunksize` パラメータを使用して、処理可能なチャンクごとにデータを読み込むことができます。

各チャンクごとに目的の列の合計を計算し、これらの部分合計を集計して総合計を求めます。この方法は効率的かつスケーラブルであり、各チャンクがメモリに収まる限り、ほぼ任意のサイズのファイルを扱うことが可能です。


import unittest
import user_code
import ast
import re   
import importlib
import csv
import unittest
import importlib
import os
import csv
import pandas as pd

class TestTask(unittest.TestCase):
    def setUp(self):
        # Create test CSV files for different scenarios
        self.one_chunk_file = "test_one_chunk.csv"
        self.multi_chunk_file = "test_multi_chunk.csv"
        self.float_file = "test_float.csv"
        self.empty_col_file = "test_empty_col.csv"
        self.no_col_file = "test_no_col.csv"

        # One chunk: 3 rows
        with open(self.one_chunk_file, "w", newline="") as f:
            writer = csv.writer(f)
            writer.writerow(["amount", "desc"])
            writer.writerow([10, "a"])
            writer.writerow([20, "b"])
            writer.writerow([30, "c"])
        # Multi chunk: 6 rows
        with open(self.multi_chunk_file, "w", newline="") as f:
            writer = csv.writer(f)
            writer.writerow(["value", "desc"])
            for i in range(1, 7):
                writer.writerow([i, f"row{i}"])
        # Float values
        with open(self.float_file, "w", newline="") as f:
            writer = csv.writer(f)
            writer.writerow(["price", "desc"])
            writer.writerow([1.5, "x"])
            writer.writerow([2.5, "y"])
            writer.writerow([3.0, "z"])
        # Empty column
        with open(self.empty_col_file, "w", newline="") as f:
            writer = csv.writer(f)
            writer.writerow(["col1", "col2"])
            writer.writerow(["", "a"])
            writer.writerow(["", "b"])
        # No such column
        with open(self.no_col_file, "w", newline="") as f:
            writer = csv.writer(f)
            writer.writerow(["foo", "bar"])
            writer.writerow([1, 2])
            writer.writerow([3, 4])

    def tearDown(self):
        for f in [self.one_chunk_file, self.multi_chunk_file, self.float_file, self.empty_col_file, self.no_col_file]:
            if os.path.exists(f):
                os.remove(f)

    def test_sum_one_chunk(self):
        import user_code
        importlib.reload(user_code)
        result = user_code.sum_column_in_chunks(self.one_chunk_file, "amount", 100)
        _dynamic_test(
            self,
            result == 60,
            "Returns correct sum when file fits in one chunk.",
            f"Expected 60, got {result} for one chunk file."
        )

    def test_sum_multiple_chunks(self):
        import user_code
        importlib.reload(user_code)
        # chunksize=2, so 3 chunks
        result = user_code.sum_column_in_chunks(self.multi_chunk_file, "value", 2)
        _dynamic_test(
            self,
            result == 21,
            "Returns correct sum when file is read in multiple chunks.",
            f"Expected 21, got {result} for multi-chunk file."
        )

    def test_sum_float_column(self):
        import user_code
        importlib.reload(user_code)
        result = user_code.sum_column_in_chunks(self.float_file, "price", 2)
        _dynamic_test(
            self,
            abs(result - 7.0) < 1e-8,
            "Returns correct sum for float column.",
            f"Expected 7.0, got {result} for float column."
        )

    def test_sum_empty_column(self):
        import user_code
        importlib.reload(user_code)
        result = user_code.sum_column_in_chunks(self.empty_col_file, "col1", 1)
        _dynamic_test(
            self,
            result == 0,
            "Returns 0 for column with no values.",
            f"Expected 0, got {result} for empty column."
        )

    def test_sum_column_not_exist(self):
        import user_code
        importlib.reload(user_code)
        result = user_code.sum_column_in_chunks(self.no_col_file, "notacol", 1)
        _dynamic_test(
            self,
            result == 0,
            "Returns 0 if column does not exist.",
            f"Expected 0, got {result} for missing column."
        )

def _dynamic_test(test_case, condition, success_message, failure_message):
    if condition:
        test_case._testMethodName = success_message
        test_case.assertTrue(True, success_message)
    else:
        test_case._testMethodName = failure_message
        test_case.fail(failure_message)

def normalize_text(text):
    text = text.lower()
    text = re.sub(r"\\s{2,}", " ", text)
    text = re.sub(r"\\s*([,:?])\\s*", r"\\1 ", text)
    return text.strip()

def change_var(code: str, var_name: str, value: str) -> str:
    tree = ast.parse(code)
    lines = code.splitlines()
    changed = False
    # Collect all assignment nodes to modify
    assign_nodes = [
        (i, node)
        for i, node in enumerate(tree.body)
        if isinstance(node, ast.Assign)
        and any(isinstance(target, ast.Name) and target.id == var_name for target in node.targets)
    ]

    # If nothing to change, return unmodified code
    if not assign_nodes:
        return code

    # Perform replacements for all matching assignments (from last to first to not break line offsets)
    for i, node in reversed(assign_nodes):
        start_line = node.lineno - 1
        line = lines[start_line]
        indent = ' ' * (len(line) - len(line.lstrip()))
        lines[start_line] = f"{indent}{var_name} = {value}"
        next_line = len(lines)
        for next_node in tree.body[i+1:]:
            if hasattr(next_node, 'lineno'):
                next_line = next_node.lineno - 1
                break
        if next_line > start_line + 1:
            lines[start_line+1:next_line] = []
        changed = True

    return '\\n'.join(lines) if changed else code

if __name__ == "__main__":
    unittest.main()


test_main.py

実践的かつハンズオンのコースで、現実世界の大規模データ課題に取り組む意欲的なデータサイエンティスト向けです。Pythonと主要なライブラリを用いて、大規模データセットの効率的な処理、サンプリング、分析方法を学びます。各セクションには、分かりやすいビデオ解説とインタラクティブな課題が含まれており、専門知識を身につけることができます。

メモリに収まりきらない大規模データセットを扱うための基礎的な戦略として、チャンク処理やストリーミング手法を学びます。

オーバーサンプリングやアンダーサンプリングを含む、大規模データセットのバランス調整およびサンプリング手法を探求します。

高速かつメモリ効率の良いデータ処理のためにpolarsライブラリを使用する方法を学習します。

チャレンジ：チャンク化データの集計

解答