import unittest
import user_code
import ast
import re   
import importlib
import csv
import unittest
import numpy as np
import gymnasium as gym
import importlib
import sys

class TestTask(unittest.TestCase):
    def setUp(self):
        if 'user_code' in sys.modules:
            del sys.modules['user_code']

    def test_exploration_rate_random_action(self):
        import user_code
        importlib.reload(user_code)
        env = gym.make("FrozenLake-v1", is_slippery=False)
        
        # Patch np.random.uniform to always return 0.05 (less than exploration_rate)
        original_uniform = np.random.uniform
        # Patch env.action_space.sample to always return 2
        original_sample = env.action_space.sample
        action_chosen = []
        def sample_patch():
            action_chosen.append(2)
            return 2
        try:
            np.random.uniform = lambda a, b: 0.05
            env.action_space.sample = sample_patch
            # Patch env.step and env.reset to only run one step for test
            env.reset = lambda: (0, {})
            env.step = lambda action: (1, 1, True, False, {})
            # Patch np.argmax to always return 1 (so if best action is chosen, it's 1)
            original_argmax = np.argmax
            np.argmax = lambda arr: 1
            user_code.q_learning_with_exploration_rate(env, 1, 0.1, 0.99, 0.8)
            _dynamic_test(
                self,
                action_chosen and action_chosen[0] == 2,
                "Random action is chosen when random value < exploration_rate.",
                f"Expected random action 2, got {action_chosen[0] if action_chosen else None}."
            )
        finally:
            np.random.uniform = original_uniform
            env.action_space.sample = original_sample
            np.argmax = original_argmax

    def test_exploration_rate_best_action(self):
        import user_code
        importlib.reload(user_code)
        env = gym.make("FrozenLake-v1", is_slippery=False)
        
        # Patch np.random.uniform to always return 0.95 (greater than exploration_rate)
        original_uniform = np.random.uniform
        # Patch np.argmax to always return 3
        original_argmax = np.argmax
        # Patch env.action_space.sample to always return 2 (should NOT be chosen)
        original_sample = env.action_space.sample
        best_action_used = []
        def argmax_patch(arr):
            best_action_used.append(3)
            return 3
        try:
            np.random.uniform = lambda a, b: 0.95
            np.argmax = argmax_patch
            env.action_space.sample = lambda: 2
            env.reset = lambda: (0, {})
            env.step = lambda action: (1, 1, True, False, {})
            user_code.q_learning_with_exploration_rate(env, 1, 0.1, 0.99, 0.8)
            _dynamic_test(
                self,
                best_action_used and best_action_used[0] == 3,
                "Best action is chosen when random value >= exploration_rate.",
                f"Expected best action 3, got {best_action_used[0] if best_action_used else None}."
            )
        finally:
            np.random.uniform = original_uniform
            np.argmax = original_argmax
            env.action_space.sample = original_sample

    def test_q_table_update(self):
        import user_code
        importlib.reload(user_code)
        env = gym.make("FrozenLake-v1", is_slippery=False)
        # Patch env.step to always return next_state=1, reward=1, terminated=True, truncated=False
        original_step = env.step
        env.step = lambda action: (1, 1, True, False, {})
        # Patch env.reset to always return state=0
        env.reset = lambda: (0, {})
        # Patch np.random.uniform to always select best action
        original_uniform = np.random.uniform
        np.random.uniform = lambda a, b: 0.95
        try:
            q_table = user_code.q_learning_with_exploration_rate(env, episodes=1, learning_rate=0.5, discount_factor=0.9, exploration_rate=0.1)
            # Only one update should have happened: state=0, action=0 (since q_table is zeros, argmax is 0)
            expected = 0.5 * (1 + 0.9 * 0) # reward=1, next_max=0, learning_rate=0.5
            _dynamic_test(
                self,
                np.isclose(q_table[0,0], expected),
                "Q-table updated correctly with Q-learning rule.",
                f"Expected Q-table[0,0]={expected}, got {q_table[0,0]}"
            )
        finally:
            np.random.uniform = original_uniform
            env.step = original_step
    
    def test_q_table_return_shape(self):
        import user_code
        importlib.reload(user_code)
        env = gym.make("FrozenLake-v1", is_slippery=False)
        q_table = user_code.q_learning_with_exploration_rate(env, episodes=10, learning_rate=0.1, discount_factor=0.99, exploration_rate=0.5)
        _dynamic_test(
            self,
            isinstance(q_table, np.ndarray) and q_table.shape == (env.observation_space.n, env.action_space.n),
            "Returned Q-table has correct shape.",
            f"Expected shape {(env.observation_space.n, env.action_space.n)}, got {q_table.shape}"
        )

def _dynamic_test(test_case, condition, success_message, failure_message):
    if condition:
        test_case._testMethodName = success_message
        test_case.assertTrue(True, success_message)
    else:
        test_case._testMethodName = failure_message
        test_case.fail(failure_message)

def normalize_text(text):
    text = text.lower()
    text = re.sub(r"\\s{2,}", " ", text)
    text = re.sub(r"\\s*([,:?])\\s*", r"\\1 ", text)
    return text.strip()

def change_var(code: str, var_name: str, value: str) -> str:
    tree = ast.parse(code)
    lines = code.splitlines()
    changed = False
    # Collect all assignment nodes to modify
    assign_nodes = [
        (i, node)
        for i, node in enumerate(tree.body)
        if isinstance(node, ast.Assign)
        and any(isinstance(target, ast.Name) and target.id == var_name for target in node.targets)
    ]

    # If nothing to change, return unmodified code
    if not assign_nodes:
        return code

    # Perform replacements for all matching assignments (from last to first to not break line offsets)
    for i, node in reversed(assign_nodes):
        start_line = node.lineno - 1
        line = lines[start_line]
        indent = ' ' * (len(line) - len(line.lstrip()))
        lines[start_line] = f"{indent}{var_name} = {value}"
        next_line = len(lines)
        for next_node in tree.body[i+1:]:
            if hasattr(next_node, 'lineno'):
                next_line = next_node.lineno - 1
                break
        if next_line > start_line + 1:
            lines[start_line+1:next_line] = []
        changed = True

    return '\\n'.join(lines) if changed else code

if __name__ == "__main__":
    unittest.main()


test_main.py

Practice implementing Q-learning and SARSA from scratch with Python. Includes step-by-step coding exercises, intuitive explanations, and simple tasks like grid-world navigation.

Explore the foundations of reinforcement learning, implement Q-learning and SARSA from scratch, and apply them to simple environments with hands-on coding, explanations, and challenges.

Challenge: Modify Exploration Rate

解答