import unittest
import user_code
import ast
import re   
import importlib
import csv
import unittest
import numpy as np
import importlib

class TestTask(unittest.TestCase):
    def test_single_transition_update(self):
        import user_code
        importlib.reload(user_code)
        q_table = np.array([
            [0.5, 0.2, 0.1],
            [0.0, 0.3, 0.4],
            [0.6, 0.1, 0.2]
        ])
        transitions = [
            (0, 1, 1.0, 2)
        ]
        alpha = 0.1
        gamma = 0.9
        expected = np.copy(q_table)
        # Q[0,1] = 0.2 + 0.1 * (1.0 + 0.9*0.6 - 0.2) = 0.2 + 0.1 * (1.0 + 0.54 - 0.2) = 0.2 + 0.1 * 1.34 = 0.2 + 0.134 = 0.334
        expected[0,1] = 0.334
        user_code.q_learning_update(q_table, transitions, alpha, gamma)
        _dynamic_test(
            self,
            np.allclose(q_table, expected),
            "Q-table correctly updated for single transition.",
            f"Expected Q-table: {expected}, got: {q_table}"
        )

    def test_multiple_transitions_update(self):
        import user_code
        importlib.reload(user_code)
        q_table = np.array([
            [0.5, 0.2, 0.1],
            [0.0, 0.3, 0.4],
            [0.6, 0.1, 0.2]
        ])
        transitions = [
            (0, 1, 1.0, 2),
            (2, 0, 0.5, 1),
            (1, 2, 0.0, 0)
        ]
        alpha = 0.1
        gamma = 0.9
        expected = np.array([
            [0.5, 0.334, 0.1], # after first update
            [0.0, 0.3, 0.4],
            [0.6, 0.1, 0.2]
        ])
        # second update: Q[2,0] = 0.6 + 0.1 * (0.5 + 0.9*0.4 - 0.6) = 0.6 + 0.1*(0.5+0.36-0.6) = 0.6 + 0.1*0.26 = 0.626
        expected[2,0] = 0.626
        # third update: Q[1,2] = 0.4 + 0.1 * (0.0 + 0.9*0.5 - 0.4) = 0.4 + 0.1*(0.0+0.45-0.4) = 0.4+0.1*0.05=0.405
        expected[1,2] = 0.405
        user_code.q_learning_update(q_table, transitions, alpha, gamma)
        _dynamic_test(
            self,
            np.allclose(q_table, expected, atol=1e-6),
            "Q-table correctly updated for all transitions.",
            f"Expected Q-table: {expected}, got: {q_table}"
        )

    def test_inplace_modification(self):
        import user_code
        importlib.reload(user_code)
        q_table = np.ones((2,2))
        transitions = [(0,1,2,1)]
        alpha = 0.5
        gamma = 0.5
        q_table_id = id(q_table)
        user_code.q_learning_update(q_table, transitions, alpha, gamma)
        _dynamic_test(
            self,
            id(q_table) == q_table_id,
            "Q-table modified in-place.",
            "Q-table was not modified in-place."
        )

    def test_uses_max_q_of_next_state(self):
        import user_code
        importlib.reload(user_code)
        q_table = np.array([
            [1.0, 2.0, 3.0],
            [4.0, 5.0, 6.0],
            [7.0, 8.0, 9.0]
        ])
        transitions = [(0,0,1.0,2)]
        alpha = 0.2
        gamma = 0.5
        # max Q of next_state=2 is 9.0
        # Q[0,0] = 1.0 + 0.2 * (1.0 + 0.5*9.0 - 1.0) = 1.0 + 0.2*(1.0+4.5-1.0) = 1.0+0.2*4.5=1.0+0.9=1.9
        expected = np.array([
            [1.9, 2.0, 3.0],
            [4.0, 5.0, 6.0],
            [7.0, 8.0, 9.0]
        ])
        user_code.q_learning_update(q_table, transitions, alpha, gamma)
        _dynamic_test(
            self,
            np.allclose(q_table, expected, atol=1e-6),
            "Q-table update uses max Q-value of next state.",
            f"Expected Q-table: {expected}, got: {q_table}"
        )

def _dynamic_test(test_case, condition, success_message, failure_message):
    if condition:
        test_case._testMethodName = success_message
        test_case.assertTrue(True, success_message)
    else:
        test_case._testMethodName = failure_message
        test_case.fail(failure_message)

def normalize_text(text):
    text = text.lower()
    text = re.sub(r"\\s{2,}", " ", text)
    text = re.sub(r"\\s*([,:?])\\s*", r"\\1 ", text)
    return text.strip()

def change_var(code: str, var_name: str, value: str) -> str:
    tree = ast.parse(code)
    lines = code.splitlines()
    changed = False
    # Collect all assignment nodes to modify
    assign_nodes = [
        (i, node)
        for i, node in enumerate(tree.body)
        if isinstance(node, ast.Assign)
        and any(isinstance(target, ast.Name) and target.id == var_name for target in node.targets)
    ]

    # If nothing to change, return unmodified code
    if not assign_nodes:
        return code

    # Perform replacements for all matching assignments (from last to first to not break line offsets)
    for i, node in reversed(assign_nodes):
        start_line = node.lineno - 1
        line = lines[start_line]
        indent = ' ' * (len(line) - len(line.lstrip()))
        lines[start_line] = f"{indent}{var_name} = {value}"
        next_line = len(lines)
        for next_node in tree.body[i+1:]:
            if hasattr(next_node, 'lineno'):
                next_line = next_node.lineno - 1
                break
        if next_line > start_line + 1:
            lines[start_line+1:next_line] = []
        changed = True

    return '\\n'.join(lines) if changed else code

if __name__ == "__main__":
    unittest.main()


test_main.py

Practice implementing Q-learning and SARSA from scratch with Python. Includes step-by-step coding exercises, intuitive explanations, and simple tasks like grid-world navigation.

Explore the foundations of reinforcement learning, implement Q-learning and SARSA from scratch, and apply them to simple environments with hands-on coding, explanations, and challenges.

Challenge: Q-table Update with Q-learning

Solution