In this challenge, you will analyze a dataset containing information about individuals' wages and their potential determinants, such as years of education, years of experience, and gender. Your goal is to write a function that fits a linear regression model using this data and then interprets which factor most strongly influences wages. This exercise will help you practice applying regression techniques to real-world economic questions and interpreting the results to inform policy or business decisions.

Start by considering a small dataset that is already structured as a pandas DataFrame. The DataFrame includes the following columns:
- **"wage"**: annual wage in thousands of dollars;
- **"education"**: years of education completed;
- **"experience"**: years of work experience;
- **"gender"**: 1 if male, 0 if female.

You will use the `scikit-learn` library's `LinearRegression` class to fit the model, and then examine the estimated coefficients to determine which variable has the largest effect on wage.

import pandas as pd
from sklearn.linear_model import LinearRegression

# Hardcoded sample data
data = {
    "wage": [45, 55, 60, 38, 70, 52, 48, 62, 49, 58],
    "education": [12, 16, 18, 12, 20, 14, 13, 17, 12, 16],
    "experience": [5, 7, 10, 3, 12, 6, 4, 9, 5, 8],
    "gender": [1, 0, 1, 0, 1, 0, 0, 1, 0, 1]
}
df = pd.DataFrame(data)

def fit_wage_regression(df):
    X = df[["education", "experience", "gender"]]
    y = df["wage"]
    model = LinearRegression()
    model.fit(X, y)
    coefficients = dict(zip(X.columns, model.coef_))
    return coefficients

coeffs = fit_wage_regression(df)
print("Estimated coefficients:", coeffs)

After running the function, you will see the estimated coefficients for each variable. The coefficient for **"education"** shows how much the wage is expected to increase for each additional year of education, holding other variables constant. The **"experience"** coefficient reflects the effect of an extra year of work experience, and the **"gender"** coefficient captures the difference in wage between males and females, assuming other factors are equal.

To interpret which determinant has the largest impact, compare the absolute values of the coefficients. The variable with the largest coefficient (in absolute terms) is the strongest predictor of wage in this dataset. This information is crucial for economists and policymakers when designing interventions to improve earnings or reduce inequality.


import unittest
import user_code
import ast
import re   
import importlib
import csv
import unittest
import importlib
import pandas as pd

class TestTask(unittest.TestCase):
    def setUp(self):
        self.data = {
            "wage": [45, 55, 60, 38, 70, 52, 48, 62, 49, 58],
            "education": [12, 16, 18, 12, 20, 14, 13, 17, 12, 16],
            "experience": [5, 7, 10, 3, 12, 6, 4, 9, 5, 8],
            "gender": [1, 0, 1, 0, 1, 0, 0, 1, 0, 1]
        }
        self.df = pd.DataFrame(self.data)
        self.expected_vars = set(["education", "experience", "gender"])

    def test_function_exists(self):
        import user_code
        func = getattr(user_code, 'analyze_wage_determinants', None)
        _dynamic_test(
            self,
            callable(func),
            "Function analyze_wage_determinants is defined.",
            "Function analyze_wage_determinants is not defined or not callable."
        )

    def test_return_type(self):
        import user_code
        importlib.reload(user_code)
        func = getattr(user_code, 'analyze_wage_determinants', None)
        if not callable(func):
            _dynamic_test(self, False, "Function unavailable", "Function not defined.")
            return
        result = func(self.df)
        _dynamic_test(
            self,
            isinstance(result, tuple) and len(result) == 2,
            "Function returns a tuple of length 2.",
            f"Expected a tuple of length 2, got {type(result)} with length {len(result) if isinstance(result, tuple) else 'N/A'}."
        )

    def test_coefficient_dict(self):
        import user_code
        importlib.reload(user_code)
        func = getattr(user_code, 'analyze_wage_determinants', None)
        if not callable(func):
            _dynamic_test(self, False, "Function unavailable", "Function not defined.")
            return
        result = func(self.df)
        coeffs = result[0] if result is not None else None
        _dynamic_test(
            self,
            isinstance(coeffs, dict),
            "First element of tuple is a dictionary.",
            f"Expected dict, got {type(coeffs)}."
        )
        if isinstance(coeffs, dict):
            _dynamic_test(
                self,
                set(coeffs.keys()) == self.expected_vars,
                "Dictionary keys match independent variables.",
                f"Expected keys {self.expected_vars}, got {set(coeffs.keys())}."
            )
            for k, v in coeffs.items():
                _dynamic_test(
                    self,
                    isinstance(v, float),
                    f"Coefficient for {k} is a float.",
                    f"Coefficient for {k} is not a float: {v}."
                )

    def test_strongest_impact(self):
        import user_code
        importlib.reload(user_code)
        func = getattr(user_code, 'analyze_wage_determinants', None)
        if not callable(func):
            _dynamic_test(self, False, "Function unavailable", "Function not defined.")
            return
        result = func(self.df)
        coeffs = result[0] if result is not None else None
        max_var = result[1] if result is not None else None
        _dynamic_test(
            self,
            max_var in self.expected_vars,
            "Second element of tuple is a valid variable name.",
            f"Expected one of {self.expected_vars}, got {max_var}."
        )
        if isinstance(coeffs, dict):
            abs_coeffs = {k: abs(v) for k, v in coeffs.items()}
            expected_max = max(abs_coeffs, key=lambda k: abs_coeffs[k])
            _dynamic_test(
                self,
                max_var == expected_max,
                f"Variable with largest absolute coefficient is correctly identified: {max_var}.",
                f"Expected {expected_max}, got {max_var}."
            )

def _dynamic_test(test_case, condition, success_message, failure_message):
    if condition:
        test_case._testMethodName = success_message
        test_case.assertTrue(True, success_message)
    else:
        test_case._testMethodName = failure_message
        test_case.fail(failure_message)

def normalize_text(text):
    text = text.lower()
    text = re.sub(r"\\s{2,}", " ", text)
    text = re.sub(r"\\s*([,:?])\\s*", r"\\1 ", text)
    return text.strip()

def change_var(code: str, var_name: str, value: str) -> str:
    tree = ast.parse(code)
    lines = code.splitlines()
    changed = False
    # Collect all assignment nodes to modify
    assign_nodes = [
        (i, node)
        for i, node in enumerate(tree.body)
        if isinstance(node, ast.Assign)
        and any(isinstance(target, ast.Name) and target.id == var_name for target in node.targets)
    ]

    # If nothing to change, return unmodified code
    if not assign_nodes:
        return code

    # Perform replacements for all matching assignments (from last to first to not break line offsets)
    for i, node in reversed(assign_nodes):
        start_line = node.lineno - 1
        line = lines[start_line]
        indent = ' ' * (len(line) - len(line.lstrip()))
        lines[start_line] = f"{indent}{var_name} = {value}"
        next_line = len(lines)
        for next_node in tree.body[i+1:]:
            if hasattr(next_node, 'lineno'):
                next_line = next_node.lineno - 1
                break
        if next_line > start_line + 1:
            lines[start_line+1:next_line] = []
        changed = True

    return '\\n'.join(lines) if changed else code

if __name__ == "__main__":
    unittest.main()


test_main.py

A practical course designed for economists to leverage Python for data analysis, economic modeling, and visualization. Learn how to apply Python's powerful libraries and programming techniques to solve real-world economic problems.

Learn how to use Python for analyzing and interpreting economic data, focusing on data structures, manipulation, and summary statistics relevant to economics.

Apply Python to build and interpret economic models, including linear regression and forecasting, using real-world economic data.

Explore advanced techniques for economic analysis, including correlation, multivariate visualization, and basic simulation, using Python.

Challenge: Wage Determinants Regression

Solution