You are often faced with the challenge of comparing environmental data from different locations to determine whether observed differences are meaningful or simply due to random variation. Suppose you have daily PM2.5 (particulate matter with a diameter less than 2.5 micrometers) measurements from two air quality monitoring stations. Your goal is to assess whether there is a statistically significant difference in PM2.5 levels between these stations.

To do this, you will use a statistical hypothesis test known as the **independent two-sample t-test**. This test helps you decide if the means of two independent groups are significantly different from each other. In this context, your two groups are the PM2.5 measurements from each station.

First, create two pandas DataFrames, each containing daily PM2.5 measurements for one station. You will then use the `scipy.stats` module to perform the t-test. The t-test will provide a **p-value**, which tells you the probability of observing such a difference (or more extreme) in means if there were actually no difference between the stations. A common threshold for statistical significance is 0.05: if the p-value is less than 0.05, you can conclude that the difference is statistically significant.

You will interpret the results by reporting both the p-value and your conclusion about whether the stations differ significantly in PM2.5 levels.


import unittest
import user_code
import ast
import re   
import importlib
import csv
import unittest
import importlib
import sys
import io

class TestTask(unittest.TestCase):
    def setUp(self):
        import user_code
        importlib.reload(user_code)
        self.user_code = user_code

    def test_df_x_exists_and_valid(self):
        import user_code
        importlib.reload(user_code)
        df_x = getattr(user_code, 'df_x', None)
        _dynamic_test(
            self,
            df_x is not None and hasattr(df_x, 'shape') and df_x.shape[0] >= 8 and 'PM2.5' in df_x.columns,
            "df_x exists with at least 8 rows and column 'PM2.5'",
            f'df_x does not exist, does not have at least 8 rows, or is missing column "PM2.5". Value: {df_x}'
        )

    def test_df_y_exists_and_valid(self):
        import user_code
        importlib.reload(user_code)
        df_y = getattr(user_code, 'df_y', None)
        _dynamic_test(
            self,
            df_y is not None and hasattr(df_y, 'shape') and df_y.shape[0] >= 8 and 'PM2.5' in df_y.columns,
            "df_y exists with at least 8 rows and column 'PM2.5'",
            f'df_y does not exist, does not have at least 8 rows, or is missing column "PM2.5". Value: {df_y}'
        )

    def test_ttest_ind_used_and_printed(self):
        import user_code
        importlib.reload(user_code)
        f = io.StringIO()
        sys_stdout = sys.stdout
        sys.stdout = f
        try:
            importlib.reload(user_code)
        finally:
            sys.stdout = sys_stdout
        output = normalize_text(f.getvalue())
        # Check for t-statistic and p-value printed (numbers will vary)
        _dynamic_test(
            self,
            "t-statistic:" in output and "p-value:" in output,
            'Output includes "T-statistic:" and "P-value:"',
            f'Output missing "T-statistic:" or "P-value:". Output: {output}'
        )

    def test_interpretation_printed(self):
        import user_code
        importlib.reload(user_code)
        f = io.StringIO()
        sys_stdout = sys.stdout
        sys.stdout = f
        try:
            importlib.reload(user_code)
        finally:
            sys.stdout = sys_stdout
        output = normalize_text(f.getvalue())
        msg1 = normalize_text('There is a statistically significant difference in PM2.5 levels between the two stations.')
        msg2 = normalize_text('There is no statistically significant difference in PM2.5 levels between the two stations.')
        _dynamic_test(
            self,
            msg1 in output or msg2 in output,
            'Interpretation statement is printed based on p-value.',
            f'Interpretation message missing. Output: {output}'
        )

def _dynamic_test(test_case, condition, success_message, failure_message):
    if condition:
        test_case._testMethodName = success_message
        test_case.assertTrue(True, success_message)
    else:
        test_case._testMethodName = failure_message
        test_case.fail(failure_message)

def normalize_text(text):
    text = text.lower()
    text = re.sub(r"\\s{2,}", " ", text)
    text = re.sub(r"\\s*([,:?])\\s*", r"\\1 ", text)
    return text.strip()

def change_var(code: str, var_name: str, value: str) -> str:
    tree = ast.parse(code)
    lines = code.splitlines()
    changed = False
    # Collect all assignment nodes to modify
    assign_nodes = [
        (i, node)
        for i, node in enumerate(tree.body)
        if isinstance(node, ast.Assign)
        and any(isinstance(target, ast.Name) and target.id == var_name for target in node.targets)
    ]

    # If nothing to change, return unmodified code
    if not assign_nodes:
        return code

    # Perform replacements for all matching assignments (from last to first to not break line offsets)
    for i, node in reversed(assign_nodes):
        start_line = node.lineno - 1
        line = lines[start_line]
        indent = ' ' * (len(line) - len(line.lstrip()))
        lines[start_line] = f"{indent}{var_name} = {value}"
        next_line = len(lines)
        for next_node in tree.body[i+1:]:
            if hasattr(next_node, 'lineno'):
                next_line = next_node.lineno - 1
                break
        if next_line > start_line + 1:
            lines[start_line+1:next_line] = []
        changed = True

    return '\\n'.join(lines) if changed else code

if __name__ == "__main__":
    unittest.main()


test_main.py

Explore how Python can be leveraged to address real-world environmental science problems. This course guides students through data analysis, visualization, and modeling techniques relevant to environmental research, using hands-on tasks and engaging theory chapters.

Learn how to access, clean, and explore environmental datasets using Python. Gain foundational skills for working with real-world environmental data.

Delve into statistical techniques for analyzing environmental data, including descriptive statistics, correlation, and hypothesis testing.

Apply Python to model and predict environmental processes, such as pollution dispersion and climate trends, using real datasets.

Challenge: Compare Pollution Levels

Solution