Clustering can help you uncover patterns in environmental data that are not immediately obvious. Suppose you have a dataset containing average annual concentrations of several pollutants—such as **nitrogen dioxide (NO₂)**, **sulfur dioxide (SO₂)**, and **particulate matter (PM10)**—from a set of air quality monitoring stations spread across a region. By grouping these stations according to their pollutant profiles, you can identify areas with similar pollution characteristics, which can inform targeted interventions and further study.

To begin, you need a `pandas` DataFrame representing the pollutant concentrations at each station. This DataFrame will contain rows for each station and columns for each pollutant. Once the data is prepared, you will use the `KMeans` algorithm from **scikit-learn** to cluster the stations. After clustering, you will visualize the results to interpret the spatial and environmental significance of the clusters.

Clustering is an **unsupervised learning** technique, meaning it does not use labeled outcomes but instead finds structure in the data itself. The choice of the number of clusters (`n_clusters`) is important and may require domain knowledge or experimentation. You can try different values to see which grouping makes the most sense for your environmental context.

Note


import unittest
import user_code
import ast
import re   
import importlib
import csv
import unittest
import importlib
import pandas as pd

class TestTask(unittest.TestCase):
    def setUp(self):
        self.data = {
            "Station": ["North", "South", "East", "West", "Central", "SuburbA", "SuburbB", "Industrial", "Park", "Airport"],
            "NO2": [32, 45, 28, 55, 38, 22, 25, 70, 18, 60],
            "SO2": [12, 20, 9, 25, 15, 8, 10, 30, 7, 22],
            "PM10": [40, 55, 35, 65, 48, 30, 33, 80, 28, 70]
        }
        self.df = pd.DataFrame(self.data)

    def test_cluster_column_exists(self):
        import user_code
        importlib.reload(user_code)
        df = getattr(user_code, 'df', None)
        _dynamic_test(
            self,
            isinstance(df, pd.DataFrame) and 'Cluster' in df.columns,
            "DataFrame has 'Cluster' column after clustering",
            "DataFrame does not have 'Cluster' column after clustering."
        )

    def test_cluster_column_is_int(self):
        import user_code
        importlib.reload(user_code)
        df = getattr(user_code, 'df', None)
        _dynamic_test(
            self,
            isinstance(df, pd.DataFrame) and 'Cluster' in df.columns,
            "DataFrame has 'Cluster' column",
            "Missing 'Cluster' column in DataFrame."
        )
        types_ok = pd.api.types.is_integer_dtype(df['Cluster'])
        _dynamic_test(
            self,
            types_ok,
            "'Cluster' column has integer dtype",
            f"'Cluster' column does not have integer dtype, got {df['Cluster'].dtype}"
        )

    def test_kmeans_n_clusters(self):
        import user_code
        importlib.reload(user_code)
        # User may have KMeans instance or not, but must have 3 clusters in result
        df = getattr(user_code, 'df', None)
        _dynamic_test(
            self,
            isinstance(df, pd.DataFrame) and 'Cluster' in df.columns,
            "DataFrame has 'Cluster' column",
            "Missing 'Cluster' column in DataFrame."
        )
        unique_clusters = set(df['Cluster'].unique())
        _dynamic_test(
            self,
            len(unique_clusters) == 3,
            "There are exactly 3 unique clusters assigned",
            f"Expected 3 clusters, got {len(unique_clusters)}: {unique_clusters}"
        )

    def test_station_labels_unchanged(self):
        import user_code
        importlib.reload(user_code)
        df = getattr(user_code, 'df', None)
        _dynamic_test(
            self,
            isinstance(df, pd.DataFrame),
            "df is a pandas DataFrame",
            "df is not a pandas DataFrame."
        )
        expected_stations = set(self.df['Station'])
        actual_stations = set(df['Station'])
        _dynamic_test(
            self,
            expected_stations == actual_stations,
            "All station names are present and unchanged",
            f"Station names mismatch. Expected: {expected_stations}, got: {actual_stations}"
        )

def _dynamic_test(test_case, condition, success_message, failure_message):
    if condition:
        test_case._testMethodName = success_message
        test_case.assertTrue(True, success_message)
    else:
        test_case._testMethodName = failure_message
        test_case.fail(failure_message)

def normalize_text(text):
    text = text.lower()
    text = re.sub(r"\\s{2,}", " ", text)
    text = re.sub(r"\\s*([,:?])\\s*", r"\\1 ", text)
    return text.strip()

def change_var(code: str, var_name: str, value: str) -> str:
    tree = ast.parse(code)
    lines = code.splitlines()
    changed = False
    # Collect all assignment nodes to modify
    assign_nodes = [
        (i, node)
        for i, node in enumerate(tree.body)
        if isinstance(node, ast.Assign)
        and any(isinstance(target, ast.Name) and target.id == var_name for target in node.targets)
    ]

    # If nothing to change, return unmodified code
    if not assign_nodes:
        return code

    # Perform replacements for all matching assignments (from last to first to not break line offsets)
    for i, node in reversed(assign_nodes):
        start_line = node.lineno - 1
        line = lines[start_line]
        indent = ' ' * (len(line) - len(line.lstrip()))
        lines[start_line] = f"{indent}{var_name} = {value}"
        next_line = len(lines)
        for next_node in tree.body[i+1:]:
            if hasattr(next_node, 'lineno'):
                next_line = next_node.lineno - 1
                break
        if next_line > start_line + 1:
            lines[start_line+1:next_line] = []
        changed = True

    return '\\n'.join(lines) if changed else code

if __name__ == "__main__":
    unittest.main()


test_main.py

Explore how Python can be leveraged to address real-world environmental science problems. This course guides students through data analysis, visualization, and modeling techniques relevant to environmental research, using hands-on tasks and engaging theory chapters.

Learn how to access, clean, and explore environmental datasets using Python. Gain foundational skills for working with real-world environmental data.

Delve into statistical techniques for analyzing environmental data, including descriptive statistics, correlation, and hypothesis testing.

Apply Python to model and predict environmental processes, such as pollution dispersion and climate trends, using real datasets.

Challenge: Cluster Monitoring Stations

Solution

Station	NO2	SO2	PM10
North	32	12	40
South	45	20	55
East	28	9	35
West	55	25	65
Central	38	15	48
SuburbA	22	8	30
SuburbB	25	10	33
Industrial	70	30	80
Park	18	7	28
Airport	60	22	70