When working with government records, inconsistencies in **department names** can have a significant impact on your ability to analyze and report on data accurately. If department names are entered with different capitalization, extra spaces, or other variations, it becomes difficult to group, summarize, or compare records correctly. For instance, `Health Department`, ` health department`, and `HEALTH DEPARTMENT` might all refer to the same entity, but automated analysis would treat them as separate categories. This can lead to misleading results and additional manual work to clean up the data before performing meaningful analysis.

# Example dataset with inconsistent department names
records = [
    {"id": 1, "department": "health department "},
    {"id": 2, "department": "  Education Department"},
    {"id": 3, "department": "TRANSPORTATION department"},
    {"id": 4, "department": "public safety"},
    {"id": 5, "department": "Health Department"},
    {"id": 6, "department": " education department"},
    {"id": 7, "department": "Public Safety "},
    {"id": 8, "department": "TRANSPORTATION DEPARTMENT"},
]

To address these inconsistencies, you can use Python's **string methods** to clean and standardize text fields. The `strip()` method removes leading and trailing whitespace, which is useful when entries have extra spaces at the beginning or end. The `title()` method converts a string so that each word starts with an uppercase letter and the rest are lowercase, making capitalization consistent. By combining these methods, you can ensure that department names are formatted uniformly across your dataset, which improves the quality and reliability of your analysis.


import unittest
import user_code
import ast
import re   
import importlib
import csv
import unittest
import importlib

class TestTask(unittest.TestCase):
    def setUp(self):
        self.sample_records = [
            {"id": 1, "department": "health department "},
            {"id": 2, "department": "  Education Department"},
            {"id": 3, "department": "TRANSPORTATION department"},
            {"id": 4, "department": "public safety"},
            {"id": 5, "department": "Health Department"},
            {"id": 6, "department": " education department"},
            {"id": 7, "department": "Public Safety "},
            {"id": 8, "department": "TRANSPORTATION DEPARTMENT"},
        ]
        self.expected_departments = [
            "Health Department",
            "Education Department",
            "Transportation Department",
            "Public Safety",
            "Health Department",
            "Education Department",
            "Public Safety",
            "Transportation Department",
        ]

    def test_output_list_length(self):
        import user_code
        importlib.reload(user_code)
        result = user_code.clean_departments(self.sample_records)
        _dynamic_test(
            self,
            isinstance(result, list) and len(result) == len(self.sample_records),
            "Output list has same length as input records",
            f"Expected length {len(self.sample_records)}, got {len(result) if isinstance(result, list) else type(result)}",
        )

    def test_department_no_spaces(self):
        import user_code
        importlib.reload(user_code)
        result = user_code.clean_departments(self.sample_records)
        all_clean = True
        for rec in result:
            dep = rec.get("department", None)
            if dep is None or dep != dep.strip():
                all_clean = False
                break
        _dynamic_test(
            self,
            all_clean,
            "All 'department' fields have no leading or trailing spaces",
            "Some 'department' fields have leading or trailing spaces",
        )

    def test_department_title_case(self):
        import user_code
        importlib.reload(user_code)
        result = user_code.clean_departments(self.sample_records)
        all_title = True
        for rec in result:
            dep = rec.get("department", None)
            if dep is None or dep != dep.title():
                all_title = False
                break
        _dynamic_test(
            self,
            all_title,
            "All 'department' fields are in title case",
            "Some 'department' fields are not in title case",
        )

    def test_original_not_modified(self):
        import user_code
        import copy
        importlib.reload(user_code)
        original = copy.deepcopy(self.sample_records)
        _ = user_code.clean_departments(self.sample_records)
        _dynamic_test(
            self,
            self.sample_records == original,
            "Original input list is not modified",
            "Original input list was modified",
        )

    def test_cleaned_departments_match(self):
        import user_code
        importlib.reload(user_code)
        result = user_code.clean_departments(self.sample_records)
        depts = [rec.get("department") for rec in result]
        _dynamic_test(
            self,
            depts == self.expected_departments,
            "Cleaned departments match expected output",
            f"Expected: {self.expected_departments}, got: {depts}",
        )

def _dynamic_test(test_case, condition, success_message, failure_message):
    if condition:
        test_case._testMethodName = success_message
        test_case.assertTrue(True, success_message)
    else:
        test_case._testMethodName = failure_message
        test_case.fail(failure_message)

def normalize_text(text):
    text = text.lower()
    text = re.sub(r"\\s{2,}", " ", text)
    text = re.sub(r"\\s*([,:?])\\s*", r"\\1 ", text)
    return text.strip()

def change_var(code: str, var_name: str, value: str) -> str:
    tree = ast.parse(code)
    lines = code.splitlines()
    changed = False
    # Collect all assignment nodes to modify
    assign_nodes = [
        (i, node)
        for i, node in enumerate(tree.body)
        if isinstance(node, ast.Assign)
        and any(isinstance(target, ast.Name) and target.id == var_name for target in node.targets)
    ]

    # If nothing to change, return unmodified code
    if not assign_nodes:
        return code

    # Perform replacements for all matching assignments (from last to first to not break line offsets)
    for i, node in reversed(assign_nodes):
        start_line = node.lineno - 1
        line = lines[start_line]
        indent = ' ' * (len(line) - len(line.lstrip()))
        lines[start_line] = f"{indent}{var_name} = {value}"
        next_line = len(lines)
        for next_node in tree.body[i+1:]:
            if hasattr(next_node, 'lineno'):
                next_line = next_node.lineno - 1
                break
        if next_line > start_line + 1:
            lines[start_line+1:next_line] = []
        changed = True

    return '\\n'.join(lines) if changed else code

if __name__ == "__main__":
    unittest.main()


test_main.py

A practical Python course tailored for government analysts, focusing on data analysis, visualization, and decision-making using real-world scenarios relevant to public sector work. Learn to harness Python's power to extract insights, visualize trends, and support evidence-based policy.

Explore how Python can be used to analyze government datasets, extract actionable insights, and support evidence-based policy decisions.

Apply statistical methods to government datasets to evaluate policies, measure impact, and support data-driven decisions.

Discover how Python can automate repetitive analytical tasks, streamline reporting, and improve efficiency in government operations.

Challenge: Clean and Standardize Department Names

Ratkaisu