Clean, reliable data is critical for journalists who want to build trustworthy media databases. When working with lists of news sources, data often arrives in a messy state: duplicate entries can inflate counts, missing website links can leave gaps in research, and inconsistent capitalization can make automated analysis difficult. Ensuring your data is clean not only saves time but also prevents errors in your reporting.

import pandas as pd

# Example: Messy news sources data
data = {
    "Name": [
        "the daily news", "The Daily News", "Global Times", "global times",
        "Metro Herald", "Metro herald", "Metro Herald", "The Observer", "The Observer"
    ],
    "Website": [
        "www.dailynews.com", None, "www.globaltimes.com", "www.globaltimes.com",
        "www.metroherald.com", None, None, "www.observer.com", None
    ]
}

df = pd.DataFrame(data)

# Remove duplicate rows based on both columns
df = df.drop_duplicates()

# Fill missing website URLs with 'Unknown'
df["Website"] = df["Website"].fillna("Unknown")

# Standardize news source names to title case
df["Name"] = df["Name"].str.title()

# Output the cleaned DataFrame
print(df)

Cleaning your data in this way makes your media analysis more reliable. By removing duplicates, you ensure each source is only counted once. Filling in missing website URLs with a placeholder like `"Unknown"` allows you to spot gaps without breaking your workflow. Standardizing name capitalization avoids mismatches and makes grouping or filtering sources much easier. **Clean data leads to more accurate reporting and helps maintain the credibility of your findings.**


import unittest
import user_code
import ast
import re   
import importlib
import csv
import unittest
import importlib
import pandas as pd

class TestTask(unittest.TestCase):
    def setUp(self):
        self.data = {
            "Name": [
                "the daily news", "The Daily News", "Global Times", "global times",
                "Metro Herald", "Metro herald", "Metro Herald", "The Observer", "The Observer"
            ],
            "Website": [
                "www.dailynews.com", None, "www.globaltimes.com", "www.globaltimes.com",
                "www.metroherald.com", None, None, "www.observer.com", None
            ]
        }
        self.df = pd.DataFrame(self.data)

    def test_no_duplicates(self):
        import user_code
        importlib.reload(user_code)
        result = user_code.clean_news_sources(self.df.copy())
        # Check for duplicates
        num_duplicates = result.duplicated().sum()
        _dynamic_test(
            self,
            num_duplicates == 0,
            "All duplicate rows are removed from the DataFrame.",
            f"Expected 0 duplicate rows, found {num_duplicates}."
        )

    def test_missing_websites_filled(self):
        import user_code
        importlib.reload(user_code)
        result = user_code.clean_news_sources(self.df.copy())
        # Check if any missing values remain in Website
        has_missing = result["Website"].isnull().any()
        all_filled = (result["Website"] == "Unknown").sum() >= 1
        _dynamic_test(
            self,
            not has_missing and all_filled,
            "All missing website URLs are replaced with 'Unknown'.",
            "Missing website URLs were not properly filled with 'Unknown'."
        )

    def test_names_title_case(self):
        import user_code
        importlib.reload(user_code)
        result = user_code.clean_news_sources(self.df.copy())
        all_title = result["Name"].apply(lambda x: x == x.title()).all()
        _dynamic_test(
            self,
            all_title,
            "All news source names are in title case.",
            "Not all news source names are in title case."
        )

def _dynamic_test(test_case, condition, success_message, failure_message):
    if condition:
        test_case._testMethodName = success_message
        test_case.assertTrue(True, success_message)
    else:
        test_case._testMethodName = failure_message
        test_case.fail(failure_message)

def normalize_text(text):
    text = text.lower()
    text = re.sub(r"\\s{2,}", " ", text)
    text = re.sub(r"\\s*([,:?])\\s*", r"\\1 ", text)
    return text.strip()

def change_var(code: str, var_name: str, value: str) -> str:
    tree = ast.parse(code)
    lines = code.splitlines()
    changed = False
    # Collect all assignment nodes to modify
    assign_nodes = [
        (i, node)
        for i, node in enumerate(tree.body)
        if isinstance(node, ast.Assign)
        and any(isinstance(target, ast.Name) and target.id == var_name for target in node.targets)
    ]

    # If nothing to change, return unmodified code
    if not assign_nodes:
        return code

    # Perform replacements for all matching assignments (from last to first to not break line offsets)
    for i, node in reversed(assign_nodes):
        start_line = node.lineno - 1
        line = lines[start_line]
        indent = ' ' * (len(line) - len(line.lstrip()))
        lines[start_line] = f"{indent}{var_name} = {value}"
        next_line = len(lines)
        for next_node in tree.body[i+1:]:
            if hasattr(next_node, 'lineno'):
                next_line = next_node.lineno - 1
                break
        if next_line > start_line + 1:
            lines[start_line+1:next_line] = []
        changed = True

    return '\\n'.join(lines) if changed else code

if __name__ == "__main__":
    unittest.main()


test_main.py

A practical course designed for journalists and media professionals to harness the power of Python for data-driven storytelling, news automation, and media analysis. Learn to collect, analyze, and visualize data, automate repetitive newsroom tasks, and uncover insights from large datasets using Python.

Learn how to gather, clean, and prepare data for journalistic investigations and media reporting using Python.

Master the skills to analyze, interpret, and visualize media data for compelling storytelling and reporting.

Discover how Python can automate repetitive newsroom tasks and analyze media content for deeper insights.

Challenge: Clean a List of News Sources

Lösning