In healthcare, ensuring the accuracy and completeness of patient records is critical. **Data quality audits** help you identify errors such as negative ages or missing required information, which could impact patient care and reporting. In this challenge, you will use `python` to audit a `DataFrame` of patient records for two common issues: negative ages and missing diagnoses. You will then generate a report listing all problematic records and output this report as a CSV file, simulating a real-world data quality assurance workflow.

To begin, you will need to create a `DataFrame` that represents a sample set of patient records. This `DataFrame` should include at least the fields `patient_id`, `age`, and `diagnosis`. The next step is to check for negative values in the `age` column, which are not possible in real patient data. You will also check for missing values in the `diagnosis` field, as every patient record should contain a diagnosis for accurate medical tracking and billing.

Once you have identified records with these issues, you will compile them into a separate `DataFrame` to generate a clear and actionable report. Finally, you will export this report to a CSV file, which is a common format for sharing and reviewing data quality findings in healthcare settings.

import pandas as pd

# Sample patient records
data = {
    "patient_id": [1, 2, 3, 4, 5],
    "age": [34, -2, 55, 42, 28],
    "diagnosis": ["Hypertension", None, "Diabetes", "Asthma", None]
}
df = pd.DataFrame(data)

# Identify records with negative ages
negative_age = df[df["age"] < 0]

# Identify records with missing diagnosis
missing_diagnosis = df[df["diagnosis"].isnull()]

# Combine all problematic records, removing duplicates
problematic_records = pd.concat([negative_age, missing_diagnosis]).drop_duplicates()

# Generate the data quality report as a new DataFrame
report = problematic_records.copy()

# Output the report to a CSV file
report.to_csv("data_quality_report.csv", index=False)

print("Data quality audit complete. Problematic records:")
print(report)

Data quality audits like this are essential for maintaining trustworthy medical records. By regularly checking for logical errors and missing information, you help ensure patient safety and compliance with healthcare regulations.

Note


import unittest
import user_code
import ast
import re   
import importlib
import csv
import unittest
import pandas as pd
import os
import csv
import importlib

class TestTask(unittest.TestCase):
    def setUp(self):
        # Clean up the CSV file before each test
        if os.path.exists('data_quality_report.csv'):
            os.remove('data_quality_report.csv')

    def tearDown(self):
        # Clean up the CSV file after each test
        if os.path.exists('data_quality_report.csv'):
            os.remove('data_quality_report.csv')

    def test_csv_file_created(self):
        import user_code
        importlib.reload(user_code)
        exists = os.path.exists('data_quality_report.csv')
        _dynamic_test(
            self,
            exists,
            "CSV file 'data_quality_report.csv' is created.",
            "CSV file 'data_quality_report.csv' was not created."
        )

    def test_csv_content(self):
        import user_code
        importlib.reload(user_code)
        # Read the CSV file
        exists = os.path.exists('data_quality_report.csv')
        _dynamic_test(
            self,
            exists,
            "CSV file exists for content test.",
            "CSV file does not exist for content test."
        )
        with open("data_quality_report.csv", newline="") as csvfile:
            reader = csv.DictReader(csvfile)
            rows = list(reader)
        # Should include records with patient_id 2 (negative age) and 5 (missing diagnosis)
        ids = set(int(r["patient_id"]) for r in rows)
        expected_ids = {2, 5}
        _dynamic_test(
            self,
            ids == expected_ids,
            "CSV file contains correct problematic patient records.",
            f"CSV file does not contain correct records. Expected patient_id {expected_ids}, got {ids}."
        )
        # Check that no duplicate records are present
        _dynamic_test(
            self,
            len(rows) == 2,
            "CSV file contains no duplicate problematic records.",
            f"CSV file contains duplicate or extra records: {rows}"
        )

    def test_negative_age_record(self):
        import user_code
        importlib.reload(user_code)
        with open("data_quality_report.csv", newline="") as csvfile:
            reader = csv.DictReader(csvfile)
            rows = [r for r in reader]
        found = any(int(r["patient_id"]) == 2 and int(r["age"]) < 0 for r in rows)
        _dynamic_test(
            self,
            found,
            "CSV file contains a record with negative age (patient_id 2).",
            "CSV file missing record with negative age (patient_id 2)."
        )

    def test_missing_diagnosis_record(self):
        import user_code
        importlib.reload(user_code)
        with open("data_quality_report.csv", newline="") as csvfile:
            reader = csv.DictReader(csvfile)
            rows = [r for r in reader]
        found = any(int(r["patient_id"]) == 5 and (r["diagnosis"] == '' or r["diagnosis"] is None) for r in rows)
        _dynamic_test(
            self,
            found,
            "CSV file contains a record with missing diagnosis (patient_id 5).",
            "CSV file missing record with missing diagnosis (patient_id 5)."
        )

def _dynamic_test(test_case, condition, success_message, failure_message):
    if condition:
        test_case._testMethodName = success_message
        test_case.assertTrue(True, success_message)
    else:
        test_case._testMethodName = failure_message
        test_case.fail(failure_message)

def normalize_text(text):
    text = text.lower()
    text = re.sub(r"\\s{2,}", " ", text)
    text = re.sub(r"\\s*([,:?])\\s*", r"\\1 ", text)
    return text.strip()

def change_var(code: str, var_name: str, value: str) -> str:
    tree = ast.parse(code)
    lines = code.splitlines()
    changed = False
    # Collect all assignment nodes to modify
    assign_nodes = [
        (i, node)
        for i, node in enumerate(tree.body)
        if isinstance(node, ast.Assign)
        and any(isinstance(target, ast.Name) and target.id == var_name for target in node.targets)
    ]

    # If nothing to change, return unmodified code
    if not assign_nodes:
        return code

    # Perform replacements for all matching assignments (from last to first to not break line offsets)
    for i, node in reversed(assign_nodes):
        start_line = node.lineno - 1
        line = lines[start_line]
        indent = ' ' * (len(line) - len(line.lstrip()))
        lines[start_line] = f"{indent}{var_name} = {value}"
        next_line = len(lines)
        for next_node in tree.body[i+1:]:
            if hasattr(next_node, 'lineno'):
                next_line = next_node.lineno - 1
                break
        if next_line > start_line + 1:
            lines[start_line+1:next_line] = []
        changed = True

    return '\\n'.join(lines) if changed else code

if __name__ == "__main__":
    unittest.main()


test_main.py

A practical Python course tailored for healthcare professionals, focusing on real-world data analysis, visualization, and automation scenarios in the medical field. Learn to harness Python's power to streamline healthcare workflows, analyze patient data, and visualize medical trends.

Explore the essentials of working with healthcare data in Python, including data structures, importing datasets, and basic data cleaning tailored to medical records.

Master the art of visualizing healthcare data to uncover insights and communicate findings effectively using Python's plotting libraries.

Automate repetitive healthcare tasks and generate insightful reports using Python, streamlining workflows and improving efficiency.

Challenge: Data Quality Audit

Solution