LexKYFoodScores/transform_food_scores.py at main · Inform8n/LexKYFoodScores · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#!/usr/bin/env python3
"""
transform_food_scores.py

Reads the raw food_scores CSV, renames the first 8 columns to meaningful headers,
filters out extraneous header rows or non-data rows, and writes a cleaned CSV.

Usage:
    python transform_food_scores.py \
        --input food_scores.csv \
        --output food_scores_clean.csv
"""
import pandas as pd
import argparse
import sys

DEFAULT_HEADERS = [
    "Permit #",
    "Establishment Name",
    "Address",
    "Date",
    "Inspection Type",
    "Food or Retail",
    "Score",
    "Violations",
    "ScrapeDate",
    "Page",
    "Table",
    "SourceFile",
]


def transform(input_csv: str, output_csv: str):
    # Read the raw CSV
    df = pd.read_csv(input_csv, dtype=str)

    # Ensure there are at least 12 columns
    if df.shape[1] < len(DEFAULT_HEADERS):
        print(f"Error: expected at least {len(DEFAULT_HEADERS)} columns, found {df.shape[1]}", file=sys.stderr)
        sys.exit(1)

    # Select and rename the first 12 columns
    data = df.iloc[:, : len(DEFAULT_HEADERS)].copy()
    data.columns = DEFAULT_HEADERS

    # Drop rows where Permit # is not purely numeric
    data = data[data['Permit #'].str.match(r'^\d+$')]

    # Parse date columns
    data['Date'] = pd.to_datetime(data['Date'], errors='coerce')
    data['ScrapeDate'] = pd.to_datetime(data['ScrapeDate'], errors='coerce')

    # Optionally drop rows with invalid dates
    data = data.dropna(subset=['Date'])

    # Pivot multiple violations (space-separated) into individual rows
    data['Violations'] = data['Violations'].fillna('').str.split()  # multiple codes to list
    data = data.explode('Violations')
    # Remove any empty violations entries
    data = data[data['Violations'] != '']

    # Remove duplicates (can happen during PDF extraction or re-scraping)
    before_dedup = len(data)
    data = data.drop_duplicates(subset=['Permit #', 'Establishment Name', 'Date', 'Violations'])
    after_dedup = len(data)
    if before_dedup != after_dedup:
        print(f"[INFO] Removed {before_dedup - after_dedup:,} duplicate records")

    # Write cleaned CSV
    data.to_csv(output_csv, index=False)
    print(f"[SUCCESS] Cleaned data written to '{output_csv}' ({after_dedup:,} records)")


def main():
    parser = argparse.ArgumentParser(
        description="Transform raw food_scores CSV into cleaned data with proper headers"
    )
    parser.add_argument(
        "--input", required=True, help="Raw input CSV file (e.g., food_scores.csv)"
    )
    parser.add_argument(
        "--output", required=True, help="Output cleaned CSV file"
    )
    args = parser.parse_args()

    transform(args.input, args.output)


if __name__ == '__main__':
    main()