-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtransform_food_scores.py
More file actions
90 lines (72 loc) · 2.64 KB
/
transform_food_scores.py
File metadata and controls
90 lines (72 loc) · 2.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#!/usr/bin/env python3
"""
transform_food_scores.py
Reads the raw food_scores CSV, renames the first 8 columns to meaningful headers,
filters out extraneous header rows or non-data rows, and writes a cleaned CSV.
Usage:
python transform_food_scores.py \
--input food_scores.csv \
--output food_scores_clean.csv
"""
import pandas as pd
import argparse
import sys
DEFAULT_HEADERS = [
"Permit #",
"Establishment Name",
"Address",
"Date",
"Inspection Type",
"Food or Retail",
"Score",
"Violations",
"ScrapeDate",
"Page",
"Table",
"SourceFile",
]
def transform(input_csv: str, output_csv: str):
# Read the raw CSV
df = pd.read_csv(input_csv, dtype=str)
# Ensure there are at least 12 columns
if df.shape[1] < len(DEFAULT_HEADERS):
print(f"Error: expected at least {len(DEFAULT_HEADERS)} columns, found {df.shape[1]}", file=sys.stderr)
sys.exit(1)
# Select and rename the first 12 columns
data = df.iloc[:, : len(DEFAULT_HEADERS)].copy()
data.columns = DEFAULT_HEADERS
# Drop rows where Permit # is not purely numeric
data = data[data['Permit #'].str.match(r'^\d+$')]
# Parse date columns
data['Date'] = pd.to_datetime(data['Date'], errors='coerce')
data['ScrapeDate'] = pd.to_datetime(data['ScrapeDate'], errors='coerce')
# Optionally drop rows with invalid dates
data = data.dropna(subset=['Date'])
# Pivot multiple violations (space-separated) into individual rows
data['Violations'] = data['Violations'].fillna('').str.split() # multiple codes to list
data = data.explode('Violations')
# Remove any empty violations entries
data = data[data['Violations'] != '']
# Remove duplicates (can happen during PDF extraction or re-scraping)
before_dedup = len(data)
data = data.drop_duplicates(subset=['Permit #', 'Establishment Name', 'Date', 'Violations'])
after_dedup = len(data)
if before_dedup != after_dedup:
print(f"[INFO] Removed {before_dedup - after_dedup:,} duplicate records")
# Write cleaned CSV
data.to_csv(output_csv, index=False)
print(f"[SUCCESS] Cleaned data written to '{output_csv}' ({after_dedup:,} records)")
def main():
parser = argparse.ArgumentParser(
description="Transform raw food_scores CSV into cleaned data with proper headers"
)
parser.add_argument(
"--input", required=True, help="Raw input CSV file (e.g., food_scores.csv)"
)
parser.add_argument(
"--output", required=True, help="Output cleaned CSV file"
)
args = parser.parse_args()
transform(args.input, args.output)
if __name__ == '__main__':
main()