From 6605e2cc428e3bdaa174ccc432941eab8c5d61cb Mon Sep 17 00:00:00 2001 From: benj Date: Fri, 10 Apr 2026 11:13:57 +0800 Subject: ensure parsers do not parse and store raw XML fields --- scripts/seed.py | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 scripts/seed.py (limited to 'scripts/seed.py') diff --git a/scripts/seed.py b/scripts/seed.py new file mode 100644 index 0000000..72c12bd --- /dev/null +++ b/scripts/seed.py @@ -0,0 +1,81 @@ +""" +Seed raw-layer reference tables with initial values. + +Idempotent — safe to run repeatedly. Uses ON CONFLICT DO UPDATE +so display_name/notes stay current if you change them here. + +Usage: python scripts/seed.py +""" + +import sys +import os + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) +from scripts.common.db import execute + + +def seed_source_system(): + execute(""" + INSERT INTO raw.source_system (code, display_name, url, notes) VALUES + ('irs_xml', 'IRS Bulk XML E-Files', 'https://www.irs.gov/charities-non-profits/form-990-series-downloads', NULL), + ('irs_pdf', 'IRS PDF Filings', NULL, NULL), + ('irs_bmf', 'IRS EO BMF Extract', 'https://www.irs.gov/charities-non-profits/exempt-organizations-business-master-file-extract-eo-bmf', NULL), + ('ny_ag', 'New York Attorney General', 'https://www.charitiesnys.com/RegistrySearch/search_charities.jsp', NULL), + ('ca_ag', 'California Attorney General', 'https://rct.doj.ca.gov/Verification/Web/Search.aspx', NULL), + ('fl_ag', 'Florida Charities', NULL, NULL), + ('tx_ag', 'Texas Secretary of State', NULL, NULL), + ('nj_ag', 'New Jersey Charities', NULL, NULL), + ('il_ag', 'Illinois Attorney General', NULL, NULL), + ('pa_ag', 'Pennsylvania Charities', NULL, NULL), + ('tn_ag', 'Tennessee Secretary of State', NULL, NULL) + ON CONFLICT (code) DO UPDATE SET + display_name = EXCLUDED.display_name, + url = EXCLUDED.url; + """) + + +def seed_form_type(): + execute(""" + INSERT INTO raw.form_type (code, display_name, notes) VALUES + ('990', 'Form 990', NULL), + ('990PF', 'Form 990-PF', NULL), + ('990EZ', 'Form 990-EZ', NULL), + ('990O', 'Form 990-O', NULL), + ('990T', 'Form 990-T', NULL), + ('990A', 'Form 990 (Amended)', NULL), + ('990PA', 'Form 990-PF (Amended)', NULL), + ('990EA', 'Form 990-EZ (Amended)', NULL) + ON CONFLICT (code) DO UPDATE SET + display_name = EXCLUDED.display_name; + """) + + +def seed_grant_detail_status(): + execute(""" + INSERT INTO raw.grant_detail_status (code, display_name, notes) VALUES + ('complete', 'Complete', 'All grants are in raw grant rows'), + ('see_attached', 'See Attached', 'Filing says SEE ATTACHED for grant detail'), + ('schedule_o', 'Schedule O', 'Grant detail is in Schedule O narrative'), + ('placeholder_only', 'Placeholder Only', 'Only placeholder rows (VARIOUS, SEE STATEMENT, etc.)'), + ('no_grants', 'No Grants', 'Filing has no grant schedule or zero grants reported'), + ('unresolved', 'Unresolved', 'Completeness not yet determined'), + ('supplemented', 'Supplemented', 'Detail recovered from supplemental source (PDF, state AG)') + ON CONFLICT (code) DO UPDATE SET + display_name = EXCLUDED.display_name, + notes = EXCLUDED.notes; + """) + + +def main(): + print("Seeding reference tables...") + seed_source_system() + print(" source_system: done") + seed_form_type() + print(" form_type: done") + seed_grant_detail_status() + print(" grant_detail_status: done") + print("All reference tables seeded.") + + +if __name__ == "__main__": + main() -- cgit v1.2.3