From 6605e2cc428e3bdaa174ccc432941eab8c5d61cb Mon Sep 17 00:00:00 2001 From: benj Date: Fri, 10 Apr 2026 11:13:57 +0800 Subject: ensure parsers do not parse and store raw XML fields --- migrations/001_raw_schema.sql | 535 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 535 insertions(+) create mode 100644 migrations/001_raw_schema.sql (limited to 'migrations') diff --git a/migrations/001_raw_schema.sql b/migrations/001_raw_schema.sql new file mode 100644 index 0000000..6661de7 --- /dev/null +++ b/migrations/001_raw_schema.sql @@ -0,0 +1,535 @@ +-- Migration 001: Raw ingestion schema +-- +-- Design principles: +-- - Every row traces back to: which ingest run, which source system, +-- which document, and which filing. +-- - raw.filing is the dedup anchor: one row per unique filing per source system. +-- - Per-form tables (raw.form_990, raw.form_990pf, raw.form_990ez) hold +-- filing-level summary fields, 1:1 with raw.filing. +-- - Grant tables stay split by form type (different column sets). +-- - raw.filing_source tracks every place a filing was seen (which archive, which path). +-- - IRS manifest data lives in its own table for completeness auditing. +-- - Nothing here touches canonicalization — that's a later layer. +-- +-- Parser conventions (EIN normalization, source_document_id derivation, +-- line_number stability, amount parsing, placeholder detection) are +-- enforced in code at scripts/common/, not in documentation. + +BEGIN; + +CREATE SCHEMA IF NOT EXISTS raw; + +-- ============================================================ +-- 1. Reference tables +-- ============================================================ + +-- Data sources. Add rows as new sources come online — no schema changes needed. +CREATE TABLE IF NOT EXISTS raw.source_system ( + code TEXT PRIMARY KEY, -- e.g. 'irs_xml', 'ny_ag' + display_name TEXT NOT NULL, + url TEXT, -- home page or API docs + notes TEXT +); + +-- IRS form types. Includes amended and other variants. +-- New form types from future sources get a row here, not a schema change. +CREATE TABLE IF NOT EXISTS raw.form_type ( + code TEXT PRIMARY KEY, -- e.g. '990', '990PF' + display_name TEXT NOT NULL, + notes TEXT +); + +-- Grant detail completeness status. +-- Tracks whether parsed grant rows fully represent the filing's grants. +CREATE TABLE IF NOT EXISTS raw.grant_detail_status ( + code TEXT PRIMARY KEY, + display_name TEXT NOT NULL, + notes TEXT +); + +-- Reference tables are seeded by scripts/seed.py, not by this migration. + +-- ============================================================ +-- 2. Ingest tracking +-- ============================================================ + +-- One row per invocation of a parser/loader script. +CREATE TABLE IF NOT EXISTS raw.ingest_run ( + id SERIAL PRIMARY KEY, + parser TEXT NOT NULL, -- e.g. 'parse_990pf', 'parse_990_schedule_i' + source_system TEXT NOT NULL REFERENCES raw.source_system(code), + started_at TIMESTAMPTZ NOT NULL DEFAULT now(), + finished_at TIMESTAMPTZ, + status TEXT NOT NULL DEFAULT 'running' + CHECK (status IN ('running', 'done', 'error')), + files_scanned INT, + files_matched INT, + rows_inserted INT, + notes TEXT -- free-form (CLI args, etc.) +); + +-- Per-file errors during ingestion. +CREATE TABLE IF NOT EXISTS raw.ingest_error ( + id SERIAL PRIMARY KEY, + ingest_run_id INT REFERENCES raw.ingest_run(id), + source_archive TEXT, -- zip filename or equivalent container + source_path TEXT, -- file within archive, or standalone file path + source_document_id TEXT, -- parsed document id, if available at error time + stage TEXT, -- processing stage: 'read', 'parse_xml', 'extract', 'db_write', etc. + error TEXT NOT NULL, + created_at TIMESTAMPTZ NOT NULL DEFAULT now() +); + +-- ============================================================ +-- 3. IRS filing manifest (from IRS index CSVs) +-- ============================================================ + +-- One row per (object_id, index_year) pair. The same filing can appear in +-- multiple annual index releases, so object_id alone is not unique here. +CREATE TABLE IF NOT EXISTS raw.irs_manifest ( + id SERIAL PRIMARY KEY, + return_id TEXT, + filing_type TEXT, -- 'EFILE' + ein TEXT NOT NULL, + tax_period TEXT, -- YYYYMM + sub_date TEXT, -- submission date as published + taxpayer_name TEXT, + return_type TEXT NOT NULL, -- '990', '990EZ', '990PF', amended variants + dln TEXT, + object_id TEXT NOT NULL, + index_year INT NOT NULL, -- which year's index CSV this came from + UNIQUE (object_id, index_year) +); + +CREATE INDEX IF NOT EXISTS idx_raw_irs_manifest_ein ON raw.irs_manifest (ein); +CREATE INDEX IF NOT EXISTS idx_raw_irs_manifest_return_type ON raw.irs_manifest (return_type); +CREATE INDEX IF NOT EXISTS idx_raw_irs_manifest_object_id ON raw.irs_manifest (object_id); + +-- ============================================================ +-- 4. Raw filing table (dedup anchor) +-- ============================================================ + +-- One row per unique filing we have actually ingested. +-- Unique on (source_system, source_document_id): object_id works for IRS XML, +-- but other sources will have their own document identifiers. +-- +-- For source_system = 'irs_xml', source_document_id is the object_id, +-- which can be joined to irs_manifest.object_id for reconciliation. +CREATE TABLE IF NOT EXISTS raw.filing ( + id SERIAL PRIMARY KEY, + source_system TEXT NOT NULL REFERENCES raw.source_system(code), + source_document_id TEXT NOT NULL, -- IRS: object_id; NY AG: filing_id; etc. + ein TEXT NOT NULL CHECK (ein ~ '^\d{9}$'), + filer_name TEXT, + form_type TEXT NOT NULL REFERENCES raw.form_type(code), + tax_year INT, + tax_period_begin DATE, -- from XML TaxPeriodBeginDt + tax_period_end DATE, -- from XML TaxPeriodEndDt + return_version TEXT, + return_timestamp TIMESTAMPTZ, -- from XML ReturnTs + source_url TEXT, -- canonical lookup URL where available + ingest_run_id INT REFERENCES raw.ingest_run(id), + created_at TIMESTAMPTZ NOT NULL DEFAULT now(), + UNIQUE (source_system, source_document_id) +); + +CREATE INDEX IF NOT EXISTS idx_raw_filing_ein ON raw.filing (ein); +CREATE INDEX IF NOT EXISTS idx_raw_filing_form_type ON raw.filing (form_type); +CREATE INDEX IF NOT EXISTS idx_raw_filing_source_document_id ON raw.filing (source_document_id); +CREATE INDEX IF NOT EXISTS idx_raw_filing_tax_year ON raw.filing (tax_year); + +-- ============================================================ +-- 5. Filing provenance (where each filing was seen) +-- ============================================================ + +-- A filing can appear in multiple archives (overlapping ZIPs, re-releases). +-- This table records every sighting so we can debug path inconsistencies +-- and understand archive overlap without losing information. +-- +-- The UNIQUE on (source_archive, source_path) prevents the same archive/path +-- from mapping to different filings — a given file in a given ZIP is one filing. +CREATE TABLE IF NOT EXISTS raw.filing_source ( + id SERIAL PRIMARY KEY, + raw_filing_id INT NOT NULL REFERENCES raw.filing(id), + ingest_run_id INT REFERENCES raw.ingest_run(id), + source_archive TEXT NOT NULL, -- e.g. 'download990xml_2020_1.zip' + source_path TEXT NOT NULL, -- e.g. 'Cycles_202042_202052/202013089349101246_public.xml' + seen_at TIMESTAMPTZ NOT NULL DEFAULT now(), + UNIQUE (source_archive, source_path) +); + +CREATE INDEX IF NOT EXISTS idx_raw_filing_source_archive ON raw.filing_source (source_archive); +CREATE INDEX IF NOT EXISTS idx_raw_filing_source_filing ON raw.filing_source (raw_filing_id); + +-- ============================================================ +-- 6. Filing summary tables (per form type, 1:1 with raw.filing) +-- ============================================================ + +-- Form 990: public charity / large exempt org filing summary. +-- Holds Part I summary, balance sheet totals, governance counts, +-- and Schedule I metadata (since Schedule I is 1:1 with the 990 filing). +CREATE TABLE IF NOT EXISTS raw.form_990 ( + id SERIAL PRIMARY KEY, + raw_filing_id INT NOT NULL UNIQUE REFERENCES raw.filing(id), + + -- filer identity (as reported on this specific return) + filer_name2 TEXT, -- BusinessNameLine2Txt + filer_address_line1 TEXT, + filer_address_line2 TEXT, + filer_city TEXT, + filer_state TEXT, + filer_zip TEXT, + filer_country TEXT, -- only for foreign filers + filer_foreign_postal_code TEXT, + phone TEXT, + website TEXT, + + -- classification + is_501c3 BOOLEAN, + section_501c_type TEXT, -- '4', '6', '7', etc. if not 501c3 + org_type TEXT, -- 'corp', 'trust', 'assoc', 'other' + group_return BOOLEAN, + group_exemption_num TEXT, + formation_year TEXT, + legal_domicile_state TEXT, + mission TEXT, + accounting_method TEXT, -- 'cash', 'accrual', 'other' + + -- filing status flags + is_amended BOOLEAN DEFAULT false, + is_final BOOLEAN DEFAULT false, + is_initial BOOLEAN DEFAULT false, + is_terminated BOOLEAN DEFAULT false, + + -- Part I: current year summary + gross_receipts NUMERIC, + cy_contributions_grants NUMERIC, + cy_program_service_revenue NUMERIC, + cy_investment_income NUMERIC, + cy_other_revenue NUMERIC, + cy_total_revenue NUMERIC, + cy_grants_paid NUMERIC, + cy_benefits_to_members NUMERIC, + cy_salaries_benefits NUMERIC, + cy_fundraising_expense NUMERIC, + cy_other_expenses NUMERIC, + cy_total_expenses NUMERIC, + cy_revenue_less_expenses NUMERIC, + + -- Part I: prior year summary (NULL for first-year filers) + py_total_revenue NUMERIC, + py_total_expenses NUMERIC, + + -- balance sheet (Part I summary / Part X) + total_assets_boy NUMERIC, + total_assets_eoy NUMERIC, + total_liabilities_boy NUMERIC, + total_liabilities_eoy NUMERIC, + net_assets_boy NUMERIC, + net_assets_eoy NUMERIC, + + -- governance / workforce + total_employees INT, + total_volunteers INT, + voting_members INT, + independent_voting_members INT, + + -- Part IX: functional expense breakdown + program_services_expense NUMERIC, + management_general_expense NUMERIC, + fundraising_expense_ix NUMERIC, -- Part IX col D total + + -- Part VIII: revenue detail + government_grants NUMERIC, + total_contributions NUMERIC, + total_program_service_rev NUMERIC, + investment_income NUMERIC, + + -- UBI + gross_ubi NUMERIC, + net_ubi NUMERIC, + + -- Schedule I metadata (1:1 with filing) + sched_i_grant_records_maintained BOOLEAN, + sched_i_501c3_org_count INT, + sched_i_other_org_count INT, + sched_i_total_grants_amt NUMERIC, -- for reconciliation vs grant rows + + -- grant detail completeness + grant_detail_status TEXT NOT NULL DEFAULT 'unresolved' + REFERENCES raw.grant_detail_status(code), + + -- officer / signer + principal_officer TEXT, + officer_name TEXT, + officer_title TEXT, + signature_date DATE, + preparer_firm TEXT +); + +-- Form 990-PF: private foundation filing summary. +CREATE TABLE IF NOT EXISTS raw.form_990pf ( + id SERIAL PRIMARY KEY, + raw_filing_id INT NOT NULL UNIQUE REFERENCES raw.filing(id), + + -- filer identity (as reported on this specific return) + filer_name2 TEXT, + filer_address_line1 TEXT, + filer_address_line2 TEXT, + filer_city TEXT, + filer_state TEXT, + filer_zip TEXT, + filer_country TEXT, + filer_foreign_postal_code TEXT, + phone TEXT, + website TEXT, + + -- classification + is_501c3_pf BOOLEAN, + is_4947a1_trust BOOLEAN, + is_private_operating BOOLEAN, + accounting_method TEXT, -- 'cash', 'accrual' + + -- filing status flags + is_amended BOOLEAN DEFAULT false, + is_final BOOLEAN DEFAULT false, + is_initial BOOLEAN DEFAULT false, + + -- Part I: revenue and expenses + contributions_received NUMERIC, + interest_revenue NUMERIC, + dividends_revenue NUMERIC, + net_gain_sale_assets NUMERIC, + total_revenue NUMERIC, + total_net_investment_income NUMERIC, + compensation_officers NUMERIC, + total_operating_expenses NUMERIC, + contributions_paid NUMERIC, + total_expenses NUMERIC, + total_charitable_disbursements NUMERIC, + excess_revenue_over_expenses NUMERIC, + net_investment_income NUMERIC, + adjusted_net_income NUMERIC, + + -- Part II: balance sheets + total_assets_boy NUMERIC, + total_assets_eoy NUMERIC, + total_assets_eoy_fmv NUMERIC, + total_liabilities_boy NUMERIC, + total_liabilities_eoy NUMERIC, + net_assets_boy NUMERIC, + net_assets_eoy NUMERIC, + fmv_assets_eoy NUMERIC, -- cover-page FMV figure + + -- Part X-XII: distribution calculations + minimum_investment_return NUMERIC, + distributable_amount NUMERIC, + qualifying_distributions NUMERIC, + + -- excise tax + excise_tax_amount NUMERIC, + + -- Part XV: supplementary information totals (for reconciliation) + total_grants_paid NUMERIC, + total_grants_approved_future NUMERIC, + + -- state registration + state_of_registration TEXT, + + -- grant detail completeness + grant_detail_status TEXT NOT NULL DEFAULT 'unresolved' + REFERENCES raw.grant_detail_status(code), + + -- officer / signer + officer_name TEXT, + officer_title TEXT, + signature_date DATE, + preparer_firm TEXT +); + +-- Form 990-EZ: smaller exempt org filing summary. +-- Parser not yet built, but schema should exist so the table set is complete. +CREATE TABLE IF NOT EXISTS raw.form_990ez ( + id SERIAL PRIMARY KEY, + raw_filing_id INT NOT NULL UNIQUE REFERENCES raw.filing(id), + + -- filer identity + filer_name2 TEXT, + filer_address_line1 TEXT, + filer_city TEXT, + filer_state TEXT, + filer_zip TEXT, + filer_country TEXT, + phone TEXT, + website TEXT, + + -- classification + is_501c3 BOOLEAN, + section_501c_type TEXT, + group_exemption_num TEXT, + + -- filing status flags + is_amended BOOLEAN DEFAULT false, + is_final BOOLEAN DEFAULT false, + is_initial BOOLEAN DEFAULT false, + + -- Part I: revenue, expenses, assets + gross_receipts NUMERIC, + contributions_gifts_grants NUMERIC, + program_service_revenue NUMERIC, + investment_income NUMERIC, + total_revenue NUMERIC, + grants_paid NUMERIC, + salaries_compensation NUMERIC, + total_expenses NUMERIC, + revenue_less_expenses NUMERIC, + total_assets_boy NUMERIC, + total_assets_eoy NUMERIC, + total_liabilities_boy NUMERIC, + total_liabilities_eoy NUMERIC, + net_assets_boy NUMERIC, + net_assets_eoy NUMERIC, + + -- grant detail completeness + -- 990-EZ grant detail often lives in Schedule O or attachments, + -- not in a structured repeated table like Schedule I. + grant_detail_status TEXT NOT NULL DEFAULT 'unresolved' + REFERENCES raw.grant_detail_status(code), + has_schedule_o BOOLEAN, + + -- officer / signer + officer_name TEXT, + officer_title TEXT, + signature_date DATE, + preparer_firm TEXT +); + +-- ============================================================ +-- 7. Raw grant tables (per form type) +-- ============================================================ + +-- 990-PF Part XV grants (private foundation → recipient) +-- +-- line_number is the ordinal position of this grant within the filing's +-- grant list (1-based). Together with raw_filing_id it forms a natural +-- dedup key: re-ingesting the same filing cannot create duplicate rows. +CREATE TABLE IF NOT EXISTS raw.grant_990pf ( + id SERIAL PRIMARY KEY, + raw_filing_id INT NOT NULL REFERENCES raw.filing(id), + line_number INT NOT NULL, + -- recipient fields + recipient_name TEXT, + recipient_name2 TEXT, + recipient_person_name TEXT, -- non-null = individual grant + address_line1 TEXT, + address_line2 TEXT, + city TEXT, + state TEXT, + zip TEXT, + country TEXT, + foreign_postal_code TEXT, + -- grant fields (raw text preserved, typed column alongside) + amount_raw TEXT, + amount NUMERIC, + purpose TEXT, + foundation_status TEXT, + relationship TEXT, + UNIQUE (raw_filing_id, line_number) +); + +CREATE INDEX IF NOT EXISTS idx_raw_grant_990pf_filing ON raw.grant_990pf (raw_filing_id); + +-- 990 Schedule I grants (public charity → recipient) +CREATE TABLE IF NOT EXISTS raw.grant_990 ( + id SERIAL PRIMARY KEY, + raw_filing_id INT NOT NULL REFERENCES raw.filing(id), + line_number INT NOT NULL, + -- recipient fields + recipient_name TEXT, + recipient_name2 TEXT, + recipient_ein TEXT CHECK (recipient_ein IS NULL OR recipient_ein ~ '^\d{9}$'), + address_line1 TEXT, + address_line2 TEXT, + city TEXT, + state TEXT, + zip TEXT, + country TEXT, + foreign_postal_code TEXT, + -- grant fields (raw text preserved, typed columns alongside) + cash_grant_amt_raw TEXT, + cash_grant_amt NUMERIC, + non_cash_amt_raw TEXT, + non_cash_amt NUMERIC, + non_cash_desc TEXT, + valuation_method TEXT, + purpose TEXT, + irc_section TEXT, + UNIQUE (raw_filing_id, line_number) +); + +CREATE INDEX IF NOT EXISTS idx_raw_grant_990_filing ON raw.grant_990 (raw_filing_id); + +-- ============================================================ +-- 8. Schedule O / narrative text +-- ============================================================ + +-- Preserves narrative text from Schedule O and supplemental information. +-- Both 990 and 990-EZ push grant detail and explanatory text here. +-- Not fully structured — stores raw text blocks with whatever section +-- reference the filing provides. +CREATE TABLE IF NOT EXISTS raw.schedule_o ( + id SERIAL PRIMARY KEY, + raw_filing_id INT NOT NULL REFERENCES raw.filing(id), + line_number INT NOT NULL, -- ordinal within the schedule + form_line_ref TEXT, -- e.g. 'SCHEDULE I, PART I, LINE 2' + explanation TEXT, + UNIQUE (raw_filing_id, line_number) +); + +CREATE INDEX IF NOT EXISTS idx_raw_schedule_o_filing ON raw.schedule_o (raw_filing_id); + +-- ============================================================ +-- 10. BMF (organization registry reference source) +-- ============================================================ + +-- Current-snapshot IRS EO BMF rows, minimally transformed from the published +-- CSVs. This is a registry/reference source, not a filing source. +CREATE TABLE IF NOT EXISTS raw.bmf ( + ein TEXT PRIMARY KEY CHECK (ein ~ '^\d{9}$'), + name TEXT, + ico TEXT, + street TEXT, + city TEXT, + state TEXT, + zip TEXT, + grp TEXT, + subsection TEXT, + affiliation TEXT, + classification TEXT, + ruling TEXT, + deductibility TEXT, + foundation TEXT, + activity TEXT, + organization TEXT, + status TEXT, + tax_period TEXT, + asset_cd TEXT, + income_cd TEXT, + filing_req_cd TEXT, + pf_filing_req_cd TEXT, + acct_pd TEXT, + asset_amt BIGINT, + income_amt BIGINT, + revenue_amt BIGINT, + ntee_cd TEXT, + sort_name TEXT, + source_file TEXT, + ingest_run_id INT REFERENCES raw.ingest_run(id), + created_at TIMESTAMPTZ NOT NULL DEFAULT now() +); + +CREATE INDEX IF NOT EXISTS idx_raw_bmf_name ON raw.bmf (name); +CREATE INDEX IF NOT EXISTS idx_raw_bmf_state ON raw.bmf (state); +CREATE INDEX IF NOT EXISTS idx_raw_bmf_ntee_cd ON raw.bmf (ntee_cd); + +COMMIT; -- cgit v1.2.3