aboutsummaryrefslogtreecommitdiff
path: root/migrations
diff options
context:
space:
mode:
Diffstat (limited to 'migrations')
-rw-r--r--migrations/001_raw_schema.sql535
1 files changed, 535 insertions, 0 deletions
diff --git a/migrations/001_raw_schema.sql b/migrations/001_raw_schema.sql
new file mode 100644
index 0000000..6661de7
--- /dev/null
+++ b/migrations/001_raw_schema.sql
@@ -0,0 +1,535 @@
+-- Migration 001: Raw ingestion schema
+--
+-- Design principles:
+-- - Every row traces back to: which ingest run, which source system,
+-- which document, and which filing.
+-- - raw.filing is the dedup anchor: one row per unique filing per source system.
+-- - Per-form tables (raw.form_990, raw.form_990pf, raw.form_990ez) hold
+-- filing-level summary fields, 1:1 with raw.filing.
+-- - Grant tables stay split by form type (different column sets).
+-- - raw.filing_source tracks every place a filing was seen (which archive, which path).
+-- - IRS manifest data lives in its own table for completeness auditing.
+-- - Nothing here touches canonicalization — that's a later layer.
+--
+-- Parser conventions (EIN normalization, source_document_id derivation,
+-- line_number stability, amount parsing, placeholder detection) are
+-- enforced in code at scripts/common/, not in documentation.
+
+BEGIN;
+
+CREATE SCHEMA IF NOT EXISTS raw;
+
+-- ============================================================
+-- 1. Reference tables
+-- ============================================================
+
+-- Data sources. Add rows as new sources come online — no schema changes needed.
+CREATE TABLE IF NOT EXISTS raw.source_system (
+ code TEXT PRIMARY KEY, -- e.g. 'irs_xml', 'ny_ag'
+ display_name TEXT NOT NULL,
+ url TEXT, -- home page or API docs
+ notes TEXT
+);
+
+-- IRS form types. Includes amended and other variants.
+-- New form types from future sources get a row here, not a schema change.
+CREATE TABLE IF NOT EXISTS raw.form_type (
+ code TEXT PRIMARY KEY, -- e.g. '990', '990PF'
+ display_name TEXT NOT NULL,
+ notes TEXT
+);
+
+-- Grant detail completeness status.
+-- Tracks whether parsed grant rows fully represent the filing's grants.
+CREATE TABLE IF NOT EXISTS raw.grant_detail_status (
+ code TEXT PRIMARY KEY,
+ display_name TEXT NOT NULL,
+ notes TEXT
+);
+
+-- Reference tables are seeded by scripts/seed.py, not by this migration.
+
+-- ============================================================
+-- 2. Ingest tracking
+-- ============================================================
+
+-- One row per invocation of a parser/loader script.
+CREATE TABLE IF NOT EXISTS raw.ingest_run (
+ id SERIAL PRIMARY KEY,
+ parser TEXT NOT NULL, -- e.g. 'parse_990pf', 'parse_990_schedule_i'
+ source_system TEXT NOT NULL REFERENCES raw.source_system(code),
+ started_at TIMESTAMPTZ NOT NULL DEFAULT now(),
+ finished_at TIMESTAMPTZ,
+ status TEXT NOT NULL DEFAULT 'running'
+ CHECK (status IN ('running', 'done', 'error')),
+ files_scanned INT,
+ files_matched INT,
+ rows_inserted INT,
+ notes TEXT -- free-form (CLI args, etc.)
+);
+
+-- Per-file errors during ingestion.
+CREATE TABLE IF NOT EXISTS raw.ingest_error (
+ id SERIAL PRIMARY KEY,
+ ingest_run_id INT REFERENCES raw.ingest_run(id),
+ source_archive TEXT, -- zip filename or equivalent container
+ source_path TEXT, -- file within archive, or standalone file path
+ source_document_id TEXT, -- parsed document id, if available at error time
+ stage TEXT, -- processing stage: 'read', 'parse_xml', 'extract', 'db_write', etc.
+ error TEXT NOT NULL,
+ created_at TIMESTAMPTZ NOT NULL DEFAULT now()
+);
+
+-- ============================================================
+-- 3. IRS filing manifest (from IRS index CSVs)
+-- ============================================================
+
+-- One row per (object_id, index_year) pair. The same filing can appear in
+-- multiple annual index releases, so object_id alone is not unique here.
+CREATE TABLE IF NOT EXISTS raw.irs_manifest (
+ id SERIAL PRIMARY KEY,
+ return_id TEXT,
+ filing_type TEXT, -- 'EFILE'
+ ein TEXT NOT NULL,
+ tax_period TEXT, -- YYYYMM
+ sub_date TEXT, -- submission date as published
+ taxpayer_name TEXT,
+ return_type TEXT NOT NULL, -- '990', '990EZ', '990PF', amended variants
+ dln TEXT,
+ object_id TEXT NOT NULL,
+ index_year INT NOT NULL, -- which year's index CSV this came from
+ UNIQUE (object_id, index_year)
+);
+
+CREATE INDEX IF NOT EXISTS idx_raw_irs_manifest_ein ON raw.irs_manifest (ein);
+CREATE INDEX IF NOT EXISTS idx_raw_irs_manifest_return_type ON raw.irs_manifest (return_type);
+CREATE INDEX IF NOT EXISTS idx_raw_irs_manifest_object_id ON raw.irs_manifest (object_id);
+
+-- ============================================================
+-- 4. Raw filing table (dedup anchor)
+-- ============================================================
+
+-- One row per unique filing we have actually ingested.
+-- Unique on (source_system, source_document_id): object_id works for IRS XML,
+-- but other sources will have their own document identifiers.
+--
+-- For source_system = 'irs_xml', source_document_id is the object_id,
+-- which can be joined to irs_manifest.object_id for reconciliation.
+CREATE TABLE IF NOT EXISTS raw.filing (
+ id SERIAL PRIMARY KEY,
+ source_system TEXT NOT NULL REFERENCES raw.source_system(code),
+ source_document_id TEXT NOT NULL, -- IRS: object_id; NY AG: filing_id; etc.
+ ein TEXT NOT NULL CHECK (ein ~ '^\d{9}$'),
+ filer_name TEXT,
+ form_type TEXT NOT NULL REFERENCES raw.form_type(code),
+ tax_year INT,
+ tax_period_begin DATE, -- from XML TaxPeriodBeginDt
+ tax_period_end DATE, -- from XML TaxPeriodEndDt
+ return_version TEXT,
+ return_timestamp TIMESTAMPTZ, -- from XML ReturnTs
+ source_url TEXT, -- canonical lookup URL where available
+ ingest_run_id INT REFERENCES raw.ingest_run(id),
+ created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
+ UNIQUE (source_system, source_document_id)
+);
+
+CREATE INDEX IF NOT EXISTS idx_raw_filing_ein ON raw.filing (ein);
+CREATE INDEX IF NOT EXISTS idx_raw_filing_form_type ON raw.filing (form_type);
+CREATE INDEX IF NOT EXISTS idx_raw_filing_source_document_id ON raw.filing (source_document_id);
+CREATE INDEX IF NOT EXISTS idx_raw_filing_tax_year ON raw.filing (tax_year);
+
+-- ============================================================
+-- 5. Filing provenance (where each filing was seen)
+-- ============================================================
+
+-- A filing can appear in multiple archives (overlapping ZIPs, re-releases).
+-- This table records every sighting so we can debug path inconsistencies
+-- and understand archive overlap without losing information.
+--
+-- The UNIQUE on (source_archive, source_path) prevents the same archive/path
+-- from mapping to different filings — a given file in a given ZIP is one filing.
+CREATE TABLE IF NOT EXISTS raw.filing_source (
+ id SERIAL PRIMARY KEY,
+ raw_filing_id INT NOT NULL REFERENCES raw.filing(id),
+ ingest_run_id INT REFERENCES raw.ingest_run(id),
+ source_archive TEXT NOT NULL, -- e.g. 'download990xml_2020_1.zip'
+ source_path TEXT NOT NULL, -- e.g. 'Cycles_202042_202052/202013089349101246_public.xml'
+ seen_at TIMESTAMPTZ NOT NULL DEFAULT now(),
+ UNIQUE (source_archive, source_path)
+);
+
+CREATE INDEX IF NOT EXISTS idx_raw_filing_source_archive ON raw.filing_source (source_archive);
+CREATE INDEX IF NOT EXISTS idx_raw_filing_source_filing ON raw.filing_source (raw_filing_id);
+
+-- ============================================================
+-- 6. Filing summary tables (per form type, 1:1 with raw.filing)
+-- ============================================================
+
+-- Form 990: public charity / large exempt org filing summary.
+-- Holds Part I summary, balance sheet totals, governance counts,
+-- and Schedule I metadata (since Schedule I is 1:1 with the 990 filing).
+CREATE TABLE IF NOT EXISTS raw.form_990 (
+ id SERIAL PRIMARY KEY,
+ raw_filing_id INT NOT NULL UNIQUE REFERENCES raw.filing(id),
+
+ -- filer identity (as reported on this specific return)
+ filer_name2 TEXT, -- BusinessNameLine2Txt
+ filer_address_line1 TEXT,
+ filer_address_line2 TEXT,
+ filer_city TEXT,
+ filer_state TEXT,
+ filer_zip TEXT,
+ filer_country TEXT, -- only for foreign filers
+ filer_foreign_postal_code TEXT,
+ phone TEXT,
+ website TEXT,
+
+ -- classification
+ is_501c3 BOOLEAN,
+ section_501c_type TEXT, -- '4', '6', '7', etc. if not 501c3
+ org_type TEXT, -- 'corp', 'trust', 'assoc', 'other'
+ group_return BOOLEAN,
+ group_exemption_num TEXT,
+ formation_year TEXT,
+ legal_domicile_state TEXT,
+ mission TEXT,
+ accounting_method TEXT, -- 'cash', 'accrual', 'other'
+
+ -- filing status flags
+ is_amended BOOLEAN DEFAULT false,
+ is_final BOOLEAN DEFAULT false,
+ is_initial BOOLEAN DEFAULT false,
+ is_terminated BOOLEAN DEFAULT false,
+
+ -- Part I: current year summary
+ gross_receipts NUMERIC,
+ cy_contributions_grants NUMERIC,
+ cy_program_service_revenue NUMERIC,
+ cy_investment_income NUMERIC,
+ cy_other_revenue NUMERIC,
+ cy_total_revenue NUMERIC,
+ cy_grants_paid NUMERIC,
+ cy_benefits_to_members NUMERIC,
+ cy_salaries_benefits NUMERIC,
+ cy_fundraising_expense NUMERIC,
+ cy_other_expenses NUMERIC,
+ cy_total_expenses NUMERIC,
+ cy_revenue_less_expenses NUMERIC,
+
+ -- Part I: prior year summary (NULL for first-year filers)
+ py_total_revenue NUMERIC,
+ py_total_expenses NUMERIC,
+
+ -- balance sheet (Part I summary / Part X)
+ total_assets_boy NUMERIC,
+ total_assets_eoy NUMERIC,
+ total_liabilities_boy NUMERIC,
+ total_liabilities_eoy NUMERIC,
+ net_assets_boy NUMERIC,
+ net_assets_eoy NUMERIC,
+
+ -- governance / workforce
+ total_employees INT,
+ total_volunteers INT,
+ voting_members INT,
+ independent_voting_members INT,
+
+ -- Part IX: functional expense breakdown
+ program_services_expense NUMERIC,
+ management_general_expense NUMERIC,
+ fundraising_expense_ix NUMERIC, -- Part IX col D total
+
+ -- Part VIII: revenue detail
+ government_grants NUMERIC,
+ total_contributions NUMERIC,
+ total_program_service_rev NUMERIC,
+ investment_income NUMERIC,
+
+ -- UBI
+ gross_ubi NUMERIC,
+ net_ubi NUMERIC,
+
+ -- Schedule I metadata (1:1 with filing)
+ sched_i_grant_records_maintained BOOLEAN,
+ sched_i_501c3_org_count INT,
+ sched_i_other_org_count INT,
+ sched_i_total_grants_amt NUMERIC, -- for reconciliation vs grant rows
+
+ -- grant detail completeness
+ grant_detail_status TEXT NOT NULL DEFAULT 'unresolved'
+ REFERENCES raw.grant_detail_status(code),
+
+ -- officer / signer
+ principal_officer TEXT,
+ officer_name TEXT,
+ officer_title TEXT,
+ signature_date DATE,
+ preparer_firm TEXT
+);
+
+-- Form 990-PF: private foundation filing summary.
+CREATE TABLE IF NOT EXISTS raw.form_990pf (
+ id SERIAL PRIMARY KEY,
+ raw_filing_id INT NOT NULL UNIQUE REFERENCES raw.filing(id),
+
+ -- filer identity (as reported on this specific return)
+ filer_name2 TEXT,
+ filer_address_line1 TEXT,
+ filer_address_line2 TEXT,
+ filer_city TEXT,
+ filer_state TEXT,
+ filer_zip TEXT,
+ filer_country TEXT,
+ filer_foreign_postal_code TEXT,
+ phone TEXT,
+ website TEXT,
+
+ -- classification
+ is_501c3_pf BOOLEAN,
+ is_4947a1_trust BOOLEAN,
+ is_private_operating BOOLEAN,
+ accounting_method TEXT, -- 'cash', 'accrual'
+
+ -- filing status flags
+ is_amended BOOLEAN DEFAULT false,
+ is_final BOOLEAN DEFAULT false,
+ is_initial BOOLEAN DEFAULT false,
+
+ -- Part I: revenue and expenses
+ contributions_received NUMERIC,
+ interest_revenue NUMERIC,
+ dividends_revenue NUMERIC,
+ net_gain_sale_assets NUMERIC,
+ total_revenue NUMERIC,
+ total_net_investment_income NUMERIC,
+ compensation_officers NUMERIC,
+ total_operating_expenses NUMERIC,
+ contributions_paid NUMERIC,
+ total_expenses NUMERIC,
+ total_charitable_disbursements NUMERIC,
+ excess_revenue_over_expenses NUMERIC,
+ net_investment_income NUMERIC,
+ adjusted_net_income NUMERIC,
+
+ -- Part II: balance sheets
+ total_assets_boy NUMERIC,
+ total_assets_eoy NUMERIC,
+ total_assets_eoy_fmv NUMERIC,
+ total_liabilities_boy NUMERIC,
+ total_liabilities_eoy NUMERIC,
+ net_assets_boy NUMERIC,
+ net_assets_eoy NUMERIC,
+ fmv_assets_eoy NUMERIC, -- cover-page FMV figure
+
+ -- Part X-XII: distribution calculations
+ minimum_investment_return NUMERIC,
+ distributable_amount NUMERIC,
+ qualifying_distributions NUMERIC,
+
+ -- excise tax
+ excise_tax_amount NUMERIC,
+
+ -- Part XV: supplementary information totals (for reconciliation)
+ total_grants_paid NUMERIC,
+ total_grants_approved_future NUMERIC,
+
+ -- state registration
+ state_of_registration TEXT,
+
+ -- grant detail completeness
+ grant_detail_status TEXT NOT NULL DEFAULT 'unresolved'
+ REFERENCES raw.grant_detail_status(code),
+
+ -- officer / signer
+ officer_name TEXT,
+ officer_title TEXT,
+ signature_date DATE,
+ preparer_firm TEXT
+);
+
+-- Form 990-EZ: smaller exempt org filing summary.
+-- Parser not yet built, but schema should exist so the table set is complete.
+CREATE TABLE IF NOT EXISTS raw.form_990ez (
+ id SERIAL PRIMARY KEY,
+ raw_filing_id INT NOT NULL UNIQUE REFERENCES raw.filing(id),
+
+ -- filer identity
+ filer_name2 TEXT,
+ filer_address_line1 TEXT,
+ filer_city TEXT,
+ filer_state TEXT,
+ filer_zip TEXT,
+ filer_country TEXT,
+ phone TEXT,
+ website TEXT,
+
+ -- classification
+ is_501c3 BOOLEAN,
+ section_501c_type TEXT,
+ group_exemption_num TEXT,
+
+ -- filing status flags
+ is_amended BOOLEAN DEFAULT false,
+ is_final BOOLEAN DEFAULT false,
+ is_initial BOOLEAN DEFAULT false,
+
+ -- Part I: revenue, expenses, assets
+ gross_receipts NUMERIC,
+ contributions_gifts_grants NUMERIC,
+ program_service_revenue NUMERIC,
+ investment_income NUMERIC,
+ total_revenue NUMERIC,
+ grants_paid NUMERIC,
+ salaries_compensation NUMERIC,
+ total_expenses NUMERIC,
+ revenue_less_expenses NUMERIC,
+ total_assets_boy NUMERIC,
+ total_assets_eoy NUMERIC,
+ total_liabilities_boy NUMERIC,
+ total_liabilities_eoy NUMERIC,
+ net_assets_boy NUMERIC,
+ net_assets_eoy NUMERIC,
+
+ -- grant detail completeness
+ -- 990-EZ grant detail often lives in Schedule O or attachments,
+ -- not in a structured repeated table like Schedule I.
+ grant_detail_status TEXT NOT NULL DEFAULT 'unresolved'
+ REFERENCES raw.grant_detail_status(code),
+ has_schedule_o BOOLEAN,
+
+ -- officer / signer
+ officer_name TEXT,
+ officer_title TEXT,
+ signature_date DATE,
+ preparer_firm TEXT
+);
+
+-- ============================================================
+-- 7. Raw grant tables (per form type)
+-- ============================================================
+
+-- 990-PF Part XV grants (private foundation → recipient)
+--
+-- line_number is the ordinal position of this grant within the filing's
+-- grant list (1-based). Together with raw_filing_id it forms a natural
+-- dedup key: re-ingesting the same filing cannot create duplicate rows.
+CREATE TABLE IF NOT EXISTS raw.grant_990pf (
+ id SERIAL PRIMARY KEY,
+ raw_filing_id INT NOT NULL REFERENCES raw.filing(id),
+ line_number INT NOT NULL,
+ -- recipient fields
+ recipient_name TEXT,
+ recipient_name2 TEXT,
+ recipient_person_name TEXT, -- non-null = individual grant
+ address_line1 TEXT,
+ address_line2 TEXT,
+ city TEXT,
+ state TEXT,
+ zip TEXT,
+ country TEXT,
+ foreign_postal_code TEXT,
+ -- grant fields (raw text preserved, typed column alongside)
+ amount_raw TEXT,
+ amount NUMERIC,
+ purpose TEXT,
+ foundation_status TEXT,
+ relationship TEXT,
+ UNIQUE (raw_filing_id, line_number)
+);
+
+CREATE INDEX IF NOT EXISTS idx_raw_grant_990pf_filing ON raw.grant_990pf (raw_filing_id);
+
+-- 990 Schedule I grants (public charity → recipient)
+CREATE TABLE IF NOT EXISTS raw.grant_990 (
+ id SERIAL PRIMARY KEY,
+ raw_filing_id INT NOT NULL REFERENCES raw.filing(id),
+ line_number INT NOT NULL,
+ -- recipient fields
+ recipient_name TEXT,
+ recipient_name2 TEXT,
+ recipient_ein TEXT CHECK (recipient_ein IS NULL OR recipient_ein ~ '^\d{9}$'),
+ address_line1 TEXT,
+ address_line2 TEXT,
+ city TEXT,
+ state TEXT,
+ zip TEXT,
+ country TEXT,
+ foreign_postal_code TEXT,
+ -- grant fields (raw text preserved, typed columns alongside)
+ cash_grant_amt_raw TEXT,
+ cash_grant_amt NUMERIC,
+ non_cash_amt_raw TEXT,
+ non_cash_amt NUMERIC,
+ non_cash_desc TEXT,
+ valuation_method TEXT,
+ purpose TEXT,
+ irc_section TEXT,
+ UNIQUE (raw_filing_id, line_number)
+);
+
+CREATE INDEX IF NOT EXISTS idx_raw_grant_990_filing ON raw.grant_990 (raw_filing_id);
+
+-- ============================================================
+-- 8. Schedule O / narrative text
+-- ============================================================
+
+-- Preserves narrative text from Schedule O and supplemental information.
+-- Both 990 and 990-EZ push grant detail and explanatory text here.
+-- Not fully structured — stores raw text blocks with whatever section
+-- reference the filing provides.
+CREATE TABLE IF NOT EXISTS raw.schedule_o (
+ id SERIAL PRIMARY KEY,
+ raw_filing_id INT NOT NULL REFERENCES raw.filing(id),
+ line_number INT NOT NULL, -- ordinal within the schedule
+ form_line_ref TEXT, -- e.g. 'SCHEDULE I, PART I, LINE 2'
+ explanation TEXT,
+ UNIQUE (raw_filing_id, line_number)
+);
+
+CREATE INDEX IF NOT EXISTS idx_raw_schedule_o_filing ON raw.schedule_o (raw_filing_id);
+
+-- ============================================================
+-- 10. BMF (organization registry reference source)
+-- ============================================================
+
+-- Current-snapshot IRS EO BMF rows, minimally transformed from the published
+-- CSVs. This is a registry/reference source, not a filing source.
+CREATE TABLE IF NOT EXISTS raw.bmf (
+ ein TEXT PRIMARY KEY CHECK (ein ~ '^\d{9}$'),
+ name TEXT,
+ ico TEXT,
+ street TEXT,
+ city TEXT,
+ state TEXT,
+ zip TEXT,
+ grp TEXT,
+ subsection TEXT,
+ affiliation TEXT,
+ classification TEXT,
+ ruling TEXT,
+ deductibility TEXT,
+ foundation TEXT,
+ activity TEXT,
+ organization TEXT,
+ status TEXT,
+ tax_period TEXT,
+ asset_cd TEXT,
+ income_cd TEXT,
+ filing_req_cd TEXT,
+ pf_filing_req_cd TEXT,
+ acct_pd TEXT,
+ asset_amt BIGINT,
+ income_amt BIGINT,
+ revenue_amt BIGINT,
+ ntee_cd TEXT,
+ sort_name TEXT,
+ source_file TEXT,
+ ingest_run_id INT REFERENCES raw.ingest_run(id),
+ created_at TIMESTAMPTZ NOT NULL DEFAULT now()
+);
+
+CREATE INDEX IF NOT EXISTS idx_raw_bmf_name ON raw.bmf (name);
+CREATE INDEX IF NOT EXISTS idx_raw_bmf_state ON raw.bmf (state);
+CREATE INDEX IF NOT EXISTS idx_raw_bmf_ntee_cd ON raw.bmf (ntee_cd);
+
+COMMIT;