diff options
| author | benj <benj@rse8.com> | 2026-04-10 11:13:57 +0800 |
|---|---|---|
| committer | benj <benj@rse8.com> | 2026-04-10 11:13:57 +0800 |
| commit | 6605e2cc428e3bdaa174ccc432941eab8c5d61cb (patch) | |
| tree | 52f9d176c2ce1a80adb2ea2ac31cd12d3a29c0db /scripts/extract/irs_990_pdf.py | |
| parent | 493746b14c1251a45b061d2e3edd9160c929d2b9 (diff) | |
| download | tidyindex-6605e2cc428e3bdaa174ccc432941eab8c5d61cb.tar tidyindex-6605e2cc428e3bdaa174ccc432941eab8c5d61cb.tar.gz tidyindex-6605e2cc428e3bdaa174ccc432941eab8c5d61cb.tar.bz2 tidyindex-6605e2cc428e3bdaa174ccc432941eab8c5d61cb.tar.lz tidyindex-6605e2cc428e3bdaa174ccc432941eab8c5d61cb.tar.xz tidyindex-6605e2cc428e3bdaa174ccc432941eab8c5d61cb.tar.zst tidyindex-6605e2cc428e3bdaa174ccc432941eab8c5d61cb.zip | |
ensure parsers do not parse and store raw XML fields
Diffstat (limited to '')
| -rw-r--r-- | scripts/extract/irs_990_pdf.py | 699 |
1 files changed, 699 insertions, 0 deletions
diff --git a/scripts/extract/irs_990_pdf.py b/scripts/extract/irs_990_pdf.py new file mode 100644 index 0000000..1d1209c --- /dev/null +++ b/scripts/extract/irs_990_pdf.py @@ -0,0 +1,699 @@ +""" +Source-agnostic 990-PF PDF grant extractor. + +Takes a path to a local PDF and returns structured grant data. Side-effect +free with respect to any database; only external call is to Anthropic's API. + +A separate loader consumes ExtractionResult and writes to raw.* tables. + +Usage (CLI): + python -m scripts.extract.irs_990_pdf path/to/file.pdf + python -m scripts.extract.irs_990_pdf path/to/file.pdf --json + python -m scripts.extract.irs_990_pdf path/to/file.pdf --tax-year 2021 + python -m scripts.extract.irs_990_pdf path/to/file.pdf --source-label ny_ag + +Usage (programmatic): + from scripts.extract.irs_990_pdf import extract_from_pdf + result = extract_from_pdf("data/tmp/pdf_test/marley_2017.pdf", tax_year=2017) + for grant in result.grants: + print(grant.recipient_name, grant.amount) +""" + +from __future__ import annotations + +import argparse +import base64 +import json +import re +import subprocess +import sys +from dataclasses import dataclass, field, asdict +from decimal import Decimal, InvalidOperation +from pathlib import Path +from typing import Any + +import anthropic +import fitz # pymupdf +import pdfplumber + +from scripts.common.normalize import is_placeholder, parse_numeric + + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- + +MODEL = "claude-haiku-4-5-20251001" + +# Text-path quality heuristic. If the number of grants extracted is less than +# num_grant_pages * MIN_GRANTS_PER_PAGE, we flag the result as low-yield and +# fall back to vision. This is an initial heuristic and may be tuned. +MIN_GRANTS_PER_PAGE = 1 + +# Minimum total text characters across all pages before we trust the text layer. +MIN_TEXT_LAYER_CHARS = 200 + +# Standard 990-PF form has 13 pages; grant attachments come after. +ATTACHMENT_START_PAGE_IDX = 13 # 0-based; corresponds to page 14 + +# Vision scan stops after this many consecutive non-grant pages (once any +# grant page has been found). +VISION_CONSECUTIVE_NO_LIMIT = 3 + +# DPIs used for vision rendering. +TRIAGE_DPI = 100 +EXTRACTION_DPI = 150 + +GRANT_PAGE_KEYWORDS = [ + r"GRANTS AND CONTRIBUTIONS PAID", + r"SUPPLEMENTARY INFORMATION", + r"PART XIV", + r"PART XV", + r"SCHEDULE OF.*GRANT", + r"GRANTS PAID", + r"CONTRIBUTIONS PAID", +] +_GRANT_PAGE_RE = re.compile("|".join(GRANT_PAGE_KEYWORDS)) + +# Loose marker for "this PDF is a 990-PF". NY AG PDFs may contain only the +# CHAR500 state cover form with no federal return attached — ~46% of that +# corpus, based on sampling. Those should short-circuit before we waste any +# Haiku calls. +_IS_990PF_RE = re.compile(r"FORM\s*990-?PF") + +TEXT_EXTRACTION_PROMPT = """Extract ALL individual grants from this 990-PF page text. + +Return a JSON array. Each grant should have: +- recipient_name: organization name (if grant is to an org) +- recipient_person_name: person's name (if grant is to an individual) +- address_line1: street address (if present) +- address_line2: second address line (if present) +- city: city (if present) +- state: state abbreviation (if present) +- zip: zip code (if present) +- country: country (if present, only when non-US) +- foreign_postal_code: foreign postal code (if present) +- amount: dollar amount as string (digits only, no $ or commas) +- purpose: purpose of grant (if present) +- foundation_status: recipient status like PC, NC, PF (if present) +- relationship: relationship of recipient to foundation (if present) + +IMPORTANT RULES: +- Do NOT include total/subtotal rows +- Do NOT include header rows or column labels +- If there are multiple year columns, extract ONLY the most recent year +- If recipient name is "SEE ATTACHED", "VARIOUS", "N/A", "NONE", or a similar placeholder, skip it +- Return [] if no individual grants are found + +Return ONLY the JSON array, no other text.""" + +VISION_EXTRACTION_PROMPT = """Extract ALL individual grants from this 990-PF page image. + +Return a JSON array. Each grant should have: +- recipient_name: organization name (if grant is to an org) +- recipient_person_name: person's name (if grant is to an individual) +- address_line1: street address (if visible) +- address_line2: second address line (if visible) +- city: city +- state: state abbreviation +- zip: zip code (if visible) +- country: country (only if non-US) +- foreign_postal_code: foreign postal code (if present) +- amount: dollar amount as string (digits only, no $ or commas) +- purpose: purpose of grant +- foundation_status: recipient status like PC, NC, PF (if present) +- relationship: relationship of recipient to foundation (if present) + +IMPORTANT RULES: +- Do NOT include total/subtotal rows +- Do NOT include header rows +- If there are multiple year columns, extract ONLY the most recent year +- If recipient name is "SEE ATTACHED", "VARIOUS", "N/A", "NONE", or a similar placeholder, skip it +- Return [] if no individual grants are found + +Return ONLY the JSON array, no other text.""" + +TRIAGE_PROMPT = ( + "Is this page a table of grant or contribution recipients listing " + "individual organization names with addresses and dollar amounts? " + "Answer ONLY yes or no." +) + + +# --------------------------------------------------------------------------- +# Anthropic client (lazy singleton so importing this module is cheap) +# --------------------------------------------------------------------------- + +_client: anthropic.Anthropic | None = None + + +def _get_client() -> anthropic.Anthropic: + global _client + if _client is None: + api_key = subprocess.run( + ["pass", "show", "anthropic.com/api.anthropic.com/apikey"], + capture_output=True, text=True, + ).stdout.strip() + _client = anthropic.Anthropic(api_key=api_key) + return _client + + +# --------------------------------------------------------------------------- +# Dataclasses +# --------------------------------------------------------------------------- + +@dataclass +class ExtractedGrant: + line_number: int + recipient_name: str | None = None + recipient_name2: str | None = None + recipient_person_name: str | None = None + address_line1: str | None = None + address_line2: str | None = None + city: str | None = None + state: str | None = None + zip: str | None = None + country: str | None = None + foreign_postal_code: str | None = None + amount_raw: str | None = None + amount: Decimal | None = None + purpose: str | None = None + foundation_status: str | None = None + relationship: str | None = None + + +@dataclass +class ExtractionResult: + success: bool + grants: list[ExtractedGrant] + # 'supplemented' — ≥1 grant extracted + # 'no_grants' — extractor ran end-to-end, found nothing usable + # 'not_a_990pf' — PDF is readable but isn't a 990-PF (e.g. CHAR500 cover) + # None — catastrophic failure (success=False) + grant_detail_status: str | None + # 'pdfplumber+haiku_text' — text path was used + # 'haiku_vision_attempted' — vision path was used (success not implied) + # 'skipped_not_990pf' — short-circuited; no API calls made + # 'failed' — catastrophic failure (success=False) + method: str + diagnostics: dict = field(default_factory=dict) + + @property + def total_amount(self) -> Decimal: + return sum((g.amount or Decimal(0)) for g in self.grants) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _parse_haiku_response(text: str) -> list[dict]: + """Parse JSON from Haiku response, handling markdown code blocks.""" + text = text.strip() + if text.startswith("```"): + # Drop opening fence + optional language tag + parts = text.split("\n", 1) + if len(parts) == 2: + text = parts[1] + text = text.rsplit("```", 1)[0] + text = text.strip() + try: + parsed = json.loads(text) + except json.JSONDecodeError: + return [] + if not isinstance(parsed, list): + return [] + return [x for x in parsed if isinstance(x, dict)] + + +def _is_grant_page_text(text: str) -> bool: + return bool(_GRANT_PAGE_RE.search(text.upper())) + + +def _clean_str(value: Any) -> str | None: + if value is None: + return None + s = str(value).strip() + return s or None + + +def _normalize_grant(raw: dict) -> ExtractedGrant | None: + """Map a loose Haiku JSON dict to an ExtractedGrant. + + Returns None if the row looks like a placeholder/header artifact. + line_number is set to 0 here; it's reassigned later after dedupe. + """ + recipient_name = _clean_str(raw.get("recipient_name")) + recipient_person_name = _clean_str(raw.get("recipient_person_name")) + + # Safety net: prompt already filters these, but double-check. + if recipient_name and is_placeholder(recipient_name): + recipient_name = None + if recipient_person_name and is_placeholder(recipient_person_name): + recipient_person_name = None + + amount_raw = _clean_str(raw.get("amount")) + amount_numeric: Decimal | None = None + if amount_raw is not None: + parsed = parse_numeric(amount_raw) + if parsed is not None: + try: + amount_numeric = Decimal(parsed) + except InvalidOperation: + amount_numeric = None + + return ExtractedGrant( + line_number=0, + recipient_name=recipient_name, + recipient_name2=_clean_str(raw.get("recipient_name2")), + recipient_person_name=recipient_person_name, + address_line1=_clean_str(raw.get("address_line1")), + address_line2=_clean_str(raw.get("address_line2")), + city=_clean_str(raw.get("city")), + state=_clean_str(raw.get("state")), + zip=_clean_str(raw.get("zip")), + country=_clean_str(raw.get("country")), + foreign_postal_code=_clean_str(raw.get("foreign_postal_code")), + amount_raw=amount_raw, + amount=amount_numeric, + purpose=_clean_str(raw.get("purpose")), + foundation_status=_clean_str(raw.get("foundation_status")), + relationship=_clean_str(raw.get("relationship")), + ) + + +def _year_hint(tax_year: int | None) -> str: + if tax_year is None: + return "" + return f"\n\nThis filing is for tax year {tax_year}." + + +def _postprocess(grants: list[ExtractedGrant]) -> list[ExtractedGrant]: + """Drop placeholders / amount-less rows, dedupe, and reassign line_number.""" + out: list[ExtractedGrant] = [] + seen: set[tuple] = set() + + for g in grants: + # Drop rows with no recipient at all. + if not g.recipient_name and not g.recipient_person_name: + continue + # Drop placeholder recipients that slipped through (belt-and-suspenders). + if g.recipient_name and is_placeholder(g.recipient_name): + continue + if g.recipient_person_name and is_placeholder(g.recipient_person_name): + continue + # Drop rows with no amount text at all (Haiku didn't see an amount column). + # Keep rows where amount_raw == '0' and amount == Decimal(0). + if g.amount is None and not g.amount_raw: + continue + + key = ( + (g.recipient_name or "").upper(), + (g.city or "").upper(), + (g.state or "").upper(), + g.amount_raw or "", + ) + if key in seen: + continue + seen.add(key) + out.append(g) + + for i, g in enumerate(out, start=1): + g.line_number = i + return out + + +# --------------------------------------------------------------------------- +# Text path +# --------------------------------------------------------------------------- + +def _extract_text_layer( + pdf_path: Path, + tax_year: int | None, + diagnostics: dict, +) -> tuple[list[ExtractedGrant], str, int]: + """Extract grants via pdfplumber text + Haiku text parsing. + + Returns (grants, status, num_grant_pages). + status ∈ {'ok', 'not_a_990pf', 'no_text_layer', 'no_grant_pages', + 'haiku_empty', 'low_yield', 'error'} + + 'not_a_990pf' means the PDF has a readable text layer but doesn't look + like a 990-PF at all (e.g. a standalone NY State CHAR500 cover form). + The caller should short-circuit on this — there's nothing for the vision + path to find either. + """ + try: + pdf = pdfplumber.open(pdf_path) + except Exception as exc: + diagnostics["text_path_error"] = str(exc) + return [], "error", 0 + + try: + page_texts: list[tuple[int, str]] = [] + total_chars = 0 + for idx, page in enumerate(pdf.pages, start=1): + text = page.extract_text() or "" + page_texts.append((idx, text)) + total_chars += len(text) + + diagnostics["text_layer_chars"] = total_chars + diagnostics["pages_total"] = len(page_texts) + + if total_chars < MIN_TEXT_LAYER_CHARS: + return [], "no_text_layer", 0 + + # Must look like a 990-PF at all. If not, don't bother with Haiku — + # and don't fall through to vision, since it'd just burn triage calls + # scanning a non-990-PF document end to end. + joined_upper = "\n".join(t for _, t in page_texts).upper() + if not _IS_990PF_RE.search(joined_upper): + return [], "not_a_990pf", 0 + + grant_pages = [ + (num, text) for num, text in page_texts + if len(text) > 100 and _is_grant_page_text(text) + ] + diagnostics["grant_pages_identified"] = len(grant_pages) + + if not grant_pages: + return [], "no_grant_pages", 0 + finally: + pdf.close() + + client = _get_client() + year_hint = _year_hint(tax_year) + + raw_grants: list[dict] = [] + try: + for page_num, text in grant_pages: + resp = client.messages.create( + model=MODEL, + max_tokens=4096, + messages=[{ + "role": "user", + "content": f"{TEXT_EXTRACTION_PROMPT}{year_hint}\n\nPage text:\n{text}", + }], + ) + raw_grants.extend(_parse_haiku_response(resp.content[0].text)) + except anthropic.APIError as exc: + # Discard partial results — fall back to vision cleanly. + diagnostics["text_path_error"] = f"{type(exc).__name__}: {exc}" + return [], "error", len(grant_pages) + + if not raw_grants: + return [], "haiku_empty", len(grant_pages) + + grants: list[ExtractedGrant] = [] + for raw in raw_grants: + g = _normalize_grant(raw) + if g is not None: + grants.append(g) + + if len(grants) < len(grant_pages) * MIN_GRANTS_PER_PAGE: + return grants, "low_yield", len(grant_pages) + + return grants, "ok", len(grant_pages) + + +# --------------------------------------------------------------------------- +# Vision path +# --------------------------------------------------------------------------- + +def _render_page_b64(page, dpi: int) -> str: + pix = page.get_pixmap(dpi=dpi) + return base64.standard_b64encode(pix.tobytes("png")).decode("utf-8") + + +def _extract_vision( + pdf_path: Path, + tax_year: int | None, + diagnostics: dict, +) -> tuple[list[ExtractedGrant], int, int]: + """Extract grants via Haiku vision over rendered page images. + + Returns (grants, pages_scanned, pages_extracted). Raises anthropic.APIError + on catastrophic API failure (caller handles). + """ + client = _get_client() + year_hint = _year_hint(tax_year) + + doc = fitz.open(pdf_path) + try: + total_pages = len(doc) + diagnostics.setdefault("pages_total", total_pages) + + if total_pages <= ATTACHMENT_START_PAGE_IDX: + diagnostics["vision_pages_scanned"] = 0 + diagnostics["vision_pages_extracted"] = 0 + return [], 0, 0 + + # Phase 1: triage pages to find grant tables. + grant_page_indices: list[int] = [] + consecutive_no = 0 + found_any = False + pages_scanned = 0 + + for i in range(ATTACHMENT_START_PAGE_IDX, total_pages): + pages_scanned += 1 + b64 = _render_page_b64(doc[i], dpi=TRIAGE_DPI) + resp = client.messages.create( + model=MODEL, + max_tokens=10, + messages=[{ + "role": "user", + "content": [ + {"type": "image", + "source": {"type": "base64", "media_type": "image/png", "data": b64}}, + {"type": "text", "text": TRIAGE_PROMPT}, + ], + }], + ) + is_grant = "yes" in resp.content[0].text.lower() + if is_grant: + grant_page_indices.append(i) + consecutive_no = 0 + found_any = True + else: + consecutive_no += 1 + if found_any and consecutive_no >= VISION_CONSECUTIVE_NO_LIMIT: + break + + diagnostics["vision_pages_scanned"] = pages_scanned + + if not grant_page_indices: + diagnostics["vision_pages_extracted"] = 0 + return [], pages_scanned, 0 + + # Phase 2: extract from identified grant pages. + raw_grants: list[dict] = [] + pages_extracted = 0 + for i in grant_page_indices: + b64 = _render_page_b64(doc[i], dpi=EXTRACTION_DPI) + resp = client.messages.create( + model=MODEL, + max_tokens=4096, + messages=[{ + "role": "user", + "content": [ + {"type": "image", + "source": {"type": "base64", "media_type": "image/png", "data": b64}}, + {"type": "text", "text": f"{VISION_EXTRACTION_PROMPT}{year_hint}"}, + ], + }], + ) + page_grants = _parse_haiku_response(resp.content[0].text) + if page_grants: + pages_extracted += 1 + raw_grants.extend(page_grants) + + diagnostics["vision_pages_extracted"] = pages_extracted + finally: + doc.close() + + grants: list[ExtractedGrant] = [] + for raw in raw_grants: + g = _normalize_grant(raw) + if g is not None: + grants.append(g) + + return grants, pages_scanned, pages_extracted + + +# --------------------------------------------------------------------------- +# Top-level extractor +# --------------------------------------------------------------------------- + +def extract_from_pdf( + pdf_path: str | Path, + tax_year: int | None = None, + source_label: str | None = None, +) -> ExtractionResult: + """Extract grants from a single 990-PF PDF. + + Stateless: reads only the file at pdf_path and calls Anthropic's API. + Writes nothing. See module docstring for full contract. + """ + pdf_path = Path(pdf_path) + + diagnostics: dict = { + "tax_year_hint": tax_year, + "source_label": source_label, + } + + if not pdf_path.exists(): + diagnostics["error"] = f"PDF not found: {pdf_path}" + return ExtractionResult( + success=False, + grants=[], + grant_detail_status=None, + method="failed", + diagnostics=diagnostics, + ) + + # Text path + try: + text_grants, text_status, num_grant_pages = _extract_text_layer( + pdf_path, tax_year, diagnostics + ) + except Exception as exc: + # pdfplumber can blow up on malformed PDFs — don't let that kill us, + # fall through to the vision path like any other text failure. + text_grants = [] + text_status = "error" + num_grant_pages = 0 + diagnostics.setdefault("text_path_error", f"{type(exc).__name__}: {exc}") + + diagnostics["text_path_status"] = text_status + + if text_status == "ok": + grants = _postprocess(text_grants) + return ExtractionResult( + success=True, + grants=grants, + grant_detail_status="supplemented" if grants else "no_grants", + method="pdfplumber+haiku_text", + diagnostics=diagnostics, + ) + + # Short-circuit: PDF has a readable text layer but isn't a 990-PF + # (e.g. a standalone NY State CHAR500 cover form). Don't run vision — + # there's nothing in the document for it to find. + if text_status == "not_a_990pf": + return ExtractionResult( + success=True, + grants=[], + grant_detail_status="not_a_990pf", + method="skipped_not_990pf", + diagnostics=diagnostics, + ) + + # Vision fallback — replaces text output entirely. + try: + vision_grants, _scanned, _extracted = _extract_vision( + pdf_path, tax_year, diagnostics + ) + except anthropic.APIError as exc: + diagnostics["error"] = f"{type(exc).__name__}: {exc}" + return ExtractionResult( + success=False, + grants=[], + grant_detail_status=None, + method="failed", + diagnostics=diagnostics, + ) + except Exception as exc: + diagnostics["error"] = f"{type(exc).__name__}: {exc}" + return ExtractionResult( + success=False, + grants=[], + grant_detail_status=None, + method="failed", + diagnostics=diagnostics, + ) + + grants = _postprocess(vision_grants) + return ExtractionResult( + success=True, + grants=grants, + grant_detail_status="supplemented" if grants else "no_grants", + method="haiku_vision_attempted", + diagnostics=diagnostics, + ) + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + +def _result_to_jsonable(result: ExtractionResult) -> dict: + def grant_to_dict(g: ExtractedGrant) -> dict: + d = asdict(g) + if d["amount"] is not None: + d["amount"] = str(d["amount"]) + return d + + return { + "success": result.success, + "method": result.method, + "grant_detail_status": result.grant_detail_status, + "diagnostics": result.diagnostics, + "grants": [grant_to_dict(g) for g in result.grants], + "total_amount": str(result.total_amount), + } + + +def _print_table(result: ExtractionResult) -> None: + print(f"success: {result.success}") + print(f"method: {result.method}") + print(f"grant_detail_status: {result.grant_detail_status}") + print(f"grants: {len(result.grants)}") + print(f"total_amount: ${result.total_amount:,}") + print("diagnostics:") + for k, v in result.diagnostics.items(): + print(f" {k}: {v}") + if not result.grants: + return + print() + print(f"{'#':>4} {'recipient':<45} {'city':<20} {'st':<3} {'amount':>12}") + print("-" * 90) + for g in result.grants[:50]: + name = (g.recipient_name or g.recipient_person_name or "")[:45] + city = (g.city or "")[:20] + state = (g.state or "")[:3] + amt = f"${g.amount:,.0f}" if g.amount is not None else (g.amount_raw or "") + print(f"{g.line_number:>4} {name:<45} {city:<20} {state:<3} {amt:>12}") + if len(result.grants) > 50: + print(f"... and {len(result.grants) - 50} more") + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser( + prog="scripts.extract.irs_990_pdf", + description="Extract grant data from a 990-PF PDF.", + ) + parser.add_argument("pdf_path", help="Path to a local PDF file.") + parser.add_argument("--tax-year", type=int, default=None, + help="Tax year hint passed to the extraction prompts.") + parser.add_argument("--source-label", default=None, + help="Diagnostic label for the PDF source (e.g. 'ny_ag').") + parser.add_argument("--json", action="store_true", + help="Emit the full result as JSON instead of a table.") + args = parser.parse_args(argv) + + result = extract_from_pdf( + args.pdf_path, + tax_year=args.tax_year, + source_label=args.source_label, + ) + + if args.json: + print(json.dumps(_result_to_jsonable(result), indent=2)) + else: + _print_table(result) + + return 0 if result.success else 1 + + +if __name__ == "__main__": + sys.exit(main()) |
