ensure parsers do not parse and store raw XML fields

author: benj <benj@rse8.com> 2026-04-10 11:13:57 +0800
committer: benj <benj@rse8.com> 2026-04-10 11:13:57 +0800
commit: 6605e2cc428e3bdaa174ccc432941eab8c5d61cb (patch)
tree: 52f9d176c2ce1a80adb2ea2ac31cd12d3a29c0db /scripts/extract/irs_990_pdf.py
parent: 493746b14c1251a45b061d2e3edd9160c929d2b9 (diff)
download: tidyindex-6605e2cc428e3bdaa174ccc432941eab8c5d61cb.tar
tidyindex-6605e2cc428e3bdaa174ccc432941eab8c5d61cb.tar.gz
tidyindex-6605e2cc428e3bdaa174ccc432941eab8c5d61cb.tar.bz2
tidyindex-6605e2cc428e3bdaa174ccc432941eab8c5d61cb.tar.lz
tidyindex-6605e2cc428e3bdaa174ccc432941eab8c5d61cb.tar.xz
tidyindex-6605e2cc428e3bdaa174ccc432941eab8c5d61cb.tar.zst
tidyindex-6605e2cc428e3bdaa174ccc432941eab8c5d61cb.zip
1 files changed, 699 insertions, 0 deletions
diff --git a/scripts/extract/irs_990_pdf.py b/scripts/extract/irs_990_pdf.py
new file mode 100644
index 0000000..1d1209c
--- /dev/null
+++ b/scripts/extract/irs_990_pdf.py
@@ -0,0 +1,699 @@
+"""
+Source-agnostic 990-PF PDF grant extractor.
+
+Takes a path to a local PDF and returns structured grant data. Side-effect
+free with respect to any database; only external call is to Anthropic's API.
+
+A separate loader consumes ExtractionResult and writes to raw.* tables.
+
+Usage (CLI):
+    python -m scripts.extract.irs_990_pdf path/to/file.pdf
+    python -m scripts.extract.irs_990_pdf path/to/file.pdf --json
+    python -m scripts.extract.irs_990_pdf path/to/file.pdf --tax-year 2021
+    python -m scripts.extract.irs_990_pdf path/to/file.pdf --source-label ny_ag
+
+Usage (programmatic):
+    from scripts.extract.irs_990_pdf import extract_from_pdf
+    result = extract_from_pdf("data/tmp/pdf_test/marley_2017.pdf", tax_year=2017)
+    for grant in result.grants:
+        print(grant.recipient_name, grant.amount)
+"""
+
+from __future__ import annotations
+
+import argparse
+import base64
+import json
+import re
+import subprocess
+import sys
+from dataclasses import dataclass, field, asdict
+from decimal import Decimal, InvalidOperation
+from pathlib import Path
+from typing import Any
+
+import anthropic
+import fitz  # pymupdf
+import pdfplumber
+
+from scripts.common.normalize import is_placeholder, parse_numeric
+
+
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+
+MODEL = "claude-haiku-4-5-20251001"
+
+# Text-path quality heuristic. If the number of grants extracted is less than
+# num_grant_pages * MIN_GRANTS_PER_PAGE, we flag the result as low-yield and
+# fall back to vision. This is an initial heuristic and may be tuned.
+MIN_GRANTS_PER_PAGE = 1
+
+# Minimum total text characters across all pages before we trust the text layer.
+MIN_TEXT_LAYER_CHARS = 200
+
+# Standard 990-PF form has 13 pages; grant attachments come after.
+ATTACHMENT_START_PAGE_IDX = 13  # 0-based; corresponds to page 14
+
+# Vision scan stops after this many consecutive non-grant pages (once any
+# grant page has been found).
+VISION_CONSECUTIVE_NO_LIMIT = 3
+
+# DPIs used for vision rendering.
+TRIAGE_DPI = 100
+EXTRACTION_DPI = 150
+
+GRANT_PAGE_KEYWORDS = [
+    r"GRANTS AND CONTRIBUTIONS PAID",
+    r"SUPPLEMENTARY INFORMATION",
+    r"PART XIV",
+    r"PART XV",
+    r"SCHEDULE OF.*GRANT",
+    r"GRANTS PAID",
+    r"CONTRIBUTIONS PAID",
+]
+_GRANT_PAGE_RE = re.compile("|".join(GRANT_PAGE_KEYWORDS))
+
+# Loose marker for "this PDF is a 990-PF". NY AG PDFs may contain only the
+# CHAR500 state cover form with no federal return attached — ~46% of that
+# corpus, based on sampling. Those should short-circuit before we waste any
+# Haiku calls.
+_IS_990PF_RE = re.compile(r"FORM\s*990-?PF")
+
+TEXT_EXTRACTION_PROMPT = """Extract ALL individual grants from this 990-PF page text.
+
+Return a JSON array. Each grant should have:
+- recipient_name: organization name (if grant is to an org)
+- recipient_person_name: person's name (if grant is to an individual)
+- address_line1: street address (if present)
+- address_line2: second address line (if present)
+- city: city (if present)
+- state: state abbreviation (if present)
+- zip: zip code (if present)
+- country: country (if present, only when non-US)
+- foreign_postal_code: foreign postal code (if present)
+- amount: dollar amount as string (digits only, no $ or commas)
+- purpose: purpose of grant (if present)
+- foundation_status: recipient status like PC, NC, PF (if present)
+- relationship: relationship of recipient to foundation (if present)
+
+IMPORTANT RULES:
+- Do NOT include total/subtotal rows
+- Do NOT include header rows or column labels
+- If there are multiple year columns, extract ONLY the most recent year
+- If recipient name is "SEE ATTACHED", "VARIOUS", "N/A", "NONE", or a similar placeholder, skip it
+- Return [] if no individual grants are found
+
+Return ONLY the JSON array, no other text."""
+
+VISION_EXTRACTION_PROMPT = """Extract ALL individual grants from this 990-PF page image.
+
+Return a JSON array. Each grant should have:
+- recipient_name: organization name (if grant is to an org)
+- recipient_person_name: person's name (if grant is to an individual)
+- address_line1: street address (if visible)
+- address_line2: second address line (if visible)
+- city: city
+- state: state abbreviation
+- zip: zip code (if visible)
+- country: country (only if non-US)
+- foreign_postal_code: foreign postal code (if present)
+- amount: dollar amount as string (digits only, no $ or commas)
+- purpose: purpose of grant
+- foundation_status: recipient status like PC, NC, PF (if present)
+- relationship: relationship of recipient to foundation (if present)
+
+IMPORTANT RULES:
+- Do NOT include total/subtotal rows
+- Do NOT include header rows
+- If there are multiple year columns, extract ONLY the most recent year
+- If recipient name is "SEE ATTACHED", "VARIOUS", "N/A", "NONE", or a similar placeholder, skip it
+- Return [] if no individual grants are found
+
+Return ONLY the JSON array, no other text."""
+
+TRIAGE_PROMPT = (
+    "Is this page a table of grant or contribution recipients listing "
+    "individual organization names with addresses and dollar amounts? "
+    "Answer ONLY yes or no."
+)
+
+
+# ---------------------------------------------------------------------------
+# Anthropic client (lazy singleton so importing this module is cheap)
+# ---------------------------------------------------------------------------
+
+_client: anthropic.Anthropic | None = None
+
+
+def _get_client() -> anthropic.Anthropic:
+    global _client
+    if _client is None:
+        api_key = subprocess.run(
+            ["pass", "show", "anthropic.com/api.anthropic.com/apikey"],
+            capture_output=True, text=True,
+        ).stdout.strip()
+        _client = anthropic.Anthropic(api_key=api_key)
+    return _client
+
+
+# ---------------------------------------------------------------------------
+# Dataclasses
+# ---------------------------------------------------------------------------
+
+@dataclass
+class ExtractedGrant:
+    line_number: int
+    recipient_name: str | None = None
+    recipient_name2: str | None = None
+    recipient_person_name: str | None = None
+    address_line1: str | None = None
+    address_line2: str | None = None
+    city: str | None = None
+    state: str | None = None
+    zip: str | None = None
+    country: str | None = None
+    foreign_postal_code: str | None = None
+    amount_raw: str | None = None
+    amount: Decimal | None = None
+    purpose: str | None = None
+    foundation_status: str | None = None
+    relationship: str | None = None
+
+
+@dataclass
+class ExtractionResult:
+    success: bool
+    grants: list[ExtractedGrant]
+    # 'supplemented' — ≥1 grant extracted
+    # 'no_grants'    — extractor ran end-to-end, found nothing usable
+    # 'not_a_990pf'  — PDF is readable but isn't a 990-PF (e.g. CHAR500 cover)
+    # None           — catastrophic failure (success=False)
+    grant_detail_status: str | None
+    # 'pdfplumber+haiku_text'   — text path was used
+    # 'haiku_vision_attempted'  — vision path was used (success not implied)
+    # 'skipped_not_990pf'       — short-circuited; no API calls made
+    # 'failed'                  — catastrophic failure (success=False)
+    method: str
+    diagnostics: dict = field(default_factory=dict)
+
+    @property
+    def total_amount(self) -> Decimal:
+        return sum((g.amount or Decimal(0)) for g in self.grants)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _parse_haiku_response(text: str) -> list[dict]:
+    """Parse JSON from Haiku response, handling markdown code blocks."""
+    text = text.strip()
+    if text.startswith("```"):
+        # Drop opening fence + optional language tag
+        parts = text.split("\n", 1)
+        if len(parts) == 2:
+            text = parts[1]
+        text = text.rsplit("```", 1)[0]
+    text = text.strip()
+    try:
+        parsed = json.loads(text)
+    except json.JSONDecodeError:
+        return []
+    if not isinstance(parsed, list):
+        return []
+    return [x for x in parsed if isinstance(x, dict)]
+
+
+def _is_grant_page_text(text: str) -> bool:
+    return bool(_GRANT_PAGE_RE.search(text.upper()))
+
+
+def _clean_str(value: Any) -> str | None:
+    if value is None:
+        return None
+    s = str(value).strip()
+    return s or None
+
+
+def _normalize_grant(raw: dict) -> ExtractedGrant | None:
+    """Map a loose Haiku JSON dict to an ExtractedGrant.
+
+    Returns None if the row looks like a placeholder/header artifact.
+    line_number is set to 0 here; it's reassigned later after dedupe.
+    """
+    recipient_name = _clean_str(raw.get("recipient_name"))
+    recipient_person_name = _clean_str(raw.get("recipient_person_name"))
+
+    # Safety net: prompt already filters these, but double-check.
+    if recipient_name and is_placeholder(recipient_name):
+        recipient_name = None
+    if recipient_person_name and is_placeholder(recipient_person_name):
+        recipient_person_name = None
+
+    amount_raw = _clean_str(raw.get("amount"))
+    amount_numeric: Decimal | None = None
+    if amount_raw is not None:
+        parsed = parse_numeric(amount_raw)
+        if parsed is not None:
+            try:
+                amount_numeric = Decimal(parsed)
+            except InvalidOperation:
+                amount_numeric = None
+
+    return ExtractedGrant(
+        line_number=0,
+        recipient_name=recipient_name,
+        recipient_name2=_clean_str(raw.get("recipient_name2")),
+        recipient_person_name=recipient_person_name,
+        address_line1=_clean_str(raw.get("address_line1")),
+        address_line2=_clean_str(raw.get("address_line2")),
+        city=_clean_str(raw.get("city")),
+        state=_clean_str(raw.get("state")),
+        zip=_clean_str(raw.get("zip")),
+        country=_clean_str(raw.get("country")),
+        foreign_postal_code=_clean_str(raw.get("foreign_postal_code")),
+        amount_raw=amount_raw,
+        amount=amount_numeric,
+        purpose=_clean_str(raw.get("purpose")),
+        foundation_status=_clean_str(raw.get("foundation_status")),
+        relationship=_clean_str(raw.get("relationship")),
+    )
+
+
+def _year_hint(tax_year: int | None) -> str:
+    if tax_year is None:
+        return ""
+    return f"\n\nThis filing is for tax year {tax_year}."
+
+
+def _postprocess(grants: list[ExtractedGrant]) -> list[ExtractedGrant]:
+    """Drop placeholders / amount-less rows, dedupe, and reassign line_number."""
+    out: list[ExtractedGrant] = []
+    seen: set[tuple] = set()
+
+    for g in grants:
+        # Drop rows with no recipient at all.
+        if not g.recipient_name and not g.recipient_person_name:
+            continue
+        # Drop placeholder recipients that slipped through (belt-and-suspenders).
+        if g.recipient_name and is_placeholder(g.recipient_name):
+            continue
+        if g.recipient_person_name and is_placeholder(g.recipient_person_name):
+            continue
+        # Drop rows with no amount text at all (Haiku didn't see an amount column).
+        # Keep rows where amount_raw == '0' and amount == Decimal(0).
+        if g.amount is None and not g.amount_raw:
+            continue
+
+        key = (
+            (g.recipient_name or "").upper(),
+            (g.city or "").upper(),
+            (g.state or "").upper(),
+            g.amount_raw or "",
+        )
+        if key in seen:
+            continue
+        seen.add(key)
+        out.append(g)
+
+    for i, g in enumerate(out, start=1):
+        g.line_number = i
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Text path
+# ---------------------------------------------------------------------------
+
+def _extract_text_layer(
+    pdf_path: Path,
+    tax_year: int | None,
+    diagnostics: dict,
+) -> tuple[list[ExtractedGrant], str, int]:
+    """Extract grants via pdfplumber text + Haiku text parsing.
+
+    Returns (grants, status, num_grant_pages).
+    status ∈ {'ok', 'not_a_990pf', 'no_text_layer', 'no_grant_pages',
+              'haiku_empty', 'low_yield', 'error'}
+
+    'not_a_990pf' means the PDF has a readable text layer but doesn't look
+    like a 990-PF at all (e.g. a standalone NY State CHAR500 cover form).
+    The caller should short-circuit on this — there's nothing for the vision
+    path to find either.
+    """
+    try:
+        pdf = pdfplumber.open(pdf_path)
+    except Exception as exc:
+        diagnostics["text_path_error"] = str(exc)
+        return [], "error", 0
+
+    try:
+        page_texts: list[tuple[int, str]] = []
+        total_chars = 0
+        for idx, page in enumerate(pdf.pages, start=1):
+            text = page.extract_text() or ""
+            page_texts.append((idx, text))
+            total_chars += len(text)
+
+        diagnostics["text_layer_chars"] = total_chars
+        diagnostics["pages_total"] = len(page_texts)
+
+        if total_chars < MIN_TEXT_LAYER_CHARS:
+            return [], "no_text_layer", 0
+
+        # Must look like a 990-PF at all. If not, don't bother with Haiku —
+        # and don't fall through to vision, since it'd just burn triage calls
+        # scanning a non-990-PF document end to end.
+        joined_upper = "\n".join(t for _, t in page_texts).upper()
+        if not _IS_990PF_RE.search(joined_upper):
+            return [], "not_a_990pf", 0
+
+        grant_pages = [
+            (num, text) for num, text in page_texts
+            if len(text) > 100 and _is_grant_page_text(text)
+        ]
+        diagnostics["grant_pages_identified"] = len(grant_pages)
+
+        if not grant_pages:
+            return [], "no_grant_pages", 0
+    finally:
+        pdf.close()
+
+    client = _get_client()
+    year_hint = _year_hint(tax_year)
+
+    raw_grants: list[dict] = []
+    try:
+        for page_num, text in grant_pages:
+            resp = client.messages.create(
+                model=MODEL,
+                max_tokens=4096,
+                messages=[{
+                    "role": "user",
+                    "content": f"{TEXT_EXTRACTION_PROMPT}{year_hint}\n\nPage text:\n{text}",
+                }],
+            )
+            raw_grants.extend(_parse_haiku_response(resp.content[0].text))
+    except anthropic.APIError as exc:
+        # Discard partial results — fall back to vision cleanly.
+        diagnostics["text_path_error"] = f"{type(exc).__name__}: {exc}"
+        return [], "error", len(grant_pages)
+
+    if not raw_grants:
+        return [], "haiku_empty", len(grant_pages)
+
+    grants: list[ExtractedGrant] = []
+    for raw in raw_grants:
+        g = _normalize_grant(raw)
+        if g is not None:
+            grants.append(g)
+
+    if len(grants) < len(grant_pages) * MIN_GRANTS_PER_PAGE:
+        return grants, "low_yield", len(grant_pages)
+
+    return grants, "ok", len(grant_pages)
+
+
+# ---------------------------------------------------------------------------
+# Vision path
+# ---------------------------------------------------------------------------
+
+def _render_page_b64(page, dpi: int) -> str:
+    pix = page.get_pixmap(dpi=dpi)
+    return base64.standard_b64encode(pix.tobytes("png")).decode("utf-8")
+
+
+def _extract_vision(
+    pdf_path: Path,
+    tax_year: int | None,
+    diagnostics: dict,
+) -> tuple[list[ExtractedGrant], int, int]:
+    """Extract grants via Haiku vision over rendered page images.
+
+    Returns (grants, pages_scanned, pages_extracted). Raises anthropic.APIError
+    on catastrophic API failure (caller handles).
+    """
+    client = _get_client()
+    year_hint = _year_hint(tax_year)
+
+    doc = fitz.open(pdf_path)
+    try:
+        total_pages = len(doc)
+        diagnostics.setdefault("pages_total", total_pages)
+
+        if total_pages <= ATTACHMENT_START_PAGE_IDX:
+            diagnostics["vision_pages_scanned"] = 0
+            diagnostics["vision_pages_extracted"] = 0
+            return [], 0, 0
+
+        # Phase 1: triage pages to find grant tables.
+        grant_page_indices: list[int] = []
+        consecutive_no = 0
+        found_any = False
+        pages_scanned = 0
+
+        for i in range(ATTACHMENT_START_PAGE_IDX, total_pages):
+            pages_scanned += 1
+            b64 = _render_page_b64(doc[i], dpi=TRIAGE_DPI)
+            resp = client.messages.create(
+                model=MODEL,
+                max_tokens=10,
+                messages=[{
+                    "role": "user",
+                    "content": [
+                        {"type": "image",
+                         "source": {"type": "base64", "media_type": "image/png", "data": b64}},
+                        {"type": "text", "text": TRIAGE_PROMPT},
+                    ],
+                }],
+            )
+            is_grant = "yes" in resp.content[0].text.lower()
+            if is_grant:
+                grant_page_indices.append(i)
+                consecutive_no = 0
+                found_any = True
+            else:
+                consecutive_no += 1
+                if found_any and consecutive_no >= VISION_CONSECUTIVE_NO_LIMIT:
+                    break
+
+        diagnostics["vision_pages_scanned"] = pages_scanned
+
+        if not grant_page_indices:
+            diagnostics["vision_pages_extracted"] = 0
+            return [], pages_scanned, 0
+
+        # Phase 2: extract from identified grant pages.
+        raw_grants: list[dict] = []
+        pages_extracted = 0
+        for i in grant_page_indices:
+            b64 = _render_page_b64(doc[i], dpi=EXTRACTION_DPI)
+            resp = client.messages.create(
+                model=MODEL,
+                max_tokens=4096,
+                messages=[{
+                    "role": "user",
+                    "content": [
+                        {"type": "image",
+                         "source": {"type": "base64", "media_type": "image/png", "data": b64}},
+                        {"type": "text", "text": f"{VISION_EXTRACTION_PROMPT}{year_hint}"},
+                    ],
+                }],
+            )
+            page_grants = _parse_haiku_response(resp.content[0].text)
+            if page_grants:
+                pages_extracted += 1
+                raw_grants.extend(page_grants)
+
+        diagnostics["vision_pages_extracted"] = pages_extracted
+    finally:
+        doc.close()
+
+    grants: list[ExtractedGrant] = []
+    for raw in raw_grants:
+        g = _normalize_grant(raw)
+        if g is not None:
+            grants.append(g)
+
+    return grants, pages_scanned, pages_extracted
+
+
+# ---------------------------------------------------------------------------
+# Top-level extractor
+# ---------------------------------------------------------------------------
+
+def extract_from_pdf(
+    pdf_path: str | Path,
+    tax_year: int | None = None,
+    source_label: str | None = None,
+) -> ExtractionResult:
+    """Extract grants from a single 990-PF PDF.
+
+    Stateless: reads only the file at pdf_path and calls Anthropic's API.
+    Writes nothing. See module docstring for full contract.
+    """
+    pdf_path = Path(pdf_path)
+
+    diagnostics: dict = {
+        "tax_year_hint": tax_year,
+        "source_label": source_label,
+    }
+
+    if not pdf_path.exists():
+        diagnostics["error"] = f"PDF not found: {pdf_path}"
+        return ExtractionResult(
+            success=False,
+            grants=[],
+            grant_detail_status=None,
+            method="failed",
+            diagnostics=diagnostics,
+        )
+
+    # Text path
+    try:
+        text_grants, text_status, num_grant_pages = _extract_text_layer(
+            pdf_path, tax_year, diagnostics
+        )
+    except Exception as exc:
+        # pdfplumber can blow up on malformed PDFs — don't let that kill us,
+        # fall through to the vision path like any other text failure.
+        text_grants = []
+        text_status = "error"
+        num_grant_pages = 0
+        diagnostics.setdefault("text_path_error", f"{type(exc).__name__}: {exc}")
+
+    diagnostics["text_path_status"] = text_status
+
+    if text_status == "ok":
+        grants = _postprocess(text_grants)
+        return ExtractionResult(
+            success=True,
+            grants=grants,
+            grant_detail_status="supplemented" if grants else "no_grants",
+            method="pdfplumber+haiku_text",
+            diagnostics=diagnostics,
+        )
+
+    # Short-circuit: PDF has a readable text layer but isn't a 990-PF
+    # (e.g. a standalone NY State CHAR500 cover form). Don't run vision —
+    # there's nothing in the document for it to find.
+    if text_status == "not_a_990pf":
+        return ExtractionResult(
+            success=True,
+            grants=[],
+            grant_detail_status="not_a_990pf",
+            method="skipped_not_990pf",
+            diagnostics=diagnostics,
+        )
+
+    # Vision fallback — replaces text output entirely.
+    try:
+        vision_grants, _scanned, _extracted = _extract_vision(
+            pdf_path, tax_year, diagnostics
+        )
+    except anthropic.APIError as exc:
+        diagnostics["error"] = f"{type(exc).__name__}: {exc}"
+        return ExtractionResult(
+            success=False,
+            grants=[],
+            grant_detail_status=None,
+            method="failed",
+            diagnostics=diagnostics,
+        )
+    except Exception as exc:
+        diagnostics["error"] = f"{type(exc).__name__}: {exc}"
+        return ExtractionResult(
+            success=False,
+            grants=[],
+            grant_detail_status=None,
+            method="failed",
+            diagnostics=diagnostics,
+        )
+
+    grants = _postprocess(vision_grants)
+    return ExtractionResult(
+        success=True,
+        grants=grants,
+        grant_detail_status="supplemented" if grants else "no_grants",
+        method="haiku_vision_attempted",
+        diagnostics=diagnostics,
+    )
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+def _result_to_jsonable(result: ExtractionResult) -> dict:
+    def grant_to_dict(g: ExtractedGrant) -> dict:
+        d = asdict(g)
+        if d["amount"] is not None:
+            d["amount"] = str(d["amount"])
+        return d
+
+    return {
+        "success": result.success,
+        "method": result.method,
+        "grant_detail_status": result.grant_detail_status,
+        "diagnostics": result.diagnostics,
+        "grants": [grant_to_dict(g) for g in result.grants],
+        "total_amount": str(result.total_amount),
+    }
+
+
+def _print_table(result: ExtractionResult) -> None:
+    print(f"success:             {result.success}")
+    print(f"method:              {result.method}")
+    print(f"grant_detail_status: {result.grant_detail_status}")
+    print(f"grants:              {len(result.grants)}")
+    print(f"total_amount:        ${result.total_amount:,}")
+    print("diagnostics:")
+    for k, v in result.diagnostics.items():
+        print(f"  {k}: {v}")
+    if not result.grants:
+        return
+    print()
+    print(f"{'#':>4}  {'recipient':<45}  {'city':<20} {'st':<3} {'amount':>12}")
+    print("-" * 90)
+    for g in result.grants[:50]:
+        name = (g.recipient_name or g.recipient_person_name or "")[:45]
+        city = (g.city or "")[:20]
+        state = (g.state or "")[:3]
+        amt = f"${g.amount:,.0f}" if g.amount is not None else (g.amount_raw or "")
+        print(f"{g.line_number:>4}  {name:<45}  {city:<20} {state:<3} {amt:>12}")
+    if len(result.grants) > 50:
+        print(f"... and {len(result.grants) - 50} more")
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(
+        prog="scripts.extract.irs_990_pdf",
+        description="Extract grant data from a 990-PF PDF.",
+    )
+    parser.add_argument("pdf_path", help="Path to a local PDF file.")
+    parser.add_argument("--tax-year", type=int, default=None,
+                        help="Tax year hint passed to the extraction prompts.")
+    parser.add_argument("--source-label", default=None,
+                        help="Diagnostic label for the PDF source (e.g. 'ny_ag').")
+    parser.add_argument("--json", action="store_true",
+                        help="Emit the full result as JSON instead of a table.")
+    args = parser.parse_args(argv)
+
+    result = extract_from_pdf(
+        args.pdf_path,
+        tax_year=args.tax_year,
+        source_label=args.source_label,
+    )
+
+    if args.json:
+        print(json.dumps(_result_to_jsonable(result), indent=2))
+    else:
+        _print_table(result)
+
+    return 0 if result.success else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
author	benj <benj@rse8.com>	2026-04-10 11:13:57 +0800
committer	benj <benj@rse8.com>	2026-04-10 11:13:57 +0800
commit	6605e2cc428e3bdaa174ccc432941eab8c5d61cb (patch)
tree	52f9d176c2ce1a80adb2ea2ac31cd12d3a29c0db /scripts/extract/irs_990_pdf.py
parent	493746b14c1251a45b061d2e3edd9160c929d2b9 (diff)
download	tidyindex-6605e2cc428e3bdaa174ccc432941eab8c5d61cb.tar tidyindex-6605e2cc428e3bdaa174ccc432941eab8c5d61cb.tar.gz tidyindex-6605e2cc428e3bdaa174ccc432941eab8c5d61cb.tar.bz2 tidyindex-6605e2cc428e3bdaa174ccc432941eab8c5d61cb.tar.lz tidyindex-6605e2cc428e3bdaa174ccc432941eab8c5d61cb.tar.xz tidyindex-6605e2cc428e3bdaa174ccc432941eab8c5d61cb.tar.zst tidyindex-6605e2cc428e3bdaa174ccc432941eab8c5d61cb.zip