aboutsummaryrefslogtreecommitdiff
path: root/scripts/extract/irs_990_pdf.py
diff options
context:
space:
mode:
authorbenj <benj@rse8.com>2026-04-10 11:13:57 +0800
committerbenj <benj@rse8.com>2026-04-10 11:13:57 +0800
commit6605e2cc428e3bdaa174ccc432941eab8c5d61cb (patch)
tree52f9d176c2ce1a80adb2ea2ac31cd12d3a29c0db /scripts/extract/irs_990_pdf.py
parent493746b14c1251a45b061d2e3edd9160c929d2b9 (diff)
downloadtidyindex-6605e2cc428e3bdaa174ccc432941eab8c5d61cb.tar
tidyindex-6605e2cc428e3bdaa174ccc432941eab8c5d61cb.tar.gz
tidyindex-6605e2cc428e3bdaa174ccc432941eab8c5d61cb.tar.bz2
tidyindex-6605e2cc428e3bdaa174ccc432941eab8c5d61cb.tar.lz
tidyindex-6605e2cc428e3bdaa174ccc432941eab8c5d61cb.tar.xz
tidyindex-6605e2cc428e3bdaa174ccc432941eab8c5d61cb.tar.zst
tidyindex-6605e2cc428e3bdaa174ccc432941eab8c5d61cb.zip
ensure parsers do not parse and store raw XML fields
Diffstat (limited to 'scripts/extract/irs_990_pdf.py')
-rw-r--r--scripts/extract/irs_990_pdf.py699
1 files changed, 699 insertions, 0 deletions
diff --git a/scripts/extract/irs_990_pdf.py b/scripts/extract/irs_990_pdf.py
new file mode 100644
index 0000000..1d1209c
--- /dev/null
+++ b/scripts/extract/irs_990_pdf.py
@@ -0,0 +1,699 @@
+"""
+Source-agnostic 990-PF PDF grant extractor.
+
+Takes a path to a local PDF and returns structured grant data. Side-effect
+free with respect to any database; only external call is to Anthropic's API.
+
+A separate loader consumes ExtractionResult and writes to raw.* tables.
+
+Usage (CLI):
+ python -m scripts.extract.irs_990_pdf path/to/file.pdf
+ python -m scripts.extract.irs_990_pdf path/to/file.pdf --json
+ python -m scripts.extract.irs_990_pdf path/to/file.pdf --tax-year 2021
+ python -m scripts.extract.irs_990_pdf path/to/file.pdf --source-label ny_ag
+
+Usage (programmatic):
+ from scripts.extract.irs_990_pdf import extract_from_pdf
+ result = extract_from_pdf("data/tmp/pdf_test/marley_2017.pdf", tax_year=2017)
+ for grant in result.grants:
+ print(grant.recipient_name, grant.amount)
+"""
+
+from __future__ import annotations
+
+import argparse
+import base64
+import json
+import re
+import subprocess
+import sys
+from dataclasses import dataclass, field, asdict
+from decimal import Decimal, InvalidOperation
+from pathlib import Path
+from typing import Any
+
+import anthropic
+import fitz # pymupdf
+import pdfplumber
+
+from scripts.common.normalize import is_placeholder, parse_numeric
+
+
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+
+MODEL = "claude-haiku-4-5-20251001"
+
+# Text-path quality heuristic. If the number of grants extracted is less than
+# num_grant_pages * MIN_GRANTS_PER_PAGE, we flag the result as low-yield and
+# fall back to vision. This is an initial heuristic and may be tuned.
+MIN_GRANTS_PER_PAGE = 1
+
+# Minimum total text characters across all pages before we trust the text layer.
+MIN_TEXT_LAYER_CHARS = 200
+
+# Standard 990-PF form has 13 pages; grant attachments come after.
+ATTACHMENT_START_PAGE_IDX = 13 # 0-based; corresponds to page 14
+
+# Vision scan stops after this many consecutive non-grant pages (once any
+# grant page has been found).
+VISION_CONSECUTIVE_NO_LIMIT = 3
+
+# DPIs used for vision rendering.
+TRIAGE_DPI = 100
+EXTRACTION_DPI = 150
+
+GRANT_PAGE_KEYWORDS = [
+ r"GRANTS AND CONTRIBUTIONS PAID",
+ r"SUPPLEMENTARY INFORMATION",
+ r"PART XIV",
+ r"PART XV",
+ r"SCHEDULE OF.*GRANT",
+ r"GRANTS PAID",
+ r"CONTRIBUTIONS PAID",
+]
+_GRANT_PAGE_RE = re.compile("|".join(GRANT_PAGE_KEYWORDS))
+
+# Loose marker for "this PDF is a 990-PF". NY AG PDFs may contain only the
+# CHAR500 state cover form with no federal return attached — ~46% of that
+# corpus, based on sampling. Those should short-circuit before we waste any
+# Haiku calls.
+_IS_990PF_RE = re.compile(r"FORM\s*990-?PF")
+
+TEXT_EXTRACTION_PROMPT = """Extract ALL individual grants from this 990-PF page text.
+
+Return a JSON array. Each grant should have:
+- recipient_name: organization name (if grant is to an org)
+- recipient_person_name: person's name (if grant is to an individual)
+- address_line1: street address (if present)
+- address_line2: second address line (if present)
+- city: city (if present)
+- state: state abbreviation (if present)
+- zip: zip code (if present)
+- country: country (if present, only when non-US)
+- foreign_postal_code: foreign postal code (if present)
+- amount: dollar amount as string (digits only, no $ or commas)
+- purpose: purpose of grant (if present)
+- foundation_status: recipient status like PC, NC, PF (if present)
+- relationship: relationship of recipient to foundation (if present)
+
+IMPORTANT RULES:
+- Do NOT include total/subtotal rows
+- Do NOT include header rows or column labels
+- If there are multiple year columns, extract ONLY the most recent year
+- If recipient name is "SEE ATTACHED", "VARIOUS", "N/A", "NONE", or a similar placeholder, skip it
+- Return [] if no individual grants are found
+
+Return ONLY the JSON array, no other text."""
+
+VISION_EXTRACTION_PROMPT = """Extract ALL individual grants from this 990-PF page image.
+
+Return a JSON array. Each grant should have:
+- recipient_name: organization name (if grant is to an org)
+- recipient_person_name: person's name (if grant is to an individual)
+- address_line1: street address (if visible)
+- address_line2: second address line (if visible)
+- city: city
+- state: state abbreviation
+- zip: zip code (if visible)
+- country: country (only if non-US)
+- foreign_postal_code: foreign postal code (if present)
+- amount: dollar amount as string (digits only, no $ or commas)
+- purpose: purpose of grant
+- foundation_status: recipient status like PC, NC, PF (if present)
+- relationship: relationship of recipient to foundation (if present)
+
+IMPORTANT RULES:
+- Do NOT include total/subtotal rows
+- Do NOT include header rows
+- If there are multiple year columns, extract ONLY the most recent year
+- If recipient name is "SEE ATTACHED", "VARIOUS", "N/A", "NONE", or a similar placeholder, skip it
+- Return [] if no individual grants are found
+
+Return ONLY the JSON array, no other text."""
+
+TRIAGE_PROMPT = (
+ "Is this page a table of grant or contribution recipients listing "
+ "individual organization names with addresses and dollar amounts? "
+ "Answer ONLY yes or no."
+)
+
+
+# ---------------------------------------------------------------------------
+# Anthropic client (lazy singleton so importing this module is cheap)
+# ---------------------------------------------------------------------------
+
+_client: anthropic.Anthropic | None = None
+
+
+def _get_client() -> anthropic.Anthropic:
+ global _client
+ if _client is None:
+ api_key = subprocess.run(
+ ["pass", "show", "anthropic.com/api.anthropic.com/apikey"],
+ capture_output=True, text=True,
+ ).stdout.strip()
+ _client = anthropic.Anthropic(api_key=api_key)
+ return _client
+
+
+# ---------------------------------------------------------------------------
+# Dataclasses
+# ---------------------------------------------------------------------------
+
+@dataclass
+class ExtractedGrant:
+ line_number: int
+ recipient_name: str | None = None
+ recipient_name2: str | None = None
+ recipient_person_name: str | None = None
+ address_line1: str | None = None
+ address_line2: str | None = None
+ city: str | None = None
+ state: str | None = None
+ zip: str | None = None
+ country: str | None = None
+ foreign_postal_code: str | None = None
+ amount_raw: str | None = None
+ amount: Decimal | None = None
+ purpose: str | None = None
+ foundation_status: str | None = None
+ relationship: str | None = None
+
+
+@dataclass
+class ExtractionResult:
+ success: bool
+ grants: list[ExtractedGrant]
+ # 'supplemented' — ≥1 grant extracted
+ # 'no_grants' — extractor ran end-to-end, found nothing usable
+ # 'not_a_990pf' — PDF is readable but isn't a 990-PF (e.g. CHAR500 cover)
+ # None — catastrophic failure (success=False)
+ grant_detail_status: str | None
+ # 'pdfplumber+haiku_text' — text path was used
+ # 'haiku_vision_attempted' — vision path was used (success not implied)
+ # 'skipped_not_990pf' — short-circuited; no API calls made
+ # 'failed' — catastrophic failure (success=False)
+ method: str
+ diagnostics: dict = field(default_factory=dict)
+
+ @property
+ def total_amount(self) -> Decimal:
+ return sum((g.amount or Decimal(0)) for g in self.grants)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _parse_haiku_response(text: str) -> list[dict]:
+ """Parse JSON from Haiku response, handling markdown code blocks."""
+ text = text.strip()
+ if text.startswith("```"):
+ # Drop opening fence + optional language tag
+ parts = text.split("\n", 1)
+ if len(parts) == 2:
+ text = parts[1]
+ text = text.rsplit("```", 1)[0]
+ text = text.strip()
+ try:
+ parsed = json.loads(text)
+ except json.JSONDecodeError:
+ return []
+ if not isinstance(parsed, list):
+ return []
+ return [x for x in parsed if isinstance(x, dict)]
+
+
+def _is_grant_page_text(text: str) -> bool:
+ return bool(_GRANT_PAGE_RE.search(text.upper()))
+
+
+def _clean_str(value: Any) -> str | None:
+ if value is None:
+ return None
+ s = str(value).strip()
+ return s or None
+
+
+def _normalize_grant(raw: dict) -> ExtractedGrant | None:
+ """Map a loose Haiku JSON dict to an ExtractedGrant.
+
+ Returns None if the row looks like a placeholder/header artifact.
+ line_number is set to 0 here; it's reassigned later after dedupe.
+ """
+ recipient_name = _clean_str(raw.get("recipient_name"))
+ recipient_person_name = _clean_str(raw.get("recipient_person_name"))
+
+ # Safety net: prompt already filters these, but double-check.
+ if recipient_name and is_placeholder(recipient_name):
+ recipient_name = None
+ if recipient_person_name and is_placeholder(recipient_person_name):
+ recipient_person_name = None
+
+ amount_raw = _clean_str(raw.get("amount"))
+ amount_numeric: Decimal | None = None
+ if amount_raw is not None:
+ parsed = parse_numeric(amount_raw)
+ if parsed is not None:
+ try:
+ amount_numeric = Decimal(parsed)
+ except InvalidOperation:
+ amount_numeric = None
+
+ return ExtractedGrant(
+ line_number=0,
+ recipient_name=recipient_name,
+ recipient_name2=_clean_str(raw.get("recipient_name2")),
+ recipient_person_name=recipient_person_name,
+ address_line1=_clean_str(raw.get("address_line1")),
+ address_line2=_clean_str(raw.get("address_line2")),
+ city=_clean_str(raw.get("city")),
+ state=_clean_str(raw.get("state")),
+ zip=_clean_str(raw.get("zip")),
+ country=_clean_str(raw.get("country")),
+ foreign_postal_code=_clean_str(raw.get("foreign_postal_code")),
+ amount_raw=amount_raw,
+ amount=amount_numeric,
+ purpose=_clean_str(raw.get("purpose")),
+ foundation_status=_clean_str(raw.get("foundation_status")),
+ relationship=_clean_str(raw.get("relationship")),
+ )
+
+
+def _year_hint(tax_year: int | None) -> str:
+ if tax_year is None:
+ return ""
+ return f"\n\nThis filing is for tax year {tax_year}."
+
+
+def _postprocess(grants: list[ExtractedGrant]) -> list[ExtractedGrant]:
+ """Drop placeholders / amount-less rows, dedupe, and reassign line_number."""
+ out: list[ExtractedGrant] = []
+ seen: set[tuple] = set()
+
+ for g in grants:
+ # Drop rows with no recipient at all.
+ if not g.recipient_name and not g.recipient_person_name:
+ continue
+ # Drop placeholder recipients that slipped through (belt-and-suspenders).
+ if g.recipient_name and is_placeholder(g.recipient_name):
+ continue
+ if g.recipient_person_name and is_placeholder(g.recipient_person_name):
+ continue
+ # Drop rows with no amount text at all (Haiku didn't see an amount column).
+ # Keep rows where amount_raw == '0' and amount == Decimal(0).
+ if g.amount is None and not g.amount_raw:
+ continue
+
+ key = (
+ (g.recipient_name or "").upper(),
+ (g.city or "").upper(),
+ (g.state or "").upper(),
+ g.amount_raw or "",
+ )
+ if key in seen:
+ continue
+ seen.add(key)
+ out.append(g)
+
+ for i, g in enumerate(out, start=1):
+ g.line_number = i
+ return out
+
+
+# ---------------------------------------------------------------------------
+# Text path
+# ---------------------------------------------------------------------------
+
+def _extract_text_layer(
+ pdf_path: Path,
+ tax_year: int | None,
+ diagnostics: dict,
+) -> tuple[list[ExtractedGrant], str, int]:
+ """Extract grants via pdfplumber text + Haiku text parsing.
+
+ Returns (grants, status, num_grant_pages).
+ status ∈ {'ok', 'not_a_990pf', 'no_text_layer', 'no_grant_pages',
+ 'haiku_empty', 'low_yield', 'error'}
+
+ 'not_a_990pf' means the PDF has a readable text layer but doesn't look
+ like a 990-PF at all (e.g. a standalone NY State CHAR500 cover form).
+ The caller should short-circuit on this — there's nothing for the vision
+ path to find either.
+ """
+ try:
+ pdf = pdfplumber.open(pdf_path)
+ except Exception as exc:
+ diagnostics["text_path_error"] = str(exc)
+ return [], "error", 0
+
+ try:
+ page_texts: list[tuple[int, str]] = []
+ total_chars = 0
+ for idx, page in enumerate(pdf.pages, start=1):
+ text = page.extract_text() or ""
+ page_texts.append((idx, text))
+ total_chars += len(text)
+
+ diagnostics["text_layer_chars"] = total_chars
+ diagnostics["pages_total"] = len(page_texts)
+
+ if total_chars < MIN_TEXT_LAYER_CHARS:
+ return [], "no_text_layer", 0
+
+ # Must look like a 990-PF at all. If not, don't bother with Haiku —
+ # and don't fall through to vision, since it'd just burn triage calls
+ # scanning a non-990-PF document end to end.
+ joined_upper = "\n".join(t for _, t in page_texts).upper()
+ if not _IS_990PF_RE.search(joined_upper):
+ return [], "not_a_990pf", 0
+
+ grant_pages = [
+ (num, text) for num, text in page_texts
+ if len(text) > 100 and _is_grant_page_text(text)
+ ]
+ diagnostics["grant_pages_identified"] = len(grant_pages)
+
+ if not grant_pages:
+ return [], "no_grant_pages", 0
+ finally:
+ pdf.close()
+
+ client = _get_client()
+ year_hint = _year_hint(tax_year)
+
+ raw_grants: list[dict] = []
+ try:
+ for page_num, text in grant_pages:
+ resp = client.messages.create(
+ model=MODEL,
+ max_tokens=4096,
+ messages=[{
+ "role": "user",
+ "content": f"{TEXT_EXTRACTION_PROMPT}{year_hint}\n\nPage text:\n{text}",
+ }],
+ )
+ raw_grants.extend(_parse_haiku_response(resp.content[0].text))
+ except anthropic.APIError as exc:
+ # Discard partial results — fall back to vision cleanly.
+ diagnostics["text_path_error"] = f"{type(exc).__name__}: {exc}"
+ return [], "error", len(grant_pages)
+
+ if not raw_grants:
+ return [], "haiku_empty", len(grant_pages)
+
+ grants: list[ExtractedGrant] = []
+ for raw in raw_grants:
+ g = _normalize_grant(raw)
+ if g is not None:
+ grants.append(g)
+
+ if len(grants) < len(grant_pages) * MIN_GRANTS_PER_PAGE:
+ return grants, "low_yield", len(grant_pages)
+
+ return grants, "ok", len(grant_pages)
+
+
+# ---------------------------------------------------------------------------
+# Vision path
+# ---------------------------------------------------------------------------
+
+def _render_page_b64(page, dpi: int) -> str:
+ pix = page.get_pixmap(dpi=dpi)
+ return base64.standard_b64encode(pix.tobytes("png")).decode("utf-8")
+
+
+def _extract_vision(
+ pdf_path: Path,
+ tax_year: int | None,
+ diagnostics: dict,
+) -> tuple[list[ExtractedGrant], int, int]:
+ """Extract grants via Haiku vision over rendered page images.
+
+ Returns (grants, pages_scanned, pages_extracted). Raises anthropic.APIError
+ on catastrophic API failure (caller handles).
+ """
+ client = _get_client()
+ year_hint = _year_hint(tax_year)
+
+ doc = fitz.open(pdf_path)
+ try:
+ total_pages = len(doc)
+ diagnostics.setdefault("pages_total", total_pages)
+
+ if total_pages <= ATTACHMENT_START_PAGE_IDX:
+ diagnostics["vision_pages_scanned"] = 0
+ diagnostics["vision_pages_extracted"] = 0
+ return [], 0, 0
+
+ # Phase 1: triage pages to find grant tables.
+ grant_page_indices: list[int] = []
+ consecutive_no = 0
+ found_any = False
+ pages_scanned = 0
+
+ for i in range(ATTACHMENT_START_PAGE_IDX, total_pages):
+ pages_scanned += 1
+ b64 = _render_page_b64(doc[i], dpi=TRIAGE_DPI)
+ resp = client.messages.create(
+ model=MODEL,
+ max_tokens=10,
+ messages=[{
+ "role": "user",
+ "content": [
+ {"type": "image",
+ "source": {"type": "base64", "media_type": "image/png", "data": b64}},
+ {"type": "text", "text": TRIAGE_PROMPT},
+ ],
+ }],
+ )
+ is_grant = "yes" in resp.content[0].text.lower()
+ if is_grant:
+ grant_page_indices.append(i)
+ consecutive_no = 0
+ found_any = True
+ else:
+ consecutive_no += 1
+ if found_any and consecutive_no >= VISION_CONSECUTIVE_NO_LIMIT:
+ break
+
+ diagnostics["vision_pages_scanned"] = pages_scanned
+
+ if not grant_page_indices:
+ diagnostics["vision_pages_extracted"] = 0
+ return [], pages_scanned, 0
+
+ # Phase 2: extract from identified grant pages.
+ raw_grants: list[dict] = []
+ pages_extracted = 0
+ for i in grant_page_indices:
+ b64 = _render_page_b64(doc[i], dpi=EXTRACTION_DPI)
+ resp = client.messages.create(
+ model=MODEL,
+ max_tokens=4096,
+ messages=[{
+ "role": "user",
+ "content": [
+ {"type": "image",
+ "source": {"type": "base64", "media_type": "image/png", "data": b64}},
+ {"type": "text", "text": f"{VISION_EXTRACTION_PROMPT}{year_hint}"},
+ ],
+ }],
+ )
+ page_grants = _parse_haiku_response(resp.content[0].text)
+ if page_grants:
+ pages_extracted += 1
+ raw_grants.extend(page_grants)
+
+ diagnostics["vision_pages_extracted"] = pages_extracted
+ finally:
+ doc.close()
+
+ grants: list[ExtractedGrant] = []
+ for raw in raw_grants:
+ g = _normalize_grant(raw)
+ if g is not None:
+ grants.append(g)
+
+ return grants, pages_scanned, pages_extracted
+
+
+# ---------------------------------------------------------------------------
+# Top-level extractor
+# ---------------------------------------------------------------------------
+
+def extract_from_pdf(
+ pdf_path: str | Path,
+ tax_year: int | None = None,
+ source_label: str | None = None,
+) -> ExtractionResult:
+ """Extract grants from a single 990-PF PDF.
+
+ Stateless: reads only the file at pdf_path and calls Anthropic's API.
+ Writes nothing. See module docstring for full contract.
+ """
+ pdf_path = Path(pdf_path)
+
+ diagnostics: dict = {
+ "tax_year_hint": tax_year,
+ "source_label": source_label,
+ }
+
+ if not pdf_path.exists():
+ diagnostics["error"] = f"PDF not found: {pdf_path}"
+ return ExtractionResult(
+ success=False,
+ grants=[],
+ grant_detail_status=None,
+ method="failed",
+ diagnostics=diagnostics,
+ )
+
+ # Text path
+ try:
+ text_grants, text_status, num_grant_pages = _extract_text_layer(
+ pdf_path, tax_year, diagnostics
+ )
+ except Exception as exc:
+ # pdfplumber can blow up on malformed PDFs — don't let that kill us,
+ # fall through to the vision path like any other text failure.
+ text_grants = []
+ text_status = "error"
+ num_grant_pages = 0
+ diagnostics.setdefault("text_path_error", f"{type(exc).__name__}: {exc}")
+
+ diagnostics["text_path_status"] = text_status
+
+ if text_status == "ok":
+ grants = _postprocess(text_grants)
+ return ExtractionResult(
+ success=True,
+ grants=grants,
+ grant_detail_status="supplemented" if grants else "no_grants",
+ method="pdfplumber+haiku_text",
+ diagnostics=diagnostics,
+ )
+
+ # Short-circuit: PDF has a readable text layer but isn't a 990-PF
+ # (e.g. a standalone NY State CHAR500 cover form). Don't run vision —
+ # there's nothing in the document for it to find.
+ if text_status == "not_a_990pf":
+ return ExtractionResult(
+ success=True,
+ grants=[],
+ grant_detail_status="not_a_990pf",
+ method="skipped_not_990pf",
+ diagnostics=diagnostics,
+ )
+
+ # Vision fallback — replaces text output entirely.
+ try:
+ vision_grants, _scanned, _extracted = _extract_vision(
+ pdf_path, tax_year, diagnostics
+ )
+ except anthropic.APIError as exc:
+ diagnostics["error"] = f"{type(exc).__name__}: {exc}"
+ return ExtractionResult(
+ success=False,
+ grants=[],
+ grant_detail_status=None,
+ method="failed",
+ diagnostics=diagnostics,
+ )
+ except Exception as exc:
+ diagnostics["error"] = f"{type(exc).__name__}: {exc}"
+ return ExtractionResult(
+ success=False,
+ grants=[],
+ grant_detail_status=None,
+ method="failed",
+ diagnostics=diagnostics,
+ )
+
+ grants = _postprocess(vision_grants)
+ return ExtractionResult(
+ success=True,
+ grants=grants,
+ grant_detail_status="supplemented" if grants else "no_grants",
+ method="haiku_vision_attempted",
+ diagnostics=diagnostics,
+ )
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+def _result_to_jsonable(result: ExtractionResult) -> dict:
+ def grant_to_dict(g: ExtractedGrant) -> dict:
+ d = asdict(g)
+ if d["amount"] is not None:
+ d["amount"] = str(d["amount"])
+ return d
+
+ return {
+ "success": result.success,
+ "method": result.method,
+ "grant_detail_status": result.grant_detail_status,
+ "diagnostics": result.diagnostics,
+ "grants": [grant_to_dict(g) for g in result.grants],
+ "total_amount": str(result.total_amount),
+ }
+
+
+def _print_table(result: ExtractionResult) -> None:
+ print(f"success: {result.success}")
+ print(f"method: {result.method}")
+ print(f"grant_detail_status: {result.grant_detail_status}")
+ print(f"grants: {len(result.grants)}")
+ print(f"total_amount: ${result.total_amount:,}")
+ print("diagnostics:")
+ for k, v in result.diagnostics.items():
+ print(f" {k}: {v}")
+ if not result.grants:
+ return
+ print()
+ print(f"{'#':>4} {'recipient':<45} {'city':<20} {'st':<3} {'amount':>12}")
+ print("-" * 90)
+ for g in result.grants[:50]:
+ name = (g.recipient_name or g.recipient_person_name or "")[:45]
+ city = (g.city or "")[:20]
+ state = (g.state or "")[:3]
+ amt = f"${g.amount:,.0f}" if g.amount is not None else (g.amount_raw or "")
+ print(f"{g.line_number:>4} {name:<45} {city:<20} {state:<3} {amt:>12}")
+ if len(result.grants) > 50:
+ print(f"... and {len(result.grants) - 50} more")
+
+
+def main(argv: list[str] | None = None) -> int:
+ parser = argparse.ArgumentParser(
+ prog="scripts.extract.irs_990_pdf",
+ description="Extract grant data from a 990-PF PDF.",
+ )
+ parser.add_argument("pdf_path", help="Path to a local PDF file.")
+ parser.add_argument("--tax-year", type=int, default=None,
+ help="Tax year hint passed to the extraction prompts.")
+ parser.add_argument("--source-label", default=None,
+ help="Diagnostic label for the PDF source (e.g. 'ny_ag').")
+ parser.add_argument("--json", action="store_true",
+ help="Emit the full result as JSON instead of a table.")
+ args = parser.parse_args(argv)
+
+ result = extract_from_pdf(
+ args.pdf_path,
+ tax_year=args.tax_year,
+ source_label=args.source_label,
+ )
+
+ if args.json:
+ print(json.dumps(_result_to_jsonable(result), indent=2))
+ else:
+ _print_table(result)
+
+ return 0 if result.success else 1
+
+
+if __name__ == "__main__":
+ sys.exit(main())