""" Source-agnostic 990-PF PDF grant extractor. Takes a path to a local PDF and returns structured grant data. Side-effect free with respect to any database; only external call is to Anthropic's API. A separate loader consumes ExtractionResult and writes to raw.* tables. Usage (CLI): python -m scripts.extract.irs_990_pdf path/to/file.pdf python -m scripts.extract.irs_990_pdf path/to/file.pdf --json python -m scripts.extract.irs_990_pdf path/to/file.pdf --tax-year 2021 python -m scripts.extract.irs_990_pdf path/to/file.pdf --source-label ny_ag Usage (programmatic): from scripts.extract.irs_990_pdf import extract_from_pdf result = extract_from_pdf("data/tmp/pdf_test/marley_2017.pdf", tax_year=2017) for grant in result.grants: print(grant.recipient_name, grant.amount) """ from __future__ import annotations import argparse import base64 import json import re import subprocess import sys from dataclasses import dataclass, field, asdict from decimal import Decimal, InvalidOperation from pathlib import Path from typing import Any import anthropic import fitz # pymupdf import pdfplumber from scripts.common.normalize import is_placeholder, parse_numeric # --------------------------------------------------------------------------- # Constants # --------------------------------------------------------------------------- MODEL = "claude-haiku-4-5-20251001" # Text-path quality heuristic. If the number of grants extracted is less than # num_grant_pages * MIN_GRANTS_PER_PAGE, we flag the result as low-yield and # fall back to vision. This is an initial heuristic and may be tuned. MIN_GRANTS_PER_PAGE = 1 # Minimum total text characters across all pages before we trust the text layer. MIN_TEXT_LAYER_CHARS = 200 # Standard 990-PF form has 13 pages; grant attachments come after. ATTACHMENT_START_PAGE_IDX = 13 # 0-based; corresponds to page 14 # Vision scan stops after this many consecutive non-grant pages (once any # grant page has been found). VISION_CONSECUTIVE_NO_LIMIT = 3 # DPIs used for vision rendering. TRIAGE_DPI = 100 EXTRACTION_DPI = 150 GRANT_PAGE_KEYWORDS = [ r"GRANTS AND CONTRIBUTIONS PAID", r"SUPPLEMENTARY INFORMATION", r"PART XIV", r"PART XV", r"SCHEDULE OF.*GRANT", r"GRANTS PAID", r"CONTRIBUTIONS PAID", ] _GRANT_PAGE_RE = re.compile("|".join(GRANT_PAGE_KEYWORDS)) # Loose marker for "this PDF is a 990-PF". NY AG PDFs may contain only the # CHAR500 state cover form with no federal return attached — ~46% of that # corpus, based on sampling. Those should short-circuit before we waste any # Haiku calls. _IS_990PF_RE = re.compile(r"FORM\s*990-?PF") TEXT_EXTRACTION_PROMPT = """Extract ALL individual grants from this 990-PF page text. Return a JSON array. Each grant should have: - recipient_name: organization name (if grant is to an org) - recipient_person_name: person's name (if grant is to an individual) - address_line1: street address (if present) - address_line2: second address line (if present) - city: city (if present) - state: state abbreviation (if present) - zip: zip code (if present) - country: country (if present, only when non-US) - foreign_postal_code: foreign postal code (if present) - amount: dollar amount as string (digits only, no $ or commas) - purpose: purpose of grant (if present) - foundation_status: recipient status like PC, NC, PF (if present) - relationship: relationship of recipient to foundation (if present) IMPORTANT RULES: - Do NOT include total/subtotal rows - Do NOT include header rows or column labels - If there are multiple year columns, extract ONLY the most recent year - If recipient name is "SEE ATTACHED", "VARIOUS", "N/A", "NONE", or a similar placeholder, skip it - Return [] if no individual grants are found Return ONLY the JSON array, no other text.""" VISION_EXTRACTION_PROMPT = """Extract ALL individual grants from this 990-PF page image. Return a JSON array. Each grant should have: - recipient_name: organization name (if grant is to an org) - recipient_person_name: person's name (if grant is to an individual) - address_line1: street address (if visible) - address_line2: second address line (if visible) - city: city - state: state abbreviation - zip: zip code (if visible) - country: country (only if non-US) - foreign_postal_code: foreign postal code (if present) - amount: dollar amount as string (digits only, no $ or commas) - purpose: purpose of grant - foundation_status: recipient status like PC, NC, PF (if present) - relationship: relationship of recipient to foundation (if present) IMPORTANT RULES: - Do NOT include total/subtotal rows - Do NOT include header rows - If there are multiple year columns, extract ONLY the most recent year - If recipient name is "SEE ATTACHED", "VARIOUS", "N/A", "NONE", or a similar placeholder, skip it - Return [] if no individual grants are found Return ONLY the JSON array, no other text.""" TRIAGE_PROMPT = ( "Is this page a table of grant or contribution recipients listing " "individual organization names with addresses and dollar amounts? " "Answer ONLY yes or no." ) # --------------------------------------------------------------------------- # Anthropic client (lazy singleton so importing this module is cheap) # --------------------------------------------------------------------------- _client: anthropic.Anthropic | None = None def _get_client() -> anthropic.Anthropic: global _client if _client is None: api_key = subprocess.run( ["pass", "show", "anthropic.com/api.anthropic.com/apikey"], capture_output=True, text=True, ).stdout.strip() _client = anthropic.Anthropic(api_key=api_key) return _client # --------------------------------------------------------------------------- # Dataclasses # --------------------------------------------------------------------------- @dataclass class ExtractedGrant: line_number: int recipient_name: str | None = None recipient_name2: str | None = None recipient_person_name: str | None = None address_line1: str | None = None address_line2: str | None = None city: str | None = None state: str | None = None zip: str | None = None country: str | None = None foreign_postal_code: str | None = None amount_raw: str | None = None amount: Decimal | None = None purpose: str | None = None foundation_status: str | None = None relationship: str | None = None @dataclass class ExtractionResult: success: bool grants: list[ExtractedGrant] # 'supplemented' — ≥1 grant extracted # 'no_grants' — extractor ran end-to-end, found nothing usable # 'not_a_990pf' — PDF is readable but isn't a 990-PF (e.g. CHAR500 cover) # None — catastrophic failure (success=False) grant_detail_status: str | None # 'pdfplumber+haiku_text' — text path was used # 'haiku_vision_attempted' — vision path was used (success not implied) # 'skipped_not_990pf' — short-circuited; no API calls made # 'failed' — catastrophic failure (success=False) method: str diagnostics: dict = field(default_factory=dict) @property def total_amount(self) -> Decimal: return sum((g.amount or Decimal(0)) for g in self.grants) # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _parse_haiku_response(text: str) -> list[dict]: """Parse JSON from Haiku response, handling markdown code blocks.""" text = text.strip() if text.startswith("```"): # Drop opening fence + optional language tag parts = text.split("\n", 1) if len(parts) == 2: text = parts[1] text = text.rsplit("```", 1)[0] text = text.strip() try: parsed = json.loads(text) except json.JSONDecodeError: return [] if not isinstance(parsed, list): return [] return [x for x in parsed if isinstance(x, dict)] def _is_grant_page_text(text: str) -> bool: return bool(_GRANT_PAGE_RE.search(text.upper())) def _clean_str(value: Any) -> str | None: if value is None: return None s = str(value).strip() return s or None def _normalize_grant(raw: dict) -> ExtractedGrant | None: """Map a loose Haiku JSON dict to an ExtractedGrant. Returns None if the row looks like a placeholder/header artifact. line_number is set to 0 here; it's reassigned later after dedupe. """ recipient_name = _clean_str(raw.get("recipient_name")) recipient_person_name = _clean_str(raw.get("recipient_person_name")) # Safety net: prompt already filters these, but double-check. if recipient_name and is_placeholder(recipient_name): recipient_name = None if recipient_person_name and is_placeholder(recipient_person_name): recipient_person_name = None amount_raw = _clean_str(raw.get("amount")) amount_numeric: Decimal | None = None if amount_raw is not None: parsed = parse_numeric(amount_raw) if parsed is not None: try: amount_numeric = Decimal(parsed) except InvalidOperation: amount_numeric = None return ExtractedGrant( line_number=0, recipient_name=recipient_name, recipient_name2=_clean_str(raw.get("recipient_name2")), recipient_person_name=recipient_person_name, address_line1=_clean_str(raw.get("address_line1")), address_line2=_clean_str(raw.get("address_line2")), city=_clean_str(raw.get("city")), state=_clean_str(raw.get("state")), zip=_clean_str(raw.get("zip")), country=_clean_str(raw.get("country")), foreign_postal_code=_clean_str(raw.get("foreign_postal_code")), amount_raw=amount_raw, amount=amount_numeric, purpose=_clean_str(raw.get("purpose")), foundation_status=_clean_str(raw.get("foundation_status")), relationship=_clean_str(raw.get("relationship")), ) def _year_hint(tax_year: int | None) -> str: if tax_year is None: return "" return f"\n\nThis filing is for tax year {tax_year}." def _postprocess(grants: list[ExtractedGrant]) -> list[ExtractedGrant]: """Drop placeholders / amount-less rows, dedupe, and reassign line_number.""" out: list[ExtractedGrant] = [] seen: set[tuple] = set() for g in grants: # Drop rows with no recipient at all. if not g.recipient_name and not g.recipient_person_name: continue # Drop placeholder recipients that slipped through (belt-and-suspenders). if g.recipient_name and is_placeholder(g.recipient_name): continue if g.recipient_person_name and is_placeholder(g.recipient_person_name): continue # Drop rows with no amount text at all (Haiku didn't see an amount column). # Keep rows where amount_raw == '0' and amount == Decimal(0). if g.amount is None and not g.amount_raw: continue key = ( (g.recipient_name or "").upper(), (g.city or "").upper(), (g.state or "").upper(), g.amount_raw or "", ) if key in seen: continue seen.add(key) out.append(g) for i, g in enumerate(out, start=1): g.line_number = i return out # --------------------------------------------------------------------------- # Text path # --------------------------------------------------------------------------- def _extract_text_layer( pdf_path: Path, tax_year: int | None, diagnostics: dict, ) -> tuple[list[ExtractedGrant], str, int]: """Extract grants via pdfplumber text + Haiku text parsing. Returns (grants, status, num_grant_pages). status ∈ {'ok', 'not_a_990pf', 'no_text_layer', 'no_grant_pages', 'haiku_empty', 'low_yield', 'error'} 'not_a_990pf' means the PDF has a readable text layer but doesn't look like a 990-PF at all (e.g. a standalone NY State CHAR500 cover form). The caller should short-circuit on this — there's nothing for the vision path to find either. """ try: pdf = pdfplumber.open(pdf_path) except Exception as exc: diagnostics["text_path_error"] = str(exc) return [], "error", 0 try: page_texts: list[tuple[int, str]] = [] total_chars = 0 for idx, page in enumerate(pdf.pages, start=1): text = page.extract_text() or "" page_texts.append((idx, text)) total_chars += len(text) diagnostics["text_layer_chars"] = total_chars diagnostics["pages_total"] = len(page_texts) if total_chars < MIN_TEXT_LAYER_CHARS: return [], "no_text_layer", 0 # Must look like a 990-PF at all. If not, don't bother with Haiku — # and don't fall through to vision, since it'd just burn triage calls # scanning a non-990-PF document end to end. joined_upper = "\n".join(t for _, t in page_texts).upper() if not _IS_990PF_RE.search(joined_upper): return [], "not_a_990pf", 0 grant_pages = [ (num, text) for num, text in page_texts if len(text) > 100 and _is_grant_page_text(text) ] diagnostics["grant_pages_identified"] = len(grant_pages) if not grant_pages: return [], "no_grant_pages", 0 finally: pdf.close() client = _get_client() year_hint = _year_hint(tax_year) raw_grants: list[dict] = [] try: for page_num, text in grant_pages: resp = client.messages.create( model=MODEL, max_tokens=4096, messages=[{ "role": "user", "content": f"{TEXT_EXTRACTION_PROMPT}{year_hint}\n\nPage text:\n{text}", }], ) raw_grants.extend(_parse_haiku_response(resp.content[0].text)) except anthropic.APIError as exc: # Discard partial results — fall back to vision cleanly. diagnostics["text_path_error"] = f"{type(exc).__name__}: {exc}" return [], "error", len(grant_pages) if not raw_grants: return [], "haiku_empty", len(grant_pages) grants: list[ExtractedGrant] = [] for raw in raw_grants: g = _normalize_grant(raw) if g is not None: grants.append(g) if len(grants) < len(grant_pages) * MIN_GRANTS_PER_PAGE: return grants, "low_yield", len(grant_pages) return grants, "ok", len(grant_pages) # --------------------------------------------------------------------------- # Vision path # --------------------------------------------------------------------------- def _render_page_b64(page, dpi: int) -> str: pix = page.get_pixmap(dpi=dpi) return base64.standard_b64encode(pix.tobytes("png")).decode("utf-8") def _extract_vision( pdf_path: Path, tax_year: int | None, diagnostics: dict, ) -> tuple[list[ExtractedGrant], int, int]: """Extract grants via Haiku vision over rendered page images. Returns (grants, pages_scanned, pages_extracted). Raises anthropic.APIError on catastrophic API failure (caller handles). """ client = _get_client() year_hint = _year_hint(tax_year) doc = fitz.open(pdf_path) try: total_pages = len(doc) diagnostics.setdefault("pages_total", total_pages) if total_pages <= ATTACHMENT_START_PAGE_IDX: diagnostics["vision_pages_scanned"] = 0 diagnostics["vision_pages_extracted"] = 0 return [], 0, 0 # Phase 1: triage pages to find grant tables. grant_page_indices: list[int] = [] consecutive_no = 0 found_any = False pages_scanned = 0 for i in range(ATTACHMENT_START_PAGE_IDX, total_pages): pages_scanned += 1 b64 = _render_page_b64(doc[i], dpi=TRIAGE_DPI) resp = client.messages.create( model=MODEL, max_tokens=10, messages=[{ "role": "user", "content": [ {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": b64}}, {"type": "text", "text": TRIAGE_PROMPT}, ], }], ) is_grant = "yes" in resp.content[0].text.lower() if is_grant: grant_page_indices.append(i) consecutive_no = 0 found_any = True else: consecutive_no += 1 if found_any and consecutive_no >= VISION_CONSECUTIVE_NO_LIMIT: break diagnostics["vision_pages_scanned"] = pages_scanned if not grant_page_indices: diagnostics["vision_pages_extracted"] = 0 return [], pages_scanned, 0 # Phase 2: extract from identified grant pages. raw_grants: list[dict] = [] pages_extracted = 0 for i in grant_page_indices: b64 = _render_page_b64(doc[i], dpi=EXTRACTION_DPI) resp = client.messages.create( model=MODEL, max_tokens=4096, messages=[{ "role": "user", "content": [ {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": b64}}, {"type": "text", "text": f"{VISION_EXTRACTION_PROMPT}{year_hint}"}, ], }], ) page_grants = _parse_haiku_response(resp.content[0].text) if page_grants: pages_extracted += 1 raw_grants.extend(page_grants) diagnostics["vision_pages_extracted"] = pages_extracted finally: doc.close() grants: list[ExtractedGrant] = [] for raw in raw_grants: g = _normalize_grant(raw) if g is not None: grants.append(g) return grants, pages_scanned, pages_extracted # --------------------------------------------------------------------------- # Top-level extractor # --------------------------------------------------------------------------- def extract_from_pdf( pdf_path: str | Path, tax_year: int | None = None, source_label: str | None = None, ) -> ExtractionResult: """Extract grants from a single 990-PF PDF. Stateless: reads only the file at pdf_path and calls Anthropic's API. Writes nothing. See module docstring for full contract. """ pdf_path = Path(pdf_path) diagnostics: dict = { "tax_year_hint": tax_year, "source_label": source_label, } if not pdf_path.exists(): diagnostics["error"] = f"PDF not found: {pdf_path}" return ExtractionResult( success=False, grants=[], grant_detail_status=None, method="failed", diagnostics=diagnostics, ) # Text path try: text_grants, text_status, num_grant_pages = _extract_text_layer( pdf_path, tax_year, diagnostics ) except Exception as exc: # pdfplumber can blow up on malformed PDFs — don't let that kill us, # fall through to the vision path like any other text failure. text_grants = [] text_status = "error" num_grant_pages = 0 diagnostics.setdefault("text_path_error", f"{type(exc).__name__}: {exc}") diagnostics["text_path_status"] = text_status if text_status == "ok": grants = _postprocess(text_grants) return ExtractionResult( success=True, grants=grants, grant_detail_status="supplemented" if grants else "no_grants", method="pdfplumber+haiku_text", diagnostics=diagnostics, ) # Short-circuit: PDF has a readable text layer but isn't a 990-PF # (e.g. a standalone NY State CHAR500 cover form). Don't run vision — # there's nothing in the document for it to find. if text_status == "not_a_990pf": return ExtractionResult( success=True, grants=[], grant_detail_status="not_a_990pf", method="skipped_not_990pf", diagnostics=diagnostics, ) # Vision fallback — replaces text output entirely. try: vision_grants, _scanned, _extracted = _extract_vision( pdf_path, tax_year, diagnostics ) except anthropic.APIError as exc: diagnostics["error"] = f"{type(exc).__name__}: {exc}" return ExtractionResult( success=False, grants=[], grant_detail_status=None, method="failed", diagnostics=diagnostics, ) except Exception as exc: diagnostics["error"] = f"{type(exc).__name__}: {exc}" return ExtractionResult( success=False, grants=[], grant_detail_status=None, method="failed", diagnostics=diagnostics, ) grants = _postprocess(vision_grants) return ExtractionResult( success=True, grants=grants, grant_detail_status="supplemented" if grants else "no_grants", method="haiku_vision_attempted", diagnostics=diagnostics, ) # --------------------------------------------------------------------------- # CLI # --------------------------------------------------------------------------- def _result_to_jsonable(result: ExtractionResult) -> dict: def grant_to_dict(g: ExtractedGrant) -> dict: d = asdict(g) if d["amount"] is not None: d["amount"] = str(d["amount"]) return d return { "success": result.success, "method": result.method, "grant_detail_status": result.grant_detail_status, "diagnostics": result.diagnostics, "grants": [grant_to_dict(g) for g in result.grants], "total_amount": str(result.total_amount), } def _print_table(result: ExtractionResult) -> None: print(f"success: {result.success}") print(f"method: {result.method}") print(f"grant_detail_status: {result.grant_detail_status}") print(f"grants: {len(result.grants)}") print(f"total_amount: ${result.total_amount:,}") print("diagnostics:") for k, v in result.diagnostics.items(): print(f" {k}: {v}") if not result.grants: return print() print(f"{'#':>4} {'recipient':<45} {'city':<20} {'st':<3} {'amount':>12}") print("-" * 90) for g in result.grants[:50]: name = (g.recipient_name or g.recipient_person_name or "")[:45] city = (g.city or "")[:20] state = (g.state or "")[:3] amt = f"${g.amount:,.0f}" if g.amount is not None else (g.amount_raw or "") print(f"{g.line_number:>4} {name:<45} {city:<20} {state:<3} {amt:>12}") if len(result.grants) > 50: print(f"... and {len(result.grants) - 50} more") def main(argv: list[str] | None = None) -> int: parser = argparse.ArgumentParser( prog="scripts.extract.irs_990_pdf", description="Extract grant data from a 990-PF PDF.", ) parser.add_argument("pdf_path", help="Path to a local PDF file.") parser.add_argument("--tax-year", type=int, default=None, help="Tax year hint passed to the extraction prompts.") parser.add_argument("--source-label", default=None, help="Diagnostic label for the PDF source (e.g. 'ny_ag').") parser.add_argument("--json", action="store_true", help="Emit the full result as JSON instead of a table.") args = parser.parse_args(argv) result = extract_from_pdf( args.pdf_path, tax_year=args.tax_year, source_label=args.source_label, ) if args.json: print(json.dumps(_result_to_jsonable(result), indent=2)) else: _print_table(result) return 0 if result.success else 1 if __name__ == "__main__": sys.exit(main())