aboutsummaryrefslogtreecommitdiff
path: root/scripts/extract/irs_990_pdf.py
blob: 1d1209c585b6469675fa90710c2122d2d0d31233 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
"""
Source-agnostic 990-PF PDF grant extractor.

Takes a path to a local PDF and returns structured grant data. Side-effect
free with respect to any database; only external call is to Anthropic's API.

A separate loader consumes ExtractionResult and writes to raw.* tables.

Usage (CLI):
    python -m scripts.extract.irs_990_pdf path/to/file.pdf
    python -m scripts.extract.irs_990_pdf path/to/file.pdf --json
    python -m scripts.extract.irs_990_pdf path/to/file.pdf --tax-year 2021
    python -m scripts.extract.irs_990_pdf path/to/file.pdf --source-label ny_ag

Usage (programmatic):
    from scripts.extract.irs_990_pdf import extract_from_pdf
    result = extract_from_pdf("data/tmp/pdf_test/marley_2017.pdf", tax_year=2017)
    for grant in result.grants:
        print(grant.recipient_name, grant.amount)
"""

from __future__ import annotations

import argparse
import base64
import json
import re
import subprocess
import sys
from dataclasses import dataclass, field, asdict
from decimal import Decimal, InvalidOperation
from pathlib import Path
from typing import Any

import anthropic
import fitz  # pymupdf
import pdfplumber

from scripts.common.normalize import is_placeholder, parse_numeric


# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------

MODEL = "claude-haiku-4-5-20251001"

# Text-path quality heuristic. If the number of grants extracted is less than
# num_grant_pages * MIN_GRANTS_PER_PAGE, we flag the result as low-yield and
# fall back to vision. This is an initial heuristic and may be tuned.
MIN_GRANTS_PER_PAGE = 1

# Minimum total text characters across all pages before we trust the text layer.
MIN_TEXT_LAYER_CHARS = 200

# Standard 990-PF form has 13 pages; grant attachments come after.
ATTACHMENT_START_PAGE_IDX = 13  # 0-based; corresponds to page 14

# Vision scan stops after this many consecutive non-grant pages (once any
# grant page has been found).
VISION_CONSECUTIVE_NO_LIMIT = 3

# DPIs used for vision rendering.
TRIAGE_DPI = 100
EXTRACTION_DPI = 150

GRANT_PAGE_KEYWORDS = [
    r"GRANTS AND CONTRIBUTIONS PAID",
    r"SUPPLEMENTARY INFORMATION",
    r"PART XIV",
    r"PART XV",
    r"SCHEDULE OF.*GRANT",
    r"GRANTS PAID",
    r"CONTRIBUTIONS PAID",
]
_GRANT_PAGE_RE = re.compile("|".join(GRANT_PAGE_KEYWORDS))

# Loose marker for "this PDF is a 990-PF". NY AG PDFs may contain only the
# CHAR500 state cover form with no federal return attached — ~46% of that
# corpus, based on sampling. Those should short-circuit before we waste any
# Haiku calls.
_IS_990PF_RE = re.compile(r"FORM\s*990-?PF")

TEXT_EXTRACTION_PROMPT = """Extract ALL individual grants from this 990-PF page text.

Return a JSON array. Each grant should have:
- recipient_name: organization name (if grant is to an org)
- recipient_person_name: person's name (if grant is to an individual)
- address_line1: street address (if present)
- address_line2: second address line (if present)
- city: city (if present)
- state: state abbreviation (if present)
- zip: zip code (if present)
- country: country (if present, only when non-US)
- foreign_postal_code: foreign postal code (if present)
- amount: dollar amount as string (digits only, no $ or commas)
- purpose: purpose of grant (if present)
- foundation_status: recipient status like PC, NC, PF (if present)
- relationship: relationship of recipient to foundation (if present)

IMPORTANT RULES:
- Do NOT include total/subtotal rows
- Do NOT include header rows or column labels
- If there are multiple year columns, extract ONLY the most recent year
- If recipient name is "SEE ATTACHED", "VARIOUS", "N/A", "NONE", or a similar placeholder, skip it
- Return [] if no individual grants are found

Return ONLY the JSON array, no other text."""

VISION_EXTRACTION_PROMPT = """Extract ALL individual grants from this 990-PF page image.

Return a JSON array. Each grant should have:
- recipient_name: organization name (if grant is to an org)
- recipient_person_name: person's name (if grant is to an individual)
- address_line1: street address (if visible)
- address_line2: second address line (if visible)
- city: city
- state: state abbreviation
- zip: zip code (if visible)
- country: country (only if non-US)
- foreign_postal_code: foreign postal code (if present)
- amount: dollar amount as string (digits only, no $ or commas)
- purpose: purpose of grant
- foundation_status: recipient status like PC, NC, PF (if present)
- relationship: relationship of recipient to foundation (if present)

IMPORTANT RULES:
- Do NOT include total/subtotal rows
- Do NOT include header rows
- If there are multiple year columns, extract ONLY the most recent year
- If recipient name is "SEE ATTACHED", "VARIOUS", "N/A", "NONE", or a similar placeholder, skip it
- Return [] if no individual grants are found

Return ONLY the JSON array, no other text."""

TRIAGE_PROMPT = (
    "Is this page a table of grant or contribution recipients listing "
    "individual organization names with addresses and dollar amounts? "
    "Answer ONLY yes or no."
)


# ---------------------------------------------------------------------------
# Anthropic client (lazy singleton so importing this module is cheap)
# ---------------------------------------------------------------------------

_client: anthropic.Anthropic | None = None


def _get_client() -> anthropic.Anthropic:
    global _client
    if _client is None:
        api_key = subprocess.run(
            ["pass", "show", "anthropic.com/api.anthropic.com/apikey"],
            capture_output=True, text=True,
        ).stdout.strip()
        _client = anthropic.Anthropic(api_key=api_key)
    return _client


# ---------------------------------------------------------------------------
# Dataclasses
# ---------------------------------------------------------------------------

@dataclass
class ExtractedGrant:
    line_number: int
    recipient_name: str | None = None
    recipient_name2: str | None = None
    recipient_person_name: str | None = None
    address_line1: str | None = None
    address_line2: str | None = None
    city: str | None = None
    state: str | None = None
    zip: str | None = None
    country: str | None = None
    foreign_postal_code: str | None = None
    amount_raw: str | None = None
    amount: Decimal | None = None
    purpose: str | None = None
    foundation_status: str | None = None
    relationship: str | None = None


@dataclass
class ExtractionResult:
    success: bool
    grants: list[ExtractedGrant]
    # 'supplemented' — ≥1 grant extracted
    # 'no_grants'    — extractor ran end-to-end, found nothing usable
    # 'not_a_990pf'  — PDF is readable but isn't a 990-PF (e.g. CHAR500 cover)
    # None           — catastrophic failure (success=False)
    grant_detail_status: str | None
    # 'pdfplumber+haiku_text'   — text path was used
    # 'haiku_vision_attempted'  — vision path was used (success not implied)
    # 'skipped_not_990pf'       — short-circuited; no API calls made
    # 'failed'                  — catastrophic failure (success=False)
    method: str
    diagnostics: dict = field(default_factory=dict)

    @property
    def total_amount(self) -> Decimal:
        return sum((g.amount or Decimal(0)) for g in self.grants)


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def _parse_haiku_response(text: str) -> list[dict]:
    """Parse JSON from Haiku response, handling markdown code blocks."""
    text = text.strip()
    if text.startswith("```"):
        # Drop opening fence + optional language tag
        parts = text.split("\n", 1)
        if len(parts) == 2:
            text = parts[1]
        text = text.rsplit("```", 1)[0]
    text = text.strip()
    try:
        parsed = json.loads(text)
    except json.JSONDecodeError:
        return []
    if not isinstance(parsed, list):
        return []
    return [x for x in parsed if isinstance(x, dict)]


def _is_grant_page_text(text: str) -> bool:
    return bool(_GRANT_PAGE_RE.search(text.upper()))


def _clean_str(value: Any) -> str | None:
    if value is None:
        return None
    s = str(value).strip()
    return s or None


def _normalize_grant(raw: dict) -> ExtractedGrant | None:
    """Map a loose Haiku JSON dict to an ExtractedGrant.

    Returns None if the row looks like a placeholder/header artifact.
    line_number is set to 0 here; it's reassigned later after dedupe.
    """
    recipient_name = _clean_str(raw.get("recipient_name"))
    recipient_person_name = _clean_str(raw.get("recipient_person_name"))

    # Safety net: prompt already filters these, but double-check.
    if recipient_name and is_placeholder(recipient_name):
        recipient_name = None
    if recipient_person_name and is_placeholder(recipient_person_name):
        recipient_person_name = None

    amount_raw = _clean_str(raw.get("amount"))
    amount_numeric: Decimal | None = None
    if amount_raw is not None:
        parsed = parse_numeric(amount_raw)
        if parsed is not None:
            try:
                amount_numeric = Decimal(parsed)
            except InvalidOperation:
                amount_numeric = None

    return ExtractedGrant(
        line_number=0,
        recipient_name=recipient_name,
        recipient_name2=_clean_str(raw.get("recipient_name2")),
        recipient_person_name=recipient_person_name,
        address_line1=_clean_str(raw.get("address_line1")),
        address_line2=_clean_str(raw.get("address_line2")),
        city=_clean_str(raw.get("city")),
        state=_clean_str(raw.get("state")),
        zip=_clean_str(raw.get("zip")),
        country=_clean_str(raw.get("country")),
        foreign_postal_code=_clean_str(raw.get("foreign_postal_code")),
        amount_raw=amount_raw,
        amount=amount_numeric,
        purpose=_clean_str(raw.get("purpose")),
        foundation_status=_clean_str(raw.get("foundation_status")),
        relationship=_clean_str(raw.get("relationship")),
    )


def _year_hint(tax_year: int | None) -> str:
    if tax_year is None:
        return ""
    return f"\n\nThis filing is for tax year {tax_year}."


def _postprocess(grants: list[ExtractedGrant]) -> list[ExtractedGrant]:
    """Drop placeholders / amount-less rows, dedupe, and reassign line_number."""
    out: list[ExtractedGrant] = []
    seen: set[tuple] = set()

    for g in grants:
        # Drop rows with no recipient at all.
        if not g.recipient_name and not g.recipient_person_name:
            continue
        # Drop placeholder recipients that slipped through (belt-and-suspenders).
        if g.recipient_name and is_placeholder(g.recipient_name):
            continue
        if g.recipient_person_name and is_placeholder(g.recipient_person_name):
            continue
        # Drop rows with no amount text at all (Haiku didn't see an amount column).
        # Keep rows where amount_raw == '0' and amount == Decimal(0).
        if g.amount is None and not g.amount_raw:
            continue

        key = (
            (g.recipient_name or "").upper(),
            (g.city or "").upper(),
            (g.state or "").upper(),
            g.amount_raw or "",
        )
        if key in seen:
            continue
        seen.add(key)
        out.append(g)

    for i, g in enumerate(out, start=1):
        g.line_number = i
    return out


# ---------------------------------------------------------------------------
# Text path
# ---------------------------------------------------------------------------

def _extract_text_layer(
    pdf_path: Path,
    tax_year: int | None,
    diagnostics: dict,
) -> tuple[list[ExtractedGrant], str, int]:
    """Extract grants via pdfplumber text + Haiku text parsing.

    Returns (grants, status, num_grant_pages).
    status ∈ {'ok', 'not_a_990pf', 'no_text_layer', 'no_grant_pages',
              'haiku_empty', 'low_yield', 'error'}

    'not_a_990pf' means the PDF has a readable text layer but doesn't look
    like a 990-PF at all (e.g. a standalone NY State CHAR500 cover form).
    The caller should short-circuit on this — there's nothing for the vision
    path to find either.
    """
    try:
        pdf = pdfplumber.open(pdf_path)
    except Exception as exc:
        diagnostics["text_path_error"] = str(exc)
        return [], "error", 0

    try:
        page_texts: list[tuple[int, str]] = []
        total_chars = 0
        for idx, page in enumerate(pdf.pages, start=1):
            text = page.extract_text() or ""
            page_texts.append((idx, text))
            total_chars += len(text)

        diagnostics["text_layer_chars"] = total_chars
        diagnostics["pages_total"] = len(page_texts)

        if total_chars < MIN_TEXT_LAYER_CHARS:
            return [], "no_text_layer", 0

        # Must look like a 990-PF at all. If not, don't bother with Haiku —
        # and don't fall through to vision, since it'd just burn triage calls
        # scanning a non-990-PF document end to end.
        joined_upper = "\n".join(t for _, t in page_texts).upper()
        if not _IS_990PF_RE.search(joined_upper):
            return [], "not_a_990pf", 0

        grant_pages = [
            (num, text) for num, text in page_texts
            if len(text) > 100 and _is_grant_page_text(text)
        ]
        diagnostics["grant_pages_identified"] = len(grant_pages)

        if not grant_pages:
            return [], "no_grant_pages", 0
    finally:
        pdf.close()

    client = _get_client()
    year_hint = _year_hint(tax_year)

    raw_grants: list[dict] = []
    try:
        for page_num, text in grant_pages:
            resp = client.messages.create(
                model=MODEL,
                max_tokens=4096,
                messages=[{
                    "role": "user",
                    "content": f"{TEXT_EXTRACTION_PROMPT}{year_hint}\n\nPage text:\n{text}",
                }],
            )
            raw_grants.extend(_parse_haiku_response(resp.content[0].text))
    except anthropic.APIError as exc:
        # Discard partial results — fall back to vision cleanly.
        diagnostics["text_path_error"] = f"{type(exc).__name__}: {exc}"
        return [], "error", len(grant_pages)

    if not raw_grants:
        return [], "haiku_empty", len(grant_pages)

    grants: list[ExtractedGrant] = []
    for raw in raw_grants:
        g = _normalize_grant(raw)
        if g is not None:
            grants.append(g)

    if len(grants) < len(grant_pages) * MIN_GRANTS_PER_PAGE:
        return grants, "low_yield", len(grant_pages)

    return grants, "ok", len(grant_pages)


# ---------------------------------------------------------------------------
# Vision path
# ---------------------------------------------------------------------------

def _render_page_b64(page, dpi: int) -> str:
    pix = page.get_pixmap(dpi=dpi)
    return base64.standard_b64encode(pix.tobytes("png")).decode("utf-8")


def _extract_vision(
    pdf_path: Path,
    tax_year: int | None,
    diagnostics: dict,
) -> tuple[list[ExtractedGrant], int, int]:
    """Extract grants via Haiku vision over rendered page images.

    Returns (grants, pages_scanned, pages_extracted). Raises anthropic.APIError
    on catastrophic API failure (caller handles).
    """
    client = _get_client()
    year_hint = _year_hint(tax_year)

    doc = fitz.open(pdf_path)
    try:
        total_pages = len(doc)
        diagnostics.setdefault("pages_total", total_pages)

        if total_pages <= ATTACHMENT_START_PAGE_IDX:
            diagnostics["vision_pages_scanned"] = 0
            diagnostics["vision_pages_extracted"] = 0
            return [], 0, 0

        # Phase 1: triage pages to find grant tables.
        grant_page_indices: list[int] = []
        consecutive_no = 0
        found_any = False
        pages_scanned = 0

        for i in range(ATTACHMENT_START_PAGE_IDX, total_pages):
            pages_scanned += 1
            b64 = _render_page_b64(doc[i], dpi=TRIAGE_DPI)
            resp = client.messages.create(
                model=MODEL,
                max_tokens=10,
                messages=[{
                    "role": "user",
                    "content": [
                        {"type": "image",
                         "source": {"type": "base64", "media_type": "image/png", "data": b64}},
                        {"type": "text", "text": TRIAGE_PROMPT},
                    ],
                }],
            )
            is_grant = "yes" in resp.content[0].text.lower()
            if is_grant:
                grant_page_indices.append(i)
                consecutive_no = 0
                found_any = True
            else:
                consecutive_no += 1
                if found_any and consecutive_no >= VISION_CONSECUTIVE_NO_LIMIT:
                    break

        diagnostics["vision_pages_scanned"] = pages_scanned

        if not grant_page_indices:
            diagnostics["vision_pages_extracted"] = 0
            return [], pages_scanned, 0

        # Phase 2: extract from identified grant pages.
        raw_grants: list[dict] = []
        pages_extracted = 0
        for i in grant_page_indices:
            b64 = _render_page_b64(doc[i], dpi=EXTRACTION_DPI)
            resp = client.messages.create(
                model=MODEL,
                max_tokens=4096,
                messages=[{
                    "role": "user",
                    "content": [
                        {"type": "image",
                         "source": {"type": "base64", "media_type": "image/png", "data": b64}},
                        {"type": "text", "text": f"{VISION_EXTRACTION_PROMPT}{year_hint}"},
                    ],
                }],
            )
            page_grants = _parse_haiku_response(resp.content[0].text)
            if page_grants:
                pages_extracted += 1
                raw_grants.extend(page_grants)

        diagnostics["vision_pages_extracted"] = pages_extracted
    finally:
        doc.close()

    grants: list[ExtractedGrant] = []
    for raw in raw_grants:
        g = _normalize_grant(raw)
        if g is not None:
            grants.append(g)

    return grants, pages_scanned, pages_extracted


# ---------------------------------------------------------------------------
# Top-level extractor
# ---------------------------------------------------------------------------

def extract_from_pdf(
    pdf_path: str | Path,
    tax_year: int | None = None,
    source_label: str | None = None,
) -> ExtractionResult:
    """Extract grants from a single 990-PF PDF.

    Stateless: reads only the file at pdf_path and calls Anthropic's API.
    Writes nothing. See module docstring for full contract.
    """
    pdf_path = Path(pdf_path)

    diagnostics: dict = {
        "tax_year_hint": tax_year,
        "source_label": source_label,
    }

    if not pdf_path.exists():
        diagnostics["error"] = f"PDF not found: {pdf_path}"
        return ExtractionResult(
            success=False,
            grants=[],
            grant_detail_status=None,
            method="failed",
            diagnostics=diagnostics,
        )

    # Text path
    try:
        text_grants, text_status, num_grant_pages = _extract_text_layer(
            pdf_path, tax_year, diagnostics
        )
    except Exception as exc:
        # pdfplumber can blow up on malformed PDFs — don't let that kill us,
        # fall through to the vision path like any other text failure.
        text_grants = []
        text_status = "error"
        num_grant_pages = 0
        diagnostics.setdefault("text_path_error", f"{type(exc).__name__}: {exc}")

    diagnostics["text_path_status"] = text_status

    if text_status == "ok":
        grants = _postprocess(text_grants)
        return ExtractionResult(
            success=True,
            grants=grants,
            grant_detail_status="supplemented" if grants else "no_grants",
            method="pdfplumber+haiku_text",
            diagnostics=diagnostics,
        )

    # Short-circuit: PDF has a readable text layer but isn't a 990-PF
    # (e.g. a standalone NY State CHAR500 cover form). Don't run vision —
    # there's nothing in the document for it to find.
    if text_status == "not_a_990pf":
        return ExtractionResult(
            success=True,
            grants=[],
            grant_detail_status="not_a_990pf",
            method="skipped_not_990pf",
            diagnostics=diagnostics,
        )

    # Vision fallback — replaces text output entirely.
    try:
        vision_grants, _scanned, _extracted = _extract_vision(
            pdf_path, tax_year, diagnostics
        )
    except anthropic.APIError as exc:
        diagnostics["error"] = f"{type(exc).__name__}: {exc}"
        return ExtractionResult(
            success=False,
            grants=[],
            grant_detail_status=None,
            method="failed",
            diagnostics=diagnostics,
        )
    except Exception as exc:
        diagnostics["error"] = f"{type(exc).__name__}: {exc}"
        return ExtractionResult(
            success=False,
            grants=[],
            grant_detail_status=None,
            method="failed",
            diagnostics=diagnostics,
        )

    grants = _postprocess(vision_grants)
    return ExtractionResult(
        success=True,
        grants=grants,
        grant_detail_status="supplemented" if grants else "no_grants",
        method="haiku_vision_attempted",
        diagnostics=diagnostics,
    )


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------

def _result_to_jsonable(result: ExtractionResult) -> dict:
    def grant_to_dict(g: ExtractedGrant) -> dict:
        d = asdict(g)
        if d["amount"] is not None:
            d["amount"] = str(d["amount"])
        return d

    return {
        "success": result.success,
        "method": result.method,
        "grant_detail_status": result.grant_detail_status,
        "diagnostics": result.diagnostics,
        "grants": [grant_to_dict(g) for g in result.grants],
        "total_amount": str(result.total_amount),
    }


def _print_table(result: ExtractionResult) -> None:
    print(f"success:             {result.success}")
    print(f"method:              {result.method}")
    print(f"grant_detail_status: {result.grant_detail_status}")
    print(f"grants:              {len(result.grants)}")
    print(f"total_amount:        ${result.total_amount:,}")
    print("diagnostics:")
    for k, v in result.diagnostics.items():
        print(f"  {k}: {v}")
    if not result.grants:
        return
    print()
    print(f"{'#':>4}  {'recipient':<45}  {'city':<20} {'st':<3} {'amount':>12}")
    print("-" * 90)
    for g in result.grants[:50]:
        name = (g.recipient_name or g.recipient_person_name or "")[:45]
        city = (g.city or "")[:20]
        state = (g.state or "")[:3]
        amt = f"${g.amount:,.0f}" if g.amount is not None else (g.amount_raw or "")
        print(f"{g.line_number:>4}  {name:<45}  {city:<20} {state:<3} {amt:>12}")
    if len(result.grants) > 50:
        print(f"... and {len(result.grants) - 50} more")


def main(argv: list[str] | None = None) -> int:
    parser = argparse.ArgumentParser(
        prog="scripts.extract.irs_990_pdf",
        description="Extract grant data from a 990-PF PDF.",
    )
    parser.add_argument("pdf_path", help="Path to a local PDF file.")
    parser.add_argument("--tax-year", type=int, default=None,
                        help="Tax year hint passed to the extraction prompts.")
    parser.add_argument("--source-label", default=None,
                        help="Diagnostic label for the PDF source (e.g. 'ny_ag').")
    parser.add_argument("--json", action="store_true",
                        help="Emit the full result as JSON instead of a table.")
    args = parser.parse_args(argv)

    result = extract_from_pdf(
        args.pdf_path,
        tax_year=args.tax_year,
        source_label=args.source_label,
    )

    if args.json:
        print(json.dumps(_result_to_jsonable(result), indent=2))
    else:
        _print_table(result)

    return 0 if result.success else 1


if __name__ == "__main__":
    sys.exit(main())