"""
Source-agnostic 990-PF PDF grant extractor.

Takes a path to a local PDF and returns structured grant data. Side-effect
free with respect to any database; only external call is to Anthropic's API.

A separate loader consumes ExtractionResult and writes to raw.* tables.

Usage (CLI):
    python -m scripts.extract.irs_990_pdf path/to/file.pdf
    python -m scripts.extract.irs_990_pdf path/to/file.pdf --json
    python -m scripts.extract.irs_990_pdf path/to/file.pdf --tax-year 2021
    python -m scripts.extract.irs_990_pdf path/to/file.pdf --source-label ny_ag

Usage (programmatic):
    from scripts.extract.irs_990_pdf import extract_from_pdf
    result = extract_from_pdf("data/tmp/pdf_test/marley_2017.pdf", tax_year=2017)
    for grant in result.grants:
        print(grant.recipient_name, grant.amount)
"""

from __future__ import annotations

import argparse
import base64
import json
import re
import subprocess
import sys
from dataclasses import dataclass, field, asdict
from decimal import Decimal, InvalidOperation
from pathlib import Path
from typing import Any

import anthropic
import fitz  # pymupdf
import pdfplumber

from scripts.common.normalize import is_placeholder, parse_numeric


# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------

MODEL = "claude-haiku-4-5-20251001"

# Text-path quality heuristic. If the number of grants extracted is less than
# num_grant_pages * MIN_GRANTS_PER_PAGE, we flag the result as low-yield and
# fall back to vision. This is an initial heuristic and may be tuned.
MIN_GRANTS_PER_PAGE = 1

# Minimum total text characters across all pages before we trust the text layer.
MIN_TEXT_LAYER_CHARS = 200

# Standard 990-PF form has 13 pages; grant attachments come after.
ATTACHMENT_START_PAGE_IDX = 13  # 0-based; corresponds to page 14

# Vision scan stops after this many consecutive non-grant pages (once any
# grant page has been found).
VISION_CONSECUTIVE_NO_LIMIT = 3

# DPIs used for vision rendering.
TRIAGE_DPI = 100
EXTRACTION_DPI = 150

GRANT_PAGE_KEYWORDS = [
    r"GRANTS AND CONTRIBUTIONS PAID",
    r"SUPPLEMENTARY INFORMATION",
    r"PART XIV",
    r"PART XV",
    r"SCHEDULE OF.*GRANT",
    r"GRANTS PAID",
    r"CONTRIBUTIONS PAID",
]
_GRANT_PAGE_RE = re.compile("|".join(GRANT_PAGE_KEYWORDS))

# Loose marker for "this PDF is a 990-PF". NY AG PDFs may contain only the
# CHAR500 state cover form with no federal return attached — ~46% of that
# corpus, based on sampling. Those should short-circuit before we waste any
# Haiku calls.
_IS_990PF_RE = re.compile(r"FORM\s*990-?PF")

TEXT_EXTRACTION_PROMPT = """Extract ALL individual grants from this 990-PF page text.

Return a JSON array. Each grant should have:
- recipient_name: organization name (if grant is to an org)
- recipient_person_name: person's name (if grant is to an individual)
- address_line1: street address (if present)
- address_line2: second address line (if present)
- city: city (if present)
- state: state abbreviation (if present)
- zip: zip code (if present)
- country: country (if present, only when non-US)
- foreign_postal_code: foreign postal code (if present)
- amount: dollar amount as string (digits only, no $ or commas)
- purpose: purpose of grant (if present)
- foundation_status: recipient status like PC, NC, PF (if present)
- relationship: relationship of recipient to foundation (if present)

IMPORTANT RULES:
- Do NOT include total/subtotal rows
- Do NOT include header rows or column labels
- If there are multiple year columns, extract ONLY the most recent year
- If recipient name is "SEE ATTACHED", "VARIOUS", "N/A", "NONE", or a similar placeholder, skip it
- Return [] if no individual grants are found

Return ONLY the JSON array, no other text."""

VISION_EXTRACTION_PROMPT = """Extract ALL individual grants from this 990-PF page image.

Return a JSON array. Each grant should have:
- recipient_name: organization name (if grant is to an org)
- recipient_person_name: person's name (if grant is to an individual)
- address_line1: street address (if visible)
- address_line2: second address line (if visible)
- city: city
- state: state abbreviation
- zip: zip code (if visible)
- country: country (only if non-US)
- foreign_postal_code: foreign postal code (if present)
- amount: dollar amount as string (digits only, no $ or commas)
- purpose: purpose of grant
- foundation_status: recipient status like PC, NC, PF (if present)
- relationship: relationship of recipient to foundation (if present)

IMPORTANT RULES:
- Do NOT include total/subtotal rows
- Do NOT include header rows
- If there are multiple year columns, extract ONLY the most recent year
- If recipient name is "SEE ATTACHED", "VARIOUS", "N/A", "NONE", or a similar placeholder, skip it
- Return [] if no individual grants are found

Return ONLY the JSON array, no other text."""

TRIAGE_PROMPT = (
    "Is this page a table of grant or contribution recipients listing "
    "individual organization names with addresses and dollar amounts? "
    "Answer ONLY yes or no."
)


# ---------------------------------------------------------------------------
# Anthropic client (lazy singleton so importing this module is cheap)
# ---------------------------------------------------------------------------

_client: anthropic.Anthropic | None = None


def _get_client() -> anthropic.Anthropic:
    global _client
    if _client is None:
        api_key = subprocess.run(
            ["pass", "show", "anthropic.com/api.anthropic.com/apikey"],
            capture_output=True, text=True,
        ).stdout.strip()
        _client = anthropic.Anthropic(api_key=api_key)
    return _client


# ---------------------------------------------------------------------------
# Dataclasses
# ---------------------------------------------------------------------------

@dataclass
class ExtractedGrant:
    line_number: int
    recipient_name: str | None = None
    recipient_name2: str | None = None
    recipient_person_name: str | None = None
    address_line1: str | None = None
    address_line2: str | None = None
    city: str | None = None
    state: str | None = None
    zip: str | None = None
    country: str | None = None
    foreign_postal_code: str | None = None
    amount_raw: str | None = None
    amount: Decimal | None = None
    purpose: str | None = None
    foundation_status: str | None = None
    relationship: str | None = None


@dataclass
class ExtractionResult:
    success: bool
    grants: list[ExtractedGrant]
    # 'supplemented' — ≥1 grant extracted
    # 'no_grants'    — extractor ran end-to-end, found nothing usable
    # 'not_a_990pf'  — PDF is readable but isn't a 990-PF (e.g. CHAR500 cover)
    # None           — catastrophic failure (success=False)
    grant_detail_status: str | None
    # 'pdfplumber+haiku_text'   — text path was used
    # 'haiku_vision_attempted'  — vision path was used (success not implied)
    # 'skipped_not_990pf'       — short-circuited; no API calls made
    # 'failed'                  — catastrophic failure (success=False)
    method: str
    diagnostics: dict = field(default_factory=dict)

    @property
    def total_amount(self) -> Decimal:
        return sum((g.amount or Decimal(0)) for g in self.grants)


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def _parse_haiku_response(text: str) -> list[dict]:
    """Parse JSON from Haiku response, handling markdown code blocks."""
    text = text.strip()
    if text.startswith("```"):
        # Drop opening fence + optional language tag
        parts = text.split("\n", 1)
        if len(parts) == 2:
            text = parts[1]
        text = text.rsplit("```", 1)[0]
    text = text.strip()
    try:
        parsed = json.loads(text)
    except json.JSONDecodeError:
        return []
    if not isinstance(parsed, list):
        return []
    return [x for x in parsed if isinstance(x, dict)]


def _is_grant_page_text(text: str) -> bool:
    return bool(_GRANT_PAGE_RE.search(text.upper()))


def _clean_str(value: Any) -> str | None:
    if value is None:
        return None
    s = str(value).strip()
    return s or None


def _normalize_grant(raw: dict) -> ExtractedGrant | None:
    """Map a loose Haiku JSON dict to an ExtractedGrant.

    Returns None if the row looks like a placeholder/header artifact.
    line_number is set to 0 here; it's reassigned later after dedupe.
    """
    recipient_name = _clean_str(raw.get("recipient_name"))
    recipient_person_name = _clean_str(raw.get("recipient_person_name"))

    # Safety net: prompt already filters these, but double-check.
    if recipient_name and is_placeholder(recipient_name):
        recipient_name = None
    if recipient_person_name and is_placeholder(recipient_person_name):
        recipient_person_name = None

    amount_raw = _clean_str(raw.get("amount"))
    amount_numeric: Decimal | None = None
    if amount_raw is not None:
        parsed = parse_numeric(amount_raw)
        if parsed is not None:
            try:
                amount_numeric = Decimal(parsed)
            except InvalidOperation:
                amount_numeric = None

    return ExtractedGrant(
        line_number=0,
        recipient_name=recipient_name,
        recipient_name2=_clean_str(raw.get("recipient_name2")),
        recipient_person_name=recipient_person_name,
        address_line1=_clean_str(raw.get("address_line1")),
        address_line2=_clean_str(raw.get("address_line2")),
        city=_clean_str(raw.get("city")),
        state=_clean_str(raw.get("state")),
        zip=_clean_str(raw.get("zip")),
        country=_clean_str(raw.get("country")),
        foreign_postal_code=_clean_str(raw.get("foreign_postal_code")),
        amount_raw=amount_raw,
        amount=amount_numeric,
        purpose=_clean_str(raw.get("purpose")),
        foundation_status=_clean_str(raw.get("foundation_status")),
        relationship=_clean_str(raw.get("relationship")),
    )


def _year_hint(tax_year: int | None) -> str:
    if tax_year is None:
        return ""
    return f"\n\nThis filing is for tax year {tax_year}."


def _postprocess(grants: list[ExtractedGrant]) -> list[ExtractedGrant]:
    """Drop placeholders / amount-less rows, dedupe, and reassign line_number."""
    out: list[ExtractedGrant] = []
    seen: set[tuple] = set()

    for g in grants:
        # Drop rows with no recipient at all.
        if not g.recipient_name and not g.recipient_person_name:
            continue
        # Drop placeholder recipients that slipped through (belt-and-suspenders).
        if g.recipient_name and is_placeholder(g.recipient_name):
            continue
        if g.recipient_person_name and is_placeholder(g.recipient_person_name):
            continue
        # Drop rows with no amount text at all (Haiku didn't see an amount column).
        # Keep rows where amount_raw == '0' and amount == Decimal(0).
        if g.amount is None and not g.amount_raw:
            continue

        key = (
            (g.recipient_name or "").upper(),
            (g.city or "").upper(),
            (g.state or "").upper(),
            g.amount_raw or "",
        )
        if key in seen:
            continue
        seen.add(key)
        out.append(g)

    for i, g in enumerate(out, start=1):
        g.line_number = i
    return out


# ---------------------------------------------------------------------------
# Text path
# ---------------------------------------------------------------------------

def _extract_text_layer(
    pdf_path: Path,
    tax_year: int | None,
    diagnostics: dict,
) -> tuple[list[ExtractedGrant], str, int]:
    """Extract grants via pdfplumber text + Haiku text parsing.

    Returns (grants, status, num_grant_pages).
    status ∈ {'ok', 'not_a_990pf', 'no_text_layer', 'no_grant_pages',
              'haiku_empty', 'low_yield', 'error'}

    'not_a_990pf' means the PDF has a readable text layer but doesn't look
    like a 990-PF at all (e.g. a standalone NY State CHAR500 cover form).
    The caller should short-circuit on this — there's nothing for the vision
    path to find either.
    """
    try:
        pdf = pdfplumber.open(pdf_path)
    except Exception as exc:
        diagnostics["text_path_error"] = str(exc)
        return [], "error", 0

    try:
        page_texts: list[tuple[int, str]] = []
        total_chars = 0
        for idx, page in enumerate(pdf.pages, start=1):
            text = page.extract_text() or ""
            page_texts.append((idx, text))
            total_chars += len(text)

        diagnostics["text_layer_chars"] = total_chars
        diagnostics["pages_total"] = len(page_texts)

        if total_chars < MIN_TEXT_LAYER_CHARS:
            return [], "no_text_layer", 0

        # Must look like a 990-PF at all. If not, don't bother with Haiku —
        # and don't fall through to vision, since it'd just burn triage calls
        # scanning a non-990-PF document end to end.
        joined_upper = "\n".join(t for _, t in page_texts).upper()
        if not _IS_990PF_RE.search(joined_upper):
            return [], "not_a_990pf", 0

        grant_pages = [
            (num, text) for num, text in page_texts
            if len(text) > 100 and _is_grant_page_text(text)
        ]
        diagnostics["grant_pages_identified"] = len(grant_pages)

        if not grant_pages:
            return [], "no_grant_pages", 0
    finally:
        pdf.close()

    client = _get_client()
    year_hint = _year_hint(tax_year)

    raw_grants: list[dict] = []
    try:
        for page_num, text in grant_pages:
            resp = client.messages.create(
                model=MODEL,
                max_tokens=4096,
                messages=[{
                    "role": "user",
                    "content": f"{TEXT_EXTRACTION_PROMPT}{year_hint}\n\nPage text:\n{text}",
                }],
            )
            raw_grants.extend(_parse_haiku_response(resp.content[0].text))
    except anthropic.APIError as exc:
        # Discard partial results — fall back to vision cleanly.
        diagnostics["text_path_error"] = f"{type(exc).__name__}: {exc}"
        return [], "error", len(grant_pages)

    if not raw_grants:
        return [], "haiku_empty", len(grant_pages)

    grants: list[ExtractedGrant] = []
    for raw in raw_grants:
        g = _normalize_grant(raw)
        if g is not None:
            grants.append(g)

    if len(grants) < len(grant_pages) * MIN_GRANTS_PER_PAGE:
        return grants, "low_yield", len(grant_pages)

    return grants, "ok", len(grant_pages)


# ---------------------------------------------------------------------------
# Vision path
# ---------------------------------------------------------------------------

def _render_page_b64(page, dpi: int) -> str:
    pix = page.get_pixmap(dpi=dpi)
    return base64.standard_b64encode(pix.tobytes("png")).decode("utf-8")


def _extract_vision(
    pdf_path: Path,
    tax_year: int | None,
    diagnostics: dict,
) -> tuple[list[ExtractedGrant], int, int]:
    """Extract grants via Haiku vision over rendered page images.

    Returns (grants, pages_scanned, pages_extracted). Raises anthropic.APIError
    on catastrophic API failure (caller handles).
    """
    client = _get_client()
    year_hint = _year_hint(tax_year)

    doc = fitz.open(pdf_path)
    try:
        total_pages = len(doc)
        diagnostics.setdefault("pages_total", total_pages)

        if total_pages <= ATTACHMENT_START_PAGE_IDX:
            diagnostics["vision_pages_scanned"] = 0
            diagnostics["vision_pages_extracted"] = 0
            return [], 0, 0

        # Phase 1: triage pages to find grant tables.
        grant_page_indices: list[int] = []
        consecutive_no = 0
        found_any = False
        pages_scanned = 0

        for i in range(ATTACHMENT_START_PAGE_IDX, total_pages):
            pages_scanned += 1
            b64 = _render_page_b64(doc[i], dpi=TRIAGE_DPI)
            resp = client.messages.create(
                model=MODEL,
                max_tokens=10,
                messages=[{
                    "role": "user",
                    "content": [
                        {"type": "image",
                         "source": {"type": "base64", "media_type": "image/png", "data": b64}},
                        {"type": "text", "text": TRIAGE_PROMPT},
                    ],
                }],
            )
            is_grant = "yes" in resp.content[0].text.lower()
            if is_grant:
                grant_page_indices.append(i)
                consecutive_no = 0
                found_any = True
            else:
                consecutive_no += 1
                if found_any and consecutive_no >= VISION_CONSECUTIVE_NO_LIMIT:
                    break

        diagnostics["vision_pages_scanned"] = pages_scanned

        if not grant_page_indices:
            diagnostics["vision_pages_extracted"] = 0
            return [], pages_scanned, 0

        # Phase 2: extract from identified grant pages.
        raw_grants: list[dict] = []
        pages_extracted = 0
        for i in grant_page_indices:
            b64 = _render_page_b64(doc[i], dpi=EXTRACTION_DPI)
            resp = client.messages.create(
                model=MODEL,
                max_tokens=4096,
                messages=[{
                    "role": "user",
                    "content": [
                        {"type": "image",
                         "source": {"type": "base64", "media_type": "image/png", "data": b64}},
                        {"type": "text", "text": f"{VISION_EXTRACTION_PROMPT}{year_hint}"},
                    ],
                }],
            )
            page_grants = _parse_haiku_response(resp.content[0].text)
            if page_grants:
                pages_extracted += 1
                raw_grants.extend(page_grants)

        diagnostics["vision_pages_extracted"] = pages_extracted
    finally:
        doc.close()

    grants: list[ExtractedGrant] = []
    for raw in raw_grants:
        g = _normalize_grant(raw)
        if g is not None:
            grants.append(g)

    return grants, pages_scanned, pages_extracted


# ---------------------------------------------------------------------------
# Top-level extractor
# ---------------------------------------------------------------------------

def extract_from_pdf(
    pdf_path: str | Path,
    tax_year: int | None = None,
    source_label: str | None = None,
) -> ExtractionResult:
    """Extract grants from a single 990-PF PDF.

    Stateless: reads only the file at pdf_path and calls Anthropic's API.
    Writes nothing. See module docstring for full contract.
    """
    pdf_path = Path(pdf_path)

    diagnostics: dict = {
        "tax_year_hint": tax_year,
        "source_label": source_label,
    }

    if not pdf_path.exists():
        diagnostics["error"] = f"PDF not found: {pdf_path}"
        return ExtractionResult(
            success=False,
            grants=[],
            grant_detail_status=None,
            method="failed",
            diagnostics=diagnostics,
        )

    # Text path
    try:
        text_grants, text_status, num_grant_pages = _extract_text_layer(
            pdf_path, tax_year, diagnostics
        )
    except Exception as exc:
        # pdfplumber can blow up on malformed PDFs — don't let that kill us,
        # fall through to the vision path like any other text failure.
        text_grants = []
        text_status = "error"
        num_grant_pages = 0
        diagnostics.setdefault("text_path_error", f"{type(exc).__name__}: {exc}")

    diagnostics["text_path_status"] = text_status

    if text_status == "ok":
        grants = _postprocess(text_grants)
        return ExtractionResult(
            success=True,
            grants=grants,
            grant_detail_status="supplemented" if grants else "no_grants",
            method="pdfplumber+haiku_text",
            diagnostics=diagnostics,
        )

    # Short-circuit: PDF has a readable text layer but isn't a 990-PF
    # (e.g. a standalone NY State CHAR500 cover form). Don't run vision —
    # there's nothing in the document for it to find.
    if text_status == "not_a_990pf":
        return ExtractionResult(
            success=True,
            grants=[],
            grant_detail_status="not_a_990pf",
            method="skipped_not_990pf",
            diagnostics=diagnostics,
        )

    # Vision fallback — replaces text output entirely.
    try:
        vision_grants, _scanned, _extracted = _extract_vision(
            pdf_path, tax_year, diagnostics
        )
    except anthropic.APIError as exc:
        diagnostics["error"] = f"{type(exc).__name__}: {exc}"
        return ExtractionResult(
            success=False,
            grants=[],
            grant_detail_status=None,
            method="failed",
            diagnostics=diagnostics,
        )
    except Exception as exc:
        diagnostics["error"] = f"{type(exc).__name__}: {exc}"
        return ExtractionResult(
            success=False,
            grants=[],
            grant_detail_status=None,
            method="failed",
            diagnostics=diagnostics,
        )

    grants = _postprocess(vision_grants)
    return ExtractionResult(
        success=True,
        grants=grants,
        grant_detail_status="supplemented" if grants else "no_grants",
        method="haiku_vision_attempted",
        diagnostics=diagnostics,
    )


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------

def _result_to_jsonable(result: ExtractionResult) -> dict:
    def grant_to_dict(g: ExtractedGrant) -> dict:
        d = asdict(g)
        if d["amount"] is not None:
            d["amount"] = str(d["amount"])
        return d

    return {
        "success": result.success,
        "method": result.method,
        "grant_detail_status": result.grant_detail_status,
        "diagnostics": result.diagnostics,
        "grants": [grant_to_dict(g) for g in result.grants],
        "total_amount": str(result.total_amount),
    }


def _print_table(result: ExtractionResult) -> None:
    print(f"success:             {result.success}")
    print(f"method:              {result.method}")
    print(f"grant_detail_status: {result.grant_detail_status}")
    print(f"grants:              {len(result.grants)}")
    print(f"total_amount:        ${result.total_amount:,}")
    print("diagnostics:")
    for k, v in result.diagnostics.items():
        print(f"  {k}: {v}")
    if not result.grants:
        return
    print()
    print(f"{'#':>4}  {'recipient':<45}  {'city':<20} {'st':<3} {'amount':>12}")
    print("-" * 90)
    for g in result.grants[:50]:
        name = (g.recipient_name or g.recipient_person_name or "")[:45]
        city = (g.city or "")[:20]
        state = (g.state or "")[:3]
        amt = f"${g.amount:,.0f}" if g.amount is not None else (g.amount_raw or "")
        print(f"{g.line_number:>4}  {name:<45}  {city:<20} {state:<3} {amt:>12}")
    if len(result.grants) > 50:
        print(f"... and {len(result.grants) - 50} more")


def main(argv: list[str] | None = None) -> int:
    parser = argparse.ArgumentParser(
        prog="scripts.extract.irs_990_pdf",
        description="Extract grant data from a 990-PF PDF.",
    )
    parser.add_argument("pdf_path", help="Path to a local PDF file.")
    parser.add_argument("--tax-year", type=int, default=None,
                        help="Tax year hint passed to the extraction prompts.")
    parser.add_argument("--source-label", default=None,
                        help="Diagnostic label for the PDF source (e.g. 'ny_ag').")
    parser.add_argument("--json", action="store_true",
                        help="Emit the full result as JSON instead of a table.")
    args = parser.parse_args(argv)

    result = extract_from_pdf(
        args.pdf_path,
        tax_year=args.tax_year,
        source_label=args.source_label,
    )

    if args.json:
        print(json.dumps(_result_to_jsonable(result), indent=2))
    else:
        _print_table(result)

    return 0 if result.success else 1


if __name__ == "__main__":
    sys.exit(main())