1 files changed, 31 insertions, 13 deletions
diff --git a/scripts/parse/irs_990pf.py b/scripts/parse/irs_990pf.py
index 3d245b8..218c7fb 100644
--- a/scripts/parse/irs_990pf.py
+++ b/scripts/parse/irs_990pf.py
@@ -25,7 +25,7 @@ from scripts.common.ingest import (
     start_ingest_run, finish_ingest_run, fail_ingest_run, log_ingest_error,
 )
 from scripts.common.filing import (
-    upsert_raw_filing, record_raw_filing_source,
+    upsert_raw_filing, record_raw_filing_source, get_seen_source_paths,
 )
 
 PARSER_NAME = "parse_irs_990pf"
@@ -349,10 +349,6 @@ def process_filing(tree, source_document_id, source_archive, source_path, ingest
     form_data = extract_form_990pf(tree)
     form_data["grant_detail_status"] = compute_grant_detail_status(grant_elements, extracted_grants)
 
-    root = tree.getroot()
-    return_header = root.find(f"{{{NS}}}ReturnHeader")
-    return_data = root.find(f"{{{NS}}}ReturnData")
-
     def _do(conn):
         raw_filing_id = upsert_raw_filing(
             SOURCE_SYSTEM, source_document_id, metadata, ingest_run_id, conn=conn
@@ -367,15 +363,15 @@ def process_filing(tree, source_document_id, source_archive, source_path, ingest
             for row in extracted_grants
         ]
 
-        _replace_children(conn, raw_filing_id, filing_form_data, grant_rows, xml_rows)
+        _replace_children(conn, raw_filing_id, filing_form_data, grant_rows)
 
-        # 1 raw_filing + 1 raw_filing_source + 1 raw_form_990pf + grants + xml fields
-        return 3 + len(grant_rows) + len(xml_rows)
+        # 1 raw_filing + 1 raw_filing_source + 1 raw_form_990pf + grants
+        return 3 + len(grant_rows)
 
     return execute_transaction(_do)
 
 
-def _replace_children(conn, raw_filing_id, form_data, grant_rows, xml_rows):
+def _replace_children(conn, raw_filing_id, form_data, grant_rows):
     """Delete and re-insert all child rows for a filing using the caller's transaction."""
     form_columns = list(form_data.keys())
     form_placeholders = ", ".join(["%s"] * len(form_columns))
@@ -451,13 +447,28 @@ def process_zip(zip_path, ingest_run_id):
 
     with zf:
         names = [n for n in zf.namelist() if n.endswith(".xml")]
+        seen_paths = get_seen_source_paths(basename)
+        skipped_existing = 0
 
-        print(f"Processing {basename}: {len(names)} XML files")
+        print(
+            f"Processing {basename}: {len(names)} XML files "
+            f"({len(seen_paths)} already seen)"
+        )
         files_scanned = 0
         files_matched = 0
         total_rows = 0
 
         for i, name in enumerate(names):
+            if name in seen_paths:
+                skipped_existing += 1
+                if (i + 1) % 1000 == 0:
+                    print(
+                        f"  ...{i + 1}/{len(names)} files, "
+                        f"{skipped_existing} skipped existing, "
+                        f"{files_matched} matched, {total_rows} rows"
+                    )
+                continue
+
             # Count every ZIP member as scanned, even ones we fail to read —
             # otherwise read failures silently shrink the scanned total and
             # make run-level metrics misleading.
@@ -479,9 +490,16 @@ def process_zip(zip_path, ingest_run_id):
                 total_rows += rows
 
             if (i + 1) % 1000 == 0:
-                print(f"  ...{i + 1}/{len(names)} files, {files_matched} matched, {total_rows} rows")
-
-    print(f"  Done: {files_scanned} scanned, {files_matched} matched, {total_rows} rows")
+                print(
+                    f"  ...{i + 1}/{len(names)} files, "
+                    f"{skipped_existing} skipped existing, "
+                    f"{files_matched} matched, {total_rows} rows"
+                )
+
+    print(
+        f"  Done: {files_scanned} scanned, {skipped_existing} skipped existing, "
+        f"{files_matched} matched, {total_rows} rows"
+    )
     return files_scanned, files_matched, total_rows