aboutsummaryrefslogtreecommitdiff
path: root/scripts/parse/irs_990.py
diff options
context:
space:
mode:
authorbenj <benj@rse8.com>2026-05-01 09:36:21 +0800
committerbenj <benj@rse8.com>2026-05-01 09:36:21 +0800
commit850f4f826b536d913235e174dc07aef74e51bf60 (patch)
treea2806da6c0ed5c48d21178e0c6c280d5a40ccd38 /scripts/parse/irs_990.py
parent6605e2cc428e3bdaa174ccc432941eab8c5d61cb (diff)
downloadtidyindex-850f4f826b536d913235e174dc07aef74e51bf60.tar
tidyindex-850f4f826b536d913235e174dc07aef74e51bf60.tar.gz
tidyindex-850f4f826b536d913235e174dc07aef74e51bf60.tar.bz2
tidyindex-850f4f826b536d913235e174dc07aef74e51bf60.tar.lz
tidyindex-850f4f826b536d913235e174dc07aef74e51bf60.tar.xz
tidyindex-850f4f826b536d913235e174dc07aef74e51bf60.tar.zst
tidyindex-850f4f826b536d913235e174dc07aef74e51bf60.zip
irs 990 doc prarsers and some web stuffHEADmaster
Diffstat (limited to '')
-rw-r--r--scripts/parse/irs_990.py40
1 files changed, 31 insertions, 9 deletions
diff --git a/scripts/parse/irs_990.py b/scripts/parse/irs_990.py
index 3a5cc2d..b3751b6 100644
--- a/scripts/parse/irs_990.py
+++ b/scripts/parse/irs_990.py
@@ -25,7 +25,7 @@ from scripts.common.ingest import (
start_ingest_run, finish_ingest_run, fail_ingest_run, log_ingest_error,
)
from scripts.common.filing import (
- upsert_raw_filing, record_raw_filing_source,
+ upsert_raw_filing, record_raw_filing_source, get_seen_source_paths,
)
PARSER_NAME = "parse_irs_990"
@@ -496,16 +496,16 @@ def process_filing(tree, source_document_id, source_archive, source_path, ingest
]
_replace_children(
- conn, raw_filing_id, filing_form_data, grant_rows, schedule_o_rows, xml_rows,
+ conn, raw_filing_id, filing_form_data, grant_rows, schedule_o_rows,
)
- # 1 raw_filing + 1 raw_filing_source + 1 raw_form_990 + grants + schedule_o + xml fields
- return 3 + len(grant_rows) + len(schedule_o_rows) + len(xml_rows)
+ # 1 raw_filing + 1 raw_filing_source + 1 raw_form_990 + grants + schedule_o
+ return 3 + len(grant_rows) + len(schedule_o_rows)
return execute_transaction(_do)
-def _replace_children(conn, raw_filing_id, form_data, grant_rows, schedule_o_rows, xml_rows):
+def _replace_children(conn, raw_filing_id, form_data, grant_rows, schedule_o_rows):
"""Delete and re-insert all child rows for a filing using the caller's transaction."""
form_columns = list(form_data.keys())
form_placeholders = ", ".join(["%s"] * len(form_columns))
@@ -598,13 +598,28 @@ def process_zip(zip_path, ingest_run_id):
with zf:
names = [n for n in zf.namelist() if n.endswith(".xml")]
+ seen_paths = get_seen_source_paths(basename)
+ skipped_existing = 0
- print(f"Processing {basename}: {len(names)} XML files")
+ print(
+ f"Processing {basename}: {len(names)} XML files "
+ f"({len(seen_paths)} already seen)"
+ )
files_scanned = 0
files_matched = 0
total_rows = 0
for i, name in enumerate(names):
+ if name in seen_paths:
+ skipped_existing += 1
+ if (i + 1) % 1000 == 0:
+ print(
+ f" ...{i + 1}/{len(names)} files, "
+ f"{skipped_existing} skipped existing, "
+ f"{files_matched} matched, {total_rows} rows"
+ )
+ continue
+
# Count every ZIP member as scanned, even ones we fail to read —
# otherwise read failures silently shrink the scanned total and
# make run-level metrics misleading.
@@ -626,9 +641,16 @@ def process_zip(zip_path, ingest_run_id):
total_rows += rows
if (i + 1) % 1000 == 0:
- print(f" ...{i + 1}/{len(names)} files, {files_matched} matched, {total_rows} rows")
-
- print(f" Done: {files_scanned} scanned, {files_matched} matched, {total_rows} rows")
+ print(
+ f" ...{i + 1}/{len(names)} files, "
+ f"{skipped_existing} skipped existing, "
+ f"{files_matched} matched, {total_rows} rows"
+ )
+
+ print(
+ f" Done: {files_scanned} scanned, {skipped_existing} skipped existing, "
+ f"{files_matched} matched, {total_rows} rows"
+ )
return files_scanned, files_matched, total_rows