aboutsummaryrefslogtreecommitdiff
path: root/scripts/parse
diff options
context:
space:
mode:
authorbenj <benj@rse8.com>2026-05-01 09:36:21 +0800
committerbenj <benj@rse8.com>2026-05-01 09:36:21 +0800
commit850f4f826b536d913235e174dc07aef74e51bf60 (patch)
treea2806da6c0ed5c48d21178e0c6c280d5a40ccd38 /scripts/parse
parent6605e2cc428e3bdaa174ccc432941eab8c5d61cb (diff)
downloadtidyindex-850f4f826b536d913235e174dc07aef74e51bf60.tar
tidyindex-850f4f826b536d913235e174dc07aef74e51bf60.tar.gz
tidyindex-850f4f826b536d913235e174dc07aef74e51bf60.tar.bz2
tidyindex-850f4f826b536d913235e174dc07aef74e51bf60.tar.lz
tidyindex-850f4f826b536d913235e174dc07aef74e51bf60.tar.xz
tidyindex-850f4f826b536d913235e174dc07aef74e51bf60.tar.zst
tidyindex-850f4f826b536d913235e174dc07aef74e51bf60.zip
irs 990 doc prarsers and some web stuffHEADmaster
Diffstat (limited to '')
-rw-r--r--scripts/parse/irs_990.py40
-rw-r--r--scripts/parse/irs_990ez.py38
-rw-r--r--scripts/parse/irs_990pf.py44
3 files changed, 92 insertions, 30 deletions
diff --git a/scripts/parse/irs_990.py b/scripts/parse/irs_990.py
index 3a5cc2d..b3751b6 100644
--- a/scripts/parse/irs_990.py
+++ b/scripts/parse/irs_990.py
@@ -25,7 +25,7 @@ from scripts.common.ingest import (
start_ingest_run, finish_ingest_run, fail_ingest_run, log_ingest_error,
)
from scripts.common.filing import (
- upsert_raw_filing, record_raw_filing_source,
+ upsert_raw_filing, record_raw_filing_source, get_seen_source_paths,
)
PARSER_NAME = "parse_irs_990"
@@ -496,16 +496,16 @@ def process_filing(tree, source_document_id, source_archive, source_path, ingest
]
_replace_children(
- conn, raw_filing_id, filing_form_data, grant_rows, schedule_o_rows, xml_rows,
+ conn, raw_filing_id, filing_form_data, grant_rows, schedule_o_rows,
)
- # 1 raw_filing + 1 raw_filing_source + 1 raw_form_990 + grants + schedule_o + xml fields
- return 3 + len(grant_rows) + len(schedule_o_rows) + len(xml_rows)
+ # 1 raw_filing + 1 raw_filing_source + 1 raw_form_990 + grants + schedule_o
+ return 3 + len(grant_rows) + len(schedule_o_rows)
return execute_transaction(_do)
-def _replace_children(conn, raw_filing_id, form_data, grant_rows, schedule_o_rows, xml_rows):
+def _replace_children(conn, raw_filing_id, form_data, grant_rows, schedule_o_rows):
"""Delete and re-insert all child rows for a filing using the caller's transaction."""
form_columns = list(form_data.keys())
form_placeholders = ", ".join(["%s"] * len(form_columns))
@@ -598,13 +598,28 @@ def process_zip(zip_path, ingest_run_id):
with zf:
names = [n for n in zf.namelist() if n.endswith(".xml")]
+ seen_paths = get_seen_source_paths(basename)
+ skipped_existing = 0
- print(f"Processing {basename}: {len(names)} XML files")
+ print(
+ f"Processing {basename}: {len(names)} XML files "
+ f"({len(seen_paths)} already seen)"
+ )
files_scanned = 0
files_matched = 0
total_rows = 0
for i, name in enumerate(names):
+ if name in seen_paths:
+ skipped_existing += 1
+ if (i + 1) % 1000 == 0:
+ print(
+ f" ...{i + 1}/{len(names)} files, "
+ f"{skipped_existing} skipped existing, "
+ f"{files_matched} matched, {total_rows} rows"
+ )
+ continue
+
# Count every ZIP member as scanned, even ones we fail to read —
# otherwise read failures silently shrink the scanned total and
# make run-level metrics misleading.
@@ -626,9 +641,16 @@ def process_zip(zip_path, ingest_run_id):
total_rows += rows
if (i + 1) % 1000 == 0:
- print(f" ...{i + 1}/{len(names)} files, {files_matched} matched, {total_rows} rows")
-
- print(f" Done: {files_scanned} scanned, {files_matched} matched, {total_rows} rows")
+ print(
+ f" ...{i + 1}/{len(names)} files, "
+ f"{skipped_existing} skipped existing, "
+ f"{files_matched} matched, {total_rows} rows"
+ )
+
+ print(
+ f" Done: {files_scanned} scanned, {skipped_existing} skipped existing, "
+ f"{files_matched} matched, {total_rows} rows"
+ )
return files_scanned, files_matched, total_rows
diff --git a/scripts/parse/irs_990ez.py b/scripts/parse/irs_990ez.py
index bea4fdd..3606b88 100644
--- a/scripts/parse/irs_990ez.py
+++ b/scripts/parse/irs_990ez.py
@@ -30,7 +30,7 @@ from scripts.common.ingest import (
start_ingest_run, finish_ingest_run, fail_ingest_run, log_ingest_error,
)
from scripts.common.filing import (
- upsert_raw_filing, record_raw_filing_source,
+ upsert_raw_filing, record_raw_filing_source, get_seen_source_paths,
)
PARSER_NAME = "parse_irs_990ez"
@@ -270,11 +270,11 @@ def process_filing(tree, source_document_id, source_archive, source_path, ingest
]
_replace_children(
- conn, raw_filing_id, filing_form_data, schedule_o_rows, xml_rows,
+ conn, raw_filing_id, filing_form_data, schedule_o_rows,
)
- # 1 raw_filing + 1 raw_filing_source + 1 raw_form_990ez + schedule_o + xml fields
- return 3 + len(schedule_o_rows) + len(xml_rows)
+ # 1 raw_filing + 1 raw_filing_source + 1 raw_form_990ez + schedule_o
+ return 3 + len(schedule_o_rows)
return execute_transaction(_do)
@@ -356,13 +356,28 @@ def process_zip(zip_path, ingest_run_id):
with zf:
names = [n for n in zf.namelist() if n.endswith(".xml")]
+ seen_paths = get_seen_source_paths(basename)
+ skipped_existing = 0
- print(f"Processing {basename}: {len(names)} XML files")
+ print(
+ f"Processing {basename}: {len(names)} XML files "
+ f"({len(seen_paths)} already seen)"
+ )
files_scanned = 0
files_matched = 0
total_rows = 0
for i, name in enumerate(names):
+ if name in seen_paths:
+ skipped_existing += 1
+ if (i + 1) % 1000 == 0:
+ print(
+ f" ...{i + 1}/{len(names)} files, "
+ f"{skipped_existing} skipped existing, "
+ f"{files_matched} matched, {total_rows} rows"
+ )
+ continue
+
# Count every ZIP member as scanned, even ones we fail to read —
# otherwise read failures silently shrink the scanned total and
# make run-level metrics misleading.
@@ -384,9 +399,16 @@ def process_zip(zip_path, ingest_run_id):
total_rows += rows
if (i + 1) % 1000 == 0:
- print(f" ...{i + 1}/{len(names)} files, {files_matched} matched, {total_rows} rows")
-
- print(f" Done: {files_scanned} scanned, {files_matched} matched, {total_rows} rows")
+ print(
+ f" ...{i + 1}/{len(names)} files, "
+ f"{skipped_existing} skipped existing, "
+ f"{files_matched} matched, {total_rows} rows"
+ )
+
+ print(
+ f" Done: {files_scanned} scanned, {skipped_existing} skipped existing, "
+ f"{files_matched} matched, {total_rows} rows"
+ )
return files_scanned, files_matched, total_rows
diff --git a/scripts/parse/irs_990pf.py b/scripts/parse/irs_990pf.py
index 3d245b8..218c7fb 100644
--- a/scripts/parse/irs_990pf.py
+++ b/scripts/parse/irs_990pf.py
@@ -25,7 +25,7 @@ from scripts.common.ingest import (
start_ingest_run, finish_ingest_run, fail_ingest_run, log_ingest_error,
)
from scripts.common.filing import (
- upsert_raw_filing, record_raw_filing_source,
+ upsert_raw_filing, record_raw_filing_source, get_seen_source_paths,
)
PARSER_NAME = "parse_irs_990pf"
@@ -349,10 +349,6 @@ def process_filing(tree, source_document_id, source_archive, source_path, ingest
form_data = extract_form_990pf(tree)
form_data["grant_detail_status"] = compute_grant_detail_status(grant_elements, extracted_grants)
- root = tree.getroot()
- return_header = root.find(f"{{{NS}}}ReturnHeader")
- return_data = root.find(f"{{{NS}}}ReturnData")
-
def _do(conn):
raw_filing_id = upsert_raw_filing(
SOURCE_SYSTEM, source_document_id, metadata, ingest_run_id, conn=conn
@@ -367,15 +363,15 @@ def process_filing(tree, source_document_id, source_archive, source_path, ingest
for row in extracted_grants
]
- _replace_children(conn, raw_filing_id, filing_form_data, grant_rows, xml_rows)
+ _replace_children(conn, raw_filing_id, filing_form_data, grant_rows)
- # 1 raw_filing + 1 raw_filing_source + 1 raw_form_990pf + grants + xml fields
- return 3 + len(grant_rows) + len(xml_rows)
+ # 1 raw_filing + 1 raw_filing_source + 1 raw_form_990pf + grants
+ return 3 + len(grant_rows)
return execute_transaction(_do)
-def _replace_children(conn, raw_filing_id, form_data, grant_rows, xml_rows):
+def _replace_children(conn, raw_filing_id, form_data, grant_rows):
"""Delete and re-insert all child rows for a filing using the caller's transaction."""
form_columns = list(form_data.keys())
form_placeholders = ", ".join(["%s"] * len(form_columns))
@@ -451,13 +447,28 @@ def process_zip(zip_path, ingest_run_id):
with zf:
names = [n for n in zf.namelist() if n.endswith(".xml")]
+ seen_paths = get_seen_source_paths(basename)
+ skipped_existing = 0
- print(f"Processing {basename}: {len(names)} XML files")
+ print(
+ f"Processing {basename}: {len(names)} XML files "
+ f"({len(seen_paths)} already seen)"
+ )
files_scanned = 0
files_matched = 0
total_rows = 0
for i, name in enumerate(names):
+ if name in seen_paths:
+ skipped_existing += 1
+ if (i + 1) % 1000 == 0:
+ print(
+ f" ...{i + 1}/{len(names)} files, "
+ f"{skipped_existing} skipped existing, "
+ f"{files_matched} matched, {total_rows} rows"
+ )
+ continue
+
# Count every ZIP member as scanned, even ones we fail to read —
# otherwise read failures silently shrink the scanned total and
# make run-level metrics misleading.
@@ -479,9 +490,16 @@ def process_zip(zip_path, ingest_run_id):
total_rows += rows
if (i + 1) % 1000 == 0:
- print(f" ...{i + 1}/{len(names)} files, {files_matched} matched, {total_rows} rows")
-
- print(f" Done: {files_scanned} scanned, {files_matched} matched, {total_rows} rows")
+ print(
+ f" ...{i + 1}/{len(names)} files, "
+ f"{skipped_existing} skipped existing, "
+ f"{files_matched} matched, {total_rows} rows"
+ )
+
+ print(
+ f" Done: {files_scanned} scanned, {skipped_existing} skipped existing, "
+ f"{files_matched} matched, {total_rows} rows"
+ )
return files_scanned, files_matched, total_rows