From 6605e2cc428e3bdaa174ccc432941eab8c5d61cb Mon Sep 17 00:00:00 2001 From: benj Date: Fri, 10 Apr 2026 11:13:57 +0800 Subject: ensure parsers do not parse and store raw XML fields --- scripts/common/__init__.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 scripts/common/__init__.py (limited to 'scripts/common/__init__.py') diff --git a/scripts/common/__init__.py b/scripts/common/__init__.py new file mode 100644 index 0000000..f06ee26 --- /dev/null +++ b/scripts/common/__init__.py @@ -0,0 +1,28 @@ +""" +Shared infrastructure for the 990 data pipeline (v2). + +This package provides the single authoritative implementation of +normalization, XML helpers, DB access, and ingest tracking for +all parsers under scripts/parse/, scripts/fetch/, and scripts/extract/. + +Old parsers in scripts/ still use scripts/parse_common.py directly. +""" + +import zipfile_deflate64 # noqa: F401 + +from scripts.common.db import ( + execute, execute_scalar, execute_all, execute_transaction, copy_rows, + # Legacy (shell-based, for old parsers) + psql, psql_scalar, psql_query_values, insert_rows, +) +from scripts.common.normalize import normalize_ein, parse_numeric, map_form_type, is_placeholder +from scripts.common.xml import ( + text, strip_ns, leaf_paths, extract_filing_metadata, + derive_source_document_id, +) +from scripts.common.ingest import ( + start_ingest_run, finish_ingest_run, fail_ingest_run, log_ingest_error, +) +from scripts.common.filing import ( + upsert_raw_filing, record_raw_filing_source, +) -- cgit v1.2.3