aboutsummaryrefslogtreecommitdiff
path: root/scripts/common/normalize.py
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/common/normalize.py')
-rw-r--r--scripts/common/normalize.py120
1 files changed, 120 insertions, 0 deletions
diff --git a/scripts/common/normalize.py b/scripts/common/normalize.py
new file mode 100644
index 0000000..b06afed
--- /dev/null
+++ b/scripts/common/normalize.py
@@ -0,0 +1,120 @@
+"""
+Single authoritative implementations of data normalization.
+
+Every parser must use these — never hand-roll EIN cleanup,
+amount parsing, form type mapping, or placeholder detection.
+"""
+
+import re
+
+
+def normalize_ein(raw):
+ """Normalize an EIN to 9-digit zero-padded string.
+
+ Returns None if the input can't be normalized to exactly 9 digits.
+
+ >>> normalize_ein('04-3567890')
+ '043567890'
+ >>> normalize_ein('43567890')
+ '043567890'
+ >>> normalize_ein(' 04-3567890 ')
+ '043567890'
+ >>> normalize_ein(None)
+ >>> normalize_ein('')
+ >>> normalize_ein('not-an-ein')
+ """
+ if not raw:
+ return None
+ cleaned = re.sub(r'[^0-9]', '', str(raw).strip())
+ if not cleaned:
+ return None
+ padded = cleaned.zfill(9)
+ if len(padded) != 9:
+ return None
+ return padded
+
+
+def parse_numeric(raw):
+ """Parse a raw string to a clean numeric string for DB insertion.
+
+ Returns None for non-numeric values like 'SEE ATTACHED'.
+
+ >>> parse_numeric('1,234,567')
+ '1234567'
+ >>> parse_numeric('$1,234.56')
+ '1234.56'
+ >>> parse_numeric('(500)')
+ '-500'
+ >>> parse_numeric('SEE ATTACHED')
+ >>> parse_numeric(None)
+ """
+ if not raw:
+ return None
+ s = str(raw).strip()
+ if not s:
+ return None
+
+ s = s.replace('$', '').replace(',', '').strip()
+
+ # Parenthesized negatives: (500) -> -500
+ if s.startswith('(') and s.endswith(')'):
+ s = '-' + s[1:-1]
+
+ try:
+ float(s)
+ return s
+ except ValueError:
+ return None
+
+
+# IRS ReturnTypeCd -> our form_type reference table code
+_FORM_TYPE_MAP = {
+ '990': '990',
+ '990PF': '990PF',
+ '990EZ': '990EZ',
+ '990O': '990O',
+ '990T': '990T',
+ '990A': '990A',
+ '990PA': '990PA',
+ '990EA': '990EA',
+}
+
+
+def map_form_type(return_type_cd):
+ """Map an IRS ReturnTypeCd to our form_type code.
+
+ Raises ValueError if unknown — caller should log and skip the filing.
+
+ >>> map_form_type('990PF')
+ '990PF'
+ """
+ if not return_type_cd:
+ raise ValueError("Empty return type")
+ code = return_type_cd.strip()
+ if code in _FORM_TYPE_MAP:
+ return _FORM_TYPE_MAP[code]
+ raise ValueError(f"Unknown return type: {code!r}")
+
+
+_PLACEHOLDER_RE = re.compile(
+ r'^(SEE\s+(ATTACHED|STATEMENT|SCHEDULE|PART\s|CONTINUATION)|'
+ r'VARIOUS|MULTIPLE|N/?A|NONE|--+|\.+)$',
+ re.IGNORECASE,
+)
+
+
+def is_placeholder(value):
+ """Check if a text value is a placeholder rather than real data.
+
+ >>> is_placeholder('SEE ATTACHED')
+ True
+ >>> is_placeholder('VARIOUS')
+ True
+ >>> is_placeholder('BOYS AND GIRLS CLUB')
+ False
+ >>> is_placeholder(None)
+ False
+ """
+ if not value:
+ return False
+ return bool(_PLACEHOLDER_RE.match(value.strip()))