diff options
Diffstat (limited to '')
| -rw-r--r-- | scripts/common/normalize.py | 120 |
1 files changed, 120 insertions, 0 deletions
diff --git a/scripts/common/normalize.py b/scripts/common/normalize.py new file mode 100644 index 0000000..b06afed --- /dev/null +++ b/scripts/common/normalize.py @@ -0,0 +1,120 @@ +""" +Single authoritative implementations of data normalization. + +Every parser must use these — never hand-roll EIN cleanup, +amount parsing, form type mapping, or placeholder detection. +""" + +import re + + +def normalize_ein(raw): + """Normalize an EIN to 9-digit zero-padded string. + + Returns None if the input can't be normalized to exactly 9 digits. + + >>> normalize_ein('04-3567890') + '043567890' + >>> normalize_ein('43567890') + '043567890' + >>> normalize_ein(' 04-3567890 ') + '043567890' + >>> normalize_ein(None) + >>> normalize_ein('') + >>> normalize_ein('not-an-ein') + """ + if not raw: + return None + cleaned = re.sub(r'[^0-9]', '', str(raw).strip()) + if not cleaned: + return None + padded = cleaned.zfill(9) + if len(padded) != 9: + return None + return padded + + +def parse_numeric(raw): + """Parse a raw string to a clean numeric string for DB insertion. + + Returns None for non-numeric values like 'SEE ATTACHED'. + + >>> parse_numeric('1,234,567') + '1234567' + >>> parse_numeric('$1,234.56') + '1234.56' + >>> parse_numeric('(500)') + '-500' + >>> parse_numeric('SEE ATTACHED') + >>> parse_numeric(None) + """ + if not raw: + return None + s = str(raw).strip() + if not s: + return None + + s = s.replace('$', '').replace(',', '').strip() + + # Parenthesized negatives: (500) -> -500 + if s.startswith('(') and s.endswith(')'): + s = '-' + s[1:-1] + + try: + float(s) + return s + except ValueError: + return None + + +# IRS ReturnTypeCd -> our form_type reference table code +_FORM_TYPE_MAP = { + '990': '990', + '990PF': '990PF', + '990EZ': '990EZ', + '990O': '990O', + '990T': '990T', + '990A': '990A', + '990PA': '990PA', + '990EA': '990EA', +} + + +def map_form_type(return_type_cd): + """Map an IRS ReturnTypeCd to our form_type code. + + Raises ValueError if unknown — caller should log and skip the filing. + + >>> map_form_type('990PF') + '990PF' + """ + if not return_type_cd: + raise ValueError("Empty return type") + code = return_type_cd.strip() + if code in _FORM_TYPE_MAP: + return _FORM_TYPE_MAP[code] + raise ValueError(f"Unknown return type: {code!r}") + + +_PLACEHOLDER_RE = re.compile( + r'^(SEE\s+(ATTACHED|STATEMENT|SCHEDULE|PART\s|CONTINUATION)|' + r'VARIOUS|MULTIPLE|N/?A|NONE|--+|\.+)$', + re.IGNORECASE, +) + + +def is_placeholder(value): + """Check if a text value is a placeholder rather than real data. + + >>> is_placeholder('SEE ATTACHED') + True + >>> is_placeholder('VARIOUS') + True + >>> is_placeholder('BOYS AND GIRLS CLUB') + False + >>> is_placeholder(None) + False + """ + if not value: + return False + return bool(_PLACEHOLDER_RE.match(value.strip())) |
