""" Single authoritative implementations of data normalization. Every parser must use these — never hand-roll EIN cleanup, amount parsing, form type mapping, or placeholder detection. """ import re def normalize_ein(raw): """Normalize an EIN to 9-digit zero-padded string. Returns None if the input can't be normalized to exactly 9 digits. >>> normalize_ein('04-3567890') '043567890' >>> normalize_ein('43567890') '043567890' >>> normalize_ein(' 04-3567890 ') '043567890' >>> normalize_ein(None) >>> normalize_ein('') >>> normalize_ein('not-an-ein') """ if not raw: return None cleaned = re.sub(r'[^0-9]', '', str(raw).strip()) if not cleaned: return None padded = cleaned.zfill(9) if len(padded) != 9: return None return padded def parse_numeric(raw): """Parse a raw string to a clean numeric string for DB insertion. Returns None for non-numeric values like 'SEE ATTACHED'. >>> parse_numeric('1,234,567') '1234567' >>> parse_numeric('$1,234.56') '1234.56' >>> parse_numeric('(500)') '-500' >>> parse_numeric('SEE ATTACHED') >>> parse_numeric(None) """ if not raw: return None s = str(raw).strip() if not s: return None s = s.replace('$', '').replace(',', '').strip() # Parenthesized negatives: (500) -> -500 if s.startswith('(') and s.endswith(')'): s = '-' + s[1:-1] try: float(s) return s except ValueError: return None # IRS ReturnTypeCd -> our form_type reference table code _FORM_TYPE_MAP = { '990': '990', '990PF': '990PF', '990EZ': '990EZ', '990O': '990O', '990T': '990T', '990A': '990A', '990PA': '990PA', '990EA': '990EA', } def map_form_type(return_type_cd): """Map an IRS ReturnTypeCd to our form_type code. Raises ValueError if unknown — caller should log and skip the filing. >>> map_form_type('990PF') '990PF' """ if not return_type_cd: raise ValueError("Empty return type") code = return_type_cd.strip() if code in _FORM_TYPE_MAP: return _FORM_TYPE_MAP[code] raise ValueError(f"Unknown return type: {code!r}") _PLACEHOLDER_RE = re.compile( r'^(SEE\s+(ATTACHED|STATEMENT|SCHEDULE|PART\s|CONTINUATION)|' r'VARIOUS|MULTIPLE|N/?A|NONE|--+|\.+)$', re.IGNORECASE, ) def is_placeholder(value): """Check if a text value is a placeholder rather than real data. >>> is_placeholder('SEE ATTACHED') True >>> is_placeholder('VARIOUS') True >>> is_placeholder('BOYS AND GIRLS CLUB') False >>> is_placeholder(None) False """ if not value: return False return bool(_PLACEHOLDER_RE.match(value.strip()))