1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
|
"""
Single authoritative implementations of data normalization.
Every parser must use these — never hand-roll EIN cleanup,
amount parsing, form type mapping, or placeholder detection.
"""
import re
def normalize_ein(raw):
"""Normalize an EIN to 9-digit zero-padded string.
Returns None if the input can't be normalized to exactly 9 digits.
>>> normalize_ein('04-3567890')
'043567890'
>>> normalize_ein('43567890')
'043567890'
>>> normalize_ein(' 04-3567890 ')
'043567890'
>>> normalize_ein(None)
>>> normalize_ein('')
>>> normalize_ein('not-an-ein')
"""
if not raw:
return None
cleaned = re.sub(r'[^0-9]', '', str(raw).strip())
if not cleaned:
return None
padded = cleaned.zfill(9)
if len(padded) != 9:
return None
return padded
def parse_numeric(raw):
"""Parse a raw string to a clean numeric string for DB insertion.
Returns None for non-numeric values like 'SEE ATTACHED'.
>>> parse_numeric('1,234,567')
'1234567'
>>> parse_numeric('$1,234.56')
'1234.56'
>>> parse_numeric('(500)')
'-500'
>>> parse_numeric('SEE ATTACHED')
>>> parse_numeric(None)
"""
if not raw:
return None
s = str(raw).strip()
if not s:
return None
s = s.replace('$', '').replace(',', '').strip()
# Parenthesized negatives: (500) -> -500
if s.startswith('(') and s.endswith(')'):
s = '-' + s[1:-1]
try:
float(s)
return s
except ValueError:
return None
# IRS ReturnTypeCd -> our form_type reference table code
_FORM_TYPE_MAP = {
'990': '990',
'990PF': '990PF',
'990EZ': '990EZ',
'990O': '990O',
'990T': '990T',
'990A': '990A',
'990PA': '990PA',
'990EA': '990EA',
}
def map_form_type(return_type_cd):
"""Map an IRS ReturnTypeCd to our form_type code.
Raises ValueError if unknown — caller should log and skip the filing.
>>> map_form_type('990PF')
'990PF'
"""
if not return_type_cd:
raise ValueError("Empty return type")
code = return_type_cd.strip()
if code in _FORM_TYPE_MAP:
return _FORM_TYPE_MAP[code]
raise ValueError(f"Unknown return type: {code!r}")
_PLACEHOLDER_RE = re.compile(
r'^(SEE\s+(ATTACHED|STATEMENT|SCHEDULE|PART\s|CONTINUATION)|'
r'VARIOUS|MULTIPLE|N/?A|NONE|--+|\.+)$',
re.IGNORECASE,
)
def is_placeholder(value):
"""Check if a text value is a placeholder rather than real data.
>>> is_placeholder('SEE ATTACHED')
True
>>> is_placeholder('VARIOUS')
True
>>> is_placeholder('BOYS AND GIRLS CLUB')
False
>>> is_placeholder(None)
False
"""
if not value:
return False
return bool(_PLACEHOLDER_RE.match(value.strip()))
|