aboutsummaryrefslogtreecommitdiff
path: root/scripts/common/normalize.py
blob: b06afed2a8961d449bcb51d2c8641ced001df3c8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
"""
Single authoritative implementations of data normalization.

Every parser must use these — never hand-roll EIN cleanup,
amount parsing, form type mapping, or placeholder detection.
"""

import re


def normalize_ein(raw):
    """Normalize an EIN to 9-digit zero-padded string.

    Returns None if the input can't be normalized to exactly 9 digits.

    >>> normalize_ein('04-3567890')
    '043567890'
    >>> normalize_ein('43567890')
    '043567890'
    >>> normalize_ein('  04-3567890 ')
    '043567890'
    >>> normalize_ein(None)
    >>> normalize_ein('')
    >>> normalize_ein('not-an-ein')
    """
    if not raw:
        return None
    cleaned = re.sub(r'[^0-9]', '', str(raw).strip())
    if not cleaned:
        return None
    padded = cleaned.zfill(9)
    if len(padded) != 9:
        return None
    return padded


def parse_numeric(raw):
    """Parse a raw string to a clean numeric string for DB insertion.

    Returns None for non-numeric values like 'SEE ATTACHED'.

    >>> parse_numeric('1,234,567')
    '1234567'
    >>> parse_numeric('$1,234.56')
    '1234.56'
    >>> parse_numeric('(500)')
    '-500'
    >>> parse_numeric('SEE ATTACHED')
    >>> parse_numeric(None)
    """
    if not raw:
        return None
    s = str(raw).strip()
    if not s:
        return None

    s = s.replace('$', '').replace(',', '').strip()

    # Parenthesized negatives: (500) -> -500
    if s.startswith('(') and s.endswith(')'):
        s = '-' + s[1:-1]

    try:
        float(s)
        return s
    except ValueError:
        return None


# IRS ReturnTypeCd -> our form_type reference table code
_FORM_TYPE_MAP = {
    '990':    '990',
    '990PF':  '990PF',
    '990EZ':  '990EZ',
    '990O':   '990O',
    '990T':   '990T',
    '990A':   '990A',
    '990PA':  '990PA',
    '990EA':  '990EA',
}


def map_form_type(return_type_cd):
    """Map an IRS ReturnTypeCd to our form_type code.

    Raises ValueError if unknown — caller should log and skip the filing.

    >>> map_form_type('990PF')
    '990PF'
    """
    if not return_type_cd:
        raise ValueError("Empty return type")
    code = return_type_cd.strip()
    if code in _FORM_TYPE_MAP:
        return _FORM_TYPE_MAP[code]
    raise ValueError(f"Unknown return type: {code!r}")


_PLACEHOLDER_RE = re.compile(
    r'^(SEE\s+(ATTACHED|STATEMENT|SCHEDULE|PART\s|CONTINUATION)|'
    r'VARIOUS|MULTIPLE|N/?A|NONE|--+|\.+)$',
    re.IGNORECASE,
)


def is_placeholder(value):
    """Check if a text value is a placeholder rather than real data.

    >>> is_placeholder('SEE ATTACHED')
    True
    >>> is_placeholder('VARIOUS')
    True
    >>> is_placeholder('BOYS AND GIRLS CLUB')
    False
    >>> is_placeholder(None)
    False
    """
    if not value:
        return False
    return bool(_PLACEHOLDER_RE.match(value.strip()))