generated from UBICenter/analysis-template
-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathpre-processing.py
229 lines (195 loc) · 7.27 KB
/
pre-processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
# if kernel crashes, make sure pywin32 and pipywin32 are installed.
# Followed instructions here: https://github.com/jupyter/notebook/issues/4909
# import win32api
import pandas as pd
import numpy as np
import microdf as mdf
import os
import us
# Import data from Ipums
person = pd.read_csv("cps_00041.csv.gz")
# lower column names
person.columns = person.columns.str.lower()
# Divide by three for three years of data.
person[["asecwt", "spmwt"]] /= 3
# Create booleans for demographics
person["adult"] = person.age >= 18
person["child"] = person.age < 18
# create mutually exclusive white non-hisp/black non-hisp/hispanic groups
person["hispanic"] = person.hispan.between(1, 699)
person["black"] = (person.race == 200) & (~person.hispanic)
person["white"] = (person.race == 100) & (~person.hispanic)
# check to make sure persons are double counted
assert person[["black", "hispanic", "white"]].sum(axis=1).max() == 1
person["pwd"] = person.diffany == 2
person["non_citizen"] = person.citizen == 5
person["non_citizen_child"] = (person.citizen == 5) & person.child
person["non_citizen_adult"] = (person.citizen == 5) & person.adult
# Remove NIUs
person["adjginc"].replace({99999999: 0}, inplace=True)
person["fedtaxac"].replace({99999999: 0}, inplace=True)
person["taxinc"].replace({9999999: 0}, inplace=True)
person["stataxac"].replace({9999999: 0}, inplace=True)
person["incss"].replace({999999: 0}, inplace=True)
person["incunemp"].replace({999999: 0}, inplace=True)
person["incssi"].replace({999999: 0}, inplace=True)
person["ctccrd"].replace({999999: 0}, inplace=True)
person["incunemp"].replace({99999: 0}, inplace=True)
person["actccrd"].replace({99999: 0}, inplace=True)
person["fica"].replace({99999: 0}, inplace=True)
person["eitcred"].replace({9999: 0}, inplace=True)
# Change fip codes to state names
person["state"] = (
person["statefip"].astype(str)
# pad leading zero or wrong number of states
.apply("{:0>2}".format)
# lookup full state name from fips code
.apply(lambda x: us.states.lookup(x))
# change us package formatting to string
.astype(str)
)
# drop original statefip column from dataframe
person.drop(columns=["statefip"], inplace=True)
# Aggregate deductible and refundable child tax credits
person["ctc"] = person.ctccrd + person.actccrd
# Calculate the number of people per smp unit
person["person"] = 1
spm = person.groupby(["spmfamunit", "year"])[["person"]].sum()
spm.columns = ["numper"]
person = person.merge(spm, left_on=["spmfamunit", "year"], right_index=True)
person["weighted_state_tax"] = person.asecwt * person.stataxac
person["weighted_agi"] = person.asecwt * person.adjginc
# Calculate the total taxable income and total people in each state
state_groups_taxinc = person.groupby(["state"])[
["weighted_state_tax", "weighted_agi"]
].sum()
state_groups_taxinc.columns = ["state_tax_revenue", "state_taxable_income"]
person = person.merge(state_groups_taxinc, left_on=["state"], right_index=True)
# Create dataframe with aggregated spm unit data
PERSON_COLUMNS = [
"adjginc",
"fica",
"fedtaxac",
"ctc",
"incssi",
"incunemp",
"eitcred",
"child",
"adult",
"non_citizen",
"non_citizen_child",
"non_citizen_adult",
"person",
"stataxac",
]
SPMU_COLUMNS = [
"spmheat",
"spmsnap",
"spmfamunit",
"spmthresh",
"spmtotres",
"spmwt",
"year",
"state",
"state_tax_revenue",
"state_taxable_income",
]
spmu = person.groupby(SPMU_COLUMNS, observed=False)[PERSON_COLUMNS].sum().reset_index()
spmu[["fica", "fedtaxac", "stataxac"]] *= -1
spmu.rename(columns={"person": "numper"}, inplace=True)
# write pre-processed dfs to csv files
person.to_csv("person.csv.gz", compression="gzip")
spmu.to_csv("spmu.csv.gz", compression="gzip")
# create boolean column for individual's poverty status, 1=poor
person["poor"] = person.spmthresh > person.spmtotres
# create a column for all selected demographic variables
# that will be used to calculate poverty rates
DEMOG_COLS = [
"person",
"adult",
"child",
"black",
"white",
"hispanic",
"pwd",
]
poor_pop = person[person.poor]
# calculate weighted sum of people living in poverty
mdf.weighted_sum(poor_pop, DEMOG_COLS, "asecwt")
# calculate poverty RATE for each DEMOGRAPHIC in US
pov_rate_us = mdf.weighted_sum(poor_pop, DEMOG_COLS, "asecwt") / mdf.weighted_sum(
person, DEMOG_COLS, w="asecwt"
)
# add name to series
pov_rate_us.name = "US"
# calculate poverty RATE for each group by state
pov_rates = mdf.weighted_sum(
poor_pop, DEMOG_COLS, "asecwt", groupby="state"
) / mdf.weighted_sum(person, DEMOG_COLS, w="asecwt", groupby="state")
# append US statistics as additional 'state'
pov_df = pov_rates.append(pov_rate_us)
# melt df from wide to long format
pov_df = pov_df.melt(ignore_index=False, var_name="demog")
# insert column indicating metric in question
pov_df.insert(loc=1, column="metric", value="pov_rate")
##
# calculate POPULATION for each DEMOGRAPHIC in US
pop_us = mdf.weighted_sum(person, DEMOG_COLS, w="asecwt")
# add name to series
pop_us.name = "US"
# calculate POPULATION for each group by state
pop_states = mdf.weighted_sum(person, DEMOG_COLS, w="asecwt", groupby="state")
# append US statistics as additional 'state'
pop_df = pop_states.append(pop_us)
# melt df from wide to long format
pop_df = pop_df.melt(ignore_index=False, var_name="demog")
pop_df.insert(loc=1, column="metric", value="pop")
# concat poverty and population dfs
demog_stats = pd.concat([pov_df, pop_df])
# write to csv file
demog_stats.to_csv("demog_stats.csv.gz", compression="gzip")
# Caluclate original gini
person["spm_resources_per_person"] = person.spmtotres / person.numper
# Caluclate original gini for US
gini_us = pd.Series(mdf.gini(df=person, col="spm_resources_per_person", w="asecwt"))
# add name to series
gini_us.index = ["US"]
# calculate gini for each group by state
gini_states = mdf.gini(
df=person, col="spm_resources_per_person", w="asecwt", groupby="state"
)
# append US statistics as additional 'state'
gini_ser = gini_states.append(gini_us)
gini_ser.name = "gini"
# Calculate the original poverty gap
spmu["poverty_gap"] = np.where(
spmu.spmtotres < spmu.spmthresh,
spmu.spmthresh - spmu.spmtotres,
0,
)
poverty_gap_us = pd.Series(mdf.weighted_sum(spmu, "poverty_gap", w="spmwt"))
# add name to series
poverty_gap_us.index = ["US"]
# calculate gini for each group by state
poverty_gap_states = mdf.weighted_sum(spmu, "poverty_gap", w="spmwt", groupby="state")
# append US statistics as additional 'state'
poverty_gap_ser = poverty_gap_states.append(poverty_gap_us)
poverty_gap_ser.name = "poverty_gap"
# calculate the sum total of everyone's resources in US
total_resources_us = pd.Series(mdf.weighted_sum(spmu, "spmtotres", w="spmwt"))
# add name to series
total_resources_us.index = ["US"]
# calculate gini for each group by state
total_resources_state = mdf.weighted_sum(
df=spmu, col="spmtotres", w="spmwt", groupby="state"
)
# append US statistics as additional 'state'
total_resources_state = total_resources_state.append(total_resources_us)
total_resources_state.name = "total_resources"
# merge "total_resources","gini","poverty gap" into 1 df
all_state_stats = (
poverty_gap_ser.to_frame()
.join(total_resources_state.to_frame())
.join(gini_ser.to_frame())
)
all_state_stats.to_csv("all_state_stats.csv.gz", compression="gzip")