US Census Cleaning Completed

Hello All!

The following is my completed code for this project. Any and all feedback is welcome and appreciated!

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import codecademylib3_seaborn
import glob

files = glob.glob(‘state*.csv’)
df_list =
for states in files:
data = pd.read_csv(states)
df_list.append(data)
us_census = pd.concat(df_list)
#print(us_census)

#print(us_census.dtypes)
#print(us_census.columns)
#print(us_census.head())

us_census[‘Income’] = us_census[‘Income’].replace(’[$,]’, ‘’, regex=True)
us_census[‘Income’] = pd.to_numeric(us_census[‘Income’])

split_gender = us_census[‘GenderPop’].str.split(’_’, expand=True)

us_census[‘Female’] = split_gender[1].str.split(’(\d+)’, expand=True)[1]
us_census[‘Male’] = split_gender[0].str.split(’(\d+)’, expand=True)[1]
us_census.Female = pd.to_numeric(us_census.Female)
us_census.Male = pd.to_numeric(us_census.Male)
#print(us_census.dtypes)

#plt.scatter(‘Female’, ‘Income’)
#plt.show()

difference = us_census[‘TotalPop’] - us_census[‘Male’]
us_census[‘Female’] = us_census[‘Female’].fillna(value=difference)

#print(us_census.duplicated(‘Female’))
us_census2 = us_census.drop_duplicates(‘State’)

#print(us_census2.columns)
#print(us_census2.dtypes)
#print(us_census2)

plt.scatter(us_census2.Female, us_census2.Income)
plt.xlabel(‘Female Population’)
plt.ylabel(‘Income’)
plt.title(‘Female Population and Income’)
plt.show()

us_census2.Hispanic = us_census2[‘Hispanic’].replace(’[%,]’, ‘’, regex=True)
us_census2.White = us_census2[‘White’].replace(’[%,]’, ‘’, regex=True)
us_census2.Black = us_census2[‘Black’].replace(’[%,]’, ‘’, regex=True)
us_census2.Native = us_census2[‘Native’].replace(’[%,]’, ‘’, regex=True)
us_census2.Asian = us_census2[‘Asian’].replace(’[%,]’, ‘’, regex=True)
us_census2.Pacific = us_census2[‘Pacific’].replace(’[%,]’, ‘’, regex=True)

us_census2.Hispanic = pd.to_numeric(us_census2[‘Hispanic’])
us_census2.White = pd.to_numeric(us_census2[‘White’])
us_census2.Black = pd.to_numeric(us_census2[‘Black’])
us_census2.Native = pd.to_numeric(us_census2[‘Native’])
us_census2.Asian = pd.to_numeric(us_census2[‘Asian’])
us_census2.Pacific = pd.to_numeric(us_census2[‘Pacific’])

difference2 = (100 - us_census2.White) - us_census2.Hispanic - us_census2.Black - us_census2.Asian
us_census2[‘Pacific’] = us_census2[‘Pacific’].fillna(value=difference2)
print(us_census2)

#print(us_census2.dtypes)
plt.figure(2)
plt.hist(us_census2.Hispanic)
plt.show()

plt.figure(3)
plt.hist(us_census2.White)
plt.show()

plt.figure(4)
plt.hist(us_census2.Black)
plt.show()

plt.figure(5)
plt.hist(us_census2.Asian)
plt.show()

plt.figure(6)
plt.hist(us_census2.Pacific)
plt.show()

plt.figure(7)
plt.hist(us_census2.Native)
plt.show()

1 Like