This notebook contains all the data cleaning of the various datasets used for this project.
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from datetime import datetime
# import datasets
counties_df = pd.read_csv('../data/archive/us-counties.csv')
mask_df = pd.read_csv('../data/archive/mask-use-by-county.csv')
cases_df = pd.read_csv('../data/statewide_cases.csv')
vaccine_moderna=pd.read_csv('../data/COVID-19_Vaccine_Distribution_Allocations_by_Jurisdiction_-_Moderna.csv')
counties_df.head()
date | county | state | fips | cases | deaths | |
---|---|---|---|---|---|---|
0 | 2020-01-21 | Snohomish | Washington | 53061.0 | 1 | 0.0 |
1 | 2020-01-22 | Snohomish | Washington | 53061.0 | 1 | 0.0 |
2 | 2020-01-23 | Snohomish | Washington | 53061.0 | 1 | 0.0 |
3 | 2020-01-24 | Cook | Illinois | 17031.0 | 1 | 0.0 |
4 | 2020-01-24 | Snohomish | Washington | 53061.0 | 1 | 0.0 |
# check for null values
counties_df.isnull().sum()
date 0 county 0 state 0 fips 8664 cases 0 deaths 19775 dtype: int64
# fill in null values
counties_df.fillna(0, inplace = True)
# changing fips to be an integer
counties_df['fips'] = counties_df['fips'].astype(int)
mask_df.head()
COUNTYFP | NEVER | RARELY | SOMETIMES | FREQUENTLY | ALWAYS | |
---|---|---|---|---|---|---|
0 | 1001 | 0.053 | 0.074 | 0.134 | 0.295 | 0.444 |
1 | 1003 | 0.083 | 0.059 | 0.098 | 0.323 | 0.436 |
2 | 1005 | 0.067 | 0.121 | 0.120 | 0.201 | 0.491 |
3 | 1007 | 0.020 | 0.034 | 0.096 | 0.278 | 0.572 |
4 | 1009 | 0.053 | 0.114 | 0.180 | 0.194 | 0.459 |
# join together mask_df and counties_df
counties_mask_df = counties_df.merge(mask_df, how = 'inner', left_on = 'fips',
right_on = 'COUNTYFP')
counties_mask_df.head()
date | county | state | fips | cases | deaths | COUNTYFP | NEVER | RARELY | SOMETIMES | FREQUENTLY | ALWAYS | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2020-01-21 | Snohomish | Washington | 53061 | 1 | 0.0 | 53061 | 0.017 | 0.014 | 0.056 | 0.191 | 0.721 |
1 | 2020-01-22 | Snohomish | Washington | 53061 | 1 | 0.0 | 53061 | 0.017 | 0.014 | 0.056 | 0.191 | 0.721 |
2 | 2020-01-23 | Snohomish | Washington | 53061 | 1 | 0.0 | 53061 | 0.017 | 0.014 | 0.056 | 0.191 | 0.721 |
3 | 2020-01-24 | Snohomish | Washington | 53061 | 1 | 0.0 | 53061 | 0.017 | 0.014 | 0.056 | 0.191 | 0.721 |
4 | 2020-01-25 | Snohomish | Washington | 53061 | 1 | 0.0 | 53061 | 0.017 | 0.014 | 0.056 | 0.191 | 0.721 |
# keep only rows where state is California
counties_mask_df = counties_mask_df[counties_mask_df['state'] == 'California']
counties_mask_df.head()
date | county | state | fips | cases | deaths | COUNTYFP | NEVER | RARELY | SOMETIMES | FREQUENTLY | ALWAYS | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
715 | 2020-01-25 | Orange | California | 6059 | 1 | 0.0 | 6059 | 0.023 | 0.021 | 0.046 | 0.156 | 0.754 |
716 | 2020-01-26 | Orange | California | 6059 | 1 | 0.0 | 6059 | 0.023 | 0.021 | 0.046 | 0.156 | 0.754 |
717 | 2020-01-27 | Orange | California | 6059 | 1 | 0.0 | 6059 | 0.023 | 0.021 | 0.046 | 0.156 | 0.754 |
718 | 2020-01-28 | Orange | California | 6059 | 1 | 0.0 | 6059 | 0.023 | 0.021 | 0.046 | 0.156 | 0.754 |
719 | 2020-01-29 | Orange | California | 6059 | 1 | 0.0 | 6059 | 0.023 | 0.021 | 0.046 | 0.156 | 0.754 |
# drop uncessary columns
counties_mask_df.drop(['COUNTYFP', 'fips', 'date', 'cases', 'deaths', 'state'],
axis = 1, inplace = True)
counties_mask_df.head()
county | NEVER | RARELY | SOMETIMES | FREQUENTLY | ALWAYS | |
---|---|---|---|---|---|---|
715 | Orange | 0.023 | 0.021 | 0.046 | 0.156 | 0.754 |
716 | Orange | 0.023 | 0.021 | 0.046 | 0.156 | 0.754 |
717 | Orange | 0.023 | 0.021 | 0.046 | 0.156 | 0.754 |
718 | Orange | 0.023 | 0.021 | 0.046 | 0.156 | 0.754 |
719 | Orange | 0.023 | 0.021 | 0.046 | 0.156 | 0.754 |
# only keep data mask data for one county each
counties_mask_df.drop_duplicates('county', inplace = True)
counties_mask_df.head()
county | NEVER | RARELY | SOMETIMES | FREQUENTLY | ALWAYS | |
---|---|---|---|---|---|---|
715 | Orange | 0.023 | 0.021 | 0.046 | 0.156 | 0.754 |
1424 | Los Angeles | 0.021 | 0.013 | 0.049 | 0.131 | 0.786 |
1778 | Santa Clara | 0.015 | 0.014 | 0.040 | 0.168 | 0.764 |
2475 | San Francisco | 0.017 | 0.011 | 0.035 | 0.121 | 0.817 |
3166 | San Diego | 0.017 | 0.023 | 0.034 | 0.126 | 0.800 |
counties_mask_df.reset_index(drop = True, inplace = True)
counties_mask_df.head()
county | NEVER | RARELY | SOMETIMES | FREQUENTLY | ALWAYS | |
---|---|---|---|---|---|---|
0 | Orange | 0.023 | 0.021 | 0.046 | 0.156 | 0.754 |
1 | Los Angeles | 0.021 | 0.013 | 0.049 | 0.131 | 0.786 |
2 | Santa Clara | 0.015 | 0.014 | 0.040 | 0.168 | 0.764 |
3 | San Francisco | 0.017 | 0.011 | 0.035 | 0.121 | 0.817 |
4 | San Diego | 0.017 | 0.023 | 0.034 | 0.126 | 0.800 |
# read in cases data
cases_df = pd.read_csv('../data/statewide_cases.csv')
cases_df.head()
county | totalcountconfirmed | totalcountdeaths | newcountconfirmed | newcountdeaths | date | |
---|---|---|---|---|---|---|
0 | Santa Clara | 151.0 | 6.0 | 151 | 6 | 2020-03-18 |
1 | Santa Clara | 183.0 | 8.0 | 32 | 2 | 2020-03-19 |
2 | Santa Clara | 246.0 | 8.0 | 63 | 0 | 2020-03-20 |
3 | Santa Clara | 269.0 | 10.0 | 23 | 2 | 2020-03-21 |
4 | Santa Clara | 284.0 | 13.0 | 15 | 3 | 2020-03-22 |
# check for null values
cases_df.isnull().sum()
county 0 totalcountconfirmed 3 totalcountdeaths 2 newcountconfirmed 0 newcountdeaths 0 date 0 dtype: int64
cases_df[cases_df['totalcountconfirmed'].isnull()]
county | totalcountconfirmed | totalcountdeaths | newcountconfirmed | newcountdeaths | date | |
---|---|---|---|---|---|---|
1263 | Sierra | NaN | NaN | 0 | 0 | 2020-03-29 |
9388 | Lake | NaN | NaN | 0 | 0 | 2020-03-29 |
11631 | Out Of Country | NaN | 0.0 | 0 | 0 | 2020-07-01 |
# as null values are for early on during the pandemic we will impute zeros
cases_df.fillna(0, inplace = True)
# merge mask use with cases
cases_masks_df = cases_df.merge(counties_mask_df, how = 'inner', left_on = 'county',
right_on = 'county')
cases_masks_df.head()
county | totalcountconfirmed | totalcountdeaths | newcountconfirmed | newcountdeaths | date | NEVER | RARELY | SOMETIMES | FREQUENTLY | ALWAYS | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | Santa Clara | 151.0 | 6.0 | 151 | 6 | 2020-03-18 | 0.015 | 0.014 | 0.04 | 0.168 | 0.764 |
1 | Santa Clara | 183.0 | 8.0 | 32 | 2 | 2020-03-19 | 0.015 | 0.014 | 0.04 | 0.168 | 0.764 |
2 | Santa Clara | 246.0 | 8.0 | 63 | 0 | 2020-03-20 | 0.015 | 0.014 | 0.04 | 0.168 | 0.764 |
3 | Santa Clara | 269.0 | 10.0 | 23 | 2 | 2020-03-21 | 0.015 | 0.014 | 0.04 | 0.168 | 0.764 |
4 | Santa Clara | 284.0 | 13.0 | 15 | 3 | 2020-03-22 | 0.015 | 0.014 | 0.04 | 0.168 | 0.764 |
cases_masks_df.to_csv('../clean_data/cases_with_mask_use.csv')
# get county population data
county_pop_df = pd.read_csv('../data/archive/co-est2019-annres-06.csv', skiprows = range(3))
county_pop_df.head()
Unnamed: 0 | Census | Estimates Base | 2010 | 2011 | 2012 | 2013 | 2014 | 2015 | 2016 | 2017 | 2018 | 2019 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | California | 37,253,956 | 37,254,519 | 37,319,502 | 37,638,369 | 37,948,800 | 38,260,787 | 38,596,972 | 38,918,045 | 39,167,117 | 39,358,497 | 39,461,588 | 39,512,223 |
1 | .Alameda County, California | 1,510,271 | 1,510,258 | 1,512,986 | 1,530,915 | 1,553,764 | 1,579,593 | 1,607,792 | 1,634,538 | 1,650,950 | 1,660,196 | 1,666,756 | 1,671,329 |
2 | .Alpine County, California | 1,175 | 1,175 | 1,161 | 1,093 | 1,110 | 1,128 | 1,080 | 1,077 | 1,047 | 1,111 | 1,089 | 1,129 |
3 | .Amador County, California | 38,091 | 38,091 | 37,886 | 37,543 | 37,104 | 36,620 | 36,726 | 37,031 | 37,429 | 38,529 | 39,405 | 39,752 |
4 | .Butte County, California | 220,000 | 220,005 | 219,949 | 219,975 | 220,869 | 221,641 | 223,516 | 224,631 | 226,231 | 228,696 | 230,339 | 219,186 |
# drop unncessary rows
county_pop_df.drop(['Census', 'Estimates Base', '2010', '2011', '2012', '2013', '2014', '2015',
'2016', '2017', '2018'], axis = 1, inplace = True)
county_pop_df.head()
Unnamed: 0 | 2019 | |
---|---|---|
0 | California | 39,512,223 |
1 | .Alameda County, California | 1,671,329 |
2 | .Alpine County, California | 1,129 |
3 | .Amador County, California | 39,752 |
4 | .Butte County, California | 219,186 |
# clean up county column
county_pop_df['Unnamed: 0'].replace(' County, California', '', inplace = True, regex = True)
county_pop_df['Unnamed: 0'].replace('\.', '', inplace = True, regex = True)
county_pop_df.drop(index = 0, inplace = True)
county_pop_df.head()
Unnamed: 0 | 2019 | |
---|---|---|
1 | Alameda | 1,671,329 |
2 | Alpine | 1,129 |
3 | Amador | 39,752 |
4 | Butte | 219,186 |
5 | Calaveras | 45,905 |
# join population with rest of data
cases_masks_pop_df = cases_masks_df.merge(county_pop_df, how = 'inner', left_on = 'county',
right_on = 'Unnamed: 0')
cases_masks_pop_df.head()
county | totalcountconfirmed | totalcountdeaths | newcountconfirmed | newcountdeaths | date | NEVER | RARELY | SOMETIMES | FREQUENTLY | ALWAYS | Unnamed: 0 | 2019 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Santa Clara | 151.0 | 6.0 | 151 | 6 | 2020-03-18 | 0.015 | 0.014 | 0.04 | 0.168 | 0.764 | Santa Clara | 1,927,852 |
1 | Santa Clara | 183.0 | 8.0 | 32 | 2 | 2020-03-19 | 0.015 | 0.014 | 0.04 | 0.168 | 0.764 | Santa Clara | 1,927,852 |
2 | Santa Clara | 246.0 | 8.0 | 63 | 0 | 2020-03-20 | 0.015 | 0.014 | 0.04 | 0.168 | 0.764 | Santa Clara | 1,927,852 |
3 | Santa Clara | 269.0 | 10.0 | 23 | 2 | 2020-03-21 | 0.015 | 0.014 | 0.04 | 0.168 | 0.764 | Santa Clara | 1,927,852 |
4 | Santa Clara | 284.0 | 13.0 | 15 | 3 | 2020-03-22 | 0.015 | 0.014 | 0.04 | 0.168 | 0.764 | Santa Clara | 1,927,852 |
# clean up final dataframe
cases_masks_pop_df.drop('Unnamed: 0', axis = 1, inplace = True)
cases_masks_pop_df.rename({'2019': 'population'}, axis = 1, inplace = True)
cases_masks_pop_df['population'].replace(',', '', inplace = True, regex = True)
cases_masks_pop_df['population'] = cases_masks_pop_df['population'].astype(int)
cases_masks_pop_df.dtypes
county object totalcountconfirmed float64 totalcountdeaths float64 newcountconfirmed int64 newcountdeaths int64 date object NEVER float64 RARELY float64 SOMETIMES float64 FREQUENTLY float64 ALWAYS float64 population int64 dtype: object
cases_masks_pop_df.head()
county | totalcountconfirmed | totalcountdeaths | newcountconfirmed | newcountdeaths | date | NEVER | RARELY | SOMETIMES | FREQUENTLY | ALWAYS | population | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Santa Clara | 151.0 | 6.0 | 151 | 6 | 2020-03-18 | 0.015 | 0.014 | 0.04 | 0.168 | 0.764 | 1927852 |
1 | Santa Clara | 183.0 | 8.0 | 32 | 2 | 2020-03-19 | 0.015 | 0.014 | 0.04 | 0.168 | 0.764 | 1927852 |
2 | Santa Clara | 246.0 | 8.0 | 63 | 0 | 2020-03-20 | 0.015 | 0.014 | 0.04 | 0.168 | 0.764 | 1927852 |
3 | Santa Clara | 269.0 | 10.0 | 23 | 2 | 2020-03-21 | 0.015 | 0.014 | 0.04 | 0.168 | 0.764 | 1927852 |
4 | Santa Clara | 284.0 | 13.0 | 15 | 3 | 2020-03-22 | 0.015 | 0.014 | 0.04 | 0.168 | 0.764 | 1927852 |
# save cleaned data
cases_masks_pop_df.to_csv('../clean_data/cases_with_mask_use.csv', index = False)
vaccine_moderna.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 66 entries, 0 to 65 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Jurisdiction 66 non-null object 1 HHS Region 64 non-null object 2 Doses allocated week of 12/21 64 non-null object 3 Second Dose Shipment (28 days later) week of 12/21 58 non-null object 4 Doses allocated week of 12/28 57 non-null object 5 Second Dose Shipment (28 days later) week of 12/28 57 non-null object 6 Doses allocated for distribution week of 01/04 64 non-null object 7 Second dose shipment for distribution (28 days later) week of 01/04 58 non-null object 8 Doses allocated for distribution week of 01/10 57 non-null object 9 Second dose shipment for distribution (28 days later) week of 01/10 57 non-null object 10 Doses allocated for distribution week of 01/18 57 non-null object 11 Second dose shipment for distribution (28 days later) week of 01/18 57 non-null object 12 Doses allocated for distribution week of 01/25 57 non-null object 13 Second dose shipment for distribution (28 days later) week of 01/25 57 non-null object 14 Doses allocated for distribution week of 02/01 64 non-null object 15 Second dose shipment for distribution (28 days later) week of 02/01 58 non-null object 16 Total Moderna Allocation "First Dose" Shipments 65 non-null object 17 Total Allocation Moderna"Second Dose" Shipments 65 non-null object dtypes: object(18) memory usage: 9.4+ KB
vaccine_moderna.head()
Jurisdiction | HHS Region | Doses allocated week of 12/21 | Second Dose Shipment (28 days later) week of 12/21 | Doses allocated week of 12/28 | Second Dose Shipment (28 days later) week of 12/28 | Doses allocated for distribution week of 01/04 | Second dose shipment for distribution (28 days later) week of 01/04 | Doses allocated for distribution week of 01/10 | Second dose shipment for distribution (28 days later) week of 01/10 | Doses allocated for distribution week of 01/18 | Second dose shipment for distribution (28 days later) week of 01/18 | Doses allocated for distribution week of 01/25 | Second dose shipment for distribution (28 days later) week of 01/25 | Doses allocated for distribution week of 02/01 | Second dose shipment for distribution (28 days later) week of 02/01 | Total Moderna Allocation "First Dose" Shipments | Total Allocation Moderna"Second Dose" Shipments | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Connecticut | Region 1 | 63,300 | 63,300 | 21,900 | 21,900 | 21,800 | 21,800 | 21,900 | 21,900 | 23,000 | 23,000 | 23,000 | 23,000 | 30,300 | 30,300 | 205,200 | 205,200 |
1 | Maine | Region 1 | 24,200 | 24,200 | 8,400 | 8,400 | 8,300 | 8,300 | 8,400 | 8,400 | 8,800 | 8,800 | 8,800 | 8,800 | 11,600 | 11,600 | 78,500 | 78,500 |
2 | Massachusetts | Region 1 | 121,900 | 121,900 | 42,100 | 42,100 | 42,000 | 42,000 | 42,100 | 42,100 | 44,300 | 44,300 | 44,300 | 44,300 | 58,400 | 58,400 | 395,100 | 395,100 |
3 | New Hampshire | Region 1 | 24,200 | 24,200 | 8,400 | 8,400 | 8,400 | 8,400 | 8,400 | 8,400 | 8,800 | 8,800 | 8,800 | 8,800 | 11,600 | 11,600 | 78,600 | 78,600 |
4 | Rhode Island | Region 1 | 19,000 | 19,000 | 6,600 | 6,600 | 6,600 | 6,600 | 6,600 | 6,600 | 6,900 | 6,900 | 6,900 | 6,900 | 9,100 | 9,100 | 61,700 | 61,700 |
vaccine_moderna.columns
Index(['Jurisdiction', 'HHS Region', 'Doses allocated week of 12/21', 'Second Dose Shipment (28 days later) week of 12/21', 'Doses allocated week of 12/28', 'Second Dose Shipment (28 days later) week of 12/28', 'Doses allocated for distribution week of 01/04', 'Second dose shipment for distribution (28 days later) week of 01/04', 'Doses allocated for distribution week of 01/10', 'Second dose shipment for distribution (28 days later) week of 01/10', 'Doses allocated for distribution week of 01/18', 'Second dose shipment for distribution (28 days later) week of 01/18', 'Doses allocated for distribution week of 01/25', 'Second dose shipment for distribution (28 days later) week of 01/25', 'Doses allocated for distribution week of 02/01', 'Second dose shipment for distribution (28 days later) week of 02/01', 'Total Moderna Allocation "First Dose" Shipments', 'Total Allocation Moderna"Second Dose" Shipments'], dtype='object')
vaccine_moderna.shape
(66, 18)
total_first_not_equal_second=vaccine_moderna[vaccine_moderna["Total Moderna Allocation \"First Dose\" Shipments"]
!=vaccine_moderna["Total Allocation Moderna\"Second Dose\" Shipments"]]
total_first_not_equal_second
Jurisdiction | HHS Region | Doses allocated week of 12/21 | Second Dose Shipment (28 days later) week of 12/21 | Doses allocated week of 12/28 | Second Dose Shipment (28 days later) week of 12/28 | Doses allocated for distribution week of 01/04 | Second dose shipment for distribution (28 days later) week of 01/04 | Doses allocated for distribution week of 01/10 | Second dose shipment for distribution (28 days later) week of 01/10 | Doses allocated for distribution week of 01/18 | Second dose shipment for distribution (28 days later) week of 01/18 | Doses allocated for distribution week of 01/25 | Second dose shipment for distribution (28 days later) week of 01/25 | Doses allocated for distribution week of 02/01 | Second dose shipment for distribution (28 days later) week of 02/01 | Total Moderna Allocation "First Dose" Shipments | Total Allocation Moderna"Second Dose" Shipments | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
54 | American Samoa** | Region 9 | 2,400 | NaN | NaN | NaN | 2,400 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 3,200 | NaN | 8,000 | 0 |
55 | Guam** | Region 9 | 7,600 | NaN | NaN | NaN | 7,200 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 9,200 | NaN | 24,000 | 0 |
56 | Marshall Islands* | Region 9 | 7,200 | NaN | NaN | NaN | 6,400 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 6,600 | NaN | 20,200 | 0 |
57 | Micronesia* | Region 9 | 9,800 | NaN | NaN | NaN | 9,600 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 9,400 | NaN | 28,800 | 0 |
58 | Mariana Islands** | Region 9 | 2,600 | NaN | NaN | NaN | 2,400 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 3,200 | NaN | 8,200 | 0 |
59 | Palau* | Region 9 | 2,800 | NaN | NaN | NaN | 3,200 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2,800 | NaN | 8,800 | 0 |
64 | Federal Entities **** | NaN | 298,200 | 289,200 | 100,000 | 100,000 | 108,400 | 99,600 | 99,900 | 99,900 | 105,000 | 105,000 | 105,000 | 105,000 | 149,400 | 138,600 | 965,900 | 937,300 |
65 | Total | NaN | 5,990,000 | 5,948,600 | 2,047,300 | 2,047,300 | 2,104,400 | 2,064,400 | 2,047,200 | 2,047,200 | 2,150,000 | 2,150,000 | 2,150,000 | 2,150,000 | 2,913,400 | 2,868,200 | NaN | NaN |
# drop rows where first total doses is not equal to the second total doses
vaccine_moderna.drop(total_first_not_equal_second.index.tolist(),inplace=True)
# drop all the second dose columns, since they're all equal to the first dose
#https://stackoverflow.com/questions/19071199/drop-columns-whose-name-contains-a-specific-string-from-pandas-dataframe
vaccine_moderna=vaccine_moderna[vaccine_moderna.columns.drop
(list(vaccine_moderna.filter(regex='Second')))]
def remove_punctuation(x):
try:
x=x.str.replace('[^\w\s]','')
except:
pass
return x
vaccine_moderna=vaccine_moderna.apply(remove_punctuation)
# show rows with Nan value
vaccine_moderna[vaccine_moderna.isna().any(axis=1)]
Jurisdiction | HHS Region | Doses allocated week of 12/21 | Doses allocated week of 12/28 | Doses allocated for distribution week of 01/04 | Doses allocated for distribution week of 01/10 | Doses allocated for distribution week of 01/18 | Doses allocated for distribution week of 01/25 | Doses allocated for distribution week of 02/01 | Total Moderna Allocation "First Dose" Shipments | |
---|---|---|---|---|---|---|---|---|---|---|
38 | San Antonio | Region 6 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
39 | Houston | Region 6 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
60 | Alaska | Region 10 | 26800 | NaN | 25600 | NaN | NaN | NaN | 32300 | 84700 |
vaccine_moderna=vaccine_moderna.drop([38,39])
# fill out the rest of NaN value with 0
vaccine_moderna.fillna(0,inplace=True)
vaccine_moderna.shape
(56, 10)
vaccine_moderna.isnull().sum()
Jurisdiction 0 HHS Region 0 Doses allocated week of 12/21 0 Doses allocated week of 12/28 0 Doses allocated for distribution week of 01/04 0 Doses allocated for distribution week of 01/10 0 Doses allocated for distribution week of 01/18 0 Doses allocated for distribution week of 01/25 0 Doses allocated for distribution week of 02/01 0 Total Moderna Allocation "First Dose" Shipments 0 dtype: int64
vaccine_moderna[['Doses allocated week of 12/21',
'Doses allocated week of 12/28',
'Doses allocated for distribution week of 01/04',
'Doses allocated for distribution week of 01/10',
'Doses allocated for distribution week of 01/18',
'Doses allocated for distribution week of 01/25',
'Doses allocated for distribution week of 02/01',
'Total Moderna Allocation "First Dose" Shipments']]\
=vaccine_moderna[['Doses allocated week of 12/21',
'Doses allocated week of 12/28',
'Doses allocated for distribution week of 01/04',
'Doses allocated for distribution week of 01/10',
'Doses allocated for distribution week of 01/18',
'Doses allocated for distribution week of 01/25',
'Doses allocated for distribution week of 02/01',
'Total Moderna Allocation "First Dose" Shipments']].astype('int64')
vaccine_moderna.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 56 entries, 0 to 63 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Jurisdiction 56 non-null object 1 HHS Region 56 non-null object 2 Doses allocated week of 12/21 56 non-null int64 3 Doses allocated week of 12/28 56 non-null int64 4 Doses allocated for distribution week of 01/04 56 non-null int64 5 Doses allocated for distribution week of 01/10 56 non-null int64 6 Doses allocated for distribution week of 01/18 56 non-null int64 7 Doses allocated for distribution week of 01/25 56 non-null int64 8 Doses allocated for distribution week of 02/01 56 non-null int64 9 Total Moderna Allocation "First Dose" Shipments 56 non-null int64 dtypes: int64(8), object(2) memory usage: 4.8+ KB
# rename the Total Allocation column name for concat later
vaccine_moderna=vaccine_moderna.rename(columns={'Total Moderna Allocation "First Dose" Shipments'
:'Total Allocation "First Dose" Shipments'})
vaccine_moderna.head(1)
Jurisdiction | HHS Region | Doses allocated week of 12/21 | Doses allocated week of 12/28 | Doses allocated for distribution week of 01/04 | Doses allocated for distribution week of 01/10 | Doses allocated for distribution week of 01/18 | Doses allocated for distribution week of 01/25 | Doses allocated for distribution week of 02/01 | Total Allocation "First Dose" Shipments | |
---|---|---|---|---|---|---|---|---|---|---|
0 | Connecticut | Region 1 | 63300 | 21900 | 21800 | 21900 | 23000 | 23000 | 30300 | 205200 |
vaccine_pfizer=pd.read_csv('../data/COVID-19_Vaccine_Distribution_Allocations_by_Jurisdiction_-_Pfizer.csv')
vaccine_pfizer.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 66 entries, 0 to 65 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Jurisdiction 66 non-null object 1 HHS Region 64 non-null object 2 Doses allocated week of 12/14 61 non-null object 3 Second Dose Shipment (21 days later) week of 12/14 58 non-null object 4 Doses allocated week of 12/21 57 non-null object 5 Second Dose Shipment (21 days later) week of 12/21 57 non-null object 6 Doses allocated week of 12/28 57 non-null object 7 Second Dose Shipment (21 days later) week of 12/28 57 non-null object 8 Doses allocated for distribution week of 01/04 61 non-null object 9 Second dose shipment for distribution (21 days later) week of 01/04 58 non-null object 10 Doses allocated for distribution week of 01/10 57 non-null object 11 Second dose shipment for distribution (21 days later) week of 01/10 57 non-null object 12 Doses allocated for distribution week of 01/18 57 non-null object 13 Second dose shipment for distribution (21 days later) week of 01/18 57 non-null object 14 Doses allocated for distribution week of 01/25 57 non-null object 15 Second dose shipment for distribution (21 days later) week of 01/25 57 non-null object 16 Doses allocated for distribution week of 02/01 61 non-null object 17 Second dose shipment for distribution (21 days later) week of 02/21 58 non-null object 18 Total Pfizer Allocation "First Dose" Shipments 65 non-null object 19 Total Allocation Pfizer "Second Dose" Shipments 65 non-null object dtypes: object(20) memory usage: 10.4+ KB
vaccine_pfizer.shape
(66, 20)
vaccine_pfizer.tail()
Jurisdiction | HHS Region | Doses allocated week of 12/14 | Second Dose Shipment (21 days later) week of 12/14 | Doses allocated week of 12/21 | Second Dose Shipment (21 days later) week of 12/21 | Doses allocated week of 12/28 | Second Dose Shipment (21 days later) week of 12/28 | Doses allocated for distribution week of 01/04 | Second dose shipment for distribution (21 days later) week of 01/04 | Doses allocated for distribution week of 01/10 | Second dose shipment for distribution (21 days later) week of 01/10 | Doses allocated for distribution week of 01/18 | Second dose shipment for distribution (21 days later) week of 01/18 | Doses allocated for distribution week of 01/25 | Second dose shipment for distribution (21 days later) week of 01/25 | Doses allocated for distribution week of 02/01 | Second dose shipment for distribution (21 days later) week of 02/21 | Total Pfizer Allocation "First Dose" Shipments | Total Allocation Pfizer "Second Dose" Shipments | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
61 | Idaho | Region 10 | 13,650 | 13,650 | 9,750 | 9,750 | 12,675 | 12,675 | 9,750 | 9,750 | 9,750 | 9,750 | 10,725 | 10,725 | 10,725 | 10,725 | 10,725 | 10,725 | 87,750 | 87,750 |
62 | Oregon*** | Region 10 | 35,100 | 35,100 | 25,350 | 25,350 | 32,175 | 32,175 | 25,350 | 25,350 | 25,350 | 25,350 | 26,325 | 26,325 | 26,325 | 26,325 | 26,325 | 26,325 | 222,300 | 222,300 |
63 | Washington*** | Region 10 | 62,400 | 62,400 | 44,850 | 44,850 | 57,525 | 57,525 | 44,850 | 44,850 | 57,525 | 57,525 | 46,800 | 46,800 | 46,800 | 46,800 | 45,825 | 45,825 | 406,575 | 406,575 |
64 | Federal Entities **** | NaN | 156000 | 142350 | 102,375 | 102,375 | 128,700 | 128,700 | 117,000 | 101,400 | 102,375 | 102,375 | 105,300 | 105,300 | 105,300 | 105,300 | 119,925 | 104,325 | 936,975 | 892,125 |
65 | Total | NaN | 2,980,575 | 2,943,525 | 2,071,875 | 2,071,875 | 2,643,225 | 2,643,225 | 2,131,350 | 2,092,350 | 2,084,550 | 2,084,550 | 2,149,875 | 2,149,875 | 2,149,875 | 2,149,875 | 2,204,475 | 2,165,475 | NaN | NaN |
vaccine_pfizer.columns
Index(['Jurisdiction', 'HHS Region', 'Doses allocated week of 12/14', 'Second Dose Shipment (21 days later) week of 12/14', 'Doses allocated week of 12/21', 'Second Dose Shipment (21 days later) week of 12/21', 'Doses allocated week of 12/28', 'Second Dose Shipment (21 days later) week of 12/28', 'Doses allocated for distribution week of 01/04', 'Second dose shipment for distribution (21 days later) week of 01/04', 'Doses allocated for distribution week of 01/10', 'Second dose shipment for distribution (21 days later) week of 01/10', 'Doses allocated for distribution week of 01/18', 'Second dose shipment for distribution (21 days later) week of 01/18', 'Doses allocated for distribution week of 01/25', 'Second dose shipment for distribution (21 days later) week of 01/25', 'Doses allocated for distribution week of 02/01', 'Second dose shipment for distribution (21 days later) week of 02/21', 'Total Pfizer Allocation "First Dose" Shipments', 'Total Allocation Pfizer "Second Dose" Shipments'], dtype='object')
total_first_not_equal_second_p=vaccine_pfizer[vaccine_pfizer["Total Pfizer Allocation \"First Dose\" Shipments"]
!=vaccine_pfizer["Total Allocation Pfizer \"Second Dose\" Shipments"]]
total_first_not_equal_second_p
Jurisdiction | HHS Region | Doses allocated week of 12/14 | Second Dose Shipment (21 days later) week of 12/14 | Doses allocated week of 12/21 | Second Dose Shipment (21 days later) week of 12/21 | Doses allocated week of 12/28 | Second Dose Shipment (21 days later) week of 12/28 | Doses allocated for distribution week of 01/04 | Second dose shipment for distribution (21 days later) week of 01/04 | Doses allocated for distribution week of 01/10 | Second dose shipment for distribution (21 days later) week of 01/10 | Doses allocated for distribution week of 01/18 | Second dose shipment for distribution (21 days later) week of 01/18 | Doses allocated for distribution week of 01/25 | Second dose shipment for distribution (21 days later) week of 01/25 | Doses allocated for distribution week of 02/01 | Second dose shipment for distribution (21 days later) week of 02/21 | Total Pfizer Allocation "First Dose" Shipments | Total Allocation Pfizer "Second Dose" Shipments | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
54 | American Samoa** | Region 9 | 5,850 | NaN | NaN | NaN | NaN | NaN | 7,800 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 7,800 | NaN | 21,450 | 0 |
55 | Guam** | Region 9 | 11,700 | NaN | NaN | NaN | NaN | NaN | 7,800 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 7,800 | NaN | 27,300 | 0 |
58 | Mariana Islands** | Region 9 | 5,850 | NaN | NaN | NaN | NaN | NaN | 7,800 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 7,800 | NaN | 21,450 | 0 |
64 | Federal Entities **** | NaN | 156000 | 142350 | 102,375 | 102,375 | 128,700 | 128,700 | 117,000 | 101,400 | 102,375 | 102,375 | 105,300 | 105,300 | 105,300 | 105,300 | 119,925 | 104,325 | 936,975 | 892,125 |
65 | Total | NaN | 2,980,575 | 2,943,525 | 2,071,875 | 2,071,875 | 2,643,225 | 2,643,225 | 2,131,350 | 2,092,350 | 2,084,550 | 2,084,550 | 2,149,875 | 2,149,875 | 2,149,875 | 2,149,875 | 2,204,475 | 2,165,475 | NaN | NaN |
# drop rows where first total doses isn't equal to the second total doses
vaccine_pfizer.drop(total_first_not_equal_second_p.index.tolist(),inplace=True)
# drop all the second dose columns, since they're all equal to the first dose
vaccine_pfizer=vaccine_pfizer[vaccine_pfizer.columns.drop
(list(vaccine_pfizer.filter(regex='Second')))]
vaccine_pfizer=vaccine_pfizer.apply(remove_punctuation)
# show rows with Nan value
vaccine_pfizer[vaccine_pfizer.isna().any(axis=1)]
Jurisdiction | HHS Region | Doses allocated week of 12/14 | Doses allocated week of 12/21 | Doses allocated week of 12/28 | Doses allocated for distribution week of 01/04 | Doses allocated for distribution week of 01/10 | Doses allocated for distribution week of 01/18 | Doses allocated for distribution week of 01/25 | Doses allocated for distribution week of 02/01 | Total Pfizer Allocation "First Dose" Shipments | |
---|---|---|---|---|---|---|---|---|---|---|---|
38 | San Antonio | Region 6 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
39 | Houston | Region 6 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
56 | Marshall Islands | Region 9 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
57 | Micronesia | Region 9 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
59 | Palau | Region 9 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
60 | Alaska | Region 10 | 35100 | NaN | NaN | 27300 | NaN | NaN | NaN | 27300 | 89700 |
# drop everything except Alaska
vaccine_pfizer=vaccine_pfizer.drop([38,39,56,57,59])
# fill out the rest of NaN value with 0
vaccine_pfizer.fillna(0,inplace=True)
vaccine_pfizer.shape
(56, 11)
vaccine_pfizer.isnull().sum()
Jurisdiction 0 HHS Region 0 Doses allocated week of 12/14 0 Doses allocated week of 12/21 0 Doses allocated week of 12/28 0 Doses allocated for distribution week of 01/04 0 Doses allocated for distribution week of 01/10 0 Doses allocated for distribution week of 01/18 0 Doses allocated for distribution week of 01/25 0 Doses allocated for distribution week of 02/01 0 Total Pfizer Allocation "First Dose" Shipments 0 dtype: int64
vaccine_pfizer.columns
Index(['Jurisdiction', 'HHS Region', 'Doses allocated week of 12/14', 'Doses allocated week of 12/21', 'Doses allocated week of 12/28', 'Doses allocated for distribution week of 01/04', 'Doses allocated for distribution week of 01/10', 'Doses allocated for distribution week of 01/18', 'Doses allocated for distribution week of 01/25', 'Doses allocated for distribution week of 02/01', 'Total Pfizer Allocation "First Dose" Shipments'], dtype='object')
vaccine_pfizer[['Doses allocated week of 12/14',
'Doses allocated week of 12/21','Doses allocated week of 12/28',
'Doses allocated for distribution week of 01/04',
'Doses allocated for distribution week of 01/10',
'Doses allocated for distribution week of 01/18',
'Doses allocated for distribution week of 01/25',
'Doses allocated for distribution week of 02/01',
'Total Pfizer Allocation "First Dose" Shipments']]\
=vaccine_pfizer[['Doses allocated week of 12/14',
'Doses allocated week of 12/21','Doses allocated week of 12/28',
'Doses allocated for distribution week of 01/04',
'Doses allocated for distribution week of 01/10',
'Doses allocated for distribution week of 01/18',
'Doses allocated for distribution week of 01/25',
'Doses allocated for distribution week of 02/01',
'Total Pfizer Allocation "First Dose" Shipments']].astype('int64')
vaccine_pfizer.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 56 entries, 0 to 63 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Jurisdiction 56 non-null object 1 HHS Region 56 non-null object 2 Doses allocated week of 12/14 56 non-null int64 3 Doses allocated week of 12/21 56 non-null int64 4 Doses allocated week of 12/28 56 non-null int64 5 Doses allocated for distribution week of 01/04 56 non-null int64 6 Doses allocated for distribution week of 01/10 56 non-null int64 7 Doses allocated for distribution week of 01/18 56 non-null int64 8 Doses allocated for distribution week of 01/25 56 non-null int64 9 Doses allocated for distribution week of 02/01 56 non-null int64 10 Total Pfizer Allocation "First Dose" Shipments 56 non-null int64 dtypes: int64(9), object(2) memory usage: 5.2+ KB
# rename the Total Allocation column name for concat later
vaccine_pfizer=vaccine_pfizer.rename(columns={'Total Pfizer Allocation "First Dose" Shipments':
'Total Allocation "First Dose" Shipments'})
vaccine_pfizer.head(1)
Jurisdiction | HHS Region | Doses allocated week of 12/14 | Doses allocated week of 12/21 | Doses allocated week of 12/28 | Doses allocated for distribution week of 01/04 | Doses allocated for distribution week of 01/10 | Doses allocated for distribution week of 01/18 | Doses allocated for distribution week of 01/25 | Doses allocated for distribution week of 02/01 | Total Allocation "First Dose" Shipments | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | Connecticut | Region 1 | 31200 | 22425 | 28275 | 22425 | 22425 | 23400 | 23400 | 23400 | 196950 |
vaccine=pd.concat([vaccine_moderna,vaccine_pfizer]).groupby(['Jurisdiction','HHS Region']).sum().reset_index()
vaccine.head()
Jurisdiction | HHS Region | Doses allocated week of 12/21 | Doses allocated week of 12/28 | Doses allocated for distribution week of 01/04 | Doses allocated for distribution week of 01/10 | Doses allocated for distribution week of 01/18 | Doses allocated for distribution week of 01/25 | Doses allocated for distribution week of 02/01 | Total Allocation "First Dose" Shipments | Doses allocated week of 12/14 | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | Alabama | Region 4 | 113550 | 67125 | 58250 | 58350 | 60825 | 60825 | 70625 | 530500 | 40950.0 |
1 | Alaska | Region 10 | 26800 | 0 | 52900 | 0 | 0 | 0 | 59600 | 174400 | 35100.0 |
2 | Arizona | Region 9 | 161325 | 94925 | 83025 | 83225 | 86300 | 86300 | 100100 | 753700 | 58500.0 |
3 | Arkansas | Region 6 | 69625 | 41100 | 36125 | 36225 | 37125 | 37125 | 43025 | 325700 | 25350.0 |
4 | California | Region 9 | 905625 | 529675 | 463450 | 465325 | 485800 | 485800 | 562825 | 4226100 | 327600.0 |
# rename all the columns name for future convenience
vaccine=vaccine.rename(columns={
'Jurisdiction':'jurisdiction',
'HHS Region':'hhs_region',
'Doses allocated week of 12/21':'doses_allocated_12_21',
'Doses allocated week of 12/28':'doses_allocated_12_28',
'Doses allocated for distribution week of 01/04':'doses_allocated_01_04',
'Doses allocated for distribution week of 01/10':'doses_allocated_01_10',
'Doses allocated for distribution week of 01/18':'doses_allocated_01_18',
'Doses allocated for distribution week of 01/25':'doses_allocated_01_25',
'Doses allocated for distribution week of 02/01':'doses_allocated_02_01',
'Total Allocation "First Dose" Shipments':'total_first_allocation',
'Doses allocated week of 12/14':'doses_allocated_12_14'
})
vaccine.columns
Index(['jurisdiction', 'hhs_region', 'doses_allocated_12_21', 'doses_allocated_12_28', 'doses_allocated_01_04', 'doses_allocated_01_10', 'doses_allocated_01_18', 'doses_allocated_01_25', 'doses_allocated_02_01', 'total_first_allocation', 'doses_allocated_12_14'], dtype='object')
# reset the oder of the columns
vaccine = vaccine.reindex(['jurisdiction','hhs_region','doses_allocated_12_14',
'doses_allocated_12_21','doses_allocated_12_28',
'doses_allocated_01_04','doses_allocated_01_10',
'doses_allocated_01_18','doses_allocated_01_25',
'doses_allocated_02_01','total_first_allocation'],axis=1)
vaccine.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 56 entries, 0 to 55 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 jurisdiction 56 non-null object 1 hhs_region 56 non-null object 2 doses_allocated_12_14 56 non-null float64 3 doses_allocated_12_21 56 non-null int64 4 doses_allocated_12_28 56 non-null int64 5 doses_allocated_01_04 56 non-null int64 6 doses_allocated_01_10 56 non-null int64 7 doses_allocated_01_18 56 non-null int64 8 doses_allocated_01_25 56 non-null int64 9 doses_allocated_02_01 56 non-null int64 10 total_first_allocation 56 non-null int64 dtypes: float64(1), int64(8), object(2) memory usage: 4.9+ KB
# convert the doses_allocated_12_14 from float to interger
vaccine[['doses_allocated_12_14']]=vaccine[['doses_allocated_12_14']].astype('int64')
vaccine.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 56 entries, 0 to 55 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 jurisdiction 56 non-null object 1 hhs_region 56 non-null object 2 doses_allocated_12_14 56 non-null int64 3 doses_allocated_12_21 56 non-null int64 4 doses_allocated_12_28 56 non-null int64 5 doses_allocated_01_04 56 non-null int64 6 doses_allocated_01_10 56 non-null int64 7 doses_allocated_01_18 56 non-null int64 8 doses_allocated_01_25 56 non-null int64 9 doses_allocated_02_01 56 non-null int64 10 total_first_allocation 56 non-null int64 dtypes: int64(9), object(2) memory usage: 4.9+ KB
vaccine.head()
jurisdiction | hhs_region | doses_allocated_12_14 | doses_allocated_12_21 | doses_allocated_12_28 | doses_allocated_01_04 | doses_allocated_01_10 | doses_allocated_01_18 | doses_allocated_01_25 | doses_allocated_02_01 | total_first_allocation | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | Alabama | Region 4 | 40950 | 113550 | 67125 | 58250 | 58350 | 60825 | 60825 | 70625 | 530500 |
1 | Alaska | Region 10 | 35100 | 26800 | 0 | 52900 | 0 | 0 | 0 | 59600 | 174400 |
2 | Arizona | Region 9 | 58500 | 161325 | 94925 | 83025 | 83225 | 86300 | 86300 | 100100 | 753700 |
3 | Arkansas | Region 6 | 25350 | 69625 | 41100 | 36125 | 36225 | 37125 | 37125 | 43025 | 325700 |
4 | California | Region 9 | 327600 | 905625 | 529675 | 463450 | 465325 | 485800 | 485800 | 562825 | 4226100 |
vaccine.to_csv('../data/vaccine_allocation.csv',index=False)
population=pd.read_csv('../data/population_per_state.csv')
population.head()
SUMLEV | REGION | DIVISION | STATE | NAME | CENSUS2010POP | ESTIMATESBASE2010 | POPESTIMATE2010 | POPESTIMATE2011 | POPESTIMATE2012 | ... | RDOMESTICMIG2019 | RNETMIG2011 | RNETMIG2012 | RNETMIG2013 | RNETMIG2014 | RNETMIG2015 | RNETMIG2016 | RNETMIG2017 | RNETMIG2018 | RNETMIG2019 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10 | 0 | 0 | 0 | United States | 308745538 | 308758105 | 309321666 | 311556874 | 313830990 | ... | 0.000000 | 2.493773 | 2.682083 | 2.636187 | 2.921500 | 3.260435 | 3.252788 | 2.871957 | 2.153911 | 1.818059 |
1 | 20 | 1 | 0 | 0 | Northeast Region | 55317240 | 55318443 | 55380134 | 55604223 | 55775216 | ... | -5.254530 | 0.887909 | -0.038355 | -0.469783 | -0.986097 | -2.061965 | -2.490484 | -1.837048 | -2.134447 | -2.859713 |
2 | 20 | 2 | 0 | 0 | Midwest Region | 66927001 | 66929725 | 66974416 | 67157800 | 67336743 | ... | -2.365881 | -0.963930 | -0.973943 | -0.006924 | -0.762969 | -1.388437 | -1.241784 | -0.557370 | -0.922755 | -1.111173 |
3 | 20 | 3 | 0 | 0 | South Region | 114555744 | 114563030 | 114866680 | 116006522 | 117241208 | ... | 3.261349 | 5.130513 | 5.850458 | 5.292073 | 6.161501 | 7.277358 | 7.150074 | 6.198168 | 5.225519 | 5.203720 |
4 | 20 | 4 | 0 | 0 | West Region | 71945553 | 71946907 | 72100436 | 72788329 | 73477823 | ... | 0.614245 | 2.723344 | 3.062896 | 3.162262 | 4.026429 | 4.987285 | 5.261078 | 4.021194 | 3.044951 | 2.312083 |
5 rows × 151 columns
col=[col for col in population.columns if '2019' in col]
col
['POPESTIMATE2019', 'NPOPCHG_2019', 'BIRTHS2019', 'DEATHS2019', 'NATURALINC2019', 'INTERNATIONALMIG2019', 'DOMESTICMIG2019', 'NETMIG2019', 'RESIDUAL2019', 'RBIRTH2019', 'RDEATH2019', 'RNATURALINC2019', 'RINTERNATIONALMIG2019', 'RDOMESTICMIG2019', 'RNETMIG2019']
ppl=population[['NAME','POPESTIMATE2019','NPOPCHG_2019','BIRTHS2019','DEATHS2019',
'NATURALINC2019','INTERNATIONALMIG2019','DOMESTICMIG2019','NETMIG2019','RESIDUAL2019',
'RBIRTH2019','RDEATH2019','RNATURALINC2019','RINTERNATIONALMIG2019','RDOMESTICMIG2019',
'RNETMIG2019']].copy()
ppl.head()
NAME | POPESTIMATE2019 | NPOPCHG_2019 | BIRTHS2019 | DEATHS2019 | NATURALINC2019 | INTERNATIONALMIG2019 | DOMESTICMIG2019 | NETMIG2019 | RESIDUAL2019 | RBIRTH2019 | RDEATH2019 | RNATURALINC2019 | RINTERNATIONALMIG2019 | RDOMESTICMIG2019 | RNETMIG2019 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | United States | 328239523 | 1552022 | 3791712 | 2835038 | 956674 | 595348 | 0 | 595348 | 0 | 11.579037 | 8.657569 | 2.921467 | 1.818059 | 0.000000 | 1.818059 |
1 | Northeast Region | 55982803 | -63817 | 602740 | 505588 | 97152 | 134145 | -294331 | -160186 | -783 | 10.760387 | 9.025986 | 1.734402 | 2.394817 | -5.254530 | -2.859713 |
2 | Midwest Region | 68329004 | 92376 | 792343 | 622854 | 169489 | 85675 | -161549 | -75874 | -1239 | 11.603842 | 9.121680 | 2.482162 | 1.254708 | -2.365881 | -1.111173 |
3 | South Region | 125580448 | 1011015 | 1481244 | 1122130 | 359114 | 242942 | 407913 | 650855 | 1046 | 11.842852 | 8.971661 | 2.871191 | 1.942372 | 3.261349 | 5.203720 |
4 | West Region | 78347268 | 512448 | 915385 | 584466 | 330919 | 132586 | 47967 | 180553 | 976 | 11.722023 | 7.484418 | 4.237605 | 1.697839 | 0.614245 | 2.312083 |
ppl_df=population[['NAME','POPESTIMATE2019']].copy()
ppl_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 57 entries, 0 to 56 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 NAME 57 non-null object 1 POPESTIMATE2019 57 non-null int64 dtypes: int64(1), object(1) memory usage: 1.0+ KB
ppl_df.head(6)
NAME | POPESTIMATE2019 | |
---|---|---|
0 | United States | 328239523 |
1 | Northeast Region | 55982803 |
2 | Midwest Region | 68329004 |
3 | South Region | 125580448 |
4 | West Region | 78347268 |
5 | Alabama | 4903185 |
# drop the total and region population
ppl_df.drop([0,1,2,3,4],inplace=True)
#rename the column name
ppl_df.rename(columns={'NAME':'jurisdiction','POPESTIMATE2019':'2019_population'},inplace=True)
ppl_df.shape
(52, 2)
# set the jurisdiction as index for concat later
ppl_df.set_index('jurisdiction',inplace=True)
vaccine=pd.read_csv('../data/vaccine_allocation.csv')
vaccine.head()
jurisdiction | hhs_region | doses_allocated_12_14 | doses_allocated_12_21 | doses_allocated_12_28 | doses_allocated_01_04 | doses_allocated_01_10 | doses_allocated_01_18 | doses_allocated_01_25 | doses_allocated_02_01 | total_first_allocation | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | Alabama | Region 4 | 40950 | 113550 | 67125 | 58250 | 58350 | 60825 | 60825 | 70625 | 530500 |
1 | Alaska | Region 10 | 35100 | 26800 | 0 | 52900 | 0 | 0 | 0 | 59600 | 174400 |
2 | Arizona | Region 9 | 58500 | 161325 | 94925 | 83025 | 83225 | 86300 | 86300 | 100100 | 753700 |
3 | Arkansas | Region 6 | 25350 | 69625 | 41100 | 36125 | 36225 | 37125 | 37125 | 43025 | 325700 |
4 | California | Region 9 | 327600 | 905625 | 529675 | 463450 | 465325 | 485800 | 485800 | 562825 | 4226100 |
# Merge Illinois row and Chicage row
vaccine.loc[14]+=vaccine.loc[5]
# drop Chicago row
vaccine.drop([5],inplace=True)
#change the cell value from IllinoisChicago to Chicago
vaccine.at[14,'jurisdiction']='Illinois'
# change the cell value from Region 5Region 5 to Region 5
vaccine.at[14,'hhs_region']='Region 5'
# merge New York row and New York City row
vaccine.loc[33]+=vaccine.loc[34]
#drop New York City row
vaccine.drop([34],inplace=True)
# change the cell value from New YorkNew York City to New York
vaccine.at[33,'jurisdiction']='New York'
# change the cell value from Region 2Region 2 to Region 2
vaccine.at[33,'hhs_region']='Region 2'
# merge Pennsylvania row and Philadelphia row
vaccine.loc[40]+=vaccine.loc[41]
# drop Philadelphia row
vaccine.drop([41],inplace=True)
# change the cell value from PennsylvaniaPhiladelphia to Pennsylvania
vaccine.at[40,'jurisdiction']='Pennsylvania'
# change the cell value from Region 3Region 3 to Region 3
vaccine.at[40,'hhs_region']='Region 3'
# drop the US Virgin Islands
vaccine=vaccine[vaccine['jurisdiction'] != 'US Virgin Islands']
vaccine.shape
(52, 11)
# set jurisdiction as the index
vaccine.set_index('jurisdiction',inplace=True)
# concatenate
vaccine_population=pd.concat([ppl_df,vaccine], axis=1,join='inner')
vaccine_population.reset_index(level=0,inplace=True)
vaccine_population.head()
jurisdiction | 2019_population | hhs_region | doses_allocated_12_14 | doses_allocated_12_21 | doses_allocated_12_28 | doses_allocated_01_04 | doses_allocated_01_10 | doses_allocated_01_18 | doses_allocated_01_25 | doses_allocated_02_01 | total_first_allocation | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Alabama | 4903185 | Region 4 | 40950 | 113550 | 67125 | 58250 | 58350 | 60825 | 60825 | 70625 | 530500 |
1 | Alaska | 731545 | Region 10 | 35100 | 26800 | 0 | 52900 | 0 | 0 | 0 | 59600 | 174400 |
2 | Arizona | 7278717 | Region 9 | 58500 | 161325 | 94925 | 83025 | 83225 | 86300 | 86300 | 100100 | 753700 |
3 | Arkansas | 3017804 | Region 6 | 25350 | 69625 | 41100 | 36125 | 36225 | 37125 | 37125 | 43025 | 325700 |
4 | California | 39512223 | Region 9 | 327600 | 905625 | 529675 | 463450 | 465325 | 485800 | 485800 | 562825 | 4226100 |
vaccine_population=vaccine_population.to_csv('../clean_data/vaccine_population.csv',index=False)
#https://stackoverflow.com/questions/3682748/converting-unix-timestamp-string-to-readable-date
datetime.utcfromtimestamp(1610755448).strftime('%Y-%m-%d %H:%M:%S')
'2021-01-16 00:04:08'
# convert unix time to readable timestamp
vac_1=pd.read_csv('../data/vaccinations_1610755448.csv')
vac_1=vac_1[vac_1['state']=='California']
vac_1.insert(1,'date',datetime.utcfromtimestamp(1610755448).strftime('%Y-%m-%d'))
vac_1
state | date | total_doses_distributed | total_doses_administered | doses_distributed_100k | doses_administed_100k | people_1+_dose | doses_1+_100k | people_2_doses | doses_2_100k | |
---|---|---|---|---|---|---|---|---|---|---|
6 | California | 2021-01-16 | 3548575 | 1072959 | 8981.0 | 2716.0 | 865387 | 2190.0 | 204374 | 204374 |
vac_2=pd.read_csv('../data/vaccinations_1610762408.csv')
vac_2=vac_2[vac_2['state']=='California']
vac_2.insert(1,'date',datetime.utcfromtimestamp(1610762408).strftime('%Y-%m-%d'))
vac_2
state | date | total_doses_distributed | total_doses_administered | doses_distributed_100k | doses_administed_100k | people_1+_dose | doses_1+_100k | people_2_doses | doses_2_100k | |
---|---|---|---|---|---|---|---|---|---|---|
6 | California | 2021-01-16 | 3548575 | 1072959 | 8981.0 | 2716.0 | 865387 | 2190.0 | 204374 | 204374 |
vac_3=pd.read_csv('../data/vaccinations_1610866722.csv')
vac_3=vac_3[vac_3['state']=='California']
vac_3.insert(1,'date',datetime.utcfromtimestamp(1610866722).strftime('%Y-%m-%d'))
vac_3
state | date | total_doses_distributed | total_doses_administered | doses_distributed_100k | doses_administed_100k | people_1+_dose | doses_1+_100k | people_2_doses | doses_2_100k | |
---|---|---|---|---|---|---|---|---|---|---|
6 | California | 2021-01-17 | 3548575 | 1072959 | 8981.0 | 2716.0 | 865387 | 2190.0 | 204374 | 204374 |
vac_4=pd.read_csv('../data/vaccinations_1610935207.csv')
vac_4=vac_4[vac_4['state']=='California']
vac_4.insert(1,'date',datetime.utcfromtimestamp(1610935207).strftime('%Y-%m-%d'))
vac_4
state | date | total_doses_distributed | total_doses_administered | doses_distributed_100k | doses_administed_100k | people_1+_dose | doses_1+_100k | people_2_doses | doses_2_100k | |
---|---|---|---|---|---|---|---|---|---|---|
6 | California | 2021-01-18 | 3548575 | 1072959 | 8981.0 | 2716.0 | 865387 | 2190.0 | 204374 | 204374 |
vac_5=pd.read_csv('../data/vaccinations_1611021607.csv')
vac_5=vac_5[vac_5['state']=='California']
vac_5.insert(1,'date',datetime.utcfromtimestamp(1611021607).strftime('%Y-%m-%d'))
vac_5
state | date | total_doses_distributed | total_doses_administered | doses_distributed_100k | doses_administed_100k | people_1+_dose | doses_1+_100k | people_2_doses | doses_2_100k | |
---|---|---|---|---|---|---|---|---|---|---|
6 | California | 2021-01-19 | 3548575 | 1072959 | 8981.0 | 2716.0 | 865387 | 2190.0 | 204374 | 204374 |
vac_6=pd.read_csv('../data/vaccinations_1611108007.csv')
vac_6=vac_6[vac_6['state']=='California']
vac_6.insert(1,'date',datetime.utcfromtimestamp(1611108007).strftime('%Y-%m-%d'))
vac_6
state | date | total_doses_distributed | total_doses_administered | doses_distributed_100k | doses_administed_100k | people_1+_dose | doses_1+_100k | people_2_doses | doses_2_100k | |
---|---|---|---|---|---|---|---|---|---|---|
6 | California | 2021-01-20 | 3548575 | 1072959 | 8981.0 | 2716.0 | 865387 | 2190.0 | 204374 | 204374 |
vac_7=pd.read_csv('../data/vaccinations_1611205257.csv')
vac_7=vac_7[vac_7['state']=='California']
vac_7.insert(1,'date',datetime.utcfromtimestamp(1611205257).strftime('%Y-%m-%d'))
vac_7
state | date | total_doses_distributed | total_doses_administered | doses_distributed_100k | doses_administed_100k | people_1+_dose | doses_1+_100k | people_2_doses | doses_2_100k | |
---|---|---|---|---|---|---|---|---|---|---|
6 | California | 2021-01-21 | 4169450 | 1532329 | 10552.0 | 3878.0 | 1255311 | 3177.0 | 273276 | 273276 |
vac_8=pd.read_csv('../data/vaccinations_1611280808.csv')
vac_8=vac_8[vac_8['state']=='California']
vac_8.insert(1,'date',datetime.utcfromtimestamp(1611280808).strftime('%Y-%m-%d'))
vac_8
state | date | total_doses_distributed | total_doses_administered | doses_distributed_100k | doses_administed_100k | people_1+_dose | doses_1+_100k | people_2_doses | doses_2_100k | |
---|---|---|---|---|---|---|---|---|---|---|
6 | California | 2021-01-22 | 4379500 | 1633875 | 11084.0 | 4135.0 | 1335886 | 3381.0 | 293834 | 293834 |
vac_9=pd.read_csv('../data/vaccinations_1611367208.csv')
vac_9=vac_9[vac_9['state']=='California']
vac_9.insert(1,'date',datetime.utcfromtimestamp(1611367208).strftime('%Y-%m-%d'))
vac_9
state | date | total_doses_distributed | total_doses_administered | doses_distributed_100k | doses_administed_100k | people_1+_dose | doses_1+_100k | people_2_doses | doses_2_100k | |
---|---|---|---|---|---|---|---|---|---|---|
6 | California | 2021-01-23 | 4714625 | 1803679 | 11932.0 | 4565.0 | 1477195 | 3739.0 | 321839 | 321839 |
vac_10=pd.read_csv('../data/vaccinations_1611453608.csv')
vac_10=vac_10[vac_10['state']=='California']
vac_10.insert(1,'date',datetime.utcfromtimestamp(1611453608).strftime('%Y-%m-%d'))
vac_10
state | date | total_doses_distributed | total_doses_administered | doses_distributed_100k | doses_administed_100k | people_1+_dose | doses_1+_100k | people_2_doses | doses_2_100k | |
---|---|---|---|---|---|---|---|---|---|---|
6 | California | 2021-01-24 | 4906525 | 2008220 | 12418.0 | 5083.0 | 1661683 | 4205.0 | 341731 | 341731 |
vac_11=pd.read_csv('../data/vaccinations_1611540008.csv')
vac_11=vac_11[vac_11['state']=='California']
vac_11.insert(1,'date',datetime.utcfromtimestamp(1611540008).strftime('%Y-%m-%d'))
vac_11
state | date | total_doses_distributed | total_doses_administered | doses_distributed_100k | doses_administed_100k | people_1+_dose | doses_1+_100k | people_2_doses | doses_2_100k | |
---|---|---|---|---|---|---|---|---|---|---|
6 | California | 2021-01-25 | 4906525 | 2199908 | 12418.0 | 5568.0 | 1838464 | 4653.0 | 356382 | 356382 |
vac_12=pd.read_csv('../data/vaccinations_1611626408.csv')
vac_12=vac_12[vac_12['state']=='California']
vac_12.insert(1,'date',datetime.utcfromtimestamp(1611626408).strftime('%Y-%m-%d'))
vac_12
state | date | total_doses_distributed | total_doses_administered | doses_distributed_100k | doses_administed_100k | people_1+_dose | doses_1+_100k | people_2_doses | doses_2_100k | |
---|---|---|---|---|---|---|---|---|---|---|
6 | California | 2021-01-26 | 4906525 | 2330633 | 12418.0 | 5899.0 | 1954048 | 4945.0 | 371482 | 371482 |
vac_13=pd.read_csv('../data/vaccinations_1611712808.csv')
vac_13=vac_13[vac_13['state']=='California']
vac_13.insert(1,'date',datetime.utcfromtimestamp(1611712808).strftime('%Y-%m-%d'))
vac_13
state | date | total_doses_distributed | total_doses_administered | doses_distributed_100k | doses_administed_100k | people_1+_dose | doses_1+_100k | people_2_doses | doses_2_100k | |
---|---|---|---|---|---|---|---|---|---|---|
6 | California | 2021-01-27 | 5340275 | 2446577 | 13516.0 | 6192.0 | 2053652 | 5198.0 | 387563 | 387563 |
# combine all the scrapped daily updates together
vac=pd.concat([vac_2,vac_3,vac_4,vac_5,vac_6,vac_7,vac_8,vac_9,vac_10,vac_11,vac_12,vac_13])
vac
state | date | total_doses_distributed | total_doses_administered | doses_distributed_100k | doses_administed_100k | people_1+_dose | doses_1+_100k | people_2_doses | doses_2_100k | |
---|---|---|---|---|---|---|---|---|---|---|
6 | California | 2021-01-16 | 3548575 | 1072959 | 8981.0 | 2716.0 | 865387 | 2190.0 | 204374 | 204374 |
6 | California | 2021-01-17 | 3548575 | 1072959 | 8981.0 | 2716.0 | 865387 | 2190.0 | 204374 | 204374 |
6 | California | 2021-01-18 | 3548575 | 1072959 | 8981.0 | 2716.0 | 865387 | 2190.0 | 204374 | 204374 |
6 | California | 2021-01-19 | 3548575 | 1072959 | 8981.0 | 2716.0 | 865387 | 2190.0 | 204374 | 204374 |
6 | California | 2021-01-20 | 3548575 | 1072959 | 8981.0 | 2716.0 | 865387 | 2190.0 | 204374 | 204374 |
6 | California | 2021-01-21 | 4169450 | 1532329 | 10552.0 | 3878.0 | 1255311 | 3177.0 | 273276 | 273276 |
6 | California | 2021-01-22 | 4379500 | 1633875 | 11084.0 | 4135.0 | 1335886 | 3381.0 | 293834 | 293834 |
6 | California | 2021-01-23 | 4714625 | 1803679 | 11932.0 | 4565.0 | 1477195 | 3739.0 | 321839 | 321839 |
6 | California | 2021-01-24 | 4906525 | 2008220 | 12418.0 | 5083.0 | 1661683 | 4205.0 | 341731 | 341731 |
6 | California | 2021-01-25 | 4906525 | 2199908 | 12418.0 | 5568.0 | 1838464 | 4653.0 | 356382 | 356382 |
6 | California | 2021-01-26 | 4906525 | 2330633 | 12418.0 | 5899.0 | 1954048 | 4945.0 | 371482 | 371482 |
6 | California | 2021-01-27 | 5340275 | 2446577 | 13516.0 | 6192.0 | 2053652 | 5198.0 | 387563 | 387563 |
# create a column for the daily distrubution change
vac.insert(loc=3,column='daily_change_dist',
value=vac['total_doses_distributed']-vac['total_doses_distributed'].shift(1))
vac.insert(loc=5,column='daily_change_admin',
value=vac['total_doses_administered']-vac['total_doses_administered'].shift(1))
vac.insert(loc=7,column='dist_100k_change',
value=vac['doses_distributed_100k']-vac['doses_distributed_100k'].shift(1))
vac.insert(loc=9,column='admin_100k_change',
value=vac['doses_administed_100k']-vac['doses_administed_100k'].shift(1))
vac.insert(loc=11,column='people_1+_dose_change',
value=vac['people_1+_dose']-vac['people_1+_dose'].shift(1))
vac.insert(loc=13,column='doses_1+_100k_change',
value=vac['doses_1+_100k']-vac['doses_1+_100k'].shift(1))
vac.insert(loc=15,column='people_2_doses_change',
value=vac['people_2_doses']-vac['people_2_doses'].shift(1))
vac.insert(loc=17,column='doses_2_100k_change',
value=vac['doses_2_100k']-vac['doses_2_100k'].shift(1))
vac.fillna(0,inplace=True)
vac.to_csv('../clean_data/scrapped_ca_vaccine.csv',index=False)
# importing the data
data = pd.read_csv('../data/hospitals_by_county.csv')
# lets look at sample data
data.head()
county | todays_date | hospitalized_covid_confirmed_patients | hospitalized_suspected_covid_patients | hospitalized_covid_patients | all_hospital_beds | icu_covid_confirmed_patients | icu_suspected_covid_patients | icu_available_beds | |
---|---|---|---|---|---|---|---|---|---|
0 | Plumas | 2020-03-29 | 0.0 | 1.0 | NaN | NaN | 0.0 | 1.0 | NaN |
1 | Tehama | 2020-03-29 | 0.0 | 0.0 | NaN | NaN | 0.0 | 0.0 | 2.0 |
2 | Glenn | 2020-03-29 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | Mono | 2020-03-29 | 0.0 | 1.0 | NaN | NaN | 0.0 | 0.0 | 2.0 |
4 | Marin | 2020-03-29 | 7.0 | 13.0 | NaN | NaN | 2.0 | 6.0 | 11.0 |
# number of null records
data.isna().sum()
county 0 todays_date 0 hospitalized_covid_confirmed_patients 8 hospitalized_suspected_covid_patients 8 hospitalized_covid_patients 1285 all_hospital_beds 1375 icu_covid_confirmed_patients 29 icu_suspected_covid_patients 29 icu_available_beds 804 dtype: int64
data.describe()
hospitalized_covid_confirmed_patients | hospitalized_suspected_covid_patients | hospitalized_covid_patients | all_hospital_beds | icu_covid_confirmed_patients | icu_suspected_covid_patients | icu_available_beds | |
---|---|---|---|---|---|---|---|
count | 16901.000000 | 16901.000000 | 15624.000000 | 15534.000000 | 16880.000000 | 16880.000000 | 16105.00000 |
mean | 113.187622 | 21.330395 | 137.904122 | 1266.172267 | 30.116706 | 3.284893 | 50.46855 |
std | 441.167685 | 72.040450 | 497.901983 | 3020.403507 | 103.721833 | 11.331873 | 137.41660 |
min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -110.00000 |
25% | 1.000000 | 0.000000 | 1.000000 | 56.000000 | 0.000000 | 0.000000 | 3.00000 |
50% | 12.000000 | 2.000000 | 17.000000 | 346.500000 | 3.000000 | 0.000000 | 9.00000 |
75% | 67.000000 | 15.000000 | 84.000000 | 1187.500000 | 18.000000 | 2.000000 | 43.00000 |
max | 8098.000000 | 1350.000000 | 8422.000000 | 23989.000000 | 1731.000000 | 244.000000 | 1502.00000 |
data.groupby(by= 'todays_date')['hospitalized_covid_patients'].count()
todays_date 2020-03-29 0 2020-03-30 0 2020-03-31 0 2020-04-01 0 2020-04-02 0 .. 2021-01-20 56 2021-01-21 56 2021-01-22 56 2021-01-23 56 2021-01-24 56 Name: hospitalized_covid_patients, Length: 302, dtype: int64
data[data['hospitalized_covid_confirmed_patients'].isna()]
county | todays_date | hospitalized_covid_confirmed_patients | hospitalized_suspected_covid_patients | hospitalized_covid_patients | all_hospital_beds | icu_covid_confirmed_patients | icu_suspected_covid_patients | icu_available_beds | |
---|---|---|---|---|---|---|---|---|---|
2 | Glenn | 2020-03-29 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
11 | Colusa | 2020-03-29 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
41 | Inyo | 2020-03-29 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
46 | Butte | 2020-03-29 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
54 | Calaveras | 2020-03-29 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
99 | Inyo | 2020-03-30 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
162 | Inyo | 2020-03-31 | NaN | 83.0 | NaN | NaN | 0.0 | 0.0 | 6.0 |
205 | Glenn | 2020-04-01 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
data[data['hospitalized_covid_patients'].isna()]
county | todays_date | hospitalized_covid_confirmed_patients | hospitalized_suspected_covid_patients | hospitalized_covid_patients | all_hospital_beds | icu_covid_confirmed_patients | icu_suspected_covid_patients | icu_available_beds | |
---|---|---|---|---|---|---|---|---|---|
0 | Plumas | 2020-03-29 | 0.0 | 1.0 | NaN | NaN | 0.0 | 1.0 | NaN |
1 | Tehama | 2020-03-29 | 0.0 | 0.0 | NaN | NaN | 0.0 | 0.0 | 2.0 |
2 | Glenn | 2020-03-29 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | Mono | 2020-03-29 | 0.0 | 1.0 | NaN | NaN | 0.0 | 0.0 | 2.0 |
4 | Marin | 2020-03-29 | 7.0 | 13.0 | NaN | NaN | 2.0 | 6.0 | 11.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1280 | Contra Costa | 2020-04-20 | 37.0 | 28.0 | NaN | NaN | 13.0 | 8.0 | 75.0 |
1281 | Mendocino | 2020-04-20 | 0.0 | 0.0 | NaN | NaN | 0.0 | 0.0 | 7.0 |
1282 | Calaveras | 2020-04-20 | 0.0 | 1.0 | NaN | NaN | 0.0 | 0.0 | 11.0 |
1283 | San Joaquin | 2020-04-20 | 30.0 | 23.0 | NaN | NaN | 16.0 | 4.0 | 18.0 |
1284 | Kings | 2020-04-20 | 8.0 | 0.0 | NaN | NaN | 3.0 | 0.0 | 3.0 |
1285 rows × 9 columns
# the data in hospitalized Covid patients seems incorrect as it should be a sum of confirmed and
# suspected
data['hospitalized_covid_patients'] = data['hospitalized_covid_confirmed_patients'] + \
data['hospitalized_suspected_covid_patients']
# data set
data_train = data[data['all_hospital_beds'].notnull() & data['icu_available_beds'].notnull()] \
[['all_hospital_beds','icu_available_beds']]
X = data_train.drop('all_hospital_beds', axis=1)
y = data_train['all_hospital_beds']
# initiating Linear Regression
hosp_lr = LinearRegression()
# fitting the model
hosp_model = hosp_lr.fit(X, y)
# lets predict 'all_hospital_beds' based on icu_available_beds for all records where all_hospital_beds are null
data_hosp_bed_null = data[data['all_hospital_beds'].isna() & data['icu_available_beds'].notnull()] \
[['all_hospital_beds','icu_available_beds']]
# Run the model on training data
data_hosp_bed_null['all_hospital_beds'] = hosp_model.predict(data_hosp_bed_null[['icu_available_beds']]).round(0)
data_hosp_bed_null
all_hospital_beds | icu_available_beds | |
---|---|---|
1 | 308.0 | 2.0 |
3 | 308.0 | 2.0 |
4 | 493.0 | 11.0 |
5 | 266.0 | 0.0 |
6 | 4502.0 | 205.0 |
... | ... | ... |
1939 | 349.0 | 4.0 |
1964 | 411.0 | 7.0 |
1981 | 308.0 | 2.0 |
2025 | 369.0 | 5.0 |
2036 | 328.0 | 3.0 |
1159 rows × 2 columns
## update the original data frame for null hospital beds
data.loc[data['all_hospital_beds'].isna(),'all_hospital_beds']= data_hosp_bed_null['all_hospital_beds']
# data set
X = data_train.drop('icu_available_beds', axis=1)
y = data_train['icu_available_beds']
# initiating Linear Regression
icu_lr = LinearRegression()
# fitting the model
icu_model = icu_lr.fit(X, y)
# lets predict icu_available_beds based on all_hospital_beds for all records where icu_available_beds are null
data_icu_bed_null = data[data['icu_available_beds'].isna() & data['all_hospital_beds'].notnull()] \
[['all_hospital_beds','icu_available_beds']]
# Run the model on training data
data_icu_bed_null['icu_available_beds'] = icu_model.predict(data_icu_bed_null[['all_hospital_beds']]).round(0)
## update the original data frame for null ICU beds
data.loc[data['icu_available_beds'].isna(),'icu_available_beds']= data_icu_bed_null['icu_available_beds']
data[data['all_hospital_beds'].isna() & data['icu_available_beds'].isna()]
county | todays_date | hospitalized_covid_confirmed_patients | hospitalized_suspected_covid_patients | hospitalized_covid_patients | all_hospital_beds | icu_covid_confirmed_patients | icu_suspected_covid_patients | icu_available_beds | |
---|---|---|---|---|---|---|---|---|---|
0 | Plumas | 2020-03-29 | 0.0 | 1.0 | 1.0 | NaN | 0.0 | 1.0 | NaN |
2 | Glenn | 2020-03-29 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
7 | Modoc | 2020-03-29 | 0.0 | 0.0 | 0.0 | NaN | 0.0 | 0.0 | NaN |
9 | Trinity | 2020-03-29 | 0.0 | 0.0 | 0.0 | NaN | 0.0 | 0.0 | NaN |
11 | Colusa | 2020-03-29 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2967 | Glenn | 2020-05-21 | 0.0 | 0.0 | 0.0 | NaN | 0.0 | 0.0 | NaN |
3068 | Glenn | 2020-05-22 | 0.0 | 0.0 | 0.0 | NaN | 0.0 | 0.0 | NaN |
3098 | Glenn | 2020-05-23 | 0.0 | 0.0 | 0.0 | NaN | 0.0 | 0.0 | NaN |
3139 | Glenn | 2020-05-24 | 0.0 | 0.0 | 0.0 | NaN | 0.0 | 0.0 | NaN |
3222 | Glenn | 2020-05-25 | 0.0 | 0.0 | 0.0 | NaN | 0.0 | 0.0 | NaN |
216 rows × 9 columns
# understand what datasets have NULLs for all_hospital_beds
data[data['all_hospital_beds'].isna() & data['icu_available_beds'].isna()].groupby(by = 'county') \
['todays_date'].count()
county Butte 1 Calaveras 1 Colusa 1 Del Norte 1 Glenn 57 Inyo 2 Lassen 24 Mariposa 21 Modoc 23 Plumas 23 Santa Cruz 1 Shasta 1 Sutter 36 Trinity 23 Tuolumne 1 Name: todays_date, dtype: int64
data[data['county'] == 'Tuolumne' ][['all_hospital_beds' ,'icu_available_beds' ]]
all_hospital_beds | icu_available_beds | |
---|---|---|
18 | 266.0 | 0.0 |
101 | 328.0 | 3.0 |
140 | 349.0 | 4.0 |
188 | 308.0 | 2.0 |
256 | 308.0 | 2.0 |
... | ... | ... |
16663 | 84.0 | 2.0 |
16693 | 84.0 | 0.0 |
16779 | 84.0 | 2.0 |
16839 | 84.0 | 1.0 |
16893 | 84.0 | 2.0 |
302 rows × 2 columns
# update the null to the min value
data.loc[(data['county'] == 'Tuolumne') & (data['all_hospital_beds'].isna()) , 'all_hospital_beds'] = 251.0
# update the null to the min value
data.loc[(data['county'] == 'Tuolumne') & (data['icu_available_beds'].isna()) , 'icu_available_beds'] = 0.0
# icu_covid_confirmed_patients
data.loc[(data['icu_covid_confirmed_patients'].isna()) , 'icu_covid_confirmed_patients'] = \
data['hospitalized_covid_confirmed_patients']
# icu_suspected_covid_patients
data.loc[(data['icu_suspected_covid_patients'].isna()) , 'icu_suspected_covid_patients'] = \
data['hospitalized_suspected_covid_patients']
### hospitalized_covid_confirmed_patients , hospitalized_suspected_covid_patients
data.loc[(data['hospitalized_covid_confirmed_patients'].isna()) , 'hospitalized_covid_confirmed_patients'] = 0.0
data.loc[(data['hospitalized_suspected_covid_patients'].isna()) , 'hospitalized_suspected_covid_patients'] = 0.0
data.loc[(data['hospitalized_covid_patients'].isna()) , 'hospitalized_covid_patients'] = \
data['hospitalized_covid_confirmed_patients']+data['hospitalized_suspected_covid_patients']
data.to_csv('../clean_data/hospitals_by_county.csv', index=False)