##First, install & load packages that we need
#Install everything we need
#install.packages("dplyr")
#install.packages("tidyr")
#install.packages("ggplot2")
#install.packages("tidyverse")
#Load packages into global environment
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.4 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## v purrr 0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
##Now, we import the dataset
library(readxl)
Hospital_capacity_RAW <- read_excel("C:/Users/jessi/Downloads/Hospital_capacity_RAW.xls")
StateMedInc <- read_excel("C:/Users/jessi/Downloads/StateMedInc.xls")
StatePop <-
read_excel("C:/Users/Jessi/Downloads/StatePop.xlsx")
##Time to wrangle/tidy up our dataset ####Making our dataset smaller, and narrowing down to only January dates
#Tidying Hospital_capacity_RAW dataset
SelectDataHosp <-Hospital_capacity_RAW %>%
select(state,
date,
critical_staffing_shortage_today_yes,
critical_staffing_shortage_today_no,
critical_staffing_shortage_today_not_reported,
hospital_onset_covid,
inpatient_beds,
inpatient_beds_used,
inpatient_beds_used_covid,
staffed_adult_icu_bed_occupancy,
staffed_icu_adult_patients_confirmed_and_suspected_covid,
staffed_icu_adult_patients_confirmed_covid,
total_adult_patients_hospitalized_confirmed_and_suspected_covid,
total_adult_patients_hospitalized_confirmed_covid,
total_pediatric_patients_hospitalized_confirmed_and_suspected_covid,
total_pediatric_patients_hospitalized_confirmed_covid,
total_staffed_adult_icu_beds,
inpatient_beds_utilization,
percent_of_inpatients_with_covid,
deaths_covid)
#Narrowing dataset to only include January 2021 dates
Data <- SelectDataHosp %>%
filter(year(date)==2021,
month(date)==1)
##Creating datasets with the jan 2021 Average for each variable in each state ####Staffing Shortages
#Getting January 2021 average number of hospitals with staffing shortages by state each day
shortStaffMeans <- Data %>%
group_by(state)%>%
summarize(mean(critical_staffing_shortage_today_yes, na.rm = TRUE))
#Remaming the columns
colnames(shortStaffMeans)<- c("state", "shortStaff")
colnames(StateMedInc) <- c("StateName", "medFamInc", "state")
####COVID Deaths
##Getting January 2021 average daily COVID deaths by state
covidDeathMean <- SelectDataHosp%>%
group_by(state)%>%
filter(year(date)==2021,
month(date)==1)%>%
summarize(mean(deaths_covid, na.rm=TRUE))
##Renaming columns so they make more sense
colnames(covidDeathMean) <- c("state", "avgCovidDeaths")
####Creating three levels for state median household income (low, med, high)
StateMedInc%>%
arrange(medFamInc)
## # A tibble: 51 x 3
## StateName medFamInc state
## <chr> <dbl> <chr>
## 1 Mississippi 45081 MS
## 2 West Virginia 46711 WV
## 3 Arkansas 47597 AR
## 4 Louisiana 49469 LA
## 5 New Mexico 49754 NM
## 6 Alabama 50536 AL
## 7 Kentucky 50589 KY
## 8 Oklahoma 52919 OK
## 9 South Carolina 53199 SC
## 10 Tennessee 53320 TN
## # ... with 41 more rows
low<-StateMedInc%>%
filter(medFamInc <=56602)%>%
mutate(incLev="low")
med<-StateMedInc%>%
filter(medFamInc > 56602 & medFamInc <=65886)%>%
mutate(incLev="med")
high<-StateMedInc%>%
filter(medFamInc >= 67167)%>%
mutate(incLev="high")
stateLev<-rbind(low, med, high)
View(stateLev)
####Joining data together
#Creating final usable data set
covidDeathMean<- covidDeathMean[-c(40,48),]
shortStaffMeans<- shortStaffMeans[-c(40, 48),]
joinedData <- left_join(covidDeathMean, stateLev)
## Joining, by = "state"
joinedData <- left_join(joinedData, shortStaffMeans)
## Joining, by = "state"
#Percent beds used by COVID patients
percentBedsCovid <- Data%>%
group_by(state)%>%
filter(year(date)==2021,
month(date)==1)%>%
summarize(mean(inpatient_beds_used_covid/inpatient_beds_used, na.rm=TRUE))
percentBedsCovid<- percentBedsCovid[-c(40, 48),]
colnames(percentBedsCovid) <- c("state", "percentUsedBedsCovid")
#Avg. confirmed COVID cases in hospitals in each state
totalConfirmedCovid <- Data%>%
group_by(state)%>%
filter(year(date)==2021,
month(date)==1)%>%
summarize(mean(total_adult_patients_hospitalized_confirmed_covid + total_pediatric_patients_hospitalized_confirmed_covid))
totalConfirmedCovid<- totalConfirmedCovid[-c(40, 48),]
colnames(totalConfirmedCovid) <- c("state", "totalConfirmedCovidPatients")
#Percent of inpatients with COVID
percentCovid <- Data%>%
group_by(state)%>%
filter(year(date)==2021,
month(date)==1)%>%
summarize(mean(percent_of_inpatients_with_covid))
percentCovid<- percentCovid[-c(40, 48),]
colnames(percentCovid) <- c("state", "PercentInpatientsCovid")
joinedData <- left_join(joinedData, percentBedsCovid)
## Joining, by = "state"
joinedData <- left_join(joinedData, totalConfirmedCovid)
## Joining, by = "state"
joinedData <- left_join(joinedData, percentCovid)
## Joining, by = "state"
joinedData <- left_join(joinedData, StatePop)
## Joining, by = "StateName"
write.csv(joinedData, "C:/Users/jessi/Downloads/finalProjData.csv",
row.names = FALSE)
joinedData<-read.csv("C:/Users/jessi/Downloads/finalProjData.csv",
header=TRUE)
##Mutating the covid deaths column to change data to be COVID deaths PER CAPITA to account for state population
covid<-joinedData%>%
mutate(avgCDPC=avgCovidDeaths/X2020.Census)
str(covid)
## 'data.frame': 51 obs. of 13 variables:
## $ state : chr "AK" "AL" "AR" "AZ" ...
## $ avgCovidDeaths : num 0.677 50.774 22.516 85.806 451.677 ...
## $ StateName : chr "Alaska" "Alabama" "Arkansas" "Arizona" ...
## $ medFamInc : int 77640 50536 47597 58945 75235 72331 78444 86420 68287 55660 ...
## $ incLev : chr "high" "low" "low" "med" ...
## $ shortStaff : num 1 38 17.6 31.8 173.7 ...
## $ percentUsedBedsCovid : num 0.074 0.245 0.195 0.358 0.376 ...
## $ totalConfirmedCovidPatients: num 59.2 2752.3 1110.3 4208.4 19973.9 ...
## $ PercentInpatientsCovid : num 0.074 0.252 0.196 0.366 0.381 ...
## $ Rank : int 48 24 33 14 1 21 29 NA 45 3 ...
## $ X2020.Census : int 733391 5024279 3011524 7151502 39538223 5773714 3605944 NA 989948 21538187 ...
## $ Percent.of.Total.US.Pop : num 0.0022 0.0148 0.0091 0.0219 0.1191 ...
## $ avgCDPC : num 9.24e-07 1.01e-05 7.48e-06 1.20e-05 1.14e-05 ...
names(covid)
## [1] "state" "avgCovidDeaths"
## [3] "StateName" "medFamInc"
## [5] "incLev" "shortStaff"
## [7] "percentUsedBedsCovid" "totalConfirmedCovidPatients"
## [9] "PercentInpatientsCovid" "Rank"
## [11] "X2020.Census" "Percent.of.Total.US.Pop"
## [13] "avgCDPC"
#Create a Simple Linear Model
### Mod A: ANOVA
ggplot(data=covid, aes(y=PercentInpatientsCovid, fill=incLev))+
geom_boxplot()
modA<-lm(PercentInpatientsCovid~incLev, data=covid)
anova(modA)
## Analysis of Variance Table
##
## Response: PercentInpatientsCovid
## Df Sum Sq Mean Sq F value Pr(>F)
## incLev 2 0.007872 0.0039359 0.79 0.4597
## Residuals 48 0.239147 0.0049822
#to calculate MSE
modASum<- summary(modA)
mean(modASum$residuals^2)
## [1] 0.004689151
#SLR Model that has points color coordinated to the income level of that state (low, med, high)
#create model
ggplot(data=covid, aes(y=PercentInpatientsCovid, x=avgCDPC))+
geom_point()
## Warning: Removed 1 rows containing missing values (geom_point).
modB<-lm(PercentInpatientsCovid~avgCDPC, data=covid)
#to calculate MSE
modBSum<- summary(modB)
mean(modBSum$residuals^2)
## [1] 0.0009018205
#plot the model
ggplot(data=covid, aes(y=PercentInpatientsCovid, x=avgCDPC,color=incLev))+
geom_point()
## Warning: Removed 1 rows containing missing values (geom_point).
#Factoring our categorical variable by creating a “dummy” variable
##Coding dummy variable
covid$incLev <- factor(covid$incLev,
levels = c("high", "med", "low"))
contrasts(covid$incLev)
## med low
## high 0 0
## med 1 0
## low 0 1
#Creating a MLR
#create model
modC<-lm(PercentInpatientsCovid~avgCDPC+incLev, data=covid)
#to calculate MSE
modCSum<- summary(modC)
mean(modCSum$residuals^2)
## [1] 0.0007865488
#plot the model
ggplot(data=covid, aes(x=avgCDPC, y=PercentInpatientsCovid, color=incLev))+
geom_point()+
ggtitle("Scatterplot of Average COVID deaths per cap v. % Inpatients COVID")+
theme_bw()+
geom_abline(slope=modC$coefficients[2], intercept = modC$coefficients[1], col=2)+
geom_abline(slope=modC$coefficients[2], intercept = modC$coefficients[1]+modC$coefficients[3])
## Warning: Removed 1 rows containing missing values (geom_point).
#MLR that includes interaction between numeric and categorical variables
modD<-lm(PercentInpatientsCovid~avgCDPC+incLev, data=covid)
#calculate MSE
modDSum<- summary(modD)
mean(modDSum$residuals^2)
## [1] 0.0007865488
#plot the model
ggplot(data=covid, aes(x=avgCDPC, y=PercentInpatientsCovid, color=incLev))+
geom_point()+
ggtitle("Scatterplot of Average COVID deaths per cap v. % Inpatients COVID")+
theme_bw()+
geom_abline(slope=modD$coefficients[2], intercept = modD$coefficients[1], col=2)+
geom_abline(slope=modD$coefficients[2]+modD$coefficients[4], intercept = modD$coefficients[1]+modD$coefficients[3], col=4)
## Warning: Removed 1 rows containing missing values (geom_point).
#Conclusion
#We learned that the relationship between COVID bed use per capita and COVID deaths are linear, but are not very strongly related to state average family income (within the US).