##Install Packages & load libraries/data
#Install everything we need
#install.packages("dplyr")
#install.packages("tidyr")
#install.packages("ggplot2")
#install.packages("tidyverse")
#install.packages("GGally")
#Load packages into global environment
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.4 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## v purrr 0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(GGally)
## Warning: package 'GGally' was built under R version 4.1.2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(ISLR)
library(readxl)
Hospital_capacity_RAW <- read_excel("C:/Users/jessi/Downloads/Hospital_capacity_RAW.xls")
StateMedInc <- read_excel("C:/Users/jessi/Downloads/StateMedInc.xls")
StatePop <-
read_excel("C:/Users/Jessi/Downloads/StatePop.xlsx")
#Data Wrangling
SelectDataHosp <-Hospital_capacity_RAW %>%
select(state,
date,
critical_staffing_shortage_today_yes,
critical_staffing_shortage_today_no,
critical_staffing_shortage_today_not_reported,
hospital_onset_covid,
inpatient_beds,
inpatient_beds_used,
inpatient_beds_used_covid,
staffed_adult_icu_bed_occupancy,
staffed_icu_adult_patients_confirmed_and_suspected_covid,
staffed_icu_adult_patients_confirmed_covid,
total_adult_patients_hospitalized_confirmed_and_suspected_covid,
total_adult_patients_hospitalized_confirmed_covid,
total_pediatric_patients_hospitalized_confirmed_and_suspected_covid,
total_pediatric_patients_hospitalized_confirmed_covid,
total_staffed_adult_icu_beds,
inpatient_beds_utilization,
percent_of_inpatients_with_covid,
deaths_covid)
#Narrowing dataset to only include January 2021 dates
Data <- SelectDataHosp %>%
filter(year(date)==2021,
month(date)==1)
#Getting January 2021 average number of hospitals with staffing shortages by state each day
shortStaffMeans <- Data %>%
group_by(state)%>%
summarize(mean(critical_staffing_shortage_today_yes, na.rm = TRUE))
#Remaming the columns
colnames(shortStaffMeans)<- c("state", "shortStaff")
colnames(StateMedInc) <- c("StateName", "medFamInc", "state")
##Getting January 2021 average daily COVID deaths by state
covidDeathMean <- SelectDataHosp%>%
group_by(state)%>%
filter(year(date)==2021,
month(date)==1)%>%
summarize(mean(deaths_covid, na.rm=TRUE))
##Renaming columns so they make more sense
colnames(covidDeathMean) <- c("state", "avgCovidDeaths")
StateMedInc%>%
arrange(medFamInc)
## # A tibble: 51 x 3
## StateName medFamInc state
## <chr> <dbl> <chr>
## 1 Mississippi 45081 MS
## 2 West Virginia 46711 WV
## 3 Arkansas 47597 AR
## 4 Louisiana 49469 LA
## 5 New Mexico 49754 NM
## 6 Alabama 50536 AL
## 7 Kentucky 50589 KY
## 8 Oklahoma 52919 OK
## 9 South Carolina 53199 SC
## 10 Tennessee 53320 TN
## # ... with 41 more rows
low<-StateMedInc%>%
filter(medFamInc <=56602)%>%
mutate(incLev="low")
med<-StateMedInc%>%
filter(medFamInc > 56602 & medFamInc <=65886)%>%
mutate(incLev="med")
high<-StateMedInc%>%
filter(medFamInc >= 67167)%>%
mutate(incLev="high")
stateLev<-rbind(low, med, high)
View(stateLev)
#Creating final usable data set
covidDeathMean<- covidDeathMean[-c(40,48),]
shortStaffMeans<- shortStaffMeans[-c(40, 48),]
joinedData <- left_join(covidDeathMean, stateLev)
## Joining, by = "state"
joinedData <- left_join(joinedData, shortStaffMeans)
## Joining, by = "state"
#Percent beds used by COVID patients
percentBedsCovid <- Data%>%
group_by(state)%>%
filter(year(date)==2021,
month(date)==1)%>%
summarize(mean(inpatient_beds_used_covid/inpatient_beds_used, na.rm=TRUE))
percentBedsCovid<- percentBedsCovid[-c(40, 48),]
colnames(percentBedsCovid) <- c("state", "percentUsedBedsCovid")
#Avg. confirmed COVID cases in hospitals in each state
totalConfirmedCovid <- Data%>%
group_by(state)%>%
filter(year(date)==2021,
month(date)==1)%>%
summarize(mean(total_adult_patients_hospitalized_confirmed_covid + total_pediatric_patients_hospitalized_confirmed_covid))
totalConfirmedCovid<- totalConfirmedCovid[-c(40, 48),]
colnames(totalConfirmedCovid) <- c("state", "totalConfirmedCovidPatients")
#Percent of inpatients with COVID
percentCovid <- Data%>%
group_by(state)%>%
filter(year(date)==2021,
month(date)==1)%>%
summarize(mean(percent_of_inpatients_with_covid))
percentCovid<- percentCovid[-c(40, 48),]
colnames(percentCovid) <- c("state", "PercentInpatientsCovid")
joinedData <- left_join(joinedData, percentBedsCovid)
## Joining, by = "state"
joinedData <- left_join(joinedData, totalConfirmedCovid)
## Joining, by = "state"
joinedData <- left_join(joinedData, percentCovid)
## Joining, by = "state"
joinedData <- left_join(joinedData, StatePop)
## Joining, by = "StateName"
colnames(joinedData)<-c("state", "avgCovidDeaths", "stateName", "medFamInc", "incLev", "shortStaff", "percentUsedBedsCovid", "totalConfirmedCovidPatients", "percentInpatientsCovid", "rank", "census2020", "percentTotalUSPop")
covid<-joinedData%>%
mutate(avgCDPC=avgCovidDeaths/census2020)
covid<-joinedData%>%
mutate(avgSSPC=shortStaff/census2020)
avgSSPC <- covid%>%
select("shortStaff", "census2020")%>%
mutate(avgSSPC=shortStaff/census2020)
avgCDPC <- covid%>%
select("avgCovidDeaths", "census2020")%>%
mutate(avgCDPC=avgCovidDeaths/census2020)
covid <- left_join(covid, avgSSPC)
## Joining, by = c("shortStaff", "census2020", "avgSSPC")
covid <- left_join(covid, avgCDPC)
## Joining, by = c("avgCovidDeaths", "census2020")
#Step 1
#STEP 1
#Staffing shortages is the number of hospitals in the state that are under staffed. Staffing Shortages becomes dichotomized as Extreme(ly short staffed) (top â…“), Somewhat (short staffed)(bottom â…”). The numeric variable becomes categorical so the information can be more easily used. The data is split into higher and lower staffing shortage groups (labeled Extremely and Somewhat).
#Dichotomize the Data
delta <- covid%>%
select("state", "avgSSPC")
delta%>%
arrange(avgSSPC)
## # A tibble: 51 x 2
## state avgSSPC
## <chr> <dbl>
## 1 KY 0.000000179
## 2 UT 0.000000221
## 3 FL 0.000000400
## 4 NY 0.000000442
## 5 OR 0.000000472
## 6 NC 0.000000516
## 7 MD 0.000000522
## 8 NJ 0.00000102
## 9 NV 0.00000104
## 10 CO 0.00000106
## # ... with 41 more rows
extremely<-covid%>%
filter(avgSSPC > 4.39264e-06)%>%
mutate(staffLev=1)
somewhat<-covid%>%
filter(avgSSPC <=4.39264e-06)%>%
mutate(staffLev=0)
shortageLev<-rbind(somewhat, extremely)
view(shortageLev)
#Step 2
qplot(x = avgCDPC, y = staffLev, data = shortageLev,
geom = "point", alpha = I(.5), ylab = "Staff Shortage Level") +
stat_smooth(method = "lm",
se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
qplot(x = avgCDPC, y = staffLev, data = shortageLev,
geom = "point", alpha = I(.5), ylab = "Staff Shortage Level") +
stat_smooth(method = "glm", method.args = list(family = "binomial"),
se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
m1 <- glm(staffLev ~ avgCDPC, data = shortageLev, family = "binomial")
summary(m1)
##
## Call:
## glm(formula = staffLev ~ avgCDPC, family = "binomial", data = shortageLev)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4741 -0.8683 -0.5582 1.0430 1.9882
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.330e+00 7.584e-01 -3.072 0.00213 **
## avgCDPC 3.113e+05 1.218e+05 2.556 0.01059 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 64.104 on 49 degrees of freedom
## Residual deviance: 56.212 on 48 degrees of freedom
## AIC: 60.212
##
## Number of Fisher Scoring iterations: 3
#The model is staff shortage level as a binary (either extreme or somewhat) as compared to Covid Deaths Per Capita. -2.330e+00 + 3.113+05x=y is the equation of the line. There is a positive slope, so there is a positive association between CDPC and short staffing.
#Step 3
#randomly split data
set.seed(1)
train_indices <- sample(1:nrow(shortageLev), size = floor(nrow(shortageLev)/2))
train_data <- shortageLev %>%
slice(train_indices)
test_data <- shortageLev %>%
slice(-train_indices)
#train
mod1 <- glm(avgCDPC ~ staffLev, data = train_data, family = "binomial")
mod2 <- glm(avgCDPC ~ staffLev + percentUsedBedsCovid, data = train_data, family = "binomial")
#test
test1 <- predict(mod1, newdata = test_data, type = "response")
test_mat1<-data.frame(staffLev=test_data$staffLev, predstaffLev=test1>.5)%>%
group_by(staffLev, predstaffLev)%>%
summarise(n=n())
## `summarise()` has grouped output by 'staffLev'. You can override using the `.groups` argument.
test_mat1
## # A tibble: 2 x 3
## # Groups: staffLev [2]
## staffLev predstaffLev n
## <dbl> <lgl> <int>
## 1 0 FALSE 17
## 2 1 FALSE 8
test2 <- predict(mod2, newdata = test_data, type = "response")
test_mat2<-data.frame(staffLev=test_data$staffLev, predstaffLev=test2>.5)%>%
group_by(staffLev, predstaffLev)%>%
summarise(n=n())
## `summarise()` has grouped output by 'staffLev'. You can override using the `.groups` argument.
test_mat2
## # A tibble: 2 x 3
## # Groups: staffLev [2]
## staffLev predstaffLev n
## <dbl> <lgl> <int>
## 1 0 FALSE 17
## 2 1 FALSE 8
#Confusion Matrix
pred1 <- predict(mod1, newdata = shortageLev, type = "response")
conf_mat1<-data.frame(staffLev=shortageLev$staffLev, predstaffLev=pred1>.5)%>%
group_by(staffLev, predstaffLev)%>%
summarise(n=n())
## `summarise()` has grouped output by 'staffLev'. You can override using the `.groups` argument.
conf_mat1
## # A tibble: 2 x 3
## # Groups: staffLev [2]
## staffLev predstaffLev n
## <dbl> <lgl> <int>
## 1 0 FALSE 33
## 2 1 FALSE 17
#they both say false but we can't figure out why
pred2 <- predict(mod2, newdata = , type = "response")
conf_mat2<-data.frame(staffLev=shortageLev$staffLev, predstaffLev=pred2>.5)%>%
group_by(staffLev, predstaffLev)%>%
summarise(n=n())
## Warning in data.frame(staffLev = shortageLev$staffLev, predstaffLev = pred2 > :
## row names were found from a short variable and have been discarded
## `summarise()` has grouped output by 'staffLev'. You can override using the `.groups` argument.
conf_mat2
## # A tibble: 2 x 3
## # Groups: staffLev [2]
## staffLev predstaffLev n
## <dbl> <lgl> <int>
## 1 0 FALSE 33
## 2 1 FALSE 17