Milestone 5

##Install Packages & load libraries/data

#Install everything we need
#install.packages("dplyr")
#install.packages("tidyr")
#install.packages("ggplot2")
#install.packages("tidyverse")
#install.packages("GGally")

#Load packages into global environment
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(ggplot2)
library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v tibble  3.1.4     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1
## v purrr   0.3.4

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(GGally)

## Warning: package 'GGally' was built under R version 4.1.2

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

library(ISLR)

library(readxl)

Hospital_capacity_RAW <- read_excel("C:/Users/jessi/Downloads/Hospital_capacity_RAW.xls")

StateMedInc <- read_excel("C:/Users/jessi/Downloads/StateMedInc.xls")

StatePop <-
read_excel("C:/Users/Jessi/Downloads/StatePop.xlsx")

#Data Wrangling

SelectDataHosp <-Hospital_capacity_RAW %>%
  select(state,
         date,
         critical_staffing_shortage_today_yes,
         critical_staffing_shortage_today_no,
         critical_staffing_shortage_today_not_reported,
         hospital_onset_covid,
         inpatient_beds,
         inpatient_beds_used,
         inpatient_beds_used_covid,
         staffed_adult_icu_bed_occupancy,
         staffed_icu_adult_patients_confirmed_and_suspected_covid,
         staffed_icu_adult_patients_confirmed_covid,
         total_adult_patients_hospitalized_confirmed_and_suspected_covid,
         total_adult_patients_hospitalized_confirmed_covid,
         total_pediatric_patients_hospitalized_confirmed_and_suspected_covid,
         total_pediatric_patients_hospitalized_confirmed_covid,
         total_staffed_adult_icu_beds,
         inpatient_beds_utilization,
         percent_of_inpatients_with_covid,
         deaths_covid)

#Narrowing dataset to only include January 2021 dates
Data <- SelectDataHosp %>%
  filter(year(date)==2021, 
         month(date)==1)

#Getting January 2021 average number of hospitals with staffing shortages by state each day 
shortStaffMeans <- Data %>%
  group_by(state)%>%
  summarize(mean(critical_staffing_shortage_today_yes, na.rm = TRUE))

#Remaming the columns
colnames(shortStaffMeans)<- c("state", "shortStaff")
colnames(StateMedInc) <- c("StateName", "medFamInc", "state")

##Getting January 2021 average daily COVID deaths by state
covidDeathMean <- SelectDataHosp%>% 
  group_by(state)%>%
  filter(year(date)==2021, 
         month(date)==1)%>%
  summarize(mean(deaths_covid, na.rm=TRUE)) 

##Renaming columns so they make more sense
colnames(covidDeathMean) <- c("state", "avgCovidDeaths")

StateMedInc%>%
  arrange(medFamInc)

## # A tibble: 51 x 3
##    StateName      medFamInc state
##    <chr>              <dbl> <chr>
##  1 Mississippi        45081 MS   
##  2 West Virginia      46711 WV   
##  3 Arkansas           47597 AR   
##  4 Louisiana          49469 LA   
##  5 New Mexico         49754 NM   
##  6 Alabama            50536 AL   
##  7 Kentucky           50589 KY   
##  8 Oklahoma           52919 OK   
##  9 South Carolina     53199 SC   
## 10 Tennessee          53320 TN   
## # ... with 41 more rows

low<-StateMedInc%>%
  filter(medFamInc <=56602)%>%
  mutate(incLev="low")

med<-StateMedInc%>%
  filter(medFamInc > 56602 & medFamInc <=65886)%>%
  mutate(incLev="med")

high<-StateMedInc%>%
  filter(medFamInc >= 67167)%>%
  mutate(incLev="high")

stateLev<-rbind(low, med, high)
View(stateLev)

#Creating final usable data set
covidDeathMean<- covidDeathMean[-c(40,48),]
shortStaffMeans<- shortStaffMeans[-c(40, 48),]

joinedData <- left_join(covidDeathMean, stateLev)

## Joining, by = "state"

joinedData <- left_join(joinedData, shortStaffMeans)

## Joining, by = "state"

#Percent beds used by COVID patients
percentBedsCovid <- Data%>%
  group_by(state)%>%
  filter(year(date)==2021, 
         month(date)==1)%>%
  summarize(mean(inpatient_beds_used_covid/inpatient_beds_used, na.rm=TRUE)) 
percentBedsCovid<- percentBedsCovid[-c(40, 48),]
colnames(percentBedsCovid) <- c("state", "percentUsedBedsCovid")

#Avg. confirmed COVID cases in hospitals in each state
totalConfirmedCovid <- Data%>%
  group_by(state)%>%
  filter(year(date)==2021, 
         month(date)==1)%>%
  summarize(mean(total_adult_patients_hospitalized_confirmed_covid + total_pediatric_patients_hospitalized_confirmed_covid))
totalConfirmedCovid<- totalConfirmedCovid[-c(40, 48),]
colnames(totalConfirmedCovid) <- c("state", "totalConfirmedCovidPatients")

#Percent of inpatients with COVID
percentCovid <- Data%>%
  group_by(state)%>%
  filter(year(date)==2021, 
         month(date)==1)%>%
  summarize(mean(percent_of_inpatients_with_covid))
percentCovid<- percentCovid[-c(40, 48),]
colnames(percentCovid) <- c("state", "PercentInpatientsCovid")

joinedData <- left_join(joinedData, percentBedsCovid)

## Joining, by = "state"

joinedData <- left_join(joinedData, totalConfirmedCovid)

## Joining, by = "state"

joinedData <- left_join(joinedData, percentCovid)

## Joining, by = "state"

joinedData <- left_join(joinedData, StatePop)

## Joining, by = "StateName"

colnames(joinedData)<-c("state", "avgCovidDeaths", "stateName", "medFamInc", "incLev", "shortStaff", "percentUsedBedsCovid", "totalConfirmedCovidPatients", "percentInpatientsCovid", "rank", "census2020", "percentTotalUSPop")

covid<-joinedData%>%
  mutate(avgCDPC=avgCovidDeaths/census2020) 

covid<-joinedData%>%
  mutate(avgSSPC=shortStaff/census2020)

avgSSPC <- covid%>%
  select("shortStaff", "census2020")%>%
  mutate(avgSSPC=shortStaff/census2020)


avgCDPC <- covid%>%
  select("avgCovidDeaths", "census2020")%>%
  mutate(avgCDPC=avgCovidDeaths/census2020)

covid <- left_join(covid, avgSSPC)

## Joining, by = c("shortStaff", "census2020", "avgSSPC")

covid <- left_join(covid, avgCDPC)

## Joining, by = c("avgCovidDeaths", "census2020")

#Step 1

#STEP 1
#Staffing shortages is the number of hospitals in the state that are under staffed. Staffing Shortages becomes dichotomized as Extreme(ly short staffed) (top ⅓), Somewhat (short staffed)(bottom ⅔). The numeric variable becomes categorical so the information can be more easily used. The data is split into higher and lower staffing shortage groups (labeled Extremely and Somewhat).

#Dichotomize the Data
delta <- covid%>%
  select("state", "avgSSPC")

delta%>% 
  arrange(avgSSPC)

## # A tibble: 51 x 2
##    state     avgSSPC
##    <chr>       <dbl>
##  1 KY    0.000000179
##  2 UT    0.000000221
##  3 FL    0.000000400
##  4 NY    0.000000442
##  5 OR    0.000000472
##  6 NC    0.000000516
##  7 MD    0.000000522
##  8 NJ    0.00000102 
##  9 NV    0.00000104 
## 10 CO    0.00000106 
## # ... with 41 more rows

extremely<-covid%>%
  filter(avgSSPC > 4.39264e-06)%>%
  mutate(staffLev=1)

somewhat<-covid%>%
  filter(avgSSPC <=4.39264e-06)%>%
  mutate(staffLev=0)

shortageLev<-rbind(somewhat, extremely)
view(shortageLev)

#Step 2

qplot(x = avgCDPC, y = staffLev, data = shortageLev, 
      geom = "point", alpha = I(.5), ylab = "Staff Shortage Level") +
  stat_smooth(method = "lm",
              se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

qplot(x = avgCDPC, y = staffLev, data = shortageLev, 
      geom = "point", alpha = I(.5), ylab = "Staff Shortage Level") +
  stat_smooth(method = "glm", method.args = list(family = "binomial"),
              se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

m1 <- glm(staffLev ~ avgCDPC, data = shortageLev, family = "binomial")
summary(m1)

## 
## Call:
## glm(formula = staffLev ~ avgCDPC, family = "binomial", data = shortageLev)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4741  -0.8683  -0.5582   1.0430   1.9882  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)   
## (Intercept) -2.330e+00  7.584e-01  -3.072  0.00213 **
## avgCDPC      3.113e+05  1.218e+05   2.556  0.01059 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 64.104  on 49  degrees of freedom
## Residual deviance: 56.212  on 48  degrees of freedom
## AIC: 60.212
## 
## Number of Fisher Scoring iterations: 3

#The model is staff shortage level as a binary (either extreme or somewhat) as compared to Covid Deaths Per Capita. -2.330e+00 + 3.113+05x=y is the equation of the line. There is a positive slope, so there is a positive association between CDPC and short staffing.

#Step 3

#randomly split data
set.seed(1)

train_indices <- sample(1:nrow(shortageLev), size = floor(nrow(shortageLev)/2))

train_data <- shortageLev %>%
  slice(train_indices)

test_data <- shortageLev %>%
  slice(-train_indices)  

#train
mod1 <- glm(avgCDPC ~ staffLev, data = train_data, family = "binomial")
mod2 <- glm(avgCDPC ~ staffLev + percentUsedBedsCovid, data = train_data, family = "binomial")

#test
test1 <- predict(mod1, newdata = test_data, type = "response")

test_mat1<-data.frame(staffLev=test_data$staffLev, predstaffLev=test1>.5)%>%
  group_by(staffLev, predstaffLev)%>%
  summarise(n=n())

## `summarise()` has grouped output by 'staffLev'. You can override using the `.groups` argument.

test_mat1

## # A tibble: 2 x 3
## # Groups:   staffLev [2]
##   staffLev predstaffLev     n
##      <dbl> <lgl>        <int>
## 1        0 FALSE           17
## 2        1 FALSE            8

test2 <- predict(mod2, newdata = test_data, type = "response")

test_mat2<-data.frame(staffLev=test_data$staffLev, predstaffLev=test2>.5)%>%
  group_by(staffLev, predstaffLev)%>%
  summarise(n=n())

## `summarise()` has grouped output by 'staffLev'. You can override using the `.groups` argument.

test_mat2

## # A tibble: 2 x 3
## # Groups:   staffLev [2]
##   staffLev predstaffLev     n
##      <dbl> <lgl>        <int>
## 1        0 FALSE           17
## 2        1 FALSE            8

#Confusion Matrix
pred1 <- predict(mod1, newdata = shortageLev, type = "response")

conf_mat1<-data.frame(staffLev=shortageLev$staffLev, predstaffLev=pred1>.5)%>%
  group_by(staffLev, predstaffLev)%>%
  summarise(n=n())

## `summarise()` has grouped output by 'staffLev'. You can override using the `.groups` argument.

conf_mat1

## # A tibble: 2 x 3
## # Groups:   staffLev [2]
##   staffLev predstaffLev     n
##      <dbl> <lgl>        <int>
## 1        0 FALSE           33
## 2        1 FALSE           17

#they both say false but we can't figure out why

pred2 <- predict(mod2, newdata = , type = "response")

conf_mat2<-data.frame(staffLev=shortageLev$staffLev, predstaffLev=pred2>.5)%>%
  group_by(staffLev, predstaffLev)%>%
  summarise(n=n())

## Warning in data.frame(staffLev = shortageLev$staffLev, predstaffLev = pred2 > :
## row names were found from a short variable and have been discarded

## `summarise()` has grouped output by 'staffLev'. You can override using the `.groups` argument.

conf_mat2

## # A tibble: 2 x 3
## # Groups:   staffLev [2]
##   staffLev predstaffLev     n
##      <dbl> <lgl>        <int>
## 1        0 FALSE           33
## 2        1 FALSE           17

Milestone 5

Jessie & Allison

12/2/2021