##Install Packages & load libraries/data

#Install everything we need
#install.packages("dplyr")
#install.packages("tidyr")
#install.packages("ggplot2")
#install.packages("tidyverse")
#install.packages("GGally")

#Load packages into global environment
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble  3.1.4     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1
## v purrr   0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(GGally)
## Warning: package 'GGally' was built under R version 4.1.2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(ISLR)

library(readxl)

Hospital_capacity_RAW <- read_excel("C:/Users/jessi/Downloads/Hospital_capacity_RAW.xls")

StateMedInc <- read_excel("C:/Users/jessi/Downloads/StateMedInc.xls")

StatePop <-
read_excel("C:/Users/Jessi/Downloads/StatePop.xlsx")

#Data Wrangling

SelectDataHosp <-Hospital_capacity_RAW %>%
  select(state,
         date,
         critical_staffing_shortage_today_yes,
         critical_staffing_shortage_today_no,
         critical_staffing_shortage_today_not_reported,
         hospital_onset_covid,
         inpatient_beds,
         inpatient_beds_used,
         inpatient_beds_used_covid,
         staffed_adult_icu_bed_occupancy,
         staffed_icu_adult_patients_confirmed_and_suspected_covid,
         staffed_icu_adult_patients_confirmed_covid,
         total_adult_patients_hospitalized_confirmed_and_suspected_covid,
         total_adult_patients_hospitalized_confirmed_covid,
         total_pediatric_patients_hospitalized_confirmed_and_suspected_covid,
         total_pediatric_patients_hospitalized_confirmed_covid,
         total_staffed_adult_icu_beds,
         inpatient_beds_utilization,
         percent_of_inpatients_with_covid,
         deaths_covid)

#Narrowing dataset to only include January 2021 dates
Data <- SelectDataHosp %>%
  filter(year(date)==2021, 
         month(date)==1)

#Getting January 2021 average number of hospitals with staffing shortages by state each day 
shortStaffMeans <- Data %>%
  group_by(state)%>%
  summarize(mean(critical_staffing_shortage_today_yes, na.rm = TRUE))

#Remaming the columns
colnames(shortStaffMeans)<- c("state", "shortStaff")
colnames(StateMedInc) <- c("StateName", "medFamInc", "state")

##Getting January 2021 average daily COVID deaths by state
covidDeathMean <- SelectDataHosp%>% 
  group_by(state)%>%
  filter(year(date)==2021, 
         month(date)==1)%>%
  summarize(mean(deaths_covid, na.rm=TRUE)) 

##Renaming columns so they make more sense
colnames(covidDeathMean) <- c("state", "avgCovidDeaths")

StateMedInc%>%
  arrange(medFamInc)
## # A tibble: 51 x 3
##    StateName      medFamInc state
##    <chr>              <dbl> <chr>
##  1 Mississippi        45081 MS   
##  2 West Virginia      46711 WV   
##  3 Arkansas           47597 AR   
##  4 Louisiana          49469 LA   
##  5 New Mexico         49754 NM   
##  6 Alabama            50536 AL   
##  7 Kentucky           50589 KY   
##  8 Oklahoma           52919 OK   
##  9 South Carolina     53199 SC   
## 10 Tennessee          53320 TN   
## # ... with 41 more rows
low<-StateMedInc%>%
  filter(medFamInc <=56602)%>%
  mutate(incLev="low")

med<-StateMedInc%>%
  filter(medFamInc > 56602 & medFamInc <=65886)%>%
  mutate(incLev="med")

high<-StateMedInc%>%
  filter(medFamInc >= 67167)%>%
  mutate(incLev="high")

stateLev<-rbind(low, med, high)
View(stateLev)

#Creating final usable data set
covidDeathMean<- covidDeathMean[-c(40,48),]
shortStaffMeans<- shortStaffMeans[-c(40, 48),]

joinedData <- left_join(covidDeathMean, stateLev)
## Joining, by = "state"
joinedData <- left_join(joinedData, shortStaffMeans)
## Joining, by = "state"
#Percent beds used by COVID patients
percentBedsCovid <- Data%>%
  group_by(state)%>%
  filter(year(date)==2021, 
         month(date)==1)%>%
  summarize(mean(inpatient_beds_used_covid/inpatient_beds_used, na.rm=TRUE)) 
percentBedsCovid<- percentBedsCovid[-c(40, 48),]
colnames(percentBedsCovid) <- c("state", "percentUsedBedsCovid")

#Avg. confirmed COVID cases in hospitals in each state
totalConfirmedCovid <- Data%>%
  group_by(state)%>%
  filter(year(date)==2021, 
         month(date)==1)%>%
  summarize(mean(total_adult_patients_hospitalized_confirmed_covid + total_pediatric_patients_hospitalized_confirmed_covid))
totalConfirmedCovid<- totalConfirmedCovid[-c(40, 48),]
colnames(totalConfirmedCovid) <- c("state", "totalConfirmedCovidPatients")

#Percent of inpatients with COVID
percentCovid <- Data%>%
  group_by(state)%>%
  filter(year(date)==2021, 
         month(date)==1)%>%
  summarize(mean(percent_of_inpatients_with_covid))
percentCovid<- percentCovid[-c(40, 48),]
colnames(percentCovid) <- c("state", "PercentInpatientsCovid")

joinedData <- left_join(joinedData, percentBedsCovid)
## Joining, by = "state"
joinedData <- left_join(joinedData, totalConfirmedCovid)
## Joining, by = "state"
joinedData <- left_join(joinedData, percentCovid)
## Joining, by = "state"
joinedData <- left_join(joinedData, StatePop)
## Joining, by = "StateName"
colnames(joinedData)<-c("state", "avgCovidDeaths", "stateName", "medFamInc", "incLev", "shortStaff", "percentUsedBedsCovid", "totalConfirmedCovidPatients", "percentInpatientsCovid", "rank", "census2020", "percentTotalUSPop")

covid<-joinedData%>%
  mutate(avgCDPC=avgCovidDeaths/census2020) 

covid<-joinedData%>%
  mutate(avgSSPC=shortStaff/census2020)

avgSSPC <- covid%>%
  select("shortStaff", "census2020")%>%
  mutate(avgSSPC=shortStaff/census2020)


avgCDPC <- covid%>%
  select("avgCovidDeaths", "census2020")%>%
  mutate(avgCDPC=avgCovidDeaths/census2020)

covid <- left_join(covid, avgSSPC)
## Joining, by = c("shortStaff", "census2020", "avgSSPC")
covid <- left_join(covid, avgCDPC)
## Joining, by = c("avgCovidDeaths", "census2020")

#Step 1

#STEP 1
#Staffing shortages is the number of hospitals in the state that are under staffed. Staffing Shortages becomes dichotomized as Extreme(ly short staffed) (top â…“), Somewhat (short staffed)(bottom â…”). The numeric variable becomes categorical so the information can be more easily used. The data is split into higher and lower staffing shortage groups (labeled Extremely and Somewhat).

#Dichotomize the Data
delta <- covid%>%
  select("state", "avgSSPC")

delta%>% 
  arrange(avgSSPC)
## # A tibble: 51 x 2
##    state     avgSSPC
##    <chr>       <dbl>
##  1 KY    0.000000179
##  2 UT    0.000000221
##  3 FL    0.000000400
##  4 NY    0.000000442
##  5 OR    0.000000472
##  6 NC    0.000000516
##  7 MD    0.000000522
##  8 NJ    0.00000102 
##  9 NV    0.00000104 
## 10 CO    0.00000106 
## # ... with 41 more rows
extremely<-covid%>%
  filter(avgSSPC > 4.39264e-06)%>%
  mutate(staffLev=1)

somewhat<-covid%>%
  filter(avgSSPC <=4.39264e-06)%>%
  mutate(staffLev=0)

shortageLev<-rbind(somewhat, extremely)
view(shortageLev)

#Step 2

qplot(x = avgCDPC, y = staffLev, data = shortageLev, 
      geom = "point", alpha = I(.5), ylab = "Staff Shortage Level") +
  stat_smooth(method = "lm",
              se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

qplot(x = avgCDPC, y = staffLev, data = shortageLev, 
      geom = "point", alpha = I(.5), ylab = "Staff Shortage Level") +
  stat_smooth(method = "glm", method.args = list(family = "binomial"),
              se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

m1 <- glm(staffLev ~ avgCDPC, data = shortageLev, family = "binomial")
summary(m1)
## 
## Call:
## glm(formula = staffLev ~ avgCDPC, family = "binomial", data = shortageLev)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4741  -0.8683  -0.5582   1.0430   1.9882  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)   
## (Intercept) -2.330e+00  7.584e-01  -3.072  0.00213 **
## avgCDPC      3.113e+05  1.218e+05   2.556  0.01059 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 64.104  on 49  degrees of freedom
## Residual deviance: 56.212  on 48  degrees of freedom
## AIC: 60.212
## 
## Number of Fisher Scoring iterations: 3
#The model is staff shortage level as a binary (either extreme or somewhat) as compared to Covid Deaths Per Capita. -2.330e+00 + 3.113+05x=y is the equation of the line. There is a positive slope, so there is a positive association between CDPC and short staffing.

#Step 3

#randomly split data
set.seed(1)

train_indices <- sample(1:nrow(shortageLev), size = floor(nrow(shortageLev)/2))

train_data <- shortageLev %>%
  slice(train_indices)

test_data <- shortageLev %>%
  slice(-train_indices)  

#train
mod1 <- glm(avgCDPC ~ staffLev, data = train_data, family = "binomial")
mod2 <- glm(avgCDPC ~ staffLev + percentUsedBedsCovid, data = train_data, family = "binomial")

#test
test1 <- predict(mod1, newdata = test_data, type = "response")

test_mat1<-data.frame(staffLev=test_data$staffLev, predstaffLev=test1>.5)%>%
  group_by(staffLev, predstaffLev)%>%
  summarise(n=n())
## `summarise()` has grouped output by 'staffLev'. You can override using the `.groups` argument.
test_mat1
## # A tibble: 2 x 3
## # Groups:   staffLev [2]
##   staffLev predstaffLev     n
##      <dbl> <lgl>        <int>
## 1        0 FALSE           17
## 2        1 FALSE            8
test2 <- predict(mod2, newdata = test_data, type = "response")

test_mat2<-data.frame(staffLev=test_data$staffLev, predstaffLev=test2>.5)%>%
  group_by(staffLev, predstaffLev)%>%
  summarise(n=n())
## `summarise()` has grouped output by 'staffLev'. You can override using the `.groups` argument.
test_mat2
## # A tibble: 2 x 3
## # Groups:   staffLev [2]
##   staffLev predstaffLev     n
##      <dbl> <lgl>        <int>
## 1        0 FALSE           17
## 2        1 FALSE            8
#Confusion Matrix
pred1 <- predict(mod1, newdata = shortageLev, type = "response")

conf_mat1<-data.frame(staffLev=shortageLev$staffLev, predstaffLev=pred1>.5)%>%
  group_by(staffLev, predstaffLev)%>%
  summarise(n=n())
## `summarise()` has grouped output by 'staffLev'. You can override using the `.groups` argument.
conf_mat1
## # A tibble: 2 x 3
## # Groups:   staffLev [2]
##   staffLev predstaffLev     n
##      <dbl> <lgl>        <int>
## 1        0 FALSE           33
## 2        1 FALSE           17
#they both say false but we can't figure out why

pred2 <- predict(mod2, newdata = , type = "response")

conf_mat2<-data.frame(staffLev=shortageLev$staffLev, predstaffLev=pred2>.5)%>%
  group_by(staffLev, predstaffLev)%>%
  summarise(n=n())
## Warning in data.frame(staffLev = shortageLev$staffLev, predstaffLev = pred2 > :
## row names were found from a short variable and have been discarded
## `summarise()` has grouped output by 'staffLev'. You can override using the `.groups` argument.
conf_mat2
## # A tibble: 2 x 3
## # Groups:   staffLev [2]
##   staffLev predstaffLev     n
##      <dbl> <lgl>        <int>
## 1        0 FALSE           33
## 2        1 FALSE           17