#Load everything we need
#install.packages("dplyr")
#install.packages("tidyr")
#install.packages("ggplot2")
#install.packages("tidyverse")

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble  3.1.4     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1
## v purrr   0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(readxl)

Hospital_capacity_RAW <- read_excel("C:/Users/jessi/Downloads/Hospital_capacity_RAW.xls")

StateMedInc <- read_excel("C:/Users/jessi/Downloads/StateMedInc.xls")

#Narrowing the dataset from 117 variables to 20
SelectDataHosp <-Hospital_capacity_RAW %>%
  select(state,
         date,
         critical_staffing_shortage_today_yes,
         critical_staffing_shortage_today_no,
         critical_staffing_shortage_today_not_reported,
         hospital_onset_covid,
         inpatient_beds,
         inpatient_beds_used,
         inpatient_beds_used_covid,
         staffed_adult_icu_bed_occupancy,
         staffed_icu_adult_patients_confirmed_and_suspected_covid,
         staffed_icu_adult_patients_confirmed_covid,
         total_adult_patients_hospitalized_confirmed_and_suspected_covid,
         total_adult_patients_hospitalized_confirmed_covid,
         total_pediatric_patients_hospitalized_confirmed_and_suspected_covid,
         total_pediatric_patients_hospitalized_confirmed_covid,
         total_staffed_adult_icu_beds,
         inpatient_beds_utilization,
         percent_of_inpatients_with_covid,
         deaths_covid)

#This is THE data set (Only jan data for our 20 variables)
Data <- SelectDataHosp %>%
  filter(year(date)==2021, 
         month(date)==1)
##Getting January 2021 average # of hospitals with staffing shortages by state
shortStaffMeans <- Data %>%
  group_by(state)%>%
  summarize(mean(critical_staffing_shortage_today_yes, na.rm = TRUE))
###I ended up making a new set for the means that we wanted, this makes it easier to merge with the Med. Income Data

##The columns had weird names, so I renamed them
colnames(shortStaffMeans)<- c("state", "shortStaff")
colnames(StateMedInc) <- c("StateName", "medFamInc", "state")


##Merged average number of hospitals with staffing shortages in each state data with median household income by state data
EDAQuestion1 <- merge(shortStaffMeans, StateMedInc)
###This is the set we will use to answer Q1

##Made a basic plot to see if staffing shortages might be impacted by a state's median household income
ggplot(EDAQuestion1, aes(x = medFamInc, y = shortStaff))+
  geom_point(color="blue")

#Now to prep for Q2 plot...

##Getting January 2021 average daily COVID deaths by state
covidDeathMean <- SelectDataHosp%>% #sorry jdog I changed this
  group_by(state)%>%
  filter(year(date)==2021, 
         month(date)==1)%>%
  summarize(mean(deaths_covid, na.rm=TRUE)) 

##Renaming columns so they make more sense
colnames(covidDeathMean) <- c("state", "avgCovidDeaths")

##Merged Jan 2021 daily COVID death average data by state with median household income data by state
EDAQuestion2 <- merge(covidDeathMean, StateMedInc)
###This is the data set we will use to answer Q2

##Making a basic plot to see if there is a relationship between Avg. daily deaths and median household income by state
ggplot(EDAQuestion2, aes(x = medFamInc, y = avgCovidDeaths))+
  geom_point(color='purple')

#Now, for EDA Q3...

##Merging our two data sets together, average daily COVID deaths by state and average number of hospitals with staffing shortages by state
EDAQuestion3 <- merge(covidDeathMean, shortStaffMeans)

##Making a plot to see if there is a relationship between average daily COVID deaths and average # of hospitals with staffing shortages daily
ggplot(EDAQuestion3, aes(x = shortStaff, y = avgCovidDeaths))+
  geom_point(color="red")+
  scale_x_log10()+
  scale_y_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis

m1 <- lm(shortStaff~avgCovidDeaths, data = EDAQuestion3)

slope <- m1$coefficients[2]
slope
## avgCovidDeaths 
##      0.3921884
intercept <- m1$coefficients[1]
intercept
## (Intercept) 
##    3.618213
ggplot(EDAQuestion3, aes(avgCovidDeaths, m1$residuals))+
  geom_point()+
  geom_hline(yintercept = 0,
             color = "blue")

##Making a plot to see if switching the axes the variables are on reveals a new relationship
ggplot(EDAQuestion3, aes(x = avgCovidDeaths, y = shortStaff))+
  geom_point(color="orange")+
  geom_abline(slope=m1$coefficients[2], intercept=m1$coefficients[1],color="blue", lty=2, lwd=1)

names(m1)
##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"
qqnorm(m1$residuals)
qqline(m1$residuals)

dev.off()
## null device 
##           1
ggplot(EDAQuestion3, aes(avgCovidDeaths, m1$residual))+
  geom_point()+
  ggtitle("Residual Plot")+
  xlab("avgCovidDeaths")+
  ylab("Residuals")+
  theme_bw()+
  geom_hline(yintercept = 0,
             color="blue", lty=2, lwd=1)

plot(m1)

summary(m1)
## 
## Call:
## lm(formula = shortStaff ~ avgCovidDeaths, data = EDAQuestion3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -47.135  -5.187  -1.129   5.196  66.959 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     3.61821    2.52695   1.432    0.158    
## avgCovidDeaths  0.39219    0.03145  12.469   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.93 on 51 degrees of freedom
## Multiple R-squared:  0.753,  Adjusted R-squared:  0.7481 
## F-statistic: 155.5 on 1 and 51 DF,  p-value: < 2.2e-16
anova(m1)
## Analysis of Variance Table
## 
## Response: shortStaff
##                Df Sum Sq Mean Sq F value    Pr(>F)    
## avgCovidDeaths  1  39453   39453  155.47 < 2.2e-16 ***
## Residuals      51  12942     254                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1