#Load everything we need
#install.packages("dplyr")
#install.packages("tidyr")
#install.packages("ggplot2")
#install.packages("tidyverse")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.4 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## v purrr 0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(readxl)
Hospital_capacity_RAW <- read_excel("C:/Users/jessi/Downloads/Hospital_capacity_RAW.xls")
StateMedInc <- read_excel("C:/Users/jessi/Downloads/StateMedInc.xls")
#Narrowing the dataset from 117 variables to 20
SelectDataHosp <-Hospital_capacity_RAW %>%
select(state,
date,
critical_staffing_shortage_today_yes,
critical_staffing_shortage_today_no,
critical_staffing_shortage_today_not_reported,
hospital_onset_covid,
inpatient_beds,
inpatient_beds_used,
inpatient_beds_used_covid,
staffed_adult_icu_bed_occupancy,
staffed_icu_adult_patients_confirmed_and_suspected_covid,
staffed_icu_adult_patients_confirmed_covid,
total_adult_patients_hospitalized_confirmed_and_suspected_covid,
total_adult_patients_hospitalized_confirmed_covid,
total_pediatric_patients_hospitalized_confirmed_and_suspected_covid,
total_pediatric_patients_hospitalized_confirmed_covid,
total_staffed_adult_icu_beds,
inpatient_beds_utilization,
percent_of_inpatients_with_covid,
deaths_covid)
#This is THE data set (Only jan data for our 20 variables)
Data <- SelectDataHosp %>%
filter(year(date)==2021,
month(date)==1)
##Getting January 2021 average # of hospitals with staffing shortages by state
shortStaffMeans <- Data %>%
group_by(state)%>%
summarize(mean(critical_staffing_shortage_today_yes, na.rm = TRUE))
###I ended up making a new set for the means that we wanted, this makes it easier to merge with the Med. Income Data
##The columns had weird names, so I renamed them
colnames(shortStaffMeans)<- c("state", "shortStaff")
colnames(StateMedInc) <- c("StateName", "medFamInc", "state")
##Merged average number of hospitals with staffing shortages in each state data with median household income by state data
EDAQuestion1 <- merge(shortStaffMeans, StateMedInc)
###This is the set we will use to answer Q1
##Made a basic plot to see if staffing shortages might be impacted by a state's median household income
ggplot(EDAQuestion1, aes(x = medFamInc, y = shortStaff))+
geom_point(color="blue")

#Now to prep for Q2 plot...
##Getting January 2021 average daily COVID deaths by state
covidDeathMean <- SelectDataHosp%>% #sorry jdog I changed this
group_by(state)%>%
filter(year(date)==2021,
month(date)==1)%>%
summarize(mean(deaths_covid, na.rm=TRUE))
##Renaming columns so they make more sense
colnames(covidDeathMean) <- c("state", "avgCovidDeaths")
##Merged Jan 2021 daily COVID death average data by state with median household income data by state
EDAQuestion2 <- merge(covidDeathMean, StateMedInc)
###This is the data set we will use to answer Q2
##Making a basic plot to see if there is a relationship between Avg. daily deaths and median household income by state
ggplot(EDAQuestion2, aes(x = medFamInc, y = avgCovidDeaths))+
geom_point(color='purple')

#Now, for EDA Q3...
##Merging our two data sets together, average daily COVID deaths by state and average number of hospitals with staffing shortages by state
EDAQuestion3 <- merge(covidDeathMean, shortStaffMeans)
##Making a plot to see if there is a relationship between average daily COVID deaths and average # of hospitals with staffing shortages daily
ggplot(EDAQuestion3, aes(x = shortStaff, y = avgCovidDeaths))+
geom_point(color="red")+
scale_x_log10()+
scale_y_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis

m1 <- lm(shortStaff~avgCovidDeaths, data = EDAQuestion3)
slope <- m1$coefficients[2]
slope
## avgCovidDeaths
## 0.3921884
intercept <- m1$coefficients[1]
intercept
## (Intercept)
## 3.618213
ggplot(EDAQuestion3, aes(avgCovidDeaths, m1$residuals))+
geom_point()+
geom_hline(yintercept = 0,
color = "blue")

##Making a plot to see if switching the axes the variables are on reveals a new relationship
ggplot(EDAQuestion3, aes(x = avgCovidDeaths, y = shortStaff))+
geom_point(color="orange")+
geom_abline(slope=m1$coefficients[2], intercept=m1$coefficients[1],color="blue", lty=2, lwd=1)

names(m1)
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "xlevels" "call" "terms" "model"
qqnorm(m1$residuals)
qqline(m1$residuals)

dev.off()
## null device
## 1
ggplot(EDAQuestion3, aes(avgCovidDeaths, m1$residual))+
geom_point()+
ggtitle("Residual Plot")+
xlab("avgCovidDeaths")+
ylab("Residuals")+
theme_bw()+
geom_hline(yintercept = 0,
color="blue", lty=2, lwd=1)
plot(m1)
summary(m1)
##
## Call:
## lm(formula = shortStaff ~ avgCovidDeaths, data = EDAQuestion3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -47.135 -5.187 -1.129 5.196 66.959
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.61821 2.52695 1.432 0.158
## avgCovidDeaths 0.39219 0.03145 12.469 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.93 on 51 degrees of freedom
## Multiple R-squared: 0.753, Adjusted R-squared: 0.7481
## F-statistic: 155.5 on 1 and 51 DF, p-value: < 2.2e-16
anova(m1)
## Analysis of Variance Table
##
## Response: shortStaff
## Df Sum Sq Mean Sq F value Pr(>F)
## avgCovidDeaths 1 39453 39453 155.47 < 2.2e-16 ***
## Residuals 51 12942 254
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1