Milestone 4

##First, install & load packages that we need

#Install everything we need
#install.packages("dplyr")
#install.packages("tidyr")
#install.packages("ggplot2")
#install.packages("tidyverse")

#Load packages into global environment
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(ggplot2)
library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v tibble  3.1.4     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1
## v purrr   0.3.4

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

##Now, we import the dataset

library(readxl)

Hospital_capacity_RAW <- read_excel("C:/Users/jessi/Downloads/Hospital_capacity_RAW.xls")

StateMedInc <- read_excel("C:/Users/jessi/Downloads/StateMedInc.xls")

StatePop <-
read_excel("C:/Users/Jessi/Downloads/StatePop.xlsx")

##Time to wrangle/tidy up our dataset ####Making our dataset smaller, and narrowing down to only January dates

#Tidying Hospital_capacity_RAW dataset
SelectDataHosp <-Hospital_capacity_RAW %>%
  select(state,
         date,
         critical_staffing_shortage_today_yes,
         critical_staffing_shortage_today_no,
         critical_staffing_shortage_today_not_reported,
         hospital_onset_covid,
         inpatient_beds,
         inpatient_beds_used,
         inpatient_beds_used_covid,
         staffed_adult_icu_bed_occupancy,
         staffed_icu_adult_patients_confirmed_and_suspected_covid,
         staffed_icu_adult_patients_confirmed_covid,
         total_adult_patients_hospitalized_confirmed_and_suspected_covid,
         total_adult_patients_hospitalized_confirmed_covid,
         total_pediatric_patients_hospitalized_confirmed_and_suspected_covid,
         total_pediatric_patients_hospitalized_confirmed_covid,
         total_staffed_adult_icu_beds,
         inpatient_beds_utilization,
         percent_of_inpatients_with_covid,
         deaths_covid)

#Narrowing dataset to only include January 2021 dates
Data <- SelectDataHosp %>%
  filter(year(date)==2021, 
         month(date)==1)

##Creating datasets with the jan 2021 Average for each variable in each state ####Staffing Shortages

#Getting January 2021 average number of hospitals with staffing shortages by state each day 
shortStaffMeans <- Data %>%
  group_by(state)%>%
  summarize(mean(critical_staffing_shortage_today_yes, na.rm = TRUE))

#Remaming the columns
colnames(shortStaffMeans)<- c("state", "shortStaff")
colnames(StateMedInc) <- c("StateName", "medFamInc", "state")

####COVID Deaths

##Getting January 2021 average daily COVID deaths by state
covidDeathMean <- SelectDataHosp%>% 
  group_by(state)%>%
  filter(year(date)==2021, 
         month(date)==1)%>%
  summarize(mean(deaths_covid, na.rm=TRUE)) 

##Renaming columns so they make more sense
colnames(covidDeathMean) <- c("state", "avgCovidDeaths")

####Creating three levels for state median household income (low, med, high)

StateMedInc%>%
  arrange(medFamInc)

## # A tibble: 51 x 3
##    StateName      medFamInc state
##    <chr>              <dbl> <chr>
##  1 Mississippi        45081 MS   
##  2 West Virginia      46711 WV   
##  3 Arkansas           47597 AR   
##  4 Louisiana          49469 LA   
##  5 New Mexico         49754 NM   
##  6 Alabama            50536 AL   
##  7 Kentucky           50589 KY   
##  8 Oklahoma           52919 OK   
##  9 South Carolina     53199 SC   
## 10 Tennessee          53320 TN   
## # ... with 41 more rows

low<-StateMedInc%>%
  filter(medFamInc <=56602)%>%
  mutate(incLev="low")

med<-StateMedInc%>%
  filter(medFamInc > 56602 & medFamInc <=65886)%>%
  mutate(incLev="med")

high<-StateMedInc%>%
  filter(medFamInc >= 67167)%>%
  mutate(incLev="high")

stateLev<-rbind(low, med, high)
View(stateLev)

####Joining data together

#Creating final usable data set
covidDeathMean<- covidDeathMean[-c(40,48),]
shortStaffMeans<- shortStaffMeans[-c(40, 48),]

joinedData <- left_join(covidDeathMean, stateLev)

## Joining, by = "state"

joinedData <- left_join(joinedData, shortStaffMeans)

## Joining, by = "state"

#Percent beds used by COVID patients
percentBedsCovid <- Data%>%
  group_by(state)%>%
  filter(year(date)==2021, 
         month(date)==1)%>%
  summarize(mean(inpatient_beds_used_covid/inpatient_beds_used, na.rm=TRUE)) 
percentBedsCovid<- percentBedsCovid[-c(40, 48),]
colnames(percentBedsCovid) <- c("state", "percentUsedBedsCovid")

#Avg. confirmed COVID cases in hospitals in each state
totalConfirmedCovid <- Data%>%
  group_by(state)%>%
  filter(year(date)==2021, 
         month(date)==1)%>%
  summarize(mean(total_adult_patients_hospitalized_confirmed_covid + total_pediatric_patients_hospitalized_confirmed_covid))
totalConfirmedCovid<- totalConfirmedCovid[-c(40, 48),]
colnames(totalConfirmedCovid) <- c("state", "totalConfirmedCovidPatients")

#Percent of inpatients with COVID
percentCovid <- Data%>%
  group_by(state)%>%
  filter(year(date)==2021, 
         month(date)==1)%>%
  summarize(mean(percent_of_inpatients_with_covid))
percentCovid<- percentCovid[-c(40, 48),]
colnames(percentCovid) <- c("state", "PercentInpatientsCovid")

joinedData <- left_join(joinedData, percentBedsCovid)

## Joining, by = "state"

joinedData <- left_join(joinedData, totalConfirmedCovid)

## Joining, by = "state"

joinedData <- left_join(joinedData, percentCovid)

## Joining, by = "state"

joinedData <- left_join(joinedData, StatePop)

## Joining, by = "StateName"

write.csv(joinedData, "C:/Users/jessi/Downloads/finalProjData.csv",
         row.names = FALSE)

joinedData<-read.csv("C:/Users/jessi/Downloads/finalProjData.csv", 
                     header=TRUE)

##Mutating the covid deaths column to change data to be COVID deaths PER CAPITA to account for state population

covid<-joinedData%>%
  mutate(avgCDPC=avgCovidDeaths/X2020.Census)

str(covid)

## 'data.frame':    51 obs. of  13 variables:
##  $ state                      : chr  "AK" "AL" "AR" "AZ" ...
##  $ avgCovidDeaths             : num  0.677 50.774 22.516 85.806 451.677 ...
##  $ StateName                  : chr  "Alaska" "Alabama" "Arkansas" "Arizona" ...
##  $ medFamInc                  : int  77640 50536 47597 58945 75235 72331 78444 86420 68287 55660 ...
##  $ incLev                     : chr  "high" "low" "low" "med" ...
##  $ shortStaff                 : num  1 38 17.6 31.8 173.7 ...
##  $ percentUsedBedsCovid       : num  0.074 0.245 0.195 0.358 0.376 ...
##  $ totalConfirmedCovidPatients: num  59.2 2752.3 1110.3 4208.4 19973.9 ...
##  $ PercentInpatientsCovid     : num  0.074 0.252 0.196 0.366 0.381 ...
##  $ Rank                       : int  48 24 33 14 1 21 29 NA 45 3 ...
##  $ X2020.Census               : int  733391 5024279 3011524 7151502 39538223 5773714 3605944 NA 989948 21538187 ...
##  $ Percent.of.Total.US.Pop    : num  0.0022 0.0148 0.0091 0.0219 0.1191 ...
##  $ avgCDPC                    : num  9.24e-07 1.01e-05 7.48e-06 1.20e-05 1.14e-05 ...

names(covid)

##  [1] "state"                       "avgCovidDeaths"             
##  [3] "StateName"                   "medFamInc"                  
##  [5] "incLev"                      "shortStaff"                 
##  [7] "percentUsedBedsCovid"        "totalConfirmedCovidPatients"
##  [9] "PercentInpatientsCovid"      "Rank"                       
## [11] "X2020.Census"                "Percent.of.Total.US.Pop"    
## [13] "avgCDPC"

#Create a Simple Linear Model

### Mod A: ANOVA
ggplot(data=covid, aes(y=PercentInpatientsCovid, fill=incLev))+
  geom_boxplot()

modA<-lm(PercentInpatientsCovid~incLev, data=covid)
anova(modA)

## Analysis of Variance Table
## 
## Response: PercentInpatientsCovid
##           Df   Sum Sq   Mean Sq F value Pr(>F)
## incLev     2 0.007872 0.0039359    0.79 0.4597
## Residuals 48 0.239147 0.0049822

#to calculate MSE
modASum<- summary(modA)
mean(modASum$residuals^2)

## [1] 0.004689151

#SLR Model that has points color coordinated to the income level of that state (low, med, high)

#create model
ggplot(data=covid, aes(y=PercentInpatientsCovid, x=avgCDPC))+
  geom_point()

## Warning: Removed 1 rows containing missing values (geom_point).

modB<-lm(PercentInpatientsCovid~avgCDPC, data=covid)

#to calculate MSE
modBSum<- summary(modB)
mean(modBSum$residuals^2)

## [1] 0.0009018205

#plot the model
ggplot(data=covid, aes(y=PercentInpatientsCovid, x=avgCDPC,color=incLev))+
  geom_point()

## Warning: Removed 1 rows containing missing values (geom_point).

#Factoring our categorical variable by creating a “dummy” variable

##Coding dummy variable
covid$incLev <- factor(covid$incLev,
                       levels = c("high", "med", "low"))
contrasts(covid$incLev)

##      med low
## high   0   0
## med    1   0
## low    0   1

#Creating a MLR

#create model
modC<-lm(PercentInpatientsCovid~avgCDPC+incLev, data=covid)

#to calculate MSE
modCSum<- summary(modC)
mean(modCSum$residuals^2)

## [1] 0.0007865488

#plot the model
ggplot(data=covid, aes(x=avgCDPC, y=PercentInpatientsCovid, color=incLev))+
  geom_point()+
  ggtitle("Scatterplot of Average COVID deaths per cap v. % Inpatients COVID")+
  theme_bw()+
  geom_abline(slope=modC$coefficients[2], intercept = modC$coefficients[1], col=2)+
  geom_abline(slope=modC$coefficients[2], intercept = modC$coefficients[1]+modC$coefficients[3])

## Warning: Removed 1 rows containing missing values (geom_point).

#MLR that includes interaction between numeric and categorical variables

modD<-lm(PercentInpatientsCovid~avgCDPC+incLev, data=covid)

#calculate MSE
modDSum<- summary(modD)
mean(modDSum$residuals^2)

## [1] 0.0007865488

#plot the model
ggplot(data=covid, aes(x=avgCDPC, y=PercentInpatientsCovid, color=incLev))+
  geom_point()+
  ggtitle("Scatterplot of Average COVID deaths per cap v. % Inpatients COVID")+
  theme_bw()+
  geom_abline(slope=modD$coefficients[2], intercept = modD$coefficients[1], col=2)+
  geom_abline(slope=modD$coefficients[2]+modD$coefficients[4], intercept = modD$coefficients[1]+modD$coefficients[3], col=4)

## Warning: Removed 1 rows containing missing values (geom_point).

#Conclusion

#We learned that the relationship between COVID bed use per capita and COVID deaths are linear, but are not very strongly related to state average family income (within the US).

Milestone 4

Jessie & Allison

11/8/2021