Loading packages & Data we Need

#Install everything we need

#install.packages("dplyr")
#install.packages("tidyr")
#install.packages("ggplot2")
#install.packages("tidyverse")
#install.packages("GGally")

#Load packages into global environment
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble  3.1.4     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1
## v purrr   0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(GGally)
## Warning: package 'GGally' was built under R version 4.1.2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(ISLR)

library(readxl)

#Load our data sets
Hospital_capacity_RAW <- read_excel("C:/Users/jessi/Downloads/Hospital_capacity_RAW.xls")

StateMedInc <- read_excel("C:/Users/jessi/Downloads/StateMedInc.xls")

StatePop <-
  read_excel("C:/Users/Jessi/Downloads/StatePop.xlsx")

Data Wrangling

Narrowing variables down

SelectDataHosp <-Hospital_capacity_RAW %>%
  select(state,
         date,
         critical_staffing_shortage_today_yes,
         critical_staffing_shortage_today_no,
         critical_staffing_shortage_today_not_reported,
         hospital_onset_covid,
         inpatient_beds,
         inpatient_beds_used,
         inpatient_beds_used_covid,
         staffed_adult_icu_bed_occupancy,
         staffed_icu_adult_patients_confirmed_and_suspected_covid,
         staffed_icu_adult_patients_confirmed_covid,
         total_adult_patients_hospitalized_confirmed_and_suspected_covid,
         total_adult_patients_hospitalized_confirmed_covid,
         total_pediatric_patients_hospitalized_confirmed_and_suspected_covid,
         total_pediatric_patients_hospitalized_confirmed_covid,
         total_staffed_adult_icu_beds,
         inpatient_beds_utilization,
         percent_of_inpatients_with_covid,
         deaths_covid)

Narrowing data to include only January 2021 averages

Data <- SelectDataHosp %>%
  filter(year(date)==2021, 
         month(date)==1)

#Getting January 2021 average number of hospitals with staffing shortages by state each day 
shortStaffMeans <- Data %>%
  group_by(state)%>%
  summarize(mean(critical_staffing_shortage_today_yes, na.rm = TRUE))

#Renaming the columns
colnames(shortStaffMeans)<- c("state", "shortStaff")
colnames(StateMedInc) <- c("StateName", "medFamInc", "state")

#Getting January 2021 average daily COVID deaths by state
covidDeathMean <- SelectDataHosp%>% 
  group_by(state)%>%
  filter(year(date)==2021, 
         month(date)==1)%>%
  summarize(mean(deaths_covid, na.rm=TRUE)) 

#Renaming the columns 
colnames(covidDeathMean) <- c("state", "avgCovidDeaths")

#Percent beds used by COVID patients
percentBedsCovid <- Data%>%
  group_by(state)%>%
  filter(year(date)==2021, 
         month(date)==1)%>%
  summarize(mean(inpatient_beds_used_covid/inpatient_beds_used, na.rm=TRUE)) 
percentBedsCovid<- percentBedsCovid[-c(40, 48),]
colnames(percentBedsCovid) <- c("state", "percentUsedBedsCovid")

#Avg. confirmed COVID cases in hospitals in each state
totalConfirmedCovid <- Data%>%
  group_by(state)%>%
  filter(year(date)==2021, 
         month(date)==1)%>%
  summarize(mean(total_adult_patients_hospitalized_confirmed_covid + total_pediatric_patients_hospitalized_confirmed_covid))
totalConfirmedCovid<- totalConfirmedCovid[-c(40, 48),]
colnames(totalConfirmedCovid) <- c("state", "totalConfirmedCovidPatients")

#Percent of inpatients with COVID
percentCovid <- Data%>%
  group_by(state)%>%
  filter(year(date)==2021, 
         month(date)==1)%>%
  summarize(mean(percent_of_inpatients_with_covid))
percentCovid<- percentCovid[-c(40, 48),]
colnames(percentCovid) <- c("state", "PercentInpatientsCovid")

Creating levels (categorical variables)

#Creating median family income levels 
StateMedInc%>%
  arrange(medFamInc)
## # A tibble: 51 x 3
##    StateName      medFamInc state
##    <chr>              <dbl> <chr>
##  1 Mississippi        45081 MS   
##  2 West Virginia      46711 WV   
##  3 Arkansas           47597 AR   
##  4 Louisiana          49469 LA   
##  5 New Mexico         49754 NM   
##  6 Alabama            50536 AL   
##  7 Kentucky           50589 KY   
##  8 Oklahoma           52919 OK   
##  9 South Carolina     53199 SC   
## 10 Tennessee          53320 TN   
## # ... with 41 more rows
low<-StateMedInc%>%
  filter(medFamInc <=56602)%>%
  mutate(incLev="low")

med<-StateMedInc%>%
  filter(medFamInc > 56602 & medFamInc <=65886)%>%
  mutate(incLev="med")

high<-StateMedInc%>%
  filter(medFamInc >= 67167)%>%
  mutate(incLev="high")

stateLev<-rbind(low, med, high)
View(stateLev)

Joining data

#Creating final usable data set
covidDeathMean<- covidDeathMean[-c(40,48),]
shortStaffMeans<- shortStaffMeans[-c(40, 48),]

joinedData <- left_join(covidDeathMean, stateLev)
## Joining, by = "state"
joinedData <- left_join(joinedData, shortStaffMeans)
## Joining, by = "state"
joinedData <- left_join(joinedData, percentBedsCovid)
## Joining, by = "state"
joinedData <- left_join(joinedData, totalConfirmedCovid)
## Joining, by = "state"
joinedData <- left_join(joinedData, percentCovid)
## Joining, by = "state"
joinedData <- left_join(joinedData, StatePop)
## Joining, by = "StateName"
#Renaming columns 
colnames(joinedData)<-c("state", "avgCovidDeaths", "stateName", "medFamInc", "incLev", "shortStaff", "percentUsedBedsCovid", "totalConfirmedCovidPatients", "percentInpatientsCovid", "rank", "census2020", "percentTotalUSPop")

Making Necessary variables per capita

covid<-joinedData%>%
  mutate(avgCDPC=avgCovidDeaths/census2020) 

covid<-joinedData%>%
  mutate(avgSSPC=shortStaff/census2020)

avgSSPC <- covid%>%
  select("shortStaff", "census2020")%>%
  mutate(avgSSPC=shortStaff/census2020)


avgCDPC <- covid%>%
  select("avgCovidDeaths", "census2020")%>%
  mutate(avgCDPC=avgCovidDeaths/census2020)

Joining all of this data

covid <- left_join(covid, avgSSPC)
## Joining, by = c("shortStaff", "census2020", "avgSSPC")
covid <- left_join(covid, avgCDPC)
## Joining, by = c("avgCovidDeaths", "census2020")

Looking at our final data set’s structure & variables

str(covid)
## tibble [51 x 14] (S3: tbl_df/tbl/data.frame)
##  $ state                      : chr [1:51] "AK" "AL" "AR" "AZ" ...
##  $ avgCovidDeaths             : num [1:51] 0.677 50.774 22.516 85.806 451.677 ...
##  $ stateName                  : chr [1:51] "Alaska" "Alabama" "Arkansas" "Arizona" ...
##  $ medFamInc                  : num [1:51] 77640 50536 47597 58945 75235 ...
##  $ incLev                     : chr [1:51] "high" "low" "low" "med" ...
##  $ shortStaff                 : num [1:51] 1 38 17.6 31.8 173.7 ...
##  $ percentUsedBedsCovid       : num [1:51] 0.074 0.245 0.195 0.358 0.376 ...
##  $ totalConfirmedCovidPatients: num [1:51] 59.2 2752.3 1110.3 4208.4 19973.9 ...
##  $ percentInpatientsCovid     : num [1:51] 0.074 0.252 0.196 0.366 0.381 ...
##  $ rank                       : num [1:51] 48 24 33 14 1 21 29 NA 45 3 ...
##  $ census2020                 : num [1:51] 733391 5024279 3011524 7151502 39538223 ...
##  $ percentTotalUSPop          : num [1:51] 0.0022 0.0148 0.0091 0.0219 0.1191 ...
##  $ avgSSPC                    : num [1:51] 1.36e-06 7.57e-06 5.86e-06 4.45e-06 4.39e-06 ...
##  $ avgCDPC                    : num [1:51] 9.24e-07 1.01e-05 7.48e-06 1.20e-05 1.14e-05 ...
names(covid)
##  [1] "state"                       "avgCovidDeaths"             
##  [3] "stateName"                   "medFamInc"                  
##  [5] "incLev"                      "shortStaff"                 
##  [7] "percentUsedBedsCovid"        "totalConfirmedCovidPatients"
##  [9] "percentInpatientsCovid"      "rank"                       
## [11] "census2020"                  "percentTotalUSPop"          
## [13] "avgSSPC"                     "avgCDPC"

Exploratory Data Analysis

Question 1

Merged average number of hospitals with staffing shortages in each state data with median household income by state data. Made a basic plot to see if staffing shortages might be impacted by a state’s median household income.

EDAQuestion1 <- merge(shortStaffMeans, StateMedInc)

ggplot(EDAQuestion1, aes(x = medFamInc, y = shortStaff))+
  geom_point(color="blue")

Question 2

Merged Jan 2021 daily COVID death average data by state with median household income data by state. Making a basic plot to see if there is a relationship between Avg. daily deaths and median household income by state.

EDAQuestion2 <- merge(covidDeathMean, StateMedInc)

ggplot(EDAQuestion2, aes(x = medFamInc, y = avgCovidDeaths))+
  geom_point(color='purple')

Question 3

Merging our two data sets together, average daily COVID deaths by state and average number of hospitals with staffing shortages by state. Making a plot to see if there is a relationship between average daily COVID deaths and average # of hospitals with staffing shortages daily.

EDAQuestion3 <- merge(covidDeathMean, shortStaffMeans)

ggplot(EDAQuestion3, aes(x = shortStaff, y = avgCovidDeaths))+
  geom_point(color="red")+
  scale_x_log10()+
  scale_y_log10()
## Warning: Transformation introduced infinite values in continuous x-axis

Creating a linear model…

model1 <- lm(shortStaff~avgCovidDeaths, data = EDAQuestion3)
summary(model1)
## 
## Call:
## lm(formula = shortStaff ~ avgCovidDeaths, data = EDAQuestion3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -47.305  -5.375  -1.347   4.916  66.894 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     3.94995    2.62041   1.507    0.138    
## avgCovidDeaths  0.39098    0.03202  12.209   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.16 on 49 degrees of freedom
## Multiple R-squared:  0.7526, Adjusted R-squared:  0.7476 
## F-statistic: 149.1 on 1 and 49 DF,  p-value: < 2.2e-16
slope <- model1$coefficients[2]
slope
## avgCovidDeaths 
##      0.3909832
intercept <- model1$coefficients[1]
intercept
## (Intercept) 
##    3.949949

…and plotting this model

ggplot(EDAQuestion3, aes(avgCovidDeaths, model1$residuals))+
  geom_point()+
  geom_hline(yintercept = 0,
             color = "blue")

Making a plot to see if switching the axes the variables are on reveals a new relationship

ggplot(EDAQuestion3, aes(x = avgCovidDeaths, y = shortStaff))+
  geom_point(color="orange")+
  geom_abline(slope=model1$coefficients[2], intercept=model1$coefficients[1],color="blue", lty=2, lwd=1)

Creating a qqplot

qqnorm(model1$residuals)
qqline(model1$residuals)

dev.off()
## null device 
##           1

Creating a residual plot

ggplot(EDAQuestion3, aes(avgCovidDeaths, model1$residuals))+
  geom_point()+
  ggtitle("Residual Plot")+
  xlab("avgCovidDeaths")+
  ylab("Residuals")+
  theme_bw()+
  geom_hline(yintercept = 0,
             color="blue", lty=2, lwd=1)

#### Creating

plot(model1)

print(model1)
## 
## Call:
## lm(formula = shortStaff ~ avgCovidDeaths, data = EDAQuestion3)
## 
## Coefficients:
##    (Intercept)  avgCovidDeaths  
##          3.950           0.391

Summary and anova table of this model

summary(model1)
## 
## Call:
## lm(formula = shortStaff ~ avgCovidDeaths, data = EDAQuestion3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -47.305  -5.375  -1.347   4.916  66.894 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     3.94995    2.62041   1.507    0.138    
## avgCovidDeaths  0.39098    0.03202  12.209   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.16 on 49 degrees of freedom
## Multiple R-squared:  0.7526, Adjusted R-squared:  0.7476 
## F-statistic: 149.1 on 1 and 49 DF,  p-value: < 2.2e-16
anova(model1)
## Analysis of Variance Table
## 
## Response: shortStaff
##                Df Sum Sq Mean Sq F value    Pr(>F)    
## avgCovidDeaths  1  38916   38916  149.06 < 2.2e-16 ***
## Residuals      49  12792     261                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Variable Selection

Pairs plot

#Data-set we'll use for pairs plot 
COVID <- covid%>%
  select("medFamInc", "percentUsedBedsCovid", "percentInpatientsCovid", "avgSSPC", "avgCDPC")

#create a pairs plot to find correlations
ggpairs(COVID)+ theme_bw()
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_density).

Creating a linear model

modA

Building and plotting model A

## Mod A: ANOVA
ggplot(data=covid, aes(y=percentInpatientsCovid, fill=incLev))+
  geom_boxplot()

modA<-lm(percentInpatientsCovid~incLev, data=covid)
anova(modA)
## Analysis of Variance Table
## 
## Response: percentInpatientsCovid
##           Df   Sum Sq   Mean Sq F value Pr(>F)
## incLev     2 0.007872 0.0039359    0.79 0.4597
## Residuals 48 0.239147 0.0049822

Calculating the MSE value and building a simple linear regression model

#to calculate MSE
summary(modA)
## 
## Call:
## lm(formula = percentInpatientsCovid ~ incLev, data = covid)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.111767 -0.043854 -0.006698  0.035112  0.210524 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.17022    0.01712   9.943 3.05e-13 ***
## incLevlow    0.02574    0.02421   1.063    0.293    
## incLevmed   -0.00119    0.02421  -0.049    0.961    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.07058 on 48 degrees of freedom
## Multiple R-squared:  0.03187,    Adjusted R-squared:  -0.008472 
## F-statistic:  0.79 on 2 and 48 DF,  p-value: 0.4597
mean(modA$residuals^2)
## [1] 0.004689151
#Creating a SLR Model that has points color coordinated to the income level of that state (low, med, high)
#create model
ggplot(data=covid, aes(y=percentInpatientsCovid, x=avgCDPC))+
  geom_point()
## Warning: Removed 1 rows containing missing values (geom_point).

modB

Building and plotting model B

modB<-lm(percentInpatientsCovid~avgCDPC, data=covid)

summary(modB)
## 
## Call:
## lm(formula = percentInpatientsCovid ~ avgCDPC, data = covid)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.054595 -0.019257 -0.001573  0.014418  0.075097 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 7.121e-02  8.602e-03   8.277 8.42e-11 ***
## avgCDPC     2.163e+04  1.485e+03  14.564  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03065 on 48 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.8155, Adjusted R-squared:  0.8116 
## F-statistic: 212.1 on 1 and 48 DF,  p-value: < 2.2e-16
anova(modB)
## Analysis of Variance Table
## 
## Response: percentInpatientsCovid
##           Df   Sum Sq  Mean Sq F value    Pr(>F)    
## avgCDPC    1 0.199254 0.199254  212.11 < 2.2e-16 ***
## Residuals 48 0.045091 0.000939                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Calculating the MSE value and building a simple linear regression model

#to calculate MSE
summary(modB)
## 
## Call:
## lm(formula = percentInpatientsCovid ~ avgCDPC, data = covid)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.054595 -0.019257 -0.001573  0.014418  0.075097 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 7.121e-02  8.602e-03   8.277 8.42e-11 ***
## avgCDPC     2.163e+04  1.485e+03  14.564  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03065 on 48 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.8155, Adjusted R-squared:  0.8116 
## F-statistic: 212.1 on 1 and 48 DF,  p-value: < 2.2e-16
mean(modB$residuals^2)
## [1] 0.0009018205
#plot the model
ggplot(data=covid, aes(y=percentInpatientsCovid, x=avgCDPC))+
  geom_point()
## Warning: Removed 1 rows containing missing values (geom_point).

Factoring our categorical variable (median family income by state)

#Factoring our categorical variable by creating a "dummy" variable
##Coding dummy variable
covid$incLev <- factor(covid$incLev,
                       levels = c("high", "med", "low"))
contrasts(covid$incLev)
##      med low
## high   0   0
## med    1   0
## low    0   1
##Coding dummy variable
covid$incLev <- factor(covid$incLev,
                       levels = c("high", "med", "low"))
contrasts(covid$incLev)
##      med low
## high   0   0
## med    1   0
## low    0   1

Creating a multiple linear regression model

modC

Building, exploring, plotting, and analyzing this model

modC<-lm(percentInpatientsCovid~avgCDPC+incLev, data=covid)

#to calculate MSE
summary(modC)
## 
## Call:
## lm(formula = percentInpatientsCovid ~ avgCDPC + incLev, data = covid)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.058781 -0.019145 -0.002274  0.014701  0.065998 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.932e-02  9.554e-03   8.302 1.07e-10 ***
## avgCDPC      2.284e+04  1.502e+03  15.213  < 2e-16 ***
## incLevmed   -1.383e-02  1.021e-02  -1.355   0.1820    
## incLevlow   -2.783e-02  1.072e-02  -2.596   0.0126 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02924 on 46 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.839,  Adjusted R-squared:  0.8286 
## F-statistic: 79.93 on 3 and 46 DF,  p-value: < 2.2e-16
mean(modC$residuals^2)
## [1] 0.0007865488
#plot the model
ggplot(data=covid, aes(x=avgCDPC, y=percentInpatientsCovid, color=incLev))+
  geom_point()+
  ggtitle("Scatterplot of Average COVID deaths per cap v. % Inpatients COVID")+
  theme_bw()+
  geom_abline(slope=modC$coefficients[2], intercept = modC$coefficients[1], col=2)+
  geom_abline(slope=modC$coefficients[2], intercept = modC$coefficients[1]+modC$coefficients[3])
## Warning: Removed 1 rows containing missing values (geom_point).

Dichotomizing the staffing shortage data into a binary categorical variable

Creating a binary categorical variable to show staffing shortage levels, which we divided between “extremely” and “somewhat”

delta <- covid%>%
  select("state", "avgSSPC")

delta%>% 
  arrange(avgSSPC)
## # A tibble: 51 x 2
##    state     avgSSPC
##    <chr>       <dbl>
##  1 KY    0.000000179
##  2 UT    0.000000221
##  3 FL    0.000000400
##  4 NY    0.000000442
##  5 OR    0.000000472
##  6 NC    0.000000516
##  7 MD    0.000000522
##  8 NJ    0.00000102 
##  9 NV    0.00000104 
## 10 CO    0.00000106 
## # ... with 41 more rows
extremely<-covid%>%
  filter(avgSSPC > 4.39264e-06)%>%
  mutate(staffLev=1)

somewhat<-covid%>%
  filter(avgSSPC <=4.39264e-06)%>%
  mutate(staffLev=0)

shortageLev<-rbind(somewhat, extremely)
view(shortageLev)
covid<- left_join(covid, shortageLev)
## Joining, by = c("state", "avgCovidDeaths", "stateName", "medFamInc", "incLev", "shortStaff", "percentUsedBedsCovid", "totalConfirmedCovidPatients", "percentInpatientsCovid", "rank", "census2020", "percentTotalUSPop", "avgSSPC", "avgCDPC")

GLMs (generalized linear models)

Creating a qplot

qplot(x = avgCDPC, y = staffLev, data = shortageLev, 
      geom = "point", alpha = I(.5), ylab = "Staff Shortage Level") +
  stat_smooth(method = "glm", method.args = list(family = "binomial"),
              se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

Building the glm

m1 <- glm(staffLev ~ avgCDPC, data = shortageLev, family = "binomial")
summary(m1)
## 
## Call:
## glm(formula = staffLev ~ avgCDPC, family = "binomial", data = shortageLev)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4741  -0.8683  -0.5582   1.0430   1.9882  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)   
## (Intercept) -2.330e+00  7.584e-01  -3.072  0.00213 **
## avgCDPC      3.113e+05  1.218e+05   2.556  0.01059 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 64.104  on 49  degrees of freedom
## Residual deviance: 56.212  on 48  degrees of freedom
## AIC: 60.212
## 
## Number of Fisher Scoring iterations: 3

Testing and training the model

Setting the seed and then randomly splitting our data so we can test and train

#Set seed
set.seed(1)

#Randomly splitting the data
train_indices <- sample(1:nrow(shortageLev), size = floor(nrow(shortageLev)/2))

train_data <- shortageLev %>%
  slice(train_indices)

test_data <- shortageLev %>%
  slice(-train_indices) 

Training the data

mod1 <- glm(avgCDPC ~ staffLev, data = train_data, family = "binomial")
mod2 <- glm(avgCDPC ~ staffLev + percentUsedBedsCovid, data = train_data, family = "binomial")

Testing the data

test1 <- predict(mod1, newdata = test_data, type = "response")

test_mat1<-data.frame(staffLev=test_data$staffLev, predstaffLev=test1>.5)%>%
  group_by(staffLev, predstaffLev)%>%
  summarise(n=n())
## `summarise()` has grouped output by 'staffLev'. You can override using the `.groups` argument.
test_mat1
## # A tibble: 2 x 3
## # Groups:   staffLev [2]
##   staffLev predstaffLev     n
##      <dbl> <lgl>        <int>
## 1        0 FALSE           17
## 2        1 FALSE            8
test2 <- predict(mod2, newdata = test_data, type = "response")

test_mat2<-data.frame(staffLev=test_data$staffLev, predstaffLev=test2>.5)%>%
  group_by(staffLev, predstaffLev)%>%
  summarise(n=n())
## `summarise()` has grouped output by 'staffLev'. You can override using the `.groups` argument.
test_mat2
## # A tibble: 2 x 3
## # Groups:   staffLev [2]
##   staffLev predstaffLev     n
##      <dbl> <lgl>        <int>
## 1        0 FALSE           17
## 2        1 FALSE            8

Confusion Matrix

pred1 <- predict(mod1, newdata = shortageLev, type = "response")

conf_mat1<-data.frame(staffLev=shortageLev$staffLev, predstaffLev=pred1>.5)%>%
  group_by(staffLev, predstaffLev)%>%
  summarise(n=n())
## `summarise()` has grouped output by 'staffLev'. You can override using the `.groups` argument.
conf_mat1
## # A tibble: 2 x 3
## # Groups:   staffLev [2]
##   staffLev predstaffLev     n
##      <dbl> <lgl>        <int>
## 1        0 FALSE           33
## 2        1 FALSE           17
pred2 <- predict(mod2, newdata = , type = "response")

conf_mat2<-data.frame(staffLev=shortageLev$staffLev, predstaffLev=pred2>.5)%>%
  group_by(staffLev, predstaffLev)%>%
  summarise(n=n())
## Warning in data.frame(staffLev = shortageLev$staffLev, predstaffLev = pred2 > :
## row names were found from a short variable and have been discarded
## `summarise()` has grouped output by 'staffLev'. You can override using the `.groups` argument.
conf_mat2
## # A tibble: 2 x 3
## # Groups:   staffLev [2]
##   staffLev predstaffLev     n
##      <dbl> <lgl>        <int>
## 1        0 FALSE           33
## 2        1 FALSE           17