#Install everything we need
#install.packages("dplyr")
#install.packages("tidyr")
#install.packages("ggplot2")
#install.packages("tidyverse")
#install.packages("GGally")
#Load packages into global environment
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.4 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## v purrr 0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(GGally)
## Warning: package 'GGally' was built under R version 4.1.2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(ISLR)
library(readxl)
#Load our data sets
Hospital_capacity_RAW <- read_excel("C:/Users/jessi/Downloads/Hospital_capacity_RAW.xls")
StateMedInc <- read_excel("C:/Users/jessi/Downloads/StateMedInc.xls")
StatePop <-
read_excel("C:/Users/Jessi/Downloads/StatePop.xlsx")
SelectDataHosp <-Hospital_capacity_RAW %>%
select(state,
date,
critical_staffing_shortage_today_yes,
critical_staffing_shortage_today_no,
critical_staffing_shortage_today_not_reported,
hospital_onset_covid,
inpatient_beds,
inpatient_beds_used,
inpatient_beds_used_covid,
staffed_adult_icu_bed_occupancy,
staffed_icu_adult_patients_confirmed_and_suspected_covid,
staffed_icu_adult_patients_confirmed_covid,
total_adult_patients_hospitalized_confirmed_and_suspected_covid,
total_adult_patients_hospitalized_confirmed_covid,
total_pediatric_patients_hospitalized_confirmed_and_suspected_covid,
total_pediatric_patients_hospitalized_confirmed_covid,
total_staffed_adult_icu_beds,
inpatient_beds_utilization,
percent_of_inpatients_with_covid,
deaths_covid)
Data <- SelectDataHosp %>%
filter(year(date)==2021,
month(date)==1)
#Getting January 2021 average number of hospitals with staffing shortages by state each day
shortStaffMeans <- Data %>%
group_by(state)%>%
summarize(mean(critical_staffing_shortage_today_yes, na.rm = TRUE))
#Renaming the columns
colnames(shortStaffMeans)<- c("state", "shortStaff")
colnames(StateMedInc) <- c("StateName", "medFamInc", "state")
#Getting January 2021 average daily COVID deaths by state
covidDeathMean <- SelectDataHosp%>%
group_by(state)%>%
filter(year(date)==2021,
month(date)==1)%>%
summarize(mean(deaths_covid, na.rm=TRUE))
#Renaming the columns
colnames(covidDeathMean) <- c("state", "avgCovidDeaths")
#Percent beds used by COVID patients
percentBedsCovid <- Data%>%
group_by(state)%>%
filter(year(date)==2021,
month(date)==1)%>%
summarize(mean(inpatient_beds_used_covid/inpatient_beds_used, na.rm=TRUE))
percentBedsCovid<- percentBedsCovid[-c(40, 48),]
colnames(percentBedsCovid) <- c("state", "percentUsedBedsCovid")
#Avg. confirmed COVID cases in hospitals in each state
totalConfirmedCovid <- Data%>%
group_by(state)%>%
filter(year(date)==2021,
month(date)==1)%>%
summarize(mean(total_adult_patients_hospitalized_confirmed_covid + total_pediatric_patients_hospitalized_confirmed_covid))
totalConfirmedCovid<- totalConfirmedCovid[-c(40, 48),]
colnames(totalConfirmedCovid) <- c("state", "totalConfirmedCovidPatients")
#Percent of inpatients with COVID
percentCovid <- Data%>%
group_by(state)%>%
filter(year(date)==2021,
month(date)==1)%>%
summarize(mean(percent_of_inpatients_with_covid))
percentCovid<- percentCovid[-c(40, 48),]
colnames(percentCovid) <- c("state", "PercentInpatientsCovid")
#Creating median family income levels
StateMedInc%>%
arrange(medFamInc)
## # A tibble: 51 x 3
## StateName medFamInc state
## <chr> <dbl> <chr>
## 1 Mississippi 45081 MS
## 2 West Virginia 46711 WV
## 3 Arkansas 47597 AR
## 4 Louisiana 49469 LA
## 5 New Mexico 49754 NM
## 6 Alabama 50536 AL
## 7 Kentucky 50589 KY
## 8 Oklahoma 52919 OK
## 9 South Carolina 53199 SC
## 10 Tennessee 53320 TN
## # ... with 41 more rows
low<-StateMedInc%>%
filter(medFamInc <=56602)%>%
mutate(incLev="low")
med<-StateMedInc%>%
filter(medFamInc > 56602 & medFamInc <=65886)%>%
mutate(incLev="med")
high<-StateMedInc%>%
filter(medFamInc >= 67167)%>%
mutate(incLev="high")
stateLev<-rbind(low, med, high)
View(stateLev)
#Creating final usable data set
covidDeathMean<- covidDeathMean[-c(40,48),]
shortStaffMeans<- shortStaffMeans[-c(40, 48),]
joinedData <- left_join(covidDeathMean, stateLev)
## Joining, by = "state"
joinedData <- left_join(joinedData, shortStaffMeans)
## Joining, by = "state"
joinedData <- left_join(joinedData, percentBedsCovid)
## Joining, by = "state"
joinedData <- left_join(joinedData, totalConfirmedCovid)
## Joining, by = "state"
joinedData <- left_join(joinedData, percentCovid)
## Joining, by = "state"
joinedData <- left_join(joinedData, StatePop)
## Joining, by = "StateName"
#Renaming columns
colnames(joinedData)<-c("state", "avgCovidDeaths", "stateName", "medFamInc", "incLev", "shortStaff", "percentUsedBedsCovid", "totalConfirmedCovidPatients", "percentInpatientsCovid", "rank", "census2020", "percentTotalUSPop")
covid<-joinedData%>%
mutate(avgCDPC=avgCovidDeaths/census2020)
covid<-joinedData%>%
mutate(avgSSPC=shortStaff/census2020)
avgSSPC <- covid%>%
select("shortStaff", "census2020")%>%
mutate(avgSSPC=shortStaff/census2020)
avgCDPC <- covid%>%
select("avgCovidDeaths", "census2020")%>%
mutate(avgCDPC=avgCovidDeaths/census2020)
covid <- left_join(covid, avgSSPC)
## Joining, by = c("shortStaff", "census2020", "avgSSPC")
covid <- left_join(covid, avgCDPC)
## Joining, by = c("avgCovidDeaths", "census2020")
Looking at our final data set’s structure & variables
str(covid)
## tibble [51 x 14] (S3: tbl_df/tbl/data.frame)
## $ state : chr [1:51] "AK" "AL" "AR" "AZ" ...
## $ avgCovidDeaths : num [1:51] 0.677 50.774 22.516 85.806 451.677 ...
## $ stateName : chr [1:51] "Alaska" "Alabama" "Arkansas" "Arizona" ...
## $ medFamInc : num [1:51] 77640 50536 47597 58945 75235 ...
## $ incLev : chr [1:51] "high" "low" "low" "med" ...
## $ shortStaff : num [1:51] 1 38 17.6 31.8 173.7 ...
## $ percentUsedBedsCovid : num [1:51] 0.074 0.245 0.195 0.358 0.376 ...
## $ totalConfirmedCovidPatients: num [1:51] 59.2 2752.3 1110.3 4208.4 19973.9 ...
## $ percentInpatientsCovid : num [1:51] 0.074 0.252 0.196 0.366 0.381 ...
## $ rank : num [1:51] 48 24 33 14 1 21 29 NA 45 3 ...
## $ census2020 : num [1:51] 733391 5024279 3011524 7151502 39538223 ...
## $ percentTotalUSPop : num [1:51] 0.0022 0.0148 0.0091 0.0219 0.1191 ...
## $ avgSSPC : num [1:51] 1.36e-06 7.57e-06 5.86e-06 4.45e-06 4.39e-06 ...
## $ avgCDPC : num [1:51] 9.24e-07 1.01e-05 7.48e-06 1.20e-05 1.14e-05 ...
names(covid)
## [1] "state" "avgCovidDeaths"
## [3] "stateName" "medFamInc"
## [5] "incLev" "shortStaff"
## [7] "percentUsedBedsCovid" "totalConfirmedCovidPatients"
## [9] "percentInpatientsCovid" "rank"
## [11] "census2020" "percentTotalUSPop"
## [13] "avgSSPC" "avgCDPC"
Merged average number of hospitals with staffing shortages in each state data with median household income by state data. Made a basic plot to see if staffing shortages might be impacted by a state’s median household income.
EDAQuestion1 <- merge(shortStaffMeans, StateMedInc)
ggplot(EDAQuestion1, aes(x = medFamInc, y = shortStaff))+
geom_point(color="blue")
Merged Jan 2021 daily COVID death average data by state with median household income data by state. Making a basic plot to see if there is a relationship between Avg. daily deaths and median household income by state.
EDAQuestion2 <- merge(covidDeathMean, StateMedInc)
ggplot(EDAQuestion2, aes(x = medFamInc, y = avgCovidDeaths))+
geom_point(color='purple')
Merging our two data sets together, average daily COVID deaths by state and average number of hospitals with staffing shortages by state. Making a plot to see if there is a relationship between average daily COVID deaths and average # of hospitals with staffing shortages daily.
EDAQuestion3 <- merge(covidDeathMean, shortStaffMeans)
ggplot(EDAQuestion3, aes(x = shortStaff, y = avgCovidDeaths))+
geom_point(color="red")+
scale_x_log10()+
scale_y_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
Creating a linear model…
model1 <- lm(shortStaff~avgCovidDeaths, data = EDAQuestion3)
summary(model1)
##
## Call:
## lm(formula = shortStaff ~ avgCovidDeaths, data = EDAQuestion3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -47.305 -5.375 -1.347 4.916 66.894
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.94995 2.62041 1.507 0.138
## avgCovidDeaths 0.39098 0.03202 12.209 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.16 on 49 degrees of freedom
## Multiple R-squared: 0.7526, Adjusted R-squared: 0.7476
## F-statistic: 149.1 on 1 and 49 DF, p-value: < 2.2e-16
slope <- model1$coefficients[2]
slope
## avgCovidDeaths
## 0.3909832
intercept <- model1$coefficients[1]
intercept
## (Intercept)
## 3.949949
…and plotting this model
ggplot(EDAQuestion3, aes(avgCovidDeaths, model1$residuals))+
geom_point()+
geom_hline(yintercept = 0,
color = "blue")
Making a plot to see if switching the axes the variables are on reveals a new relationship
ggplot(EDAQuestion3, aes(x = avgCovidDeaths, y = shortStaff))+
geom_point(color="orange")+
geom_abline(slope=model1$coefficients[2], intercept=model1$coefficients[1],color="blue", lty=2, lwd=1)
qqnorm(model1$residuals)
qqline(model1$residuals)
dev.off()
## null device
## 1
ggplot(EDAQuestion3, aes(avgCovidDeaths, model1$residuals))+
geom_point()+
ggtitle("Residual Plot")+
xlab("avgCovidDeaths")+
ylab("Residuals")+
theme_bw()+
geom_hline(yintercept = 0,
color="blue", lty=2, lwd=1)
#### Creating
plot(model1)
print(model1)
##
## Call:
## lm(formula = shortStaff ~ avgCovidDeaths, data = EDAQuestion3)
##
## Coefficients:
## (Intercept) avgCovidDeaths
## 3.950 0.391
Summary and anova table of this model
summary(model1)
##
## Call:
## lm(formula = shortStaff ~ avgCovidDeaths, data = EDAQuestion3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -47.305 -5.375 -1.347 4.916 66.894
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.94995 2.62041 1.507 0.138
## avgCovidDeaths 0.39098 0.03202 12.209 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.16 on 49 degrees of freedom
## Multiple R-squared: 0.7526, Adjusted R-squared: 0.7476
## F-statistic: 149.1 on 1 and 49 DF, p-value: < 2.2e-16
anova(model1)
## Analysis of Variance Table
##
## Response: shortStaff
## Df Sum Sq Mean Sq F value Pr(>F)
## avgCovidDeaths 1 38916 38916 149.06 < 2.2e-16 ***
## Residuals 49 12792 261
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Data-set we'll use for pairs plot
COVID <- covid%>%
select("medFamInc", "percentUsedBedsCovid", "percentInpatientsCovid", "avgSSPC", "avgCDPC")
#create a pairs plot to find correlations
ggpairs(COVID)+ theme_bw()
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_density).
Building and plotting model A
## Mod A: ANOVA
ggplot(data=covid, aes(y=percentInpatientsCovid, fill=incLev))+
geom_boxplot()
modA<-lm(percentInpatientsCovid~incLev, data=covid)
anova(modA)
## Analysis of Variance Table
##
## Response: percentInpatientsCovid
## Df Sum Sq Mean Sq F value Pr(>F)
## incLev 2 0.007872 0.0039359 0.79 0.4597
## Residuals 48 0.239147 0.0049822
Calculating the MSE value and building a simple linear regression model
#to calculate MSE
summary(modA)
##
## Call:
## lm(formula = percentInpatientsCovid ~ incLev, data = covid)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.111767 -0.043854 -0.006698 0.035112 0.210524
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.17022 0.01712 9.943 3.05e-13 ***
## incLevlow 0.02574 0.02421 1.063 0.293
## incLevmed -0.00119 0.02421 -0.049 0.961
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.07058 on 48 degrees of freedom
## Multiple R-squared: 0.03187, Adjusted R-squared: -0.008472
## F-statistic: 0.79 on 2 and 48 DF, p-value: 0.4597
mean(modA$residuals^2)
## [1] 0.004689151
#Creating a SLR Model that has points color coordinated to the income level of that state (low, med, high)
#create model
ggplot(data=covid, aes(y=percentInpatientsCovid, x=avgCDPC))+
geom_point()
## Warning: Removed 1 rows containing missing values (geom_point).
Building and plotting model B
modB<-lm(percentInpatientsCovid~avgCDPC, data=covid)
summary(modB)
##
## Call:
## lm(formula = percentInpatientsCovid ~ avgCDPC, data = covid)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.054595 -0.019257 -0.001573 0.014418 0.075097
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.121e-02 8.602e-03 8.277 8.42e-11 ***
## avgCDPC 2.163e+04 1.485e+03 14.564 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03065 on 48 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.8155, Adjusted R-squared: 0.8116
## F-statistic: 212.1 on 1 and 48 DF, p-value: < 2.2e-16
anova(modB)
## Analysis of Variance Table
##
## Response: percentInpatientsCovid
## Df Sum Sq Mean Sq F value Pr(>F)
## avgCDPC 1 0.199254 0.199254 212.11 < 2.2e-16 ***
## Residuals 48 0.045091 0.000939
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Calculating the MSE value and building a simple linear regression model
#to calculate MSE
summary(modB)
##
## Call:
## lm(formula = percentInpatientsCovid ~ avgCDPC, data = covid)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.054595 -0.019257 -0.001573 0.014418 0.075097
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.121e-02 8.602e-03 8.277 8.42e-11 ***
## avgCDPC 2.163e+04 1.485e+03 14.564 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03065 on 48 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.8155, Adjusted R-squared: 0.8116
## F-statistic: 212.1 on 1 and 48 DF, p-value: < 2.2e-16
mean(modB$residuals^2)
## [1] 0.0009018205
#plot the model
ggplot(data=covid, aes(y=percentInpatientsCovid, x=avgCDPC))+
geom_point()
## Warning: Removed 1 rows containing missing values (geom_point).
#Factoring our categorical variable by creating a "dummy" variable
##Coding dummy variable
covid$incLev <- factor(covid$incLev,
levels = c("high", "med", "low"))
contrasts(covid$incLev)
## med low
## high 0 0
## med 1 0
## low 0 1
##Coding dummy variable
covid$incLev <- factor(covid$incLev,
levels = c("high", "med", "low"))
contrasts(covid$incLev)
## med low
## high 0 0
## med 1 0
## low 0 1
Building, exploring, plotting, and analyzing this model
modC<-lm(percentInpatientsCovid~avgCDPC+incLev, data=covid)
#to calculate MSE
summary(modC)
##
## Call:
## lm(formula = percentInpatientsCovid ~ avgCDPC + incLev, data = covid)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.058781 -0.019145 -0.002274 0.014701 0.065998
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.932e-02 9.554e-03 8.302 1.07e-10 ***
## avgCDPC 2.284e+04 1.502e+03 15.213 < 2e-16 ***
## incLevmed -1.383e-02 1.021e-02 -1.355 0.1820
## incLevlow -2.783e-02 1.072e-02 -2.596 0.0126 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.02924 on 46 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.839, Adjusted R-squared: 0.8286
## F-statistic: 79.93 on 3 and 46 DF, p-value: < 2.2e-16
mean(modC$residuals^2)
## [1] 0.0007865488
#plot the model
ggplot(data=covid, aes(x=avgCDPC, y=percentInpatientsCovid, color=incLev))+
geom_point()+
ggtitle("Scatterplot of Average COVID deaths per cap v. % Inpatients COVID")+
theme_bw()+
geom_abline(slope=modC$coefficients[2], intercept = modC$coefficients[1], col=2)+
geom_abline(slope=modC$coefficients[2], intercept = modC$coefficients[1]+modC$coefficients[3])
## Warning: Removed 1 rows containing missing values (geom_point).
Creating a binary categorical variable to show staffing shortage levels, which we divided between “extremely” and “somewhat”
delta <- covid%>%
select("state", "avgSSPC")
delta%>%
arrange(avgSSPC)
## # A tibble: 51 x 2
## state avgSSPC
## <chr> <dbl>
## 1 KY 0.000000179
## 2 UT 0.000000221
## 3 FL 0.000000400
## 4 NY 0.000000442
## 5 OR 0.000000472
## 6 NC 0.000000516
## 7 MD 0.000000522
## 8 NJ 0.00000102
## 9 NV 0.00000104
## 10 CO 0.00000106
## # ... with 41 more rows
extremely<-covid%>%
filter(avgSSPC > 4.39264e-06)%>%
mutate(staffLev=1)
somewhat<-covid%>%
filter(avgSSPC <=4.39264e-06)%>%
mutate(staffLev=0)
shortageLev<-rbind(somewhat, extremely)
view(shortageLev)
covid<- left_join(covid, shortageLev)
## Joining, by = c("state", "avgCovidDeaths", "stateName", "medFamInc", "incLev", "shortStaff", "percentUsedBedsCovid", "totalConfirmedCovidPatients", "percentInpatientsCovid", "rank", "census2020", "percentTotalUSPop", "avgSSPC", "avgCDPC")
Creating a qplot
qplot(x = avgCDPC, y = staffLev, data = shortageLev,
geom = "point", alpha = I(.5), ylab = "Staff Shortage Level") +
stat_smooth(method = "glm", method.args = list(family = "binomial"),
se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
Building the glm
m1 <- glm(staffLev ~ avgCDPC, data = shortageLev, family = "binomial")
summary(m1)
##
## Call:
## glm(formula = staffLev ~ avgCDPC, family = "binomial", data = shortageLev)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4741 -0.8683 -0.5582 1.0430 1.9882
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.330e+00 7.584e-01 -3.072 0.00213 **
## avgCDPC 3.113e+05 1.218e+05 2.556 0.01059 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 64.104 on 49 degrees of freedom
## Residual deviance: 56.212 on 48 degrees of freedom
## AIC: 60.212
##
## Number of Fisher Scoring iterations: 3
Setting the seed and then randomly splitting our data so we can test and train
#Set seed
set.seed(1)
#Randomly splitting the data
train_indices <- sample(1:nrow(shortageLev), size = floor(nrow(shortageLev)/2))
train_data <- shortageLev %>%
slice(train_indices)
test_data <- shortageLev %>%
slice(-train_indices)
Training the data
mod1 <- glm(avgCDPC ~ staffLev, data = train_data, family = "binomial")
mod2 <- glm(avgCDPC ~ staffLev + percentUsedBedsCovid, data = train_data, family = "binomial")
Testing the data
test1 <- predict(mod1, newdata = test_data, type = "response")
test_mat1<-data.frame(staffLev=test_data$staffLev, predstaffLev=test1>.5)%>%
group_by(staffLev, predstaffLev)%>%
summarise(n=n())
## `summarise()` has grouped output by 'staffLev'. You can override using the `.groups` argument.
test_mat1
## # A tibble: 2 x 3
## # Groups: staffLev [2]
## staffLev predstaffLev n
## <dbl> <lgl> <int>
## 1 0 FALSE 17
## 2 1 FALSE 8
test2 <- predict(mod2, newdata = test_data, type = "response")
test_mat2<-data.frame(staffLev=test_data$staffLev, predstaffLev=test2>.5)%>%
group_by(staffLev, predstaffLev)%>%
summarise(n=n())
## `summarise()` has grouped output by 'staffLev'. You can override using the `.groups` argument.
test_mat2
## # A tibble: 2 x 3
## # Groups: staffLev [2]
## staffLev predstaffLev n
## <dbl> <lgl> <int>
## 1 0 FALSE 17
## 2 1 FALSE 8
pred1 <- predict(mod1, newdata = shortageLev, type = "response")
conf_mat1<-data.frame(staffLev=shortageLev$staffLev, predstaffLev=pred1>.5)%>%
group_by(staffLev, predstaffLev)%>%
summarise(n=n())
## `summarise()` has grouped output by 'staffLev'. You can override using the `.groups` argument.
conf_mat1
## # A tibble: 2 x 3
## # Groups: staffLev [2]
## staffLev predstaffLev n
## <dbl> <lgl> <int>
## 1 0 FALSE 33
## 2 1 FALSE 17
pred2 <- predict(mod2, newdata = , type = "response")
conf_mat2<-data.frame(staffLev=shortageLev$staffLev, predstaffLev=pred2>.5)%>%
group_by(staffLev, predstaffLev)%>%
summarise(n=n())
## Warning in data.frame(staffLev = shortageLev$staffLev, predstaffLev = pred2 > :
## row names were found from a short variable and have been discarded
## `summarise()` has grouped output by 'staffLev'. You can override using the `.groups` argument.
conf_mat2
## # A tibble: 2 x 3
## # Groups: staffLev [2]
## staffLev predstaffLev n
## <dbl> <lgl> <int>
## 1 0 FALSE 33
## 2 1 FALSE 17