This is the code for Group 4’s Final Project for Applications of Data Mining, at written and commented by Teresa Ortyl:
#importing the main library (this has ggplot2 in it too)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.2
## corrplot 0.92 loaded
#reading in the data file
#assumes the working directory has the csv file
dataset <- read_delim("./Final_Data.csv", delim = ",")
## Rows: 3043 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Facility Name, City, State, Ownership, Median Household income
## dbl (15): Facility ID, Zipcode, Patient Survey Star Rating, Hospital overall...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
summary(dataset) #just looking at a summary of the data set
## Facility ID Facility Name City, State Zipcode
## Min. : 10001 Length:3043 Length:3043 Min. : 674
## 1st Qu.:120021 Class :character Class :character 1st Qu.:30116
## Median :250002 Mode :character Mode :character Median :49801
## Mean :260004 Mean :51961
## 3rd Qu.:390049 3rd Qu.:76050
## Max. :670122 Max. :99901
## Ownership Patient Survey Star Rating Hospital overall rating
## Length:3043 Min. :1.000 Min. :1.000
## Class :character 1st Qu.:3.000 1st Qu.:2.000
## Mode :character Median :4.000 Median :3.000
## Mean :3.438 Mean :3.234
## 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000
## Nurse Communication Doctor Communication Staff Responsiveness
## Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:3.000
## Median :4.000 Median :3.000 Median :3.000
## Mean :3.438 Mean :3.026 Mean :3.308
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000
## Communication About Medicines Discharge Information Care Transition
## Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:2.000
## Median :3.000 Median :3.000 Median :3.000
## Mean :3.126 Mean :3.119 Mean :2.663
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:3.000
## Max. :5.000 Max. :5.000 Max. :5.000
## Cleanliness Quietness Overall Hospital Rating Recommend Hospital
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:3.000 1st Qu.:3.000
## Median :3.000 Median :3.000 Median :3.000 Median :3.000
## Mean :3.027 Mean :2.941 Mean :3.237 Mean :3.193
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## Summary Median Household income
## Min. :1.00 Length:3043
## 1st Qu.:3.00 Class :character
## Median :3.00 Mode :character
## Mean :3.13
## 3rd Qu.:4.00
## Max. :5.00
#from this, I saw that median household income was a character column
#I changed it to an integer one to make the linear models as seen below:
dataset$`Median Household income` <- as.integer(dataset$`Median Household income`)
## Warning: NAs introduced by coercion
#checking for correlations
nona <-na.omit(dataset[6:19])
corro<-cor(nona) #none of them seem to be strongly correlated with median household income
corrplot(corro,type="lower",diag=FALSE)
#making some linear models
#I don't discuss the intercept since that's always significant at the .001
model1=lm(`Median Household income`~`Patient Survey Star Rating`,data=dataset)
summary(model1) #is not significant (p-value of slope is .828)
##
## Call:
## lm(formula = `Median Household income` ~ `Patient Survey Star Rating`,
## data = dataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -48006 -16045 -6390 10159 147919
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 62782 1753 35.822 <2e-16 ***
## `Patient Survey Star Rating` -107 493 -0.217 0.828
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24660 on 3019 degrees of freedom
## (22 observations deleted due to missingness)
## Multiple R-squared: 1.56e-05, Adjusted R-squared: -0.0003156
## F-statistic: 0.04709 on 1 and 3019 DF, p-value: 0.8282
model2=lm(`Median Household income`~`Hospital overall rating`,data=dataset)
summary(model2) #is significant at .001 level (p-value of slope is <2e-16)
##
## Call:
## lm(formula = `Median Household income` ~ `Hospital overall rating`,
## data = dataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -50297 -15795 -6067 9635 148854
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 50112.1 1357.8 36.907 <2e-16 ***
## `Hospital overall rating` 3804.8 397.1 9.582 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24290 on 3019 degrees of freedom
## (22 observations deleted due to missingness)
## Multiple R-squared: 0.02952, Adjusted R-squared: 0.02919
## F-statistic: 91.82 on 1 and 3019 DF, p-value: < 2.2e-16
#however, adjusted r-squared is .02919, which is very low
model3=lm(`Median Household income`~`Nurse Communication`,data=dataset)
summary(model3) #is not significant (p-value of slope is .828)
##
## Call:
## lm(formula = `Median Household income` ~ `Nurse Communication`,
## data = dataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -48006 -16045 -6390 10159 147919
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 62782 1753 35.822 <2e-16 ***
## `Nurse Communication` -107 493 -0.217 0.828
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24660 on 3019 degrees of freedom
## (22 observations deleted due to missingness)
## Multiple R-squared: 1.56e-05, Adjusted R-squared: -0.0003156
## F-statistic: 0.04709 on 1 and 3019 DF, p-value: 0.8282
model4=lm(`Median Household income`~`Doctor Communication`,data=dataset)
summary(model4) #is significant at .1 level (p-value of slope is .0506)
##
## Call:
## lm(formula = `Median Household income` ~ `Doctor Communication`,
## data = dataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -47982 -16118 -6353 10466 147943
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 65013.4 1402.8 46.344 <2e-16 ***
## `Doctor Communication` -858.9 439.3 -1.955 0.0506 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24640 on 3019 degrees of freedom
## (22 observations deleted due to missingness)
## Multiple R-squared: 0.001265, Adjusted R-squared: 0.000934
## F-statistic: 3.823 on 1 and 3019 DF, p-value: 0.05064
#adjusted r square is also very low at .000934
model5=lm(`Median Household income`~`Staff Responsiveness`,data=dataset)
summary(model5) #is significant at .001 level (p-value of slope is 5.75e-5)
##
## Call:
## lm(formula = `Median Household income` ~ `Staff Responsiveness`,
## data = dataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -50163 -16004 -6077 10482 147445
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 67983.6 1453.1 46.787 < 2e-16 ***
## `Staff Responsiveness` -1682.9 417.7 -4.029 5.75e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24590 on 3019 degrees of freedom
## (22 observations deleted due to missingness)
## Multiple R-squared: 0.005347, Adjusted R-squared: 0.005018
## F-statistic: 16.23 on 1 and 3019 DF, p-value: 5.749e-05
#however, adjusted r-squared is .005018, which is very low
model6=lm(`Median Household income`~`Communication About Medicines`,data=dataset)
summary(model6) #is significant at .05 level (p-value of slope is .00655)
##
## Call:
## lm(formula = `Median Household income` ~ `Communication About Medicines`,
## data = dataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49501 -16121 -6302 10492 147793
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 66692.1 1635.0 40.790 < 2e-16 ***
## `Communication About Medicines` -1368.3 502.9 -2.721 0.00655 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24630 on 3019 degrees of freedom
## (22 observations deleted due to missingness)
## Multiple R-squared: 0.002446, Adjusted R-squared: 0.002115
## F-statistic: 7.401 on 1 and 3019 DF, p-value: 0.006554
#however, adjusted r-squared is .002115, which is very low
model7=lm(`Median Household income`~`Discharge Information`,data=dataset)
summary(model7) #is not significant (p-value of slope is .678)
##
## Call:
## lm(formula = `Median Household income` ~ `Discharge Information`,
## data = dataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -47935 -16083 -6426 10126 148199
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 61764.9 1624.6 38.018 <2e-16 ***
## `Discharge Information` 208.3 500.9 0.416 0.678
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24660 on 3019 degrees of freedom
## (22 observations deleted due to missingness)
## Multiple R-squared: 5.727e-05, Adjusted R-squared: -0.0002739
## F-statistic: 0.1729 on 1 and 3019 DF, p-value: 0.6776
model8=lm(`Median Household income`~`Care Transition`,data=dataset)
summary(model8) #is significant at .05 level (p-value of slope is .00353)
##
## Call:
## lm(formula = `Median Household income` ~ `Care Transition`, data = dataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -47828 -16015 -6275 10175 148834
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 58913.7 1280.0 46.027 < 2e-16 ***
## `Care Transition` 1315.9 450.8 2.919 0.00353 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24620 on 3019 degrees of freedom
## (22 observations deleted due to missingness)
## Multiple R-squared: 0.002815, Adjusted R-squared: 0.002485
## F-statistic: 8.523 on 1 and 3019 DF, p-value: 0.003533
#however, adjusted r-squared is .002485, which is very low
model9=lm(`Median Household income`~Cleanliness,data=dataset)
summary(model9) #is not significant (p-value of slope is .503)
##
## Call:
## lm(formula = `Median Household income` ~ Cleanliness, data = dataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -48562 -15990 -6432 10209 147957
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 63314.6 1418.5 44.636 <2e-16 ***
## Cleanliness -297.2 444.1 -0.669 0.503
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24660 on 3019 degrees of freedom
## (22 observations deleted due to missingness)
## Multiple R-squared: 0.0001483, Adjusted R-squared: -0.0001829
## F-statistic: 0.4477 on 1 and 3019 DF, p-value: 0.5035
model10=lm(`Median Household income`~Quietness,data=dataset)
summary(model10) #is significant at .001 level (p-value of slope is 2.63e-09)
##
## Call:
## lm(formula = `Median Household income` ~ Quietness, data = dataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -50257 -16053 -6303 10276 145668
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 69586.3 1281.2 54.314 < 2e-16 ***
## Quietness -2437.2 408.1 -5.971 2.63e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24510 on 3019 degrees of freedom
## (22 observations deleted due to missingness)
## Multiple R-squared: 0.01167, Adjusted R-squared: 0.01135
## F-statistic: 35.66 on 1 and 3019 DF, p-value: 2.625e-09
#however, adjusted r-squared is .01135, which is very low
model11=lm(`Median Household income`~`Overall Hospital Rating`,data=dataset)
summary(model11) #is significant at .001 level (p-value of slope is 2.39e-09)
##
## Call:
## lm(formula = `Median Household income` ~ `Overall Hospital Rating`,
## data = dataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49689 -15763 -6322 9755 148675
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 52652 1690 31.147 < 2e-16 ***
## `Overall Hospital Rating` 3018 504 5.987 2.39e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24510 on 3019 degrees of freedom
## (22 observations deleted due to missingness)
## Multiple R-squared: 0.01173, Adjusted R-squared: 0.01141
## F-statistic: 35.85 on 1 and 3019 DF, p-value: 2.386e-09
#however, adjusted r-squared is .01141, which is very low
#this is called model115 because I forgot in it the first round of model-building
model115=lm(`Median Household income`~`Recommend Hospital`,data=dataset)
summary(model115) #is significant at .001 level (p-value of slope is 2e-16)
##
## Call:
## lm(formula = `Median Household income` ~ `Recommend Hospital`,
## data = dataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -51147 -15588 -5898 9459 148846
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 47596.0 1612.0 29.526 <2e-16 ***
## `Recommend Hospital` 4646.2 486.1 9.559 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24290 on 3019 degrees of freedom
## (22 observations deleted due to missingness)
## Multiple R-squared: 0.02938, Adjusted R-squared: 0.02905
## F-statistic: 91.37 on 1 and 3019 DF, p-value: < 2.2e-16
#however, adjusted r-squared is .02905, which is very low
model12=lm(`Median Household income`~Summary,data=dataset)
summary(model12) #is not significant (p-value of slope is .681)
##
## Call:
## lm(formula = `Median Household income` ~ Summary, data = dataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -48207 -15980 -6402 10142 147937
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 63100.5 1729.1 36.492 <2e-16 ***
## Summary -219.3 533.5 -0.411 0.681
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24660 on 3019 degrees of freedom
## (22 observations deleted due to missingness)
## Multiple R-squared: 5.595e-05, Adjusted R-squared: -0.0002753
## F-statistic: 0.1689 on 1 and 3019 DF, p-value: 0.6811
#Additionally, I will make a model to see if median household income can be predicted
#based on the hospital survey ratings.
model13=lm(`Median Household income`~Summary+`Overall Hospital Rating`+Quietness+Cleanliness+
`Care Transition`+`Discharge Information`+`Communication About Medicines`+
`Staff Responsiveness`+`Doctor Communication`+`Nurse Communication`+
`Hospital overall rating`+`Recommend Hospital`+`Patient Survey Star Rating`,data=dataset)
summary(model13) #Adjusted R-squared was .1301
##
## Call:
## lm(formula = `Median Household income` ~ Summary + `Overall Hospital Rating` +
## Quietness + Cleanliness + `Care Transition` + `Discharge Information` +
## `Communication About Medicines` + `Staff Responsiveness` +
## `Doctor Communication` + `Nurse Communication` + `Hospital overall rating` +
## `Recommend Hospital` + `Patient Survey Star Rating`, data = dataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -56240 -14905 -3844 9845 143686
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 58003.8 1986.8 29.195 < 2e-16 ***
## Summary 1154.3 1452.3 0.795 0.426780
## `Overall Hospital Rating` 3869.3 1132.7 3.416 0.000644 ***
## Quietness -4175.8 522.0 -8.000 1.76e-15 ***
## Cleanliness 634.3 576.9 1.099 0.271687
## `Care Transition` 723.1 824.7 0.877 0.380693
## `Discharge Information` -1795.6 695.3 -2.583 0.009853 **
## `Communication About Medicines` -3065.5 780.6 -3.927 8.78e-05 ***
## `Staff Responsiveness` -3604.5 697.5 -5.167 2.53e-07 ***
## `Doctor Communication` -2257.1 689.8 -3.272 0.001079 **
## `Nurse Communication` -1513.1 869.6 -1.740 0.081966 .
## `Hospital overall rating` 4838.7 482.7 10.025 < 2e-16 ***
## `Recommend Hospital` 6339.5 944.7 6.711 2.31e-11 ***
## `Patient Survey Star Rating` NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22990 on 3008 degrees of freedom
## (22 observations deleted due to missingness)
## Multiple R-squared: 0.1336, Adjusted R-squared: 0.1301
## F-statistic: 38.65 on 12 and 3008 DF, p-value: < 2.2e-16
#this is the best model made
#Model 14 includes only the features significant at the .001 level from model 13
model14=lm(`Median Household income`~`Overall Hospital Rating`+Quietness+
`Communication About Medicines`+`Staff Responsiveness`+`Doctor Communication`+
`Hospital overall rating`+`Recommend Hospital`,data=dataset)
summary(model14) #Adjusted R-squared was .1281
##
## Call:
## lm(formula = `Median Household income` ~ `Overall Hospital Rating` +
## Quietness + `Communication About Medicines` + `Staff Responsiveness` +
## `Doctor Communication` + `Hospital overall rating` + `Recommend Hospital`,
## data = dataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57151 -14792 -3866 9899 145909
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 55824.1 1760.9 31.701 < 2e-16 ***
## `Overall Hospital Rating` 3667.1 1088.7 3.368 0.000766 ***
## Quietness -3823.1 499.9 -7.648 2.73e-14 ***
## `Communication About Medicines` -3163.9 726.1 -4.358 1.36e-05 ***
## `Staff Responsiveness` -3712.5 613.2 -6.054 1.59e-09 ***
## `Doctor Communication` -2394.2 628.5 -3.809 0.000142 ***
## `Hospital overall rating` 4731.8 476.7 9.927 < 2e-16 ***
## `Recommend Hospital` 6302.6 911.4 6.915 5.69e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23020 on 3013 degrees of freedom
## (22 observations deleted due to missingness)
## Multiple R-squared: 0.1301, Adjusted R-squared: 0.1281
## F-statistic: 64.38 on 7 and 3013 DF, p-value: < 2.2e-16
#which is not as good as using all the features
#but better than any of the models for predicting star ratings
#Some Visualizations
#5 histograms of median household income based on the overall hospital rating
#to illustrate why all the models are terrible
#you will need to click zoom to read the numbers on the x-axis
ggplot(dataset)+geom_histogram(mapping=aes(x=`Median Household income`),color="black",fill="navyblue")+facet_grid(.~`Overall Hospital Rating`)+
theme_bw()+labs(title="Distribution of Median Household Income in Zip Code by Overall Hospital Rating",x="Median Household Income in Zip Code")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 22 rows containing non-finite values (`stat_bin()`).
#Diagnostic Plots for Model 14, which is probably the most usable model
#this code can be copied for the plots for any other model
par(mfrow = c(2,2), mar = c(5,4,2,2))
plot(model13)
par(mfrow = c(1,1))
#Ownership vs Median Household Income Box-plots (Again, Zoom to See x-axis labels clearer)
#this shows type of hospital has no real effect on median household income so we didn't
#need to worry about it in our analyses
ggplot(dataset)+geom_boxplot(mapping=aes(x=Ownership,y=`Median Household income`,fill=Ownership))+
theme_bw()+labs(title="Boxplots of Median Household Income by Type of Hospital")
## Warning: Removed 22 rows containing non-finite values (`stat_boxplot()`).
#Histogram of Median Income
ggplot(dataset)+geom_histogram(mapping=aes(x=`Median Household income`),color="black",fill="violet")+
theme_bw()+labs(title="Distribution of Median Household Income in Zip Code",x="Median Household Income in Zip Code")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 22 rows containing non-finite values (`stat_bin()`).