##load libraries
library(haven)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
library(broom)
library(ggplot2)
library(readxl)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(nortest)
##Load data
hospital <- read.delim("C:/Users/okabe/OneDrive/Stats of Dem/hospital.txt")
View(hospital)
#Hospital administrators are interested in determining how hospitalization cost (y = cost) is related to length of stay (x = los) in the hospital. The dataset contains the reimbursed hospital costs and associated lengths of stay for a sample of 33 people. Use hospital.txt data to answer the following questions:
##1. Obtain the mean, standard deviation, min and max values for both x and y variables.
##describe works as well
summary(hospital$los)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 6.00 12.00 16.39 19.00 85.00
summary(hospital$cost)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1233 3193 5870 8810 12213 35381
sd(hospital$los)
## [1] 16.89478
sd(hospital$cost)
## [1] 7875.072
##2. Fit a simple linear regression model and write down the equation.
fit<-lm(cost~los, data=hospital)
summary(fit)
##
## Call:
## lm(formula = cost ~ los, data = hospital)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7590 -2742 -1214 2377 16052
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2706.96 1176.67 2.301 0.0283 *
## los 372.29 50.38 7.390 2.54e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4815 on 31 degrees of freedom
## Multiple R-squared: 0.6379, Adjusted R-squared: 0.6262
## F-statistic: 54.61 on 1 and 31 DF, p-value: 2.542e-08
coef(fit)
## (Intercept) los
## 2706.9613 372.2852
anova(fit)
## Analysis of Variance Table
##
## Response: cost
## Df Sum Sq Mean Sq F value Pr(>F)
## los 1 1265921342 1265921342 54.61 2.542e-08 ***
## Residuals 31 718614701 23181119
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## y= a+b*e
##y=2706.96+372.29xe
##3. What is the SST, SSE, and SSR for this model, respectively?
##SST 845206835
##SSE 1265921342
##SSR 718614701
##4. How do we interpret the coefficient and the intercept, respectively?
##Coefficient: For every unit increase in length of stay the hospitalization cost is expected to increase by $372.29
##Intercept: When the length of stay is is zero, the hospitalization cost is going to be 2706.96
##When x(los) is equal to zero, the intercept is 2706.96
##5. What is the R2? How do we interpret this?
## 0.6379
## 63% of the total variation in length of stay can be explained by the length of stay
##6. Does a linear function appear to fit the data well? If not, does the plot suggest any other potential problems with the model?
ggplot(hospital, aes(x=los, y=cost))+geom_point()+geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
##No, the linear function does not fix well. If suggests that there will be issues in linearity, and violation in normality
##Therefore, when a linear line is not fitted in the data, we see that the linear function does not fit well. (Please see plots below). Hence we need to fix the non-linearity and transform our independent variable before testing for normality and constant variance assumptions. This is done by transforming the x by transforming it using the log or square root of x
##after that we need to test the relationship between x and y by examining the linearity between x and y to see if the transformation helped
##In addition to the above, there is violation of constant variance which is fixed in question 10.
attach(hospital)
plot(y=cost, x=los)
fit1 <-lm(cost~los)
plot(fit1, which=1)
summary(fit)
##
## Call:
## lm(formula = cost ~ los, data = hospital)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7590 -2742 -1214 2377 16052
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2706.96 1176.67 2.301 0.0283 *
## los 372.29 50.38 7.390 2.54e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4815 on 31 degrees of freedom
## Multiple R-squared: 0.6379, Adjusted R-squared: 0.6262
## F-statistic: 54.61 on 1 and 31 DF, p-value: 2.542e-08
anova(fit1)
## Analysis of Variance Table
##
## Response: cost
## Df Sum Sq Mean Sq F value Pr(>F)
## los 1 1265921342 1265921342 54.61 2.542e-08 ***
## Residuals 31 718614701 23181119
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##7. Make a residuals vs. fits plot. Interpret the residuals vs. fits plot — which model assumption does it suggest is violated? Elaborate your answer.
plot(fit, which=1)
plot(fit)
Hosp1<-lm(cost~los, data=hospital)
summary(Hosp1)
##
## Call:
## lm(formula = cost ~ los, data = hospital)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7590 -2742 -1214 2377 16052
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2706.96 1176.67 2.301 0.0283 *
## los 372.29 50.38 7.390 2.54e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4815 on 31 degrees of freedom
## Multiple R-squared: 0.6379, Adjusted R-squared: 0.6262
## F-statistic: 54.61 on 1 and 31 DF, p-value: 2.542e-08
plot(x=Hosp1$fitted.values,y=Hosp1$residuals,xlab = "Fitted Values", ylab = "Residuals",pch=10, col="orange")
abline(Hosp1)
bptest(fit)
##
## studentized Breusch-Pagan test
##
## data: fit
## BP = 0.16079, df = 1, p-value = 0.6884
##The model shows that the assumption of constant variance is violated. This is because while looking at the residual vs. fitted plot, we see that the variables are more spread out from each other.
##However, there is violation of the variance of normality the dependent variable is what we need to fix
##This is fixed in question 9
##8. Test the normality of residuals assumption. Does the test provide evidence that the residuals are not normally distributed?
plot(fit, which=2)
bptest(fit)
##
## studentized Breusch-Pagan test
##
## data: fit
## BP = 0.16079, df = 1, p-value = 0.6884
ad.test(resid(fit))
##
## Anderson-Darling normality test
##
## data: resid(fit)
## A = 0.91897, p-value = 0.01714
##Yes it does
##There is violation of the variance of normality the dependent variable is what we need to fix.
##Therefore, the test that provides this evident is the AD test.
##9. To fix the problem identified in (7), what transformation would you use to fix the problem?
##Since the linear function does not fix well.
##We see that the linear function does not fit well. (Please see plots below). Hence we need to fix the non-linearity and transform our independent variable before testing for normality and constant variance assumptions. This is done by transforming the x by transforming it using the log or square root of x
##after that we need to test the relationship between x and y by examining the linearity between x and y to see if the transformation helped
##attach(hospital)
logcost<-log(cost)
plot5<-plot(los,logcost)
lines(lowess(los,logcost),col="purple")
fitr<- lm(los~logcost)
##10. To fix the problem identified in (6), what transformation would you use?
##To fix the violation of constant variance, we run a box cox analysis
Hosp1<-lm(cost~los, data=hospital)
summary(Hosp1)
##
## Call:
## lm(formula = cost ~ los, data = hospital)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7590 -2742 -1214 2377 16052
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2706.96 1176.67 2.301 0.0283 *
## los 372.29 50.38 7.390 2.54e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4815 on 31 degrees of freedom
## Multiple R-squared: 0.6379, Adjusted R-squared: 0.6262
## F-statistic: 54.61 on 1 and 31 DF, p-value: 2.542e-08
boxcox(Hosp1)
##This shows that the lamda is close to 0 so we would would transform the log of y. This is seen by using boxcox which shows us that our line is close to zero hence transforming the log of y
##The code above fixed non- linearity
attach(hospital)
## The following objects are masked from hospital (pos = 3):
##
## cost, los
logcost<-log(cost)
loglos<-log(los)
plot5<-plot(loglos,logcost)
lines(lowess(loglos,logcost),col="purple")
fitr<- lm(loglos~logcost)
ggplot(hospital, aes(x=loglos, y=logcost))+geom_point()+geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
##11. Does the transformation appear to have helped rectify the original problems with the model? Why? Justify your answer by providing visual evidence and statistical tests results.
summary(fitr)
##
## Call:
## lm(formula = loglos ~ logcost)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.35743 -0.40968 0.07815 0.41572 1.01576
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.3042 1.1134 -4.764 4.22e-05 ***
## logcost 0.8792 0.1269 6.927 9.07e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6247 on 31 degrees of freedom
## Multiple R-squared: 0.6075, Adjusted R-squared: 0.5948
## F-statistic: 47.98 on 1 and 31 DF, p-value: 9.066e-08
plot(fitr, which=1)
bptest(fitr)
##
## studentized Breusch-Pagan test
##
## data: fitr
## BP = 0.86886, df = 1, p-value = 0.3513
ad.test(resid(fitr))
##
## Anderson-Darling normality test
##
## data: resid(fitr)
## A = 0.28325, p-value = 0.6116
detach(hospital)
## It fixed the problem because now we have normality
##This is seen by the results with the AD test
##Before we had non normality. But now we don’t, so now we have no violation and our p value is above .05 so we support the alternative that there is normal distribution
##Because the p-value for the AD test is above a .05 gives us statistical evidence to reject the null that there is violation in constant variance and support the alternative that there is no violation of constant variance confirmation.
##12. Is there an association between hospitalization cost and length of stay? Justify your answer.
##run regression
fit1<-lm(logcost~los, data=hospital)
summary(fit1)
##
## Call:
## lm(formula = logcost ~ los, data = hospital)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.07050 -0.46336 0.01929 0.29820 1.35821
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.152469 0.157570 51.739 < 2e-16 ***
## los 0.035232 0.006746 5.223 1.13e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6447 on 31 degrees of freedom
## Multiple R-squared: 0.468, Adjusted R-squared: 0.4509
## F-statistic: 27.27 on 1 and 31 DF, p-value: 1.135e-05
## yes
##There is a positive correlation between length of stay and hospitalization cost. Therefore with an increase is hospital stay, there in an increase in hospitalization cost
## There is statically significance between hospitalization cost and the length of stay since the p value is less than .05.
##For every unit increase in length of stay, there is an increase in cost by 3.6% or rather 3.586%
## For every unit increase in length of stay,Its associated with a 3.6% increase in cost
##y = 0.035232+8.152469
exp(0.035232)
## [1] 1.03586
##3.586%
##3.6%