describe(hos)
## vars n mean sd median trimmed mad min max range skew
## Cost 1 33 8810.18 7875.07 5870 7435.67 5252.85 1233 35381 34148 1.58
## Los 2 33 16.39 16.89 12 13.26 10.38 1 85 84 2.31
## kurtosis se
## Cost 2.25 1370.87
## Los 6.13 2.94
#Cost: mean is 8810.18, standard deviation is 7875.07, min is 1233, max is 35381.
#Los: mean is 16.39, standard deviation is 16.89, min is 1, max is 85.
fit<-lm(Cost~Los, data=hos)
summary(fit)
##
## Call:
## lm(formula = Cost ~ Los, data = hos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7590 -2742 -1214 2377 16052
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2706.96 1176.67 2.301 0.0283 *
## Los 372.29 50.38 7.390 2.54e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4815 on 31 degrees of freedom
## Multiple R-squared: 0.6379, Adjusted R-squared: 0.6262
## F-statistic: 54.61 on 1 and 31 DF, p-value: 2.542e-08
coef(fit)
## (Intercept) Los
## 2706.9613 372.2852
# y= 2706.96 + 372.29x + E
anova(fit)
## Analysis of Variance Table
##
## Response: Cost
## Df Sum Sq Mean Sq F value Pr(>F)
## Los 1 1265921342 1265921342 54.61 2.542e-08 ***
## Residuals 31 718614701 23181119
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#SST is 1984536043 (sse+ssr)
#SSE is 1265921342
#SSR is 718614701
#The value for the length of stay (los) coefficient is 372.29, for every unit increase/day spend at hospital (los) the cost increases by $372.29
#When x (Los) is zero, the intercept is 2706.96.
#R2=0.63, approximately 63% of sample variation in cost is explained by length of stay.
#ggplot
ggplot(hos, aes(x=Los, y=Cost))+geom_point()+geom_smooth(method = "lm", se=FALSE)
## `geom_smooth()` using formula 'y ~ x'
ggplot(hos, aes(x=Los, y=Cost))+geom_point()+geom_smooth(se=F)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#Based on the plot, no, the linear function does not appear to fit the data well. #There is non-linearity. Potential problems may include issues with linearity and violation normality.
attach(hos)
plot(y=Cost, x=Los)
f1 <- lm(Cost~Los)
plot(f1, which=1)
with(hos,plot(Cost,Los))
abline(f1, col="purple")
summary(f1)
##
## Call:
## lm(formula = Cost ~ Los)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7590 -2742 -1214 2377 16052
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2706.96 1176.67 2.301 0.0283 *
## Los 372.29 50.38 7.390 2.54e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4815 on 31 degrees of freedom
## Multiple R-squared: 0.6379, Adjusted R-squared: 0.6262
## F-statistic: 54.61 on 1 and 31 DF, p-value: 2.542e-08
anova(f1)
## Analysis of Variance Table
##
## Response: Cost
## Df Sum Sq Mean Sq F value Pr(>F)
## Los 1 1265921342 1265921342 54.61 2.542e-08 ***
## Residuals 31 718614701 23181119
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(hos, aes(y=Cost, x=log(Los))) +
geom_point()+
geom_smooth(method = "lm", se=FALSE)
## `geom_smooth()` using formula 'y ~ x'
7.Make a residuals vs. fits plot. Interpret the residuals vs. fits plot — which model assumption does it suggest is violated? Elaborate your answer.
#Plots of residuals vs fits, normal QQ, Scale-Location, Residuals vs Leverage show there is no constant variance. The assumption of constant variance is violated.
plot(fit, which=1)
plot(fit)
#Yes, it violates the assumption of normality.
fit1<-lm(Cost~Los, data=hos)
summary(fit1)
##
## Call:
## lm(formula = Cost ~ Los, data = hos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7590 -2742 -1214 2377 16052
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2706.96 1176.67 2.301 0.0283 *
## Los 372.29 50.38 7.390 2.54e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4815 on 31 degrees of freedom
## Multiple R-squared: 0.6379, Adjusted R-squared: 0.6262
## F-statistic: 54.61 on 1 and 31 DF, p-value: 2.542e-08
anova(fit1)
## Analysis of Variance Table
##
## Response: Cost
## Df Sum Sq Mean Sq F value Pr(>F)
## Los 1 1265921342 1265921342 54.61 2.542e-08 ***
## Residuals 31 718614701 23181119
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(fit1)
boxcox(fit1)
bptest(fit1)
##
## studentized Breusch-Pagan test
##
## data: fit1
## BP = 0.16079, df = 1, p-value = 0.6884
#Plots of residuals vs fits, normal QQ, Scale-Location, Residuals vs Leverage show there is no constant variance. The assumption of constant variance is violated.
hos1 <- lm(formula = Cost~Los, data = hos)
summary(hos1)
##
## Call:
## lm(formula = Cost ~ Los, data = hos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7590 -2742 -1214 2377 16052
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2706.96 1176.67 2.301 0.0283 *
## Los 372.29 50.38 7.390 2.54e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4815 on 31 degrees of freedom
## Multiple R-squared: 0.6379, Adjusted R-squared: 0.6262
## F-statistic: 54.61 on 1 and 31 DF, p-value: 2.542e-08
#visual test
plot(fit, which = 2)
#formal test
ad.test(resid(fit1))
##
## Anderson-Darling normality test
##
## data: resid(fit1)
## A = 0.91897, p-value = 0.01714
#Both formal and visual tests provide evidence that the residuals are not normally distributed.
#problem in number 7 is no constant variance.
logCost <- log(Cost)
plot2<-plot(logCost,Los)
lines(lowess(logCost,Los),col="orange")
#fitr<- lm(Cost~Los)
#log trans of x
lx <- lm(logCost~Los)
summary(lx)
##
## Call:
## lm(formula = logCost ~ Los)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.07050 -0.46336 0.01929 0.29820 1.35821
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.152469 0.157570 51.739 < 2e-16 ***
## Los 0.035232 0.006746 5.223 1.13e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6447 on 31 degrees of freedom
## Multiple R-squared: 0.468, Adjusted R-squared: 0.4509
## F-statistic: 27.27 on 1 and 31 DF, p-value: 1.135e-05
plot(lx, which=1)
plot(lx, which=2)
#log trans of y
ly <- log(Cost)
LY <- lm(ly~Los, data=hos)
plot(LY, which=1)
boxcox(fit1)
#The transformation I would use is boxcox.
#Value of lambda will be use to determine the transformation to use for Y variable.
#look at estimated value (middle line) and determine the closest value of lambda. Looks close to zero, so use log transformation of y.
#problem in 6 is nonlinearity, to fix it transform both variables with log.
#transform both variables
attach(hos)
## The following objects are masked from hos (pos = 3):
##
## Cost, Los
logx <- log(Los)
logy <- log(Cost)
plotof2 <- lm(logy~logx, data=hos)
plot(logy~logx, data=hos)
ggplot(hos, aes(y=logy, x=logx))+geom_point()+geom_smooth(method= "lm",se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
#attach(hos)
lgc <- log(Cost)
p1 <- plot(lgc,Los)
lines(lowess(lgc,Los),col="green")
f3 <- lm(lgc~Los)
plot(plotof2, which=1)
summary(plotof2)
##
## Call:
## lm(formula = logy ~ logx, data = hos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.20000 -0.38642 -0.01729 0.40145 1.05618
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.09154 0.25544 27.762 < 2e-16 ***
## logx 0.69096 0.09975 6.927 9.07e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5538 on 31 degrees of freedom
## Multiple R-squared: 0.6075, Adjusted R-squared: 0.5948
## F-statistic: 47.98 on 1 and 31 DF, p-value: 9.066e-08
ad.test(resid(plotof2))
##
## Anderson-Darling normality test
##
## data: resid(plotof2)
## A = 0.12663, p-value = 0.9831
#Yes, the transformation appears to have helped rectify the original problems with the model normality and linearity
# The P-value of the AD test is greater than 0.05, we fail to reject the null, meaning that the assumption/normality is met.
#Yes, there is an association between hospitalization cost and length of stay.
#run regression
f3<-lm(lgc~Los, data=hos)
summary(f3)
##
## Call:
## lm(formula = lgc ~ Los, data = hos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.07050 -0.46336 0.01929 0.29820 1.35821
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.152469 0.157570 51.739 < 2e-16 ***
## Los 0.035232 0.006746 5.223 1.13e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6447 on 31 degrees of freedom
## Multiple R-squared: 0.468, Adjusted R-squared: 0.4509
## F-statistic: 27.27 on 1 and 31 DF, p-value: 1.135e-05
#log(y)0.035232x+8.152469
exp(0.035232)
## [1] 1.03586
#1.03586-1=0.03586* 100 = 3.586
#interpret
#For every one-unit increase in the independent variable (Los), the dependent variable (cost) increases by 3.6 percent. The association is statistically significant given the p value.