##load packages to use
#Import the dataset
Hos_los<- read_csv("Hospital.csv") %>%
mutate(sq_los=(los)^2,
exp_los=exp(los),
sqr_cost=sqrt(cost))
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## cost = col_double(),
## los = col_double()
## )
#Summary Statistics for Hospitalization cost(Y)
summarise(Hos_los, Min_value=min(cost, na.rm = T), Max_value=max(cost, na.rm = T),
Mean_value=mean(cost, na.rm = T),sd_value=sd(cost, na.rm = T))
##Summary Statistics for Length of stay(X)
summarise(Hos_los, Min_value=min(los, na.rm = T), Max_value=max(los, na.rm = T),
Mean_value=mean(los, na.rm = T),sd_value=sd(los, na.rm = T))
#simple linear regression model
#Hospitalization cost(Y) = β0 + β1*X(length of stay) + ϵ
fit<-lm(cost~los, data=Hos_los)
fit
##
## Call:
## lm(formula = cost ~ los, data = Hos_los)
##
## Coefficients:
## (Intercept) los
## 2702.6 371.2
# Y = 2702.6 + 371.2X + ϵ
anova(fit)
#SST =723596123 + 1260939920 =1984536043
#SSE =723596123
#SSR =1260939920
# For every additional day stayed in the Hospital, the Cost of Hospitalization increases by 371.2
# The intercept describes what the Cost of Hospitalization would be if the length of hosipital stay is zero.Hence in this case, when our length of hospital stays equals zero,the average cost of hospitalization would be 2702.6
summary(fit)
##
## Call:
## lm(formula = cost ~ los, data = Hos_los)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7552 -2723 -1187 2390 16076
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2702.6 1182.3 2.286 0.0293 *
## los 371.2 50.5 7.350 2.83e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4831 on 31 degrees of freedom
## Multiple R-squared: 0.6354, Adjusted R-squared: 0.6236
## F-statistic: 54.02 on 1 and 31 DF, p-value: 2.834e-08
# r-square =0.635
#Interpretation of r-square
#In the case under observation, about 64% of variation in the Cost of Hospitalization can be explained by the length of hospital stay.
ggplot(Hos_los, aes(x=los, y=cost))+
geom_point()+
geom_smooth()+
labs(y="Hospitalization Cost", x = "Length of Hospital Stay")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# No, the linear function does not appear to fit the data well.
metric <- augment(fit)
ggplot(metric, aes(los, cost)) +
geom_point() +
stat_smooth(method = lm, se = FALSE) +
geom_segment(aes(xend =los, yend = .fitted), color = "red", size = 0.3)
## `geom_smooth()` using formula 'y ~ x'
# Non Constant variance.
# Non normality of the residuals
plot(fit, 1)
# The residual plot shows a fitted pattern;the red line is not approximately horizontal at zero. Since there is a pattern in the plot, it might indicates there is no constant variance between the residual and fitted lines . Hence,we can assum that the assumption of homoscedasticity might not hold.
plot(fit, 2)
ad.test(fit$residuals)
##
## Anderson-Darling normality test
##
## data: fit$residuals
## A = 0.88385, p-value = 0.02103
#Yes, the test clearly shows that the normality of residual assumption has been violated. The residual points do not align with the dotted line. For the assumption of normality not to be violated,the good residuals points should approximately follow the straight dashed line. The p_value less than 0.05 is another indication the assumption of normality does not hold
# A possible solution to reduce the non-normalamilty problem is to use a log or square transformation of the outcome variable (Cost of Hospitalization)
# To fix the linearity problem, i will transform the independent variable by getting the exponential or square of it. This means the square or exponential of length of hospital stay will be use to fit the model.
# No, the linear function does not appear to fit the data well. The transformation makes the model looks like it needs more transformation.
fit2<-lm(cost~sq_los, data=Hos_los)
ggplot(Hos_los, aes(x=sq_los, y=cost))+
geom_point()+
geom_smooth()+
labs(y="Hospitalization Cost", x = "Length of Hospital Stay")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# No, the linear function does not appear to fit the data well.
ad.test(fit2$residuals)
##
## Anderson-Darling normality test
##
## data: fit2$residuals
## A = 1.792, p-value = 0.0001086
bptest(fit2)
##
## studentized Breusch-Pagan test
##
## data: fit2
## BP = 0.026712, df = 1, p-value = 0.8702
#Yes
sqrt(0.6354)
## [1] 0.7971198
#H0: There is no correlation between the two variables: ρ = 0
#Ha: There is a nonzero correlation between the two variables: ρ ≠ 0
cor.test(Hos_los$cost, Hos_los$sq_los, method="pearson")
##
## Pearson's product-moment correlation
##
## data: Hos_los$cost and Hos_los$sq_los
## t = 6.276, df = 31, p-value = 5.609e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5446045 0.8683559
## sample estimates:
## cor
## 0.748052
#since the value of r=0.75 and the p-value is less than 0.05,at 95% level of significance we will reject the null hypothesis and concluded that there is an association between hospitalization cost and length of stay.