##load packages to use

Question 1. Obtain the mean, standard deviation, min and max values for both x and y variables

#Import the dataset
Hos_los<- read_csv("Hospital.csv") %>% 
  mutate(sq_los=(los)^2, 
         exp_los=exp(los),
         sqr_cost=sqrt(cost))
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   cost = col_double(),
##   los = col_double()
## )
#Summary Statistics for Hospitalization cost(Y)
summarise(Hos_los, Min_value=min(cost, na.rm = T), Max_value=max(cost, na.rm = T), 
          Mean_value=mean(cost, na.rm = T),sd_value=sd(cost, na.rm = T))
##Summary Statistics for Length of stay(X)
summarise(Hos_los, Min_value=min(los, na.rm = T), Max_value=max(los, na.rm = T),
          Mean_value=mean(los, na.rm = T),sd_value=sd(los, na.rm = T))

Question 2. Fit a simple linear regression model and write down the equation.

#simple linear regression model
#Hospitalization cost(Y) = β0 + β1*X(length of stay) + ϵ
fit<-lm(cost~los, data=Hos_los)
fit
## 
## Call:
## lm(formula = cost ~ los, data = Hos_los)
## 
## Coefficients:
## (Intercept)          los  
##      2702.6        371.2
# Y = 2702.6 + 371.2X + ϵ

Question 3. What is the SST, SSE, and SSR for this model, respectively?

anova(fit)
#SST =723596123 + 1260939920 =1984536043

#SSE =723596123

#SSR =1260939920

Question 4. How do we interpret the coefficient and the intercept, respectively?

# For every additional day stayed in the Hospital, the Cost of Hospitalization increases by 371.2
# The intercept describes what the Cost of Hospitalization would be if the length of hosipital stay is zero.Hence in this case, when our length of hospital stays equals zero,the average cost of hospitalization  would be 2702.6

Question 5. What is the R2? How do we interpret this?

summary(fit)
## 
## Call:
## lm(formula = cost ~ los, data = Hos_los)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -7552  -2723  -1187   2390  16076 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2702.6     1182.3   2.286   0.0293 *  
## los            371.2       50.5   7.350 2.83e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4831 on 31 degrees of freedom
## Multiple R-squared:  0.6354, Adjusted R-squared:  0.6236 
## F-statistic: 54.02 on 1 and 31 DF,  p-value: 2.834e-08
# r-square =0.635
#Interpretation of r-square 
#In the case under observation, about 64% of variation in the Cost of Hospitalization can be explained by the length of hospital stay.

Question 6. Does a linear function appear to fit the data well? If not, does the plot suggest any other potential problems with the model?

ggplot(Hos_los, aes(x=los, y=cost))+
  geom_point()+
  geom_smooth()+ 
  labs(y="Hospitalization Cost", x = "Length of Hospital Stay")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# No, the linear function does not appear to fit the data well.
metric <- augment(fit)
ggplot(metric, aes(los, cost)) +
  geom_point() +
  stat_smooth(method = lm, se = FALSE) +
  geom_segment(aes(xend =los, yend = .fitted), color = "red", size = 0.3)
## `geom_smooth()` using formula 'y ~ x'

# Non Constant variance.
# Non normality of the residuals 

Question 7. Make a residuals vs. fits plot. Interpret the residuals vs. fits plot — which model assumption does it suggest is violated? Elaborate your answer.

plot(fit, 1)

# The residual plot shows a fitted pattern;the red line is not approximately horizontal at zero. Since there is a  pattern in the plot, it might indicates there is no  constant variance between the residual and fitted lines . Hence,we can assum that the assumption of homoscedasticity might not hold. 

Question 8. Test the normality of residuals assumption. Does the test provide evidence that the residuals are not normally distributed?

plot(fit, 2)

ad.test(fit$residuals)
## 
##  Anderson-Darling normality test
## 
## data:  fit$residuals
## A = 0.88385, p-value = 0.02103
#Yes, the test clearly shows that the normality of residual assumption has been violated. The residual points do not align with the dotted line. For the assumption of normality not to be violated,the good residuals points should approximately follow the straight dashed line. The p_value less than 0.05 is another indication the assumption of normality does not hold

Question 9. To fix the problem identified in (7), what transformation would you use to fix the problem?

# A possible solution to reduce the non-normalamilty problem is to use a log or square transformation of the outcome variable (Cost of Hospitalization)

Question 10. To fix the problem identified in (6), what transformation would you use?

# To fix the linearity problem, i will transform the independent variable by getting the exponential or square of it. This means the square or exponential of length of hospital stay will be use to fit the model.

Question 11. Does the transformation appear to have helped rectify the original problems with the model? Why? Justify your answer by providing visual evidence and statistical tests results.

# No, the linear function does not appear to fit the data well. The transformation makes the model looks like it needs more transformation.
fit2<-lm(cost~sq_los, data=Hos_los)
ggplot(Hos_los, aes(x=sq_los, y=cost))+
  geom_point()+
  geom_smooth()+ 
  labs(y="Hospitalization Cost", x = "Length of Hospital Stay")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# No, the linear function does not appear to fit the data well.
ad.test(fit2$residuals)
## 
##  Anderson-Darling normality test
## 
## data:  fit2$residuals
## A = 1.792, p-value = 0.0001086
bptest(fit2)
## 
##  studentized Breusch-Pagan test
## 
## data:  fit2
## BP = 0.026712, df = 1, p-value = 0.8702

Question 12. Is there an association between hospitalization cost and length of stay? Justify your answer.

#Yes 
sqrt(0.6354)
## [1] 0.7971198
#H0: There is no correlation between the two variables: ρ = 0
#Ha: There is a nonzero correlation between the two variables: ρ ≠ 0
cor.test(Hos_los$cost, Hos_los$sq_los, method="pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  Hos_los$cost and Hos_los$sq_los
## t = 6.276, df = 31, p-value = 5.609e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.5446045 0.8683559
## sample estimates:
##      cor 
## 0.748052
#since the value of r=0.75 and the p-value is less than 0.05,at 95% level of significance we will reject the null hypothesis and concluded that there is an association between hospitalization cost and length of stay.