##load libraries
> library(haven)
> library(readr)
> library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
> library(knitr)
> library(tidyverse)
-- Attaching packages ----------------------------------------------------- tidyverse 1.3.0 --
v ggplot2 3.3.2 v purrr 0.3.4
v tibble 3.0.1 v stringr 1.4.0
v tidyr 1.1.0 v forcats 0.5.0
-- Conflicts -------------------------------------------------------- tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
> library(readxl)
> library(ggplot2)
> library(broom)
> library(nortest)
Warning: package 'nortest' was built under R version 4.0.3
> library(lmtest)
Loading required package: zoo
Attaching package: 'zoo'
The following objects are masked from 'package:base':
as.Date, as.Date.numeric
> library(MASS)
Attaching package: 'MASS'
The following object is masked from 'package:dplyr':
select
> library(readr)
> library(car)
Loading required package: carData
Attaching package: 'car'
The following object is masked from 'package:purrr':
some
The following object is masked from 'package:dplyr':
recode
> hospital <- read.delim("C:/Users/drayr/OneDrive/Desktop/DEM Fall 2020/STATS 7273/Data/hospital.txt")
##1
> library(psych)
Attaching package: 'psych'
The following object is masked from 'package:car':
logit
The following objects are masked from 'package:ggplot2':
%+%, alpha
> describe(hospital)
vars n mean sd median trimmed mad min max range skew
cost 1 33 8810.18 7875.07 5870 7435.67 5252.85 1233 35381 34148 1.58
los 2 33 16.39 16.89 12 13.26 10.38 1 85 84 2.31
kurtosis se
cost 2.25 1370.87
los 6.13 2.94
##2
> fit<- lm(cost~los , data= hospital)
> summary(fit)
Call:
lm(formula = cost ~ los, data = hospital)
Residuals:
Min 1Q Median 3Q Max
-7590 -2742 -1214 2377 16052
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2706.96 1176.67 2.301 0.0283 *
los 372.29 50.38 7.390 2.54e-08 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 4815 on 31 degrees of freedom
Multiple R-squared: 0.6379, Adjusted R-squared: 0.6262
F-statistic: 54.61 on 1 and 31 DF, p-value: 2.542e-08
> anova(fit)
Analysis of Variance Table
Response: cost
Df Sum Sq Mean Sq F value Pr(>F)
los 1 1265921342 1265921342 54.61 2.542e-08 ***
Residuals 31 718614701 23181119
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
> ##3
> SST<-1265921342 + 718614701
> SSE<-1265921342
> SSR<-718614701
>
> SST
[1] 1984536043
> SSE
[1] 1265921342
> SSR
[1] 718614701
> ##4 Coefficient: We estimate that as length of stay increases by one unit, cost will increase by $372.29
> ## Intercept: When length of stay is zero, cost is $2,706.96
>
> ##5 R^2= .64; Approximately 64 percent of the variance in cost is explained by length of stay.
##6
> attach(hospital)
> plot(los, cost, main="Scatterplot",
+ xlab="Length of Stay ", ylab="Cost ")
> abline(lm(cost~los), col="red")
> ggplot(hospital, aes(x=los, y=cost))+geom_point()+geom_smooth(method = "lm", se = FALSE)
`geom_smooth()` using formula 'y ~ x'
##7
> plot(fit, which=1)
> #Constant variance of residuals is violated. There is more variance for lower fitted values as indicated by the red line deviating from the horizontal line.
##8
> ad.test(resid(fit))
Anderson-Darling normality test
data: resid(fit)
A = 0.91897, p-value = 0.01714
> #Since p-value is <.05, there is evidence that the residuals are not normally distributed
##9
> #Transform the dependent variable to normalize error and equalize variance of errors
##10
> #Transform the independent variable to linearize a model
> attach(hospital)
The following objects are masked from hospital (pos = 3):
cost, los
> plot(los, cost, main="Scatterplot",
+ xlab="Length of Stay ", ylab="Cost ")
> lines(lowess(los, cost), col="blue") # lowess line (x,y)
##11
> hospital2<-hospital%>%
+ mutate(explos = los^2)
>
> attach(hospital2)
The following objects are masked from hospital (pos = 3):
cost, los
The following objects are masked from hospital (pos = 4):
cost, los
> plot(explos, cost, main="Scatterplot",
+ xlab="Length of Stay ", ylab="Cost ")
> abline(lm(cost~explos), col="red")
> fit2<- lm(cost~explos , data= hospital2)
> summary(fit2)
Call:
lm(formula = cost ~ explos, data = hospital2)
Residuals:
Min 1Q Median 3Q Max
-5150 -3632 -2102 2495 17637
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6378.6867 1001.6691 6.368 4.32e-07 ***
explos 4.4570 0.7095 6.282 5.52e-07 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 5307 on 31 degrees of freedom
Multiple R-squared: 0.56, Adjusted R-squared: 0.5458
F-statistic: 39.46 on 1 and 31 DF, p-value: 5.52e-07
> anova(fit2)
Analysis of Variance Table
Response: cost
Df Sum Sq Mean Sq F value Pr(>F)
explos 1 1111395127 1111395127 39.459 5.52e-07 ***
Residuals 31 873140916 28165836
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
> boxcox(fit)
##11(cont)
> hospital3<-hospital%>%
+ mutate(costlog = log(cost))
>
> attach(hospital3)
The following objects are masked from hospital2:
cost, los
The following objects are masked from hospital (pos = 4):
cost, los
The following objects are masked from hospital (pos = 5):
cost, los
> plot(los, costlog, main="Scatterplot",
+ xlab="Length of Stay ", ylab="Cost ")
> abline(lm(costlog~los), col="red")
> fit3<- lm(costlog~los , data= hospital3)
> summary(fit3)
Call:
lm(formula = costlog ~ los, data = hospital3)
Residuals:
Min 1Q Median 3Q Max
-1.07050 -0.46336 0.01929 0.29820 1.35821
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 8.152469 0.157570 51.739 < 2e-16 ***
los 0.035232 0.006746 5.223 1.13e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.6447 on 31 degrees of freedom
Multiple R-squared: 0.468, Adjusted R-squared: 0.4509
F-statistic: 27.27 on 1 and 31 DF, p-value: 1.135e-05
> anova(fit3)
Analysis of Variance Table
Response: costlog
Df Sum Sq Mean Sq F value Pr(>F)
los 1 11.338 11.3381 27.275 1.135e-05 ***
Residuals 31 12.886 0.4157
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
> plot(fit3, which=1)
> bptest(fit3)
studentized Breusch-Pagan test
data: fit3
BP = 0.55063, df = 1, p-value = 0.4581
> ad.test(resid(fit3))
Anderson-Darling normality test
data: resid(fit3)
A = 0.28101, p-value = 0.6188
##11(cont)
> hospital4<-hospital%>%
+ mutate(costlog = log(cost),
+ loslog = log(los))
>
> attach(hospital4)
The following objects are masked from hospital3:
cost, costlog, los
The following objects are masked from hospital2:
cost, los
The following objects are masked from hospital (pos = 5):
cost, los
The following objects are masked from hospital (pos = 6):
cost, los
> plot(loslog, costlog, main="Scatterplot",
+ xlab="Length of Stay ", ylab="Cost ")
> abline(lm(costlog~loslog), col="red")
> fit4<- lm(costlog~loslog , data= hospital4)
> summary(fit4)
Call:
lm(formula = costlog ~ loslog, data = hospital4)
Residuals:
Min 1Q Median 3Q Max
-1.20000 -0.38642 -0.01729 0.40145 1.05618
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.09154 0.25544 27.762 < 2e-16 ***
loslog 0.69096 0.09975 6.927 9.07e-08 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.5538 on 31 degrees of freedom
Multiple R-squared: 0.6075, Adjusted R-squared: 0.5948
F-statistic: 47.98 on 1 and 31 DF, p-value: 9.066e-08
> anova(fit4)
Analysis of Variance Table
Response: costlog
Df Sum Sq Mean Sq F value Pr(>F)
loslog 1 14.7164 14.7164 47.98 9.066e-08 ***
Residuals 31 9.5082 0.3067
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
> plot(fit4, which=1)
> bptest(fit4)
studentized Breusch-Pagan test
data: fit4
BP = 0.0093499, df = 1, p-value = 0.923
> ad.test(resid(fit4))
Anderson-Darling normality test
data: resid(fit4)
A = 0.12663, p-value = 0.9831