##load libraries

> library(haven)
> library(readr)
> library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
> library(knitr)
> library(tidyverse)
-- Attaching packages ----------------------------------------------------- tidyverse 1.3.0 --
v ggplot2 3.3.2     v purrr   0.3.4
v tibble  3.0.1     v stringr 1.4.0
v tidyr   1.1.0     v forcats 0.5.0
-- Conflicts -------------------------------------------------------- tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()
> library(readxl)
> library(ggplot2)
> library(broom)
> library(nortest)
Warning: package 'nortest' was built under R version 4.0.3
> library(lmtest)
Loading required package: zoo

Attaching package: 'zoo'
The following objects are masked from 'package:base':

    as.Date, as.Date.numeric
> library(MASS)

Attaching package: 'MASS'
The following object is masked from 'package:dplyr':

    select
> library(readr)
> library(car)
Loading required package: carData

Attaching package: 'car'
The following object is masked from 'package:purrr':

    some
The following object is masked from 'package:dplyr':

    recode
> hospital <- read.delim("C:/Users/drayr/OneDrive/Desktop/DEM Fall 2020/STATS 7273/Data/hospital.txt")

##1

> library(psych)

Attaching package: 'psych'
The following object is masked from 'package:car':

    logit
The following objects are masked from 'package:ggplot2':

    %+%, alpha
> describe(hospital)
     vars  n    mean      sd median trimmed     mad  min   max range skew
cost    1 33 8810.18 7875.07   5870 7435.67 5252.85 1233 35381 34148 1.58
los     2 33   16.39   16.89     12   13.26   10.38    1    85    84 2.31
     kurtosis      se
cost     2.25 1370.87
los      6.13    2.94

##2

> fit<- lm(cost~los , data= hospital)
> summary(fit)

Call:
lm(formula = cost ~ los, data = hospital)

Residuals:
   Min     1Q Median     3Q    Max 
 -7590  -2742  -1214   2377  16052 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  2706.96    1176.67   2.301   0.0283 *  
los           372.29      50.38   7.390 2.54e-08 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4815 on 31 degrees of freedom
Multiple R-squared:  0.6379,    Adjusted R-squared:  0.6262 
F-statistic: 54.61 on 1 and 31 DF,  p-value: 2.542e-08
> anova(fit)
Analysis of Variance Table

Response: cost
          Df     Sum Sq    Mean Sq F value    Pr(>F)    
los        1 1265921342 1265921342   54.61 2.542e-08 ***
Residuals 31  718614701   23181119                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
> ##3
> SST<-1265921342 + 718614701   
> SSE<-1265921342
> SSR<-718614701
> 
> SST
[1] 1984536043
> SSE
[1] 1265921342
> SSR
[1] 718614701
> ##4 Coefficient: We estimate that as length of stay increases by one unit, cost will increase by $372.29
> ##  Intercept: When length of stay is zero, cost is $2,706.96
> 
> ##5 R^2= .64; Approximately 64 percent of the variance in cost is explained by length of stay.

##6

> attach(hospital)
> plot(los, cost, main="Scatterplot",
+    xlab="Length of Stay ", ylab="Cost ")
>    abline(lm(cost~los), col="red") 

>    ggplot(hospital, aes(x=los, y=cost))+geom_point()+geom_smooth(method = "lm", se = FALSE)
`geom_smooth()` using formula 'y ~ x'

##7

> plot(fit, which=1)

> #Constant variance of residuals is violated. There is more variance for lower fitted values as indicated by the red line deviating from the horizontal line.

##8

> ad.test(resid(fit))

    Anderson-Darling normality test

data:  resid(fit)
A = 0.91897, p-value = 0.01714
> #Since p-value is <.05, there is evidence that the residuals are not normally distributed

##9

> #Transform the dependent variable to normalize error and equalize variance of errors

##10

> #Transform the independent variable to linearize a model
> attach(hospital)
The following objects are masked from hospital (pos = 3):

    cost, los
> plot(los, cost, main="Scatterplot",
+    xlab="Length of Stay ", ylab="Cost ")
> lines(lowess(los, cost), col="blue") # lowess line (x,y)

##11

> hospital2<-hospital%>%
+ mutate(explos = los^2)
> 
> attach(hospital2)
The following objects are masked from hospital (pos = 3):

    cost, los
The following objects are masked from hospital (pos = 4):

    cost, los
> plot(explos, cost, main="Scatterplot",
+    xlab="Length of Stay ", ylab="Cost ")
>    abline(lm(cost~explos), col="red") 

>    fit2<- lm(cost~explos , data= hospital2)
> summary(fit2)

Call:
lm(formula = cost ~ explos, data = hospital2)

Residuals:
   Min     1Q Median     3Q    Max 
 -5150  -3632  -2102   2495  17637 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 6378.6867  1001.6691   6.368 4.32e-07 ***
explos         4.4570     0.7095   6.282 5.52e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 5307 on 31 degrees of freedom
Multiple R-squared:   0.56, Adjusted R-squared:  0.5458 
F-statistic: 39.46 on 1 and 31 DF,  p-value: 5.52e-07
> anova(fit2)
Analysis of Variance Table

Response: cost
          Df     Sum Sq    Mean Sq F value   Pr(>F)    
explos     1 1111395127 1111395127  39.459 5.52e-07 ***
Residuals 31  873140916   28165836                     
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
> boxcox(fit)

##11(cont)

> hospital3<-hospital%>%
+ mutate(costlog = log(cost))
> 
> attach(hospital3)
The following objects are masked from hospital2:

    cost, los
The following objects are masked from hospital (pos = 4):

    cost, los
The following objects are masked from hospital (pos = 5):

    cost, los
> plot(los, costlog, main="Scatterplot",
+    xlab="Length of Stay ", ylab="Cost ")
>    abline(lm(costlog~los), col="red") 

>    fit3<- lm(costlog~los , data= hospital3)
> summary(fit3)

Call:
lm(formula = costlog ~ los, data = hospital3)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.07050 -0.46336  0.01929  0.29820  1.35821 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 8.152469   0.157570  51.739  < 2e-16 ***
los         0.035232   0.006746   5.223 1.13e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.6447 on 31 degrees of freedom
Multiple R-squared:  0.468, Adjusted R-squared:  0.4509 
F-statistic: 27.27 on 1 and 31 DF,  p-value: 1.135e-05
> anova(fit3)
Analysis of Variance Table

Response: costlog
          Df Sum Sq Mean Sq F value    Pr(>F)    
los        1 11.338 11.3381  27.275 1.135e-05 ***
Residuals 31 12.886  0.4157                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
>    plot(fit3, which=1)

>    bptest(fit3)

    studentized Breusch-Pagan test

data:  fit3
BP = 0.55063, df = 1, p-value = 0.4581
>    ad.test(resid(fit3))

    Anderson-Darling normality test

data:  resid(fit3)
A = 0.28101, p-value = 0.6188

##11(cont)

> hospital4<-hospital%>%
+ mutate(costlog = log(cost),
+        loslog = log(los))
> 
> attach(hospital4)
The following objects are masked from hospital3:

    cost, costlog, los
The following objects are masked from hospital2:

    cost, los
The following objects are masked from hospital (pos = 5):

    cost, los
The following objects are masked from hospital (pos = 6):

    cost, los
> plot(loslog, costlog, main="Scatterplot",
+    xlab="Length of Stay ", ylab="Cost ")
>    abline(lm(costlog~loslog), col="red") 

>    fit4<- lm(costlog~loslog , data= hospital4)
> summary(fit4)

Call:
lm(formula = costlog ~ loslog, data = hospital4)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.20000 -0.38642 -0.01729  0.40145  1.05618 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  7.09154    0.25544  27.762  < 2e-16 ***
loslog       0.69096    0.09975   6.927 9.07e-08 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.5538 on 31 degrees of freedom
Multiple R-squared:  0.6075,    Adjusted R-squared:  0.5948 
F-statistic: 47.98 on 1 and 31 DF,  p-value: 9.066e-08
> anova(fit4)
Analysis of Variance Table

Response: costlog
          Df  Sum Sq Mean Sq F value    Pr(>F)    
loslog     1 14.7164 14.7164   47.98 9.066e-08 ***
Residuals 31  9.5082  0.3067                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
>    plot(fit4, which=1)

>    bptest(fit4)

    studentized Breusch-Pagan test

data:  fit4
BP = 0.0093499, df = 1, p-value = 0.923
>    ad.test(resid(fit4))

    Anderson-Darling normality test

data:  resid(fit4)
A = 0.12663, p-value = 0.9831