rm(list = ls()) 
gc()           
##          used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 515521 27.6    1138248 60.8         NA   669337 35.8
## Vcells 946180  7.3    8388608 64.0      16384  1839792 14.1
df <- read.csv("/Users/bellajean/Desktop/week 6 data-1.csv")
head(df)
##   Expenditures Enrolled       RVUs    FTEs Quality.Score
## 1    114948144    25294  402703.73  954.91          0.67
## 2    116423140    42186  638251.99  949.25          0.58
## 3    119977702    23772  447029.54  952.51          0.52
## 4     19056531     2085   43337.26  199.98          0.93
## 5    246166031    67258 1579789.36 2162.15          0.96
## 6    152125186    23752  673036.55 1359.07          0.56
require(psych) 
## Loading required package: psych
hospitaldata <- describe(df)

df$outlier_Expenditures <- (df$Expenditures - hospitaldata["Expenditures","mean"] ) / hospitaldata["Expenditures","sd"] 

sum(abs((hospitaldata$outlier_Expenditures)>3))
## [1] 0
df$outlier_RVUs <- (df$RVUs - hospitaldata["RVUs","mean"])  / (hospitaldata["RVUs","sd"] ) 

sum(abs((df$outlier_RVUs)>3)) 
## [1] 12
hist(x = df$Expenditures,xlab = "",main = "Hospital Expenditure" )

hist(x = df$RVUs, xlab = "", main = "Standard Outpatient Workload (RVU)")

cor(df)
##                      Expenditures  Enrolled      RVUs      FTEs Quality.Score
## Expenditures            1.0000000 0.7707756 0.9217239 0.9796506     0.2749501
## Enrolled                0.7707756 1.0000000 0.9152024 0.8148491     0.2526991
## RVUs                    0.9217239 0.9152024 1.0000000 0.9504093     0.3075742
## FTEs                    0.9796506 0.8148491 0.9504093 1.0000000     0.2769058
## Quality.Score           0.2749501 0.2526991 0.3075742 0.2769058     1.0000000
## outlier_Expenditures    1.0000000 0.7707756 0.9217239 0.9796506     0.2749501
## outlier_RVUs            0.9217239 0.9152024 1.0000000 0.9504093     0.3075742
##                      outlier_Expenditures outlier_RVUs
## Expenditures                    1.0000000    0.9217239
## Enrolled                        0.7707756    0.9152024
## RVUs                            0.9217239    1.0000000
## FTEs                            0.9796506    0.9504093
## Quality.Score                   0.2749501    0.3075742
## outlier_Expenditures            1.0000000    0.9217239
## outlier_RVUs                    0.9217239    1.0000000
plot(df)

Strong linear relationship between: - Expenditure and RVUs - Expenditure and FTEs - Expenditure and Enrolled

Low positive correlation between: - Expenditure and Quality

plot(x = df$RVUs, y = df$Expenditures, xlab = "RVUs", ylab = "Expenditures") 

univariate_reg  =  lm(df$Expenditures ~ df$RVUs)

summary(univariate_reg)
## 
## Call:
## lm(formula = df$Expenditures ~ df$RVUs)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -185723026  -14097620    2813431   11919781  642218316 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.785e+06  4.413e+06  -0.858    0.392    
## df$RVUs      2.351e+02  5.061e+00  46.449   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 67350000 on 382 degrees of freedom
## Multiple R-squared:  0.8496, Adjusted R-squared:  0.8492 
## F-statistic:  2157 on 1 and 382 DF,  p-value: < 2.2e-16
options(scipen = 999)      
summary(univariate_reg)
## 
## Call:
## lm(formula = df$Expenditures ~ df$RVUs)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -185723026  -14097620    2813431   11919781  642218316 
## 
## Coefficients:
##                 Estimate   Std. Error t value            Pr(>|t|)    
## (Intercept) -3785072.158  4412905.480  -0.858               0.392    
## df$RVUs          235.072        5.061  46.449 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 67350000 on 382 degrees of freedom
## Multiple R-squared:  0.8496, Adjusted R-squared:  0.8492 
## F-statistic:  2157 on 1 and 382 DF,  p-value: < 0.00000000000000022
abline(reg = univariate_reg, col="pink")

plot(x = univariate_reg)

qqnorm(  y = df$Expenditures )

qqnorm(  y = log(df$Expenditures) ) 

Cannot assume residuals to follow normal distribution - points aligned with the normal qq plot line. Variance of residuals not constant with fitted values.

hist(x = log(df$Expenditures),xlab = "", main = "Log Hospital Expenditure" )

hist(x = log(df$RVUs),        xlab = "", main = "Log Standard Outpatient Workload (RVU)")

plot(x = df$RVUs, y = log (df$Expenditures) , xlab = "RVUs", ylab = "Log of Expenditures") 

univariate_reg_transformedY  =  lm( formula = log(df$Expenditures) ~ df$RVUs)
?abline
abline(reg = univariate_reg_transformedY, col="pink")

summary(univariate_reg)
## 
## Call:
## lm(formula = df$Expenditures ~ df$RVUs)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -185723026  -14097620    2813431   11919781  642218316 
## 
## Coefficients:
##                 Estimate   Std. Error t value            Pr(>|t|)    
## (Intercept) -3785072.158  4412905.480  -0.858               0.392    
## df$RVUs          235.072        5.061  46.449 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 67350000 on 382 degrees of freedom
## Multiple R-squared:  0.8496, Adjusted R-squared:  0.8492 
## F-statistic:  2157 on 1 and 382 DF,  p-value: < 0.00000000000000022
summary(univariate_reg_transformedY)
## 
## Call:
## lm(formula = log(df$Expenditures) ~ df$RVUs)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.59439 -0.29504  0.06135  0.35333  1.20871 
## 
## Coefficients:
##                   Estimate     Std. Error t value            Pr(>|t|)    
## (Intercept) 17.29584389074  0.03325414655  520.11 <0.0000000000000002 ***
## df$RVUs      0.00000134911  0.00000003814   35.38 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5076 on 382 degrees of freedom
## Multiple R-squared:  0.7661, Adjusted R-squared:  0.7655 
## F-statistic:  1251 on 1 and 382 DF,  p-value: < 0.00000000000000022
plot(univariate_reg_transformedY)

plot( univariate_reg , which = 3)               

plot( univariate_reg_transformedY , which = 3)   

hist(x = log(df$Expenditures), xlab = "", main = "Log Hospital Expenditure" )

hist(x = log(df$RVUs) ,        xlab = "", main = "Log Standard Outpatient Workload (RVU)")

log_log_reg =  lm( formula = log(df$Expenditures) ~ log(df$RVUs))
summary(log_log_reg)
## 
## Call:
## lm(formula = log(df$Expenditures) ~ log(df$RVUs))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.74657 -0.19864 -0.02431  0.18642  0.93551 
## 
## Coefficients:
##              Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)   6.91487    0.16621   41.60 <0.0000000000000002 ***
## log(df$RVUs)  0.88444    0.01317   67.17 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2932 on 382 degrees of freedom
## Multiple R-squared:  0.9219, Adjusted R-squared:  0.9217 
## F-statistic:  4512 on 1 and 382 DF,  p-value: < 0.00000000000000022
plot(log_log_reg)

plot(x = df$RVUs, y = log (df$Expenditures) , xlab = "RVUs", ylab = "Log of Expenditures") 

df$RVUs2 <- df$RVUs^2

univariate_reg_transformedY2  =  lm( formula = log(df$Expenditures) ~ df$RVUs+ df$RVUs2)

summary(univariate_reg_transformedY2)
## 
## Call:
## lm(formula = log(df$Expenditures) ~ df$RVUs + df$RVUs2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.14013 -0.26450  0.01464  0.25301  0.84276 
## 
## Coefficients:
##                         Estimate           Std. Error t value
## (Intercept) 16.95069300158269243  0.03006920638916719  563.72
## df$RVUs      0.00000275479307647  0.00000007917710136   34.79
## df$RVUs2    -0.00000000000055708  0.00000000000002944  -18.92
##                        Pr(>|t|)    
## (Intercept) <0.0000000000000002 ***
## df$RVUs     <0.0000000000000002 ***
## df$RVUs2    <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3649 on 381 degrees of freedom
## Multiple R-squared:  0.8794, Adjusted R-squared:  0.8788 
## F-statistic:  1390 on 2 and 381 DF,  p-value: < 0.00000000000000022
plot(univariate_reg_transformedY2)