rm(list = ls())
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 515521 27.6 1138248 60.8 NA 669337 35.8
## Vcells 946180 7.3 8388608 64.0 16384 1839792 14.1
df <- read.csv("/Users/bellajean/Desktop/week 6 data-1.csv")
head(df)
## Expenditures Enrolled RVUs FTEs Quality.Score
## 1 114948144 25294 402703.73 954.91 0.67
## 2 116423140 42186 638251.99 949.25 0.58
## 3 119977702 23772 447029.54 952.51 0.52
## 4 19056531 2085 43337.26 199.98 0.93
## 5 246166031 67258 1579789.36 2162.15 0.96
## 6 152125186 23752 673036.55 1359.07 0.56
require(psych)
## Loading required package: psych
hospitaldata <- describe(df)
df$outlier_Expenditures <- (df$Expenditures - hospitaldata["Expenditures","mean"] ) / hospitaldata["Expenditures","sd"]
sum(abs((hospitaldata$outlier_Expenditures)>3))
## [1] 0
df$outlier_RVUs <- (df$RVUs - hospitaldata["RVUs","mean"]) / (hospitaldata["RVUs","sd"] )
sum(abs((df$outlier_RVUs)>3))
## [1] 12
hist(x = df$Expenditures,xlab = "",main = "Hospital Expenditure" )
hist(x = df$RVUs, xlab = "", main = "Standard Outpatient Workload (RVU)")
cor(df)
## Expenditures Enrolled RVUs FTEs Quality.Score
## Expenditures 1.0000000 0.7707756 0.9217239 0.9796506 0.2749501
## Enrolled 0.7707756 1.0000000 0.9152024 0.8148491 0.2526991
## RVUs 0.9217239 0.9152024 1.0000000 0.9504093 0.3075742
## FTEs 0.9796506 0.8148491 0.9504093 1.0000000 0.2769058
## Quality.Score 0.2749501 0.2526991 0.3075742 0.2769058 1.0000000
## outlier_Expenditures 1.0000000 0.7707756 0.9217239 0.9796506 0.2749501
## outlier_RVUs 0.9217239 0.9152024 1.0000000 0.9504093 0.3075742
## outlier_Expenditures outlier_RVUs
## Expenditures 1.0000000 0.9217239
## Enrolled 0.7707756 0.9152024
## RVUs 0.9217239 1.0000000
## FTEs 0.9796506 0.9504093
## Quality.Score 0.2749501 0.3075742
## outlier_Expenditures 1.0000000 0.9217239
## outlier_RVUs 0.9217239 1.0000000
plot(df)
Strong linear relationship between: - Expenditure and RVUs - Expenditure
and FTEs - Expenditure and Enrolled
Low positive correlation between: - Expenditure and Quality
plot(x = df$RVUs, y = df$Expenditures, xlab = "RVUs", ylab = "Expenditures")
univariate_reg = lm(df$Expenditures ~ df$RVUs)
summary(univariate_reg)
##
## Call:
## lm(formula = df$Expenditures ~ df$RVUs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -185723026 -14097620 2813431 11919781 642218316
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.785e+06 4.413e+06 -0.858 0.392
## df$RVUs 2.351e+02 5.061e+00 46.449 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 67350000 on 382 degrees of freedom
## Multiple R-squared: 0.8496, Adjusted R-squared: 0.8492
## F-statistic: 2157 on 1 and 382 DF, p-value: < 2.2e-16
options(scipen = 999)
summary(univariate_reg)
##
## Call:
## lm(formula = df$Expenditures ~ df$RVUs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -185723026 -14097620 2813431 11919781 642218316
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3785072.158 4412905.480 -0.858 0.392
## df$RVUs 235.072 5.061 46.449 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 67350000 on 382 degrees of freedom
## Multiple R-squared: 0.8496, Adjusted R-squared: 0.8492
## F-statistic: 2157 on 1 and 382 DF, p-value: < 0.00000000000000022
abline(reg = univariate_reg, col="pink")
plot(x = univariate_reg)
qqnorm( y = df$Expenditures )
qqnorm( y = log(df$Expenditures) )
Cannot assume residuals to follow normal distribution - points aligned
with the normal qq plot line. Variance of residuals not constant with
fitted values.
hist(x = log(df$Expenditures),xlab = "", main = "Log Hospital Expenditure" )
hist(x = log(df$RVUs), xlab = "", main = "Log Standard Outpatient Workload (RVU)")
plot(x = df$RVUs, y = log (df$Expenditures) , xlab = "RVUs", ylab = "Log of Expenditures")
univariate_reg_transformedY = lm( formula = log(df$Expenditures) ~ df$RVUs)
?abline
abline(reg = univariate_reg_transformedY, col="pink")
summary(univariate_reg)
##
## Call:
## lm(formula = df$Expenditures ~ df$RVUs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -185723026 -14097620 2813431 11919781 642218316
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3785072.158 4412905.480 -0.858 0.392
## df$RVUs 235.072 5.061 46.449 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 67350000 on 382 degrees of freedom
## Multiple R-squared: 0.8496, Adjusted R-squared: 0.8492
## F-statistic: 2157 on 1 and 382 DF, p-value: < 0.00000000000000022
summary(univariate_reg_transformedY)
##
## Call:
## lm(formula = log(df$Expenditures) ~ df$RVUs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.59439 -0.29504 0.06135 0.35333 1.20871
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.29584389074 0.03325414655 520.11 <0.0000000000000002 ***
## df$RVUs 0.00000134911 0.00000003814 35.38 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5076 on 382 degrees of freedom
## Multiple R-squared: 0.7661, Adjusted R-squared: 0.7655
## F-statistic: 1251 on 1 and 382 DF, p-value: < 0.00000000000000022
plot(univariate_reg_transformedY)
plot( univariate_reg , which = 3)
plot( univariate_reg_transformedY , which = 3)
hist(x = log(df$Expenditures), xlab = "", main = "Log Hospital Expenditure" )
hist(x = log(df$RVUs) , xlab = "", main = "Log Standard Outpatient Workload (RVU)")
log_log_reg = lm( formula = log(df$Expenditures) ~ log(df$RVUs))
summary(log_log_reg)
##
## Call:
## lm(formula = log(df$Expenditures) ~ log(df$RVUs))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.74657 -0.19864 -0.02431 0.18642 0.93551
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.91487 0.16621 41.60 <0.0000000000000002 ***
## log(df$RVUs) 0.88444 0.01317 67.17 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2932 on 382 degrees of freedom
## Multiple R-squared: 0.9219, Adjusted R-squared: 0.9217
## F-statistic: 4512 on 1 and 382 DF, p-value: < 0.00000000000000022
plot(log_log_reg)
plot(x = df$RVUs, y = log (df$Expenditures) , xlab = "RVUs", ylab = "Log of Expenditures")
df$RVUs2 <- df$RVUs^2
univariate_reg_transformedY2 = lm( formula = log(df$Expenditures) ~ df$RVUs+ df$RVUs2)
summary(univariate_reg_transformedY2)
##
## Call:
## lm(formula = log(df$Expenditures) ~ df$RVUs + df$RVUs2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.14013 -0.26450 0.01464 0.25301 0.84276
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 16.95069300158269243 0.03006920638916719 563.72
## df$RVUs 0.00000275479307647 0.00000007917710136 34.79
## df$RVUs2 -0.00000000000055708 0.00000000000002944 -18.92
## Pr(>|t|)
## (Intercept) <0.0000000000000002 ***
## df$RVUs <0.0000000000000002 ***
## df$RVUs2 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3649 on 381 degrees of freedom
## Multiple R-squared: 0.8794, Adjusted R-squared: 0.8788
## F-statistic: 1390 on 2 and 381 DF, p-value: < 0.00000000000000022
plot(univariate_reg_transformedY2)