rm(list = ls()) # Clear all files from your environment
gc() # Clear unused memory
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 526633 28.2 1169955 62.5 NA 669420 35.8
## Vcells 971421 7.5 8388608 64.0 16384 1851933 14.2
cat("\f") # Clear the console
graphics.off() # Clear all graphs
library(readr)
setwd("~/Desktop/Data Analysis/Discussion 13 - Simple Linear Regressions/HW 6")
dt <- read_csv("week 6 data.csv")
## Rows: 384 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): Expenditures, Enrolled, RVUs, FTEs, Quality Score
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
correlation <- cor(dt$Expenditures,
dt$RVUs,
use = "complete.obs")
print(correlation)
## [1] 0.9217239
lm_model <- lm(Expenditures ~ RVUs,
data = dt)
# Summary
summary(lm_model)
##
## Call:
## lm(formula = Expenditures ~ RVUs, data = dt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -185723026 -14097620 2813431 11919781 642218316
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.785e+06 4.413e+06 -0.858 0.392
## RVUs 2.351e+02 5.061e+00 46.449 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 67350000 on 382 degrees of freedom
## Multiple R-squared: 0.8496, Adjusted R-squared: 0.8492
## F-statistic: 2157 on 1 and 382 DF, p-value: < 2.2e-16
plot(lm_model)
Residual vs fitted: We can see there is a linear pattern and it contains a heteroscedasticity funnel out shape. This tells us that as the RVUs increase, the variability of errors also increases, creating a stronger linear relationship between lower RVUs.
Q-Q Resideuals: The large majority of residuals fall on the diagonal line, appearing to be normally distributed. There are a few exceptions where some deviate from normality.
Scale-Location: Similar to residual vs fitted we observe there is a heteroscedasticity funnel out shape with a concentration on the lower left.
Residuals vs Leverage: This allows us to identify influential outliers, based on the graph we have two at the top right which are outside cooks lines which may need to be removed.
library(ggplot2)
ggplot(dt, aes(x = RVUs,
y = Expenditures)) +
geom_point(size = 2,
shape = 18,
col = "red") +
stat_smooth(method = lm,
linetype = "dashed") +
xlab("RVUs") +
ylab("Expenditures")
## `geom_smooth()` using formula = 'y ~ x'
# Fit linear model of ln(Expenditures) ~ RVUs
lm_log_model <- lm(log(Expenditures) ~ RVUs,
data = dt)
# Summary
summary(lm_log_model)
##
## Call:
## lm(formula = log(Expenditures) ~ RVUs, data = dt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.59439 -0.29504 0.06135 0.35333 1.20871
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.730e+01 3.325e-02 520.11 <2e-16 ***
## RVUs 1.349e-06 3.814e-08 35.38 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5076 on 382 degrees of freedom
## Multiple R-squared: 0.7661, Adjusted R-squared: 0.7655
## F-statistic: 1251 on 1 and 382 DF, p-value: < 2.2e-16
The residual standard error has decreased as well as the variance in the data for the higher RVUs.
plot(lm_log_model)
Once we take the log, the overall shape of the line has changed from linear to curved and thus linearity does not hold. The transformation has helped the normalization of the distribution as observed by the Q-Q Residuals where residuals allign across the diagonal line with less outliers.
While there is an improvement to homoscedasticity, it still does not hold due to the majority still clustered along the left side of the X axis in the Scale-Location graph. Lastly, there are no influential outliers, while there are some points that still appear as outliers, they are no longer outside Cooks distance lines.
ggplot(dt, aes(x = RVUs,
y = log(Expenditures))) +
geom_point(size = 2,
shape = 18,
col = "red") +
stat_smooth(method = lm,
linetype = "dashed") +
xlab("RVUs") +
ylab("Log(Expenditures)")
## `geom_smooth()` using formula = 'y ~ x'
# Residual plot analysis with log transformation
# Fit linear model of ln(Expenditures) ~ ln(RVUs)
lm_log_model2 <- lm(log(Expenditures) ~ log(RVUs),
data = dt)
par(mfrow=c(2, 2))
plot(lm_log_model2)
# Summary
summary(lm_log_model2)
##
## Call:
## lm(formula = log(Expenditures) ~ log(RVUs), data = dt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.74657 -0.19864 -0.02431 0.18642 0.93551
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.91487 0.16621 41.60 <2e-16 ***
## log(RVUs) 0.88444 0.01317 67.17 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2932 on 382 degrees of freedom
## Multiple R-squared: 0.9219, Adjusted R-squared: 0.9217
## F-statistic: 4512 on 1 and 382 DF, p-value: < 2.2e-16
We can see the ln(Expenditures)~ln(RVUs) helped normalize the data further and create a more linear shape as observed by the top two graphs. It also furthermore improved the homoscedasticity as observed on the scale-location graph. In comparison to the other two examples we did previously, this provided the best results giving the lowest residual standard error of 0.2932.
ggplot(dt, aes(x = log(RVUs),
y = log(Expenditures))) +
geom_point(size = 2,
shape = 18,
col = "red") +
stat_smooth(method = lm,
linetype = "dashed") +
xlab("Log(RVUs)") +
ylab("Log(Expenditures)")
## `geom_smooth()` using formula = 'y ~ x'
Based on the relationship observed in the data between RVUs and expenditures, increasing RVUs would increase expenditures by a relatively high rate (2.35, original slope). When using the third transformation with a slope of 0.88 the expense relationship is far more enticing. While this model is important, i would need to understand the revenue relationship as well to get a more holistic perspective of profitability. Without this, it is very hard to make a recommendation to expand the hospital or not.