install.packages("tidyverse", repos = "https://cloud.r-project.org")
## also installing the dependencies 'systemfonts', 'textshaping', 'conflicted', 'lubridate', 'ragg', 'readxl'
## 
## The downloaded binary packages are in
##  /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//RtmpplI7X7/downloaded_packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.0     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.1.8
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
install.packages("psych", repos = "https://cloud.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//RtmpplI7X7/downloaded_packages
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
mydata <- read_csv("week 6 data-1.csv")
## Rows: 384 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): Expenditures, Enrolled, RVUs, FTEs, Quality Score
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(mydata)
## spc_tbl_ [384 × 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Expenditures : num [1:384] 1.15e+08 1.16e+08 1.20e+08 1.91e+07 2.46e+08 ...
##  $ Enrolled     : num [1:384] 25294 42186 23772 2085 67258 ...
##  $ RVUs         : num [1:384] 402704 638252 447030 43337 1579789 ...
##  $ FTEs         : num [1:384] 955 949 953 200 2162 ...
##  $ Quality Score: num [1:384] 0.67 0.58 0.52 0.93 0.96 0.56 0.84 0.82 0.76 0.81 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Expenditures = col_double(),
##   ..   Enrolled = col_double(),
##   ..   RVUs = col_double(),
##   ..   FTEs = col_double(),
##   ..   `Quality Score` = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
head(mydata)
## # A tibble: 6 × 5
##   Expenditures Enrolled     RVUs  FTEs `Quality Score`
##          <dbl>    <dbl>    <dbl> <dbl>           <dbl>
## 1   114948144.    25294  402704.  955.            0.67
## 2   116423140.    42186  638252.  949.            0.58
## 3   119977702.    23772  447030.  953.            0.52
## 4    19056531.     2085   43337.  200.            0.93
## 5   246166031.    67258 1579789. 2162.            0.96
## 6   152125186.    23752  673037. 1359.            0.56
describe(mydata)
##               vars   n         mean           sd      median     trimmed
## Expenditures     1 384 124765816.51 173435868.06 52796103.84 84600936.46
## Enrolled         2 384     24713.82     22756.42    16466.00    20647.21
## RVUs             3 384    546857.77    680047.21   246701.71   398680.18
## FTEs             4 384      1060.86      1336.29      483.07      750.11
## Quality Score    5 384         0.71         0.11        0.73        0.72
##                       mad        min          max        range  skew kurtosis
## Expenditures  47444814.57 7839562.52 1.301994e+09 1.294154e+09  3.04    11.27
## Enrolled         13092.10    1218.00 1.198500e+05 1.186320e+05  1.94     4.14
## RVUs            237760.93   23218.01 3.574006e+06 3.550788e+06  2.10     4.27
## FTEs               401.38     116.29 7.518630e+03 7.402340e+03  2.55     6.94
## Quality Score        0.10       0.31 9.600000e-01 6.500000e-01 -0.55     0.40
##                       se
## Expenditures  8850612.08
## Enrolled         1161.28
## RVUs            34703.51
## FTEs               68.19
## Quality Score       0.01
#Absence of outliers for  expenditures 

mean_exp <- mean(mydata$Expenditures)

sd_exp <- sd(mydata$Expenditures)

mydata_exp_out <- (mydata$Expenditures - mean_exp)/ sd_exp

sum(abs((mydata_exp_out)>3))
## [1] 13
#Absence of outliers for rvus 

mean_rvus <- mean(mydata$RVUs)

sd_rvus <- sd(mydata$RVUs)

mydata_rvus_out <- (mydata$RVUs - mean_rvus)/ sd_rvus

sum(abs((mydata_rvus_out)>3))
## [1] 12
hist(x = mydata$Expenditures, main = "Expenditure")

hist(x = mydata$RVUs, main = "RVU")

cor(mydata)
##               Expenditures  Enrolled      RVUs      FTEs Quality Score
## Expenditures     1.0000000 0.7707756 0.9217239 0.9796506     0.2749501
## Enrolled         0.7707756 1.0000000 0.9152024 0.8148491     0.2526991
## RVUs             0.9217239 0.9152024 1.0000000 0.9504093     0.3075742
## FTEs             0.9796506 0.8148491 0.9504093 1.0000000     0.2769058
## Quality Score    0.2749501 0.2526991 0.3075742 0.2769058     1.0000000

We can see that we have a strong correlation of 0.92 between expenditures and RVUS.

## Creating a plot 

plot(x = mydata$RVUs, y = mydata$Expenditures, xlab = "RVUS", ylab = "Expenditures")

#Linear regression 

reg <- lm(mydata$Expenditures ~ mydata$RVUs)
summary(reg)
## 
## Call:
## lm(formula = mydata$Expenditures ~ mydata$RVUs)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -185723026  -14097620    2813431   11919781  642218316 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.785e+06  4.413e+06  -0.858    0.392    
## mydata$RVUs  2.351e+02  5.061e+00  46.449   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 67350000 on 382 degrees of freedom
## Multiple R-squared:  0.8496, Adjusted R-squared:  0.8492 
## F-statistic:  2157 on 1 and 382 DF,  p-value: < 2.2e-16
options(scipen = 999 )
summary(reg)
## 
## Call:
## lm(formula = mydata$Expenditures ~ mydata$RVUs)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -185723026  -14097620    2813431   11919781  642218316 
## 
## Coefficients:
##                 Estimate   Std. Error t value            Pr(>|t|)    
## (Intercept) -3785072.158  4412905.480  -0.858               0.392    
## mydata$RVUs      235.072        5.061  46.449 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 67350000 on 382 degrees of freedom
## Multiple R-squared:  0.8496, Adjusted R-squared:  0.8492 
## F-statistic:  2157 on 1 and 382 DF,  p-value: < 0.00000000000000022

Interpret the linear model

1 unit increase in RVUs will increase expenditure by 235.072

When RVU is 0, expenditure is expected to be -3785000 (unrelastic)

T value = 46.449, and p value = 2e-16, suggesting a postive correlation.

The model explained 84.96% varience in Expenditures

plot(reg)

Residuals vs fitted: In our graph we can observate that our residuals shows hetorasticity. You can see the point fanning out at the end.

Normal Q-Q: In our normal qq graph we can see that the residuals does not follow a straight line very well. It has skewness issue.

Scale-Location: In our scale-location graph we can see how the residuals are not spread equally along the range of predictors proving the assumption of homoskedasticity is not validated.

Residual vs leverage: In our graph for residual vs levaregewe can see that we have 3 outilers that influences our linear regression analysis. Meaning that 3 of our cases are outside the Cook’s distance.

Did the Gauss Markow Assumptions hold ?

The Gauss Markow Assumptions did not hold because the regresion is only validating two out of the 4 assumptions. We saw how the residuals does not fiitend well in linear line and that we have some extreme values at the Cook’s distance.

linear model with transformation

reg2 <- lm(log(mydata$Expenditures) ~ mydata$RVUs)
summary(reg2)
## 
## Call:
## lm(formula = log(mydata$Expenditures) ~ mydata$RVUs)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.59439 -0.29504  0.06135  0.35333  1.20871 
## 
## Coefficients:
##                   Estimate     Std. Error t value            Pr(>|t|)    
## (Intercept) 17.29584389074  0.03325414655  520.11 <0.0000000000000002 ***
## mydata$RVUs  0.00000134911  0.00000003814   35.38 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5076 on 382 degrees of freedom
## Multiple R-squared:  0.7661, Adjusted R-squared:  0.7655 
## F-statistic:  1251 on 1 and 382 DF,  p-value: < 0.00000000000000022
plot(x = mydata$RVUs, y = log (mydata$Expenditures), xlab = "RVUS", ylab = "log of Expenditures")

plot(reg2)

1) How did this log transformation affect the Gauss Markov Assumptions (sure, the residual analysis diagnostic charts will change but what is your takeaway - which assumptions are better met or are some assumptions not met now ) ?

T value = 35.38, p value = 2e-16.

The model explained 76.61% varience in Expenditure, which is lower than the linear model.

A 1 unit increace in RVUS causes a 0.000134911 % increase in expenditure. Using log-level.

Linearity assumption is still violted, looks like a curvilinear relationship.

Normal residuals: The residual for the linear regression for log expenditures ~ rdu is slightly better than the normal distribution.

Constant variability = This grahp is also slightly better than the normal Distribution.

2) Are you happy with this functional form capturing the relationship between Y and X or would like to keep some different functional form (EG - ln(Expenditures)~RVUs + RVUs^2 ) ? Why (4 lines maximum) ?

log_log_reg <- lm(formula = log(mydata$Expenditures) ~ log(mydata$RVUs))

summary(log_log_reg)
## 
## Call:
## lm(formula = log(mydata$Expenditures) ~ log(mydata$RVUs))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.74657 -0.19864 -0.02431  0.18642  0.93551 
## 
## Coefficients:
##                  Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)       6.91487    0.16621   41.60 <0.0000000000000002 ***
## log(mydata$RVUs)  0.88444    0.01317   67.17 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2932 on 382 degrees of freedom
## Multiple R-squared:  0.9219, Adjusted R-squared:  0.9217 
## F-statistic:  4512 on 1 and 382 DF,  p-value: < 0.00000000000000022
plot(log_log_reg)