install.packages("tidyverse", repos = "https://cloud.r-project.org")
## also installing the dependencies 'systemfonts', 'textshaping', 'conflicted', 'lubridate', 'ragg', 'readxl'
##
## The downloaded binary packages are in
## /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//RtmpplI7X7/downloaded_packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.0 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.1 ✔ tibble 3.1.8
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
install.packages("psych", repos = "https://cloud.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//RtmpplI7X7/downloaded_packages
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
mydata <- read_csv("week 6 data-1.csv")
## Rows: 384 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): Expenditures, Enrolled, RVUs, FTEs, Quality Score
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(mydata)
## spc_tbl_ [384 × 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Expenditures : num [1:384] 1.15e+08 1.16e+08 1.20e+08 1.91e+07 2.46e+08 ...
## $ Enrolled : num [1:384] 25294 42186 23772 2085 67258 ...
## $ RVUs : num [1:384] 402704 638252 447030 43337 1579789 ...
## $ FTEs : num [1:384] 955 949 953 200 2162 ...
## $ Quality Score: num [1:384] 0.67 0.58 0.52 0.93 0.96 0.56 0.84 0.82 0.76 0.81 ...
## - attr(*, "spec")=
## .. cols(
## .. Expenditures = col_double(),
## .. Enrolled = col_double(),
## .. RVUs = col_double(),
## .. FTEs = col_double(),
## .. `Quality Score` = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
head(mydata)
## # A tibble: 6 × 5
## Expenditures Enrolled RVUs FTEs `Quality Score`
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 114948144. 25294 402704. 955. 0.67
## 2 116423140. 42186 638252. 949. 0.58
## 3 119977702. 23772 447030. 953. 0.52
## 4 19056531. 2085 43337. 200. 0.93
## 5 246166031. 67258 1579789. 2162. 0.96
## 6 152125186. 23752 673037. 1359. 0.56
describe(mydata)
## vars n mean sd median trimmed
## Expenditures 1 384 124765816.51 173435868.06 52796103.84 84600936.46
## Enrolled 2 384 24713.82 22756.42 16466.00 20647.21
## RVUs 3 384 546857.77 680047.21 246701.71 398680.18
## FTEs 4 384 1060.86 1336.29 483.07 750.11
## Quality Score 5 384 0.71 0.11 0.73 0.72
## mad min max range skew kurtosis
## Expenditures 47444814.57 7839562.52 1.301994e+09 1.294154e+09 3.04 11.27
## Enrolled 13092.10 1218.00 1.198500e+05 1.186320e+05 1.94 4.14
## RVUs 237760.93 23218.01 3.574006e+06 3.550788e+06 2.10 4.27
## FTEs 401.38 116.29 7.518630e+03 7.402340e+03 2.55 6.94
## Quality Score 0.10 0.31 9.600000e-01 6.500000e-01 -0.55 0.40
## se
## Expenditures 8850612.08
## Enrolled 1161.28
## RVUs 34703.51
## FTEs 68.19
## Quality Score 0.01
#Absence of outliers for expenditures
mean_exp <- mean(mydata$Expenditures)
sd_exp <- sd(mydata$Expenditures)
mydata_exp_out <- (mydata$Expenditures - mean_exp)/ sd_exp
sum(abs((mydata_exp_out)>3))
## [1] 13
#Absence of outliers for rvus
mean_rvus <- mean(mydata$RVUs)
sd_rvus <- sd(mydata$RVUs)
mydata_rvus_out <- (mydata$RVUs - mean_rvus)/ sd_rvus
sum(abs((mydata_rvus_out)>3))
## [1] 12
hist(x = mydata$Expenditures, main = "Expenditure")

hist(x = mydata$RVUs, main = "RVU")

cor(mydata)
## Expenditures Enrolled RVUs FTEs Quality Score
## Expenditures 1.0000000 0.7707756 0.9217239 0.9796506 0.2749501
## Enrolled 0.7707756 1.0000000 0.9152024 0.8148491 0.2526991
## RVUs 0.9217239 0.9152024 1.0000000 0.9504093 0.3075742
## FTEs 0.9796506 0.8148491 0.9504093 1.0000000 0.2769058
## Quality Score 0.2749501 0.2526991 0.3075742 0.2769058 1.0000000
We can see that we have a strong correlation of 0.92 between
expenditures and RVUS.
## Creating a plot
plot(x = mydata$RVUs, y = mydata$Expenditures, xlab = "RVUS", ylab = "Expenditures")

#Linear regression
reg <- lm(mydata$Expenditures ~ mydata$RVUs)
summary(reg)
##
## Call:
## lm(formula = mydata$Expenditures ~ mydata$RVUs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -185723026 -14097620 2813431 11919781 642218316
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.785e+06 4.413e+06 -0.858 0.392
## mydata$RVUs 2.351e+02 5.061e+00 46.449 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 67350000 on 382 degrees of freedom
## Multiple R-squared: 0.8496, Adjusted R-squared: 0.8492
## F-statistic: 2157 on 1 and 382 DF, p-value: < 2.2e-16
options(scipen = 999 )
summary(reg)
##
## Call:
## lm(formula = mydata$Expenditures ~ mydata$RVUs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -185723026 -14097620 2813431 11919781 642218316
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3785072.158 4412905.480 -0.858 0.392
## mydata$RVUs 235.072 5.061 46.449 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 67350000 on 382 degrees of freedom
## Multiple R-squared: 0.8496, Adjusted R-squared: 0.8492
## F-statistic: 2157 on 1 and 382 DF, p-value: < 0.00000000000000022
Interpret the linear model
1 unit increase in RVUs will increase expenditure by 235.072
When RVU is 0, expenditure is expected to be -3785000
(unrelastic)
T value = 46.449, and p value = 2e-16, suggesting a postive
correlation.
Residuals vs fitted: In our graph we can observate that our
residuals shows hetorasticity. You can see the point fanning out at the
end.
Normal Q-Q: In our normal qq graph we can see that the residuals
does not follow a straight line very well. It has skewness issue.
Scale-Location: In our scale-location graph we can see how the
residuals are not spread equally along the range of predictors proving
the assumption of homoskedasticity is not validated.
Residual vs leverage: In our graph for residual vs levaregewe can
see that we have 3 outilers that influences our linear regression
analysis. Meaning that 3 of our cases are outside the Cook’s
distance.
Did the Gauss Markow Assumptions hold ?
The Gauss Markow Assumptions did not hold because the regresion is
only validating two out of the 4 assumptions. We saw how the residuals
does not fiitend well in linear line and that we have some extreme
values at the Cook’s distance.