data("College")
head(College) %>% gt()
| Private | Apps | Accept | Enroll | Top10perc | Top25perc | F.Undergrad | P.Undergrad | Outstate | Room.Board | Books | Personal | PhD | Terminal | S.F.Ratio | perc.alumni | Expend | Grad.Rate |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Yes | 1660 | 1232 | 721 | 23 | 52 | 2885 | 537 | 7440 | 3300 | 450 | 2200 | 70 | 78 | 18.1 | 12 | 7041 | 60 |
| Yes | 2186 | 1924 | 512 | 16 | 29 | 2683 | 1227 | 12280 | 6450 | 750 | 1500 | 29 | 30 | 12.2 | 16 | 10527 | 56 |
| Yes | 1428 | 1097 | 336 | 22 | 50 | 1036 | 99 | 11250 | 3750 | 400 | 1165 | 53 | 66 | 12.9 | 30 | 8735 | 54 |
| Yes | 417 | 349 | 137 | 60 | 89 | 510 | 63 | 12960 | 5450 | 450 | 875 | 92 | 97 | 7.7 | 37 | 19016 | 59 |
| Yes | 193 | 146 | 55 | 16 | 44 | 249 | 869 | 7560 | 4120 | 800 | 1500 | 76 | 72 | 11.9 | 2 | 10922 | 15 |
| Yes | 587 | 479 | 158 | 38 | 62 | 678 | 41 | 13500 | 3335 | 500 | 675 | 67 | 73 | 9.4 | 11 | 9727 | 55 |
Questions
1-Do colleges with larger full_time enrollments have lower graduation rates?
2-Is this different for public/private institutions?
3-Is it different for more selective school?
ggplot(College, aes(x= Grad.Rate))+
geom_histogram()
Note: weird outlier
suspicious <- filter(College, Grad.Rate>=100)
View(suspicious)
ggplot(College, aes(x=F.Undergrad,
y=Grad.Rate))+
geom_point()
Not clear, not linear, try to make transformation
ggplot(College, aes(x=log10(F.Undergrad),
y=Grad.Rate))+
geom_point()
So insert this transformation in data
college_sm <- College %>%
mutate(log_full=log10(F.Undergrad)) %>%
select(Grad.Rate,
log_full,
Private,
Top25perc)
After log transform, we have a good candidate for linear regression modelling
let’s start by visualization
ggplot(College, aes(x=log10(F.Undergrad),
y=Grad.Rate))+
geom_point()+
geom_smooth(method="lm")
looks pretty flat indicating F.undergrad maybe isn’t good predictor,
check by numbers!
model_undergrad <- lm(Grad.Rate ~ log_full,
data=college_sm)
summary(model_undergrad)
##
## Call:
## lm(formula = Grad.Rate ~ log_full, data = college_sm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -55.501 -12.433 -0.434 12.539 52.564
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 65.17524 4.62722 14.085 <2e-16 ***
## log_full 0.08688 1.38301 0.063 0.95
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.19 on 775 degrees of freedom
## Multiple R-squared: 5.092e-06, Adjusted R-squared: -0.001285
## F-statistic: 0.003946 on 1 and 775 DF, p-value: 0.9499
first question answer–
high p value for log full, then not sig for the model
ggplot(College, aes(x=log10(F.Undergrad),
y=Grad.Rate,
color=Private))+
geom_point()+
geom_smooth(method="lm",
se=FALSE)
model_private <- lm(Grad.Rate ~ Private + log_full,
data= college_sm)
summary(model_private)
##
## Call:
## lm(formula = Grad.Rate ~ Private + log_full, data = college_sm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -55.826 -9.581 -0.128 10.752 51.007
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.784 6.330 -0.282 0.778
## PrivateYes 23.007 1.646 13.978 <2e-16 ***
## log_full 15.235 1.644 9.266 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.37 on 774 degrees of freedom
## Multiple R-squared: 0.2016, Adjusted R-squared: 0.1995
## F-statistic: 97.7 on 2 and 774 DF, p-value: < 2.2e-16
Showing significance of the 2 variables
model_private_int <- lm(Grad.Rate ~ Private*log_full,
data= college_sm)
summary(model_private_int)
##
## Call:
## lm(formula = Grad.Rate ~ Private * log_full, data = college_sm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -60.179 -9.488 -0.285 10.789 51.447
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.204 10.755 2.158 0.03127 *
## PrivateYes -12.468 12.482 -0.999 0.31815
## log_full 8.652 2.820 3.068 0.00223 **
## PrivateYes:log_full 9.928 3.463 2.867 0.00426 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.3 on 773 degrees of freedom
## Multiple R-squared: 0.21, Adjusted R-squared: 0.2069
## F-statistic: 68.48 on 3 and 773 DF, p-value: < 2.2e-16
Adj R squared somewhat larger but not too much,so prefer not to complicate the model
anova(model_private_int)
## Analysis of Variance Table
##
## Response: Grad.Rate
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 25876 25875.6 110.5689 < 2.2e-16 ***
## log_full 1 20279 20278.9 86.6536 < 2.2e-16 ***
## Private:log_full 1 1924 1923.5 8.2193 0.004257 **
## Residuals 773 180899 234.0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model_top <- lm(Grad.Rate ~ Private+ Top25perc+log_full,
data= college_sm)
summary(model_top)
##
## Call:
## lm(formula = Grad.Rate ~ Private + Top25perc + log_full, data = college_sm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -56.702 -9.314 0.144 9.349 57.401
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.55059 5.91262 1.954 0.0511 .
## PrivateYes 16.11989 1.61439 9.985 < 2e-16 ***
## Top25perc 0.34060 0.02819 12.084 < 2e-16 ***
## log_full 6.99253 1.65593 4.223 2.7e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.1 on 773 degrees of freedom
## Multiple R-squared: 0.3284, Adjusted R-squared: 0.3258
## F-statistic: 126 on 3 and 773 DF, p-value: < 2.2e-16
all very significant, adj R squared is higherr, so it’s good to add Top25perc to the model
plot(model_top)
1- residuals vs fitted,line is not parabolic
2- Q-Q , normal
3- scale location, homoscadicity, not horn shape
4- residuals vs leverage, no extreme outliers
library(broom)
tidy(model_top)
## # A tibble: 4 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 11.6 5.91 1.95 5.11e- 2
## 2 PrivateYes 16.1 1.61 9.99 3.64e-22
## 3 Top25perc 0.341 0.0282 12.1 6.43e-31
## 4 log_full 6.99 1.66 4.22 2.70e- 5
model_aug <- augment(model_top)
head(model_aug) %>% gt()
| .rownames | Grad.Rate | Private | Top25perc | log_full | .fitted | .resid | .hat | .sigma | .cooksd | .std.resid |
|---|---|---|---|---|---|---|---|---|---|---|
| Abilene Christian University | 60 | Yes | 52 | 3.460146 | 69.57690 | -9.576903 | 0.003627838 | 14.10915 | 0.0004212070 | -0.6802439 |
| Adelphi University | 56 | Yes | 29 | 3.428621 | 61.52264 | -5.522639 | 0.007653934 | 14.11196 | 0.0002979151 | -0.3930660 |
| Adrian College | 54 | Yes | 50 | 3.015360 | 65.78552 | -11.785521 | 0.002001436 | 14.10698 | 0.0003507681 | -0.8364388 |
| Agnes Scott College | 59 | Yes | 89 | 2.707570 | 76.91673 | -17.916732 | 0.011051909 | 14.09847 | 0.0045587798 | -1.2773867 |
| Alaska Pacific University | 15 | Yes | 44 | 2.396199 | 59.41242 | -44.412418 | 0.008130290 | 14.02181 | 0.0204854887 | -3.1617494 |
| Albertson College | 55 | Yes | 62 | 2.831230 | 68.58520 | -13.585198 | 0.003448018 | 14.10487 | 0.0008052713 | -0.9648646 |
model_glance <- glance(model_top)
head(model_glance) %>% gt()
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | nobs |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.3284373 | 0.325831 | 14.10424 | 126.0156 | 1.882661e-66 | 3 | -3156.821 | 6323.643 | 6346.92 | 153772.5 | 773 | 777 |