Topics for today!
- Topic: Multiple Regression with catagorical variables
- Topic: Multiple Regression with quadratic terms
- Plotting polynomial regression
- Topic: Multiple Regression with mixed interaction terms
Cleaning and Examining the Data
setwd("~/Desktop/R Materials/mih140/Assignments/'20 Assignments/Data")
cba = read.table("cba_admissions_1999.txt", sep = "\t", header = T, quote = "", allowEscapes = T)
cba = cba[!is.na(cba$SAT_math), ]
cba$rank = cba$HS_rank/cba$HS_class_size
cba = cba[!is.na(cba$rank),]
library(gpairs)
gpairs(cba[,c("SAT_math", "SAT_verbal", "rank")])

Topic 1: Multiple Regression with catagorical variables
auto_model = lm(data = cba, rank ~ SAT_verbal + Race)
unique(cba$Race)
## [1] "white" "Asian" "not_indicated" "black"
## [5] "Hispanic" "American_Indian"
library(dummies)
## dummies-1.5.6 provided by Decision Patterns
race_tab = dummy(cba$Race)
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
race_tab = data.frame(race_tab)
cba_2 = data.frame(cba, race_tab)
names(cba_2)[31:36] = c("RaceAmerican_Indian", "RaceAsian", "Raceblack", "RaceHispanic", "Racenot_indicated", "Racewhite")
explicit_model = lm(data = cba_2, rank ~ SAT_verbal + RaceAmerican_Indian + RaceAsian + Raceblack + RaceHispanic + Racewhite)
summary(explicit_model)
##
## Call:
## lm(formula = rank ~ SAT_verbal + RaceAmerican_Indian + RaceAsian +
## Raceblack + RaceHispanic + Racewhite, data = cba_2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.27743 -0.11969 -0.02393 0.10203 0.52202
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.330e-01 6.583e-02 8.096 2.61e-15 ***
## SAT_verbal -4.388e-04 8.429e-05 -5.206 2.56e-07 ***
## RaceAmerican_Indian -5.228e-02 1.141e-01 -0.458 0.6470
## RaceAsian -1.045e-01 4.957e-02 -2.107 0.0354 *
## Raceblack -2.855e-02 4.887e-02 -0.584 0.5593
## RaceHispanic -8.146e-02 6.386e-02 -1.276 0.2025
## Racewhite -5.126e-02 4.356e-02 -1.177 0.2397
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1491 on 684 degrees of freedom
## Multiple R-squared: 0.05003, Adjusted R-squared: 0.04169
## F-statistic: 6.003 on 6 and 684 DF, p-value: 3.931e-06
small_model = lm(data = cba_2, rank ~ SAT_verbal + RaceAsian)
summary(small_model)
##
## Call:
## lm(formula = rank ~ SAT_verbal + RaceAsian, data = cba_2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.27967 -0.12152 -0.02531 0.10384 0.52024
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.852e-01 4.728e-02 10.261 < 2e-16 ***
## SAT_verbal -4.415e-04 8.368e-05 -5.276 1.77e-07 ***
## RaceAsian -5.516e-02 2.519e-02 -2.190 0.0289 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.149 on 688 degrees of freedom
## Multiple R-squared: 0.04626, Adjusted R-squared: 0.04349
## F-statistic: 16.69 on 2 and 688 DF, p-value: 8.383e-08
Topic 2: Multiple Regression with quadratic terms
plot(data = cba_2[cba_2$RaceAsian == 1,], rank ~ Max_Test_Score, xlim = c(800, 1600), ylim = c(0, .6))

lin_model = lm(data = cba_2[cba_2$RaceAsian == 1,], rank ~ Max_Test_Score)
quad_model = lm(data = cba_2[cba_2$RaceAsian == 1,], rank ~ Max_Test_Score + I(Max_Test_Score^2))
summary(lin_model)
##
## Call:
## lm(formula = rank ~ Max_Test_Score, data = cba_2[cba_2$RaceAsian ==
## 1, ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.20884 -0.09794 -0.04038 0.06625 0.35250
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.3935097 0.2118803 1.857 0.0717 .
## Max_Test_Score -0.0001784 0.0001745 -1.022 0.3137
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1493 on 35 degrees of freedom
## Multiple R-squared: 0.02899, Adjusted R-squared: 0.00125
## F-statistic: 1.045 on 1 and 35 DF, p-value: 0.3137
summary(quad_model)
##
## Call:
## lm(formula = rank ~ Max_Test_Score + I(Max_Test_Score^2), data = cba_2[cba_2$RaceAsian ==
## 1, ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.19505 -0.08836 -0.04849 0.06227 0.32456
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.812e+00 1.106e+00 -1.639 0.1105
## Max_Test_Score 3.464e-03 1.803e-03 1.921 0.0631 .
## I(Max_Test_Score^2) -1.484e-06 7.312e-07 -2.029 0.0504 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.143 on 34 degrees of freedom
## Multiple R-squared: 0.1339, Adjusted R-squared: 0.08291
## F-statistic: 2.627 on 2 and 34 DF, p-value: 0.0869
Plotting polynomial regression
new_dat = data.frame(Max_Test_Score = seq(800,1600,10))
lin_fit = predict(lin_model, new_dat)
quad_fit = predict(quad_model, new_dat)
plot(data = cba_2[cba_2$RaceAsian == 1,], rank ~ Max_Test_Score, xlim = c(800, 1600), ylim = c(0, .6))
par(new = T)
plot(seq(800,1600,10), lin_fit, xlim = c(800, 1600), ylim = c(0, .6), type = "l", xlab = "", ylab = "", col = "red")
par(new = T)
plot(seq(800,1600,10), quad_fit, xlim = c(800, 1600), ylim = c(0, .6), type = "l", xlab = "", ylab = "", col = "blue")

Topic 3: Multiple Regression with mixed interaction terms
no_interaction_lm = lm(data = cba, SAT_math ~ SAT_verbal + Female)
interaction_lm = lm(data = cba, SAT_math ~ SAT_verbal + Female + SAT_verbal:Female)
summary(no_interaction_lm)
##
## Call:
## lm(formula = SAT_math ~ SAT_verbal + Female, data = cba)
##
## Residuals:
## Min 1Q Median 3Q Max
## -153.498 -41.726 -1.164 41.323 199.363
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 351.89984 19.51020 18.037 < 2e-16 ***
## SAT_verbal 0.43611 0.03394 12.850 < 2e-16 ***
## Female -25.70743 4.61991 -5.564 3.77e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 60.13 on 688 degrees of freedom
## Multiple R-squared: 0.2366, Adjusted R-squared: 0.2344
## F-statistic: 106.6 on 2 and 688 DF, p-value: < 2.2e-16
summary(interaction_lm)
##
## Call:
## lm(formula = SAT_math ~ SAT_verbal + Female + SAT_verbal:Female,
## data = cba)
##
## Residuals:
## Min 1Q Median 3Q Max
## -154.380 -42.086 -1.366 41.487 203.071
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 362.36999 25.78049 14.056 <2e-16 ***
## SAT_verbal 0.41766 0.04509 9.263 <2e-16 ***
## Female -49.54612 38.62331 -1.283 0.200
## SAT_verbal:Female 0.04260 0.06852 0.622 0.534
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 60.15 on 687 degrees of freedom
## Multiple R-squared: 0.237, Adjusted R-squared: 0.2337
## F-statistic: 71.15 on 3 and 687 DF, p-value: < 2.2e-16
no_interaction_lm = lm(data = cba, rank ~ SAT_math + Female)
interaction_lm = lm(data = cba, rank ~ SAT_math + Female + SAT_math:Female)
summary(no_interaction_lm)
##
## Call:
## lm(formula = rank ~ SAT_math + Female, data = cba)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.27450 -0.11133 -0.01985 0.09174 0.45865
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.485e-01 4.986e-02 11.001 < 2e-16 ***
## SAT_math -4.665e-04 8.225e-05 -5.672 2.08e-08 ***
## Female -9.088e-02 1.135e-02 -8.011 4.88e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1444 on 688 degrees of freedom
## Multiple R-squared: 0.1037, Adjusted R-squared: 0.1011
## F-statistic: 39.8 on 2 and 688 DF, p-value: < 2.2e-16
summary(interaction_lm)
##
## Call:
## lm(formula = rank ~ SAT_math + Female + SAT_math:Female, data = cba)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.26749 -0.11146 -0.02049 0.09236 0.45862
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.5204299 0.0665296 7.823 1.96e-14 ***
## SAT_math -0.0004196 0.0001103 -3.805 0.000155 ***
## Female -0.0294093 0.0970127 -0.303 0.761868
## SAT_math:Female -0.0001057 0.0001656 -0.638 0.523655
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1445 on 687 degrees of freedom
## Multiple R-squared: 0.1042, Adjusted R-squared: 0.1003
## F-statistic: 26.64 on 3 and 687 DF, p-value: 2.606e-16
no_interaction_lm = lm(data = cba, rank ~ SAT_math + Race)
interaction_lm = lm(data = cba, rank ~ SAT_math + Race + SAT_math:Race)
summary(no_interaction_lm)
##
## Call:
## lm(formula = rank ~ SAT_math + Race, data = cba)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.2484 -0.1222 -0.0224 0.1031 0.4965
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.379e-01 1.193e-01 3.670 0.000261 ***
## SAT_math -2.873e-04 8.497e-05 -3.381 0.000762 ***
## RaceAsian -7.906e-02 1.095e-01 -0.722 0.470423
## Raceblack -1.059e-02 1.093e-01 -0.097 0.922822
## RaceHispanic -7.690e-02 1.169e-01 -0.658 0.510844
## Racenot_indicated 7.472e-03 1.152e-01 0.065 0.948308
## Racewhite -3.471e-02 1.069e-01 -0.325 0.745469
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1508 on 684 degrees of freedom
## Multiple R-squared: 0.02863, Adjusted R-squared: 0.02011
## F-statistic: 3.36 on 6 and 684 DF, p-value: 0.002854
summary(interaction_lm)
##
## Call:
## lm(formula = rank ~ SAT_math + Race + SAT_math:Race, data = cba)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.24841 -0.12303 -0.02223 0.10259 0.49628
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.302e-01 1.218e-01 3.532 0.00044 ***
## SAT_math -2.750e-04 9.253e-05 -2.972 0.00306 **
## RaceAsian -8.461e-02 2.359e-01 -0.359 0.71994
## Raceblack 4.836e-02 2.381e-01 0.203 0.83914
## RaceHispanic -4.460e-02 5.614e-01 -0.079 0.93671
## Racenot_indicated 2.294e-01 3.973e-01 0.577 0.56397
## Racewhite -3.415e-02 1.072e-01 -0.319 0.75014
## SAT_math:RaceAsian 8.871e-06 3.323e-04 0.027 0.97871
## SAT_math:Raceblack -1.041e-04 3.763e-04 -0.277 0.78207
## SAT_math:RaceHispanic -5.462e-05 9.459e-04 -0.058 0.95396
## SAT_math:Racenot_indicated -3.707e-04 6.356e-04 -0.583 0.55991
## SAT_math:Racewhite NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1512 on 680 degrees of freedom
## Multiple R-squared: 0.02922, Adjusted R-squared: 0.01494
## F-statistic: 2.047 on 10 and 680 DF, p-value: 0.02669