Topics for today!

  1. Topic: Multiple Regression with catagorical variables
  2. Topic: Multiple Regression with quadratic terms
    • Plotting polynomial regression
  3. Topic: Multiple Regression with mixed interaction terms

Cleaning and Examining the Data

# Loading the data
setwd("~/Desktop/R Materials/mih140/Assignments/'20 Assignments/Data")
cba = read.table("cba_admissions_1999.txt", sep = "\t", header = T, quote = "", allowEscapes = T)
# Cleaning out the NA's
cba = cba[!is.na(cba$SAT_math), ]
cba$rank = cba$HS_rank/cba$HS_class_size # make new feature called "rank"
cba = cba[!is.na(cba$rank),] # prune students w/out a rank

# Examining the data for correlations
library(gpairs)
gpairs(cba[,c("SAT_math", "SAT_verbal", "rank")])

Topic 1: Multiple Regression with catagorical variables

# Demo: Let's try and predict class rank using SAT_verbal and also the racial catagorical variable.

auto_model = lm(data = cba, rank ~ SAT_verbal + Race)
unique(cba$Race) # Note American_Indian binary is not in model, it's the reference group chosen by R!
## [1] "white"           "Asian"           "not_indicated"   "black"          
## [5] "Hispanic"        "American_Indian"
# Make your own reference group
library(dummies)
## dummies-1.5.6 provided by Decision Patterns
race_tab = dummy(cba$Race) # Ignore warning message
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
race_tab = data.frame(race_tab)
# Now I can add these columns to the dataframe
cba_2 = data.frame(cba, race_tab) 

names(cba_2)[31:36] = c("RaceAmerican_Indian", "RaceAsian", "Raceblack", "RaceHispanic", "Racenot_indicated", "Racewhite") # Line just to fix notebook, please ignore

explicit_model = lm(data = cba_2, rank ~ SAT_verbal + RaceAmerican_Indian + RaceAsian + Raceblack + RaceHispanic + Racewhite) # I choose to omit the binary for race not indicated, making that the reference group.

summary(explicit_model) # Model is terrible
## 
## Call:
## lm(formula = rank ~ SAT_verbal + RaceAmerican_Indian + RaceAsian + 
##     Raceblack + RaceHispanic + Racewhite, data = cba_2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.27743 -0.11969 -0.02393  0.10203  0.52202 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          5.330e-01  6.583e-02   8.096 2.61e-15 ***
## SAT_verbal          -4.388e-04  8.429e-05  -5.206 2.56e-07 ***
## RaceAmerican_Indian -5.228e-02  1.141e-01  -0.458   0.6470    
## RaceAsian           -1.045e-01  4.957e-02  -2.107   0.0354 *  
## Raceblack           -2.855e-02  4.887e-02  -0.584   0.5593    
## RaceHispanic        -8.146e-02  6.386e-02  -1.276   0.2025    
## Racewhite           -5.126e-02  4.356e-02  -1.177   0.2397    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1491 on 684 degrees of freedom
## Multiple R-squared:  0.05003,    Adjusted R-squared:  0.04169 
## F-statistic: 6.003 on 6 and 684 DF,  p-value: 3.931e-06
# Regression Equation is:
# rank = 5.330e-01 - 4.388e-04*SAT_verbal - 5.228e-02*RaceAmerican_Indian + ... -  5.126e-02*Racewhite

small_model = lm(data = cba_2, rank ~ SAT_verbal + RaceAsian) # Reference group is now non-asian
summary(small_model) # Model is still terrible
## 
## Call:
## lm(formula = rank ~ SAT_verbal + RaceAsian, data = cba_2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.27967 -0.12152 -0.02531  0.10384  0.52024 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.852e-01  4.728e-02  10.261  < 2e-16 ***
## SAT_verbal  -4.415e-04  8.368e-05  -5.276 1.77e-07 ***
## RaceAsian   -5.516e-02  2.519e-02  -2.190   0.0289 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.149 on 688 degrees of freedom
## Multiple R-squared:  0.04626,    Adjusted R-squared:  0.04349 
## F-statistic: 16.69 on 2 and 688 DF,  p-value: 8.383e-08

Topic 2: Multiple Regression with quadratic terms

# Example: Suppose you hypothesize a non-linear relationship between Max_Test_Score and rank among Asian applicants. 

# You plot the relationship to confirm
plot(data = cba_2[cba_2$RaceAsian == 1,], rank ~ Max_Test_Score, xlim = c(800, 1600), ylim = c(0, .6)) # Can imagine a sort of quadratic relationship here

# You then train two models, one linear and one quadratic.

lin_model = lm(data = cba_2[cba_2$RaceAsian == 1,], rank ~ Max_Test_Score)
quad_model = lm(data = cba_2[cba_2$RaceAsian == 1,], rank ~ Max_Test_Score + I(Max_Test_Score^2))

# Regression Equation of the quadratic model is:
# rank = -1.812 + 3.464e-03 * Max_Test_Score - -1.484e-06 * Max_Test_Score^2

# Then you check to see which model is better, and note the linear one is almost insignif. and the quad. is poor but signif.

summary(lin_model) # R^2: 0.029,    Adj. R^2:  0.0013 
## 
## Call:
## lm(formula = rank ~ Max_Test_Score, data = cba_2[cba_2$RaceAsian == 
##     1, ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.20884 -0.09794 -0.04038  0.06625  0.35250 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)  
## (Intercept)     0.3935097  0.2118803   1.857   0.0717 .
## Max_Test_Score -0.0001784  0.0001745  -1.022   0.3137  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1493 on 35 degrees of freedom
## Multiple R-squared:  0.02899,    Adjusted R-squared:  0.00125 
## F-statistic: 1.045 on 1 and 35 DF,  p-value: 0.3137
summary(quad_model) # R^2: 0.134,   Adj. R^2:  0.083 
## 
## Call:
## lm(formula = rank ~ Max_Test_Score + I(Max_Test_Score^2), data = cba_2[cba_2$RaceAsian == 
##     1, ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.19505 -0.08836 -0.04849  0.06227  0.32456 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)  
## (Intercept)         -1.812e+00  1.106e+00  -1.639   0.1105  
## Max_Test_Score       3.464e-03  1.803e-03   1.921   0.0631 .
## I(Max_Test_Score^2) -1.484e-06  7.312e-07  -2.029   0.0504 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.143 on 34 degrees of freedom
## Multiple R-squared:  0.1339, Adjusted R-squared:  0.08291 
## F-statistic: 2.627 on 2 and 34 DF,  p-value: 0.0869
# You conclude that in this data the relationship is better described by a quadratic however you are careful to note that this fit is quite poor and it's prediction for low scoring students seems very strange.

Plotting polynomial regression

new_dat = data.frame(Max_Test_Score = seq(800,1600,10))
lin_fit = predict(lin_model, new_dat)
quad_fit = predict(quad_model, new_dat)

 # Make plot with both regression lines
plot(data = cba_2[cba_2$RaceAsian == 1,], rank ~ Max_Test_Score, xlim = c(800, 1600), ylim = c(0, .6))
par(new = T)
plot(seq(800,1600,10), lin_fit, xlim = c(800, 1600), ylim = c(0, .6), type = "l", xlab = "", ylab = "", col = "red")
par(new = T)
plot(seq(800,1600,10), quad_fit, xlim = c(800, 1600), ylim = c(0, .6), type = "l", xlab = "", ylab = "", col = "blue")

Topic 3: Multiple Regression with mixed interaction terms

# Lets look at generating linear models with interaction terms!

# Ex. 1: Predict SAT_math using SAT_verbal and Sex
no_interaction_lm = lm(data = cba, SAT_math ~ SAT_verbal + Female)
interaction_lm = lm(data = cba, SAT_math ~ SAT_verbal + Female + SAT_verbal:Female)
summary(no_interaction_lm)
## 
## Call:
## lm(formula = SAT_math ~ SAT_verbal + Female, data = cba)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -153.498  -41.726   -1.164   41.323  199.363 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 351.89984   19.51020  18.037  < 2e-16 ***
## SAT_verbal    0.43611    0.03394  12.850  < 2e-16 ***
## Female      -25.70743    4.61991  -5.564 3.77e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 60.13 on 688 degrees of freedom
## Multiple R-squared:  0.2366, Adjusted R-squared:  0.2344 
## F-statistic: 106.6 on 2 and 688 DF,  p-value: < 2.2e-16
summary(interaction_lm)
## 
## Call:
## lm(formula = SAT_math ~ SAT_verbal + Female + SAT_verbal:Female, 
##     data = cba)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -154.380  -42.086   -1.366   41.487  203.071 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       362.36999   25.78049  14.056   <2e-16 ***
## SAT_verbal          0.41766    0.04509   9.263   <2e-16 ***
## Female            -49.54612   38.62331  -1.283    0.200    
## SAT_verbal:Female   0.04260    0.06852   0.622    0.534    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 60.15 on 687 degrees of freedom
## Multiple R-squared:  0.237,  Adjusted R-squared:  0.2337 
## F-statistic: 71.15 on 3 and 687 DF,  p-value: < 2.2e-16
# Regression Equation of the model with interaction terms is:
# SAT_math = 362.3 + 0.41766 * SAT_verbal -49.54 * Female + 0.04 * SAT_verbal*Female

# Ex. 2: Predict rank using SAT_math and Sex
no_interaction_lm = lm(data = cba, rank ~ SAT_math + Female)
interaction_lm = lm(data = cba, rank ~ SAT_math + Female + SAT_math:Female)
summary(no_interaction_lm)
## 
## Call:
## lm(formula = rank ~ SAT_math + Female, data = cba)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.27450 -0.11133 -0.01985  0.09174  0.45865 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.485e-01  4.986e-02  11.001  < 2e-16 ***
## SAT_math    -4.665e-04  8.225e-05  -5.672 2.08e-08 ***
## Female      -9.088e-02  1.135e-02  -8.011 4.88e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1444 on 688 degrees of freedom
## Multiple R-squared:  0.1037, Adjusted R-squared:  0.1011 
## F-statistic:  39.8 on 2 and 688 DF,  p-value: < 2.2e-16
summary(interaction_lm)
## 
## Call:
## lm(formula = rank ~ SAT_math + Female + SAT_math:Female, data = cba)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.26749 -0.11146 -0.02049  0.09236  0.45862 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      0.5204299  0.0665296   7.823 1.96e-14 ***
## SAT_math        -0.0004196  0.0001103  -3.805 0.000155 ***
## Female          -0.0294093  0.0970127  -0.303 0.761868    
## SAT_math:Female -0.0001057  0.0001656  -0.638 0.523655    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1445 on 687 degrees of freedom
## Multiple R-squared:  0.1042, Adjusted R-squared:  0.1003 
## F-statistic: 26.64 on 3 and 687 DF,  p-value: 2.606e-16
# Ex. 3: Predict rank using SAT_verbal, SAT_math and Race
no_interaction_lm = lm(data = cba, rank ~ SAT_math + Race)
interaction_lm = lm(data = cba, rank ~ SAT_math + Race + SAT_math:Race)
summary(no_interaction_lm)
## 
## Call:
## lm(formula = rank ~ SAT_math + Race, data = cba)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.2484 -0.1222 -0.0224  0.1031  0.4965 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        4.379e-01  1.193e-01   3.670 0.000261 ***
## SAT_math          -2.873e-04  8.497e-05  -3.381 0.000762 ***
## RaceAsian         -7.906e-02  1.095e-01  -0.722 0.470423    
## Raceblack         -1.059e-02  1.093e-01  -0.097 0.922822    
## RaceHispanic      -7.690e-02  1.169e-01  -0.658 0.510844    
## Racenot_indicated  7.472e-03  1.152e-01   0.065 0.948308    
## Racewhite         -3.471e-02  1.069e-01  -0.325 0.745469    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1508 on 684 degrees of freedom
## Multiple R-squared:  0.02863,    Adjusted R-squared:  0.02011 
## F-statistic:  3.36 on 6 and 684 DF,  p-value: 0.002854
summary(interaction_lm)
## 
## Call:
## lm(formula = rank ~ SAT_math + Race + SAT_math:Race, data = cba)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.24841 -0.12303 -0.02223  0.10259  0.49628 
## 
## Coefficients: (1 not defined because of singularities)
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 4.302e-01  1.218e-01   3.532  0.00044 ***
## SAT_math                   -2.750e-04  9.253e-05  -2.972  0.00306 ** 
## RaceAsian                  -8.461e-02  2.359e-01  -0.359  0.71994    
## Raceblack                   4.836e-02  2.381e-01   0.203  0.83914    
## RaceHispanic               -4.460e-02  5.614e-01  -0.079  0.93671    
## Racenot_indicated           2.294e-01  3.973e-01   0.577  0.56397    
## Racewhite                  -3.415e-02  1.072e-01  -0.319  0.75014    
## SAT_math:RaceAsian          8.871e-06  3.323e-04   0.027  0.97871    
## SAT_math:Raceblack         -1.041e-04  3.763e-04  -0.277  0.78207    
## SAT_math:RaceHispanic      -5.462e-05  9.459e-04  -0.058  0.95396    
## SAT_math:Racenot_indicated -3.707e-04  6.356e-04  -0.583  0.55991    
## SAT_math:Racewhite                 NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1512 on 680 degrees of freedom
## Multiple R-squared:  0.02922,    Adjusted R-squared:  0.01494 
## F-statistic: 2.047 on 10 and 680 DF,  p-value: 0.02669
# In all three examples interaction terms are not signif. 
# Moral: Make sure you have a good reason before including interaction terms in model