Lesson35

Dusty Turner

April 25, 2017

General Guidelines to Model Building in MA206x

  1. Exploratory Data Analysis
    1. Pairs
    2. One Factor at a Time
  2. Full model
  3. Systematically remove predictors
    1. Collapse factors if necessary
  4. If uncertain, try both ways
  5. Compare Models
    1. R Squared
    2. Adj R Squared
    3. AIC
    4. BIC

While model building, always monitor your four assumptions

Read In the Data

setwd("//usmaedu/apollo/math/Userdirs/Turner/MA206x/Lessons/Block 4/Lesson 35 Assessing Model Adequacy II (Transformations)")
lsn36 = read.csv("cadet16.csv", header = TRUE)
attach(lsn36)

Exploratory Data Analysis

head(lsn36)
##   X        y       x1        x2       x3 x4         x5
## 1 1 407.4573 108.1452 0.6826014 2.036272  2  -99.34688
## 2 2 403.3364 100.8490 0.7362477 1.956096  2 -101.17498
## 3 3 356.4369 116.8831 0.4324598 1.460949  1 -100.87674
## 4 4 343.8545 105.3687 0.4921432 1.361149  2 -100.02094
## 5 5 546.6986 104.5681 1.2583366 2.292721  1  -99.63886
## 6 6 481.4378 105.5678 1.0564692 2.102945  2  -98.38004
lsn36 = lsn36[,-1]
summary(lsn36)
##        y                x1              x2                x3       
##  Min.   : 238.7   Min.   : 80.3   Min.   :0.02071   Min.   :1.003  
##  1st Qu.: 372.0   1st Qu.:100.0   1st Qu.:0.49252   1st Qu.:1.546  
##  Median : 440.3   Median :105.1   Median :0.85012   Median :2.040  
##  Mean   : 480.7   Mean   :105.2   Mean   :1.00342   Mean   :2.022  
##  3rd Qu.: 556.4   3rd Qu.:110.1   3rd Qu.:1.29954   3rd Qu.:2.514  
##  Max.   :1367.6   Max.   :131.7   Max.   :4.74919   Max.   :2.999  
##        x4              x5         
##  Min.   :0.000   Min.   :-107.22  
##  1st Qu.:1.000   1st Qu.:-101.33  
##  Median :2.000   Median : -99.96  
##  Mean   :1.501   Mean   : -99.97  
##  3rd Qu.:2.000   3rd Qu.: -98.64  
##  Max.   :2.000   Max.   : -93.16

Exploratory Data Analysis (Pairs)

library(GGally)
ggpairs(lsn36)

Exploratory Data Analysis (OFAT)

mod.x1 = lm(y~x1)
summary(mod.x1)
## 
## Call:
## lm(formula = y ~ x1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -247.13 -108.49  -36.15   73.62  867.52 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 236.4076    73.1302   3.233 0.001266 ** 
## x1            2.3226     0.6938   3.347 0.000846 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 153.7 on 998 degrees of freedom
## Multiple R-squared:  0.0111, Adjusted R-squared:  0.01011 
## F-statistic: 11.21 on 1 and 998 DF,  p-value: 0.0008461
mod.x2 = lm(y~x2)
summary(mod.x2)
## 
## Call:
## lm(formula = y ~ x2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -100.870  -24.056   -2.174   23.581  135.764 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  261.135      1.822   143.3   <2e-16 ***
## x2           218.782      1.497   146.2   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32.65 on 998 degrees of freedom
## Multiple R-squared:  0.9554, Adjusted R-squared:  0.9553 
## F-statistic: 2.137e+04 on 1 and 998 DF,  p-value: < 2.2e-16
mod.x3 = lm(y~x3)
summary(mod.x3)
## 
## Call:
## lm(formula = y ~ x3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -221.89 -111.69  -39.17   70.25  909.00 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  409.935     17.840   22.98  < 2e-16 ***
## x3            34.985      8.492    4.12 4.11e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 153.3 on 998 degrees of freedom
## Multiple R-squared:  0.01672,    Adjusted R-squared:  0.01574 
## F-statistic: 16.97 on 1 and 998 DF,  p-value: 4.11e-05
mod.x4 = lm(y~as.factor(x4))
summary(mod.x4)
## 
## Call:
## lm(formula = y ~ as.factor(x4))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -241.98 -108.46  -37.63   73.80  886.90 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      506.53      18.87  26.842   <2e-16 ***
## as.factor(x4)1   -30.62      20.53  -1.492    0.136    
## as.factor(x4)2   -25.85      19.95  -1.296    0.195    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 154.5 on 997 degrees of freedom
## Multiple R-squared:  0.002227,   Adjusted R-squared:  0.0002253 
## F-statistic: 1.113 on 2 and 997 DF,  p-value: 0.3291
mod.x5 = lm(y~x5)
summary(mod.x5)
## 
## Call:
## lm(formula = y ~ x5)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -251.17 -108.95  -39.40   74.35  873.77 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  123.220    241.899   0.509    0.611
## x5            -3.575      2.419  -1.478    0.140
## 
## Residual standard error: 154.4 on 998 degrees of freedom
## Multiple R-squared:  0.002184,   Adjusted R-squared:  0.001184 
## F-statistic: 2.184 on 1 and 998 DF,  p-value: 0.1397

Full Model

full.mod = lm(y~(x1+x2+x3+as.factor(x4)+x5)^2)
summary(full.mod)
## 
## Call:
## lm(formula = y ~ (x1 + x2 + x3 + as.factor(x4) + x5)^2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.3931  -3.6358   0.1204   3.7687  17.9787 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       530.871870 141.463356   3.753 0.000185 ***
## x1                 -2.794896   1.264551  -2.210 0.027323 *  
## x2                  8.056210  12.740270   0.632 0.527311    
## x3                 35.894342  16.356107   2.195 0.028430 *  
## as.factor(x4)1    -31.533878  37.086882  -0.850 0.395383    
## as.factor(x4)2    -42.345294  36.176262  -1.171 0.242074    
## x5                  4.762028   1.399833   3.402 0.000696 ***
## x1:x2               2.040457   0.037334  54.655  < 2e-16 ***
## x1:x3              -0.057793   0.045983  -1.257 0.209109    
## x1:as.factor(x4)1  -0.049636   0.101554  -0.489 0.625117    
## x1:as.factor(x4)2  -0.001523   0.098004  -0.016 0.987601    
## x1:x5              -0.039672   0.012472  -3.181 0.001514 ** 
## x2:x3              -0.391679   0.469339  -0.835 0.404184    
## x2:as.factor(x4)1   0.883615   1.075839   0.821 0.411660    
## x2:as.factor(x4)2   0.731151   1.039301   0.704 0.481910    
## x2:x5               0.023918   0.118399   0.202 0.839947    
## x3:as.factor(x4)1  -1.036078   1.308250  -0.792 0.428577    
## x3:as.factor(x4)2  -0.252031   1.262359  -0.200 0.841795    
## x3:x5              -0.111999   0.159492  -0.702 0.482707    
## as.factor(x4)1:x5  -0.376500   0.356740  -1.055 0.291507    
## as.factor(x4)2:x5  -0.421979   0.346067  -1.219 0.223003    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.648 on 979 degrees of freedom
## Multiple R-squared:  0.9987, Adjusted R-squared:  0.9987 
## F-statistic: 3.732e+04 on 20 and 979 DF,  p-value: < 2.2e-16

Systematically Remove Predictors

  1. remove x2*x5
## 
## Call:
## lm(formula = y ~ (x1 + x2 + x3 + as.factor(x4) + x5 + x1 * x2 + 
##     x1 * x3 + x1 * as.factor(x4) + x1 * x5 + x2 * x3 + x2 * as.factor(x4) + 
##     x3 * as.factor(x4) + x3 * x5 + as.factor(x4) * x5))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.4177  -3.6139   0.1201   3.7799  18.0037 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       533.329140 140.870440   3.786 0.000162 ***
## x1                 -2.796134   1.263917  -2.212 0.027178 *  
## x2                  5.637983   4.359141   1.293 0.196188    
## x3                 35.907360  16.347974   2.196 0.028294 *  
## as.factor(x4)1    -31.315712  37.053009  -0.845 0.398228    
## as.factor(x4)2    -42.092802  36.136967  -1.165 0.244378    
## x5                  4.786621   1.393846   3.434 0.000619 ***
## x1:x2               2.040331   0.037310  54.686  < 2e-16 ***
## x1:x3              -0.057639   0.045954  -1.254 0.210040    
## x1:as.factor(x4)1  -0.049800   0.101501  -0.491 0.623797    
## x1:as.factor(x4)2  -0.001873   0.097941  -0.019 0.984745    
## x1:x5              -0.039686   0.012465  -3.184 0.001500 ** 
## x2:x3              -0.374838   0.461649  -0.812 0.417015    
## x2:as.factor(x4)1   0.885415   1.075275   0.823 0.410463    
## x2:as.factor(x4)2   0.726116   1.038493   0.699 0.484592    
## x3:as.factor(x4)1  -1.049851   1.305833  -0.804 0.421609    
## x3:as.factor(x4)2  -0.265634   1.259945  -0.211 0.833064    
## x3:x5              -0.111693   0.159407  -0.701 0.483669    
## as.factor(x4)1:x5  -0.374712   0.356455  -1.051 0.293418    
## as.factor(x4)2:x5  -0.420069   0.345769  -1.215 0.224703    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.645 on 980 degrees of freedom
## Multiple R-squared:  0.9987, Adjusted R-squared:  0.9987 
## F-statistic: 3.933e+04 on 19 and 980 DF,  p-value: < 2.2e-16
  1. remove x1*x4
## 
## Call:
## lm(formula = y ~ (x1 + x2 + x3 + as.factor(x4) + x5 + x1 * x2 + 
##     x1 * x3 + x1 * x5 + x2 * x3 + x2 * as.factor(x4) + x3 * as.factor(x4) + 
##     x3 * x5 + as.factor(x4) * x5))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.3510  -3.7314   0.1233   3.7924  18.0890 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       544.54306  138.36926   3.935 8.89e-05 ***
## x1                 -2.90414    1.23687  -2.348 0.019074 *  
## x2                  5.71187    4.35473   1.312 0.189946    
## x3                 36.11253   16.33634   2.211 0.027296 *  
## as.factor(x4)1    -35.37173   35.93952  -0.984 0.325259    
## as.factor(x4)2    -41.86535   34.89978  -1.200 0.230589    
## x5                  4.88350    1.38188   3.534 0.000428 ***
## x1:x2               2.04009    0.03728  54.717  < 2e-16 ***
## x1:x3              -0.05979    0.04586  -1.304 0.192569    
## x1:x5              -0.04062    0.01233  -3.294 0.001023 ** 
## x2:x3              -0.39208    0.46097  -0.851 0.395219    
## x2:as.factor(x4)1   0.90091    1.07317   0.839 0.401406    
## x2:as.factor(x4)2   0.71194    1.03627   0.687 0.492231    
## x3:as.factor(x4)1  -1.01841    1.30077  -0.783 0.433860    
## x3:as.factor(x4)2  -0.25473    1.25446  -0.203 0.839134    
## x3:x5              -0.11186    0.15931  -0.702 0.482731    
## as.factor(x4)1:x5  -0.36215    0.35570  -1.018 0.308869    
## as.factor(x4)2:x5  -0.41580    0.34533  -1.204 0.228860    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.641 on 982 degrees of freedom
## Multiple R-squared:  0.9987, Adjusted R-squared:  0.9987 
## F-statistic: 4.401e+04 on 17 and 982 DF,  p-value: < 2.2e-16

3.remove x3*x4

## 
## Call:
## lm(formula = y ~ (x1 + x2 + x3 + as.factor(x4) + x5 + x1 * x2 + 
##     x1 * x3 + x1 * x5 + x2 * x3 + x2 * as.factor(x4) + x3 * x5 + 
##     as.factor(x4) * x5))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.2661  -3.7261   0.0607   3.8993  18.3106 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       547.13160  138.28658   3.957 8.15e-05 ***
## x1                 -2.89628    1.23651  -2.342  0.01936 *  
## x2                  5.83878    4.35228   1.342  0.18005    
## x3                 34.34425   16.18568   2.122  0.03410 *  
## as.factor(x4)1    -37.87554   35.81522  -1.058  0.29053    
## as.factor(x4)2    -41.94412   34.77820  -1.206  0.22809    
## x5                  4.89823    1.38143   3.546  0.00041 ***
## x1:x2               2.03861    0.03725  54.722  < 2e-16 ***
## x1:x3              -0.05903    0.04581  -1.288  0.19791    
## x1:x5              -0.04054    0.01233  -3.288  0.00104 ** 
## x2:x3              -0.37233    0.46053  -0.808  0.41900    
## x2:as.factor(x4)1   0.91538    1.07273   0.853  0.39369    
## x2:as.factor(x4)2   0.70708    1.03590   0.683  0.49503    
## x3:x5              -0.12350    0.15882  -0.778  0.43697    
## as.factor(x4)1:x5  -0.36641    0.35559  -1.030  0.30307    
## as.factor(x4)2:x5  -0.41141    0.34521  -1.192  0.23365    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.64 on 984 degrees of freedom
## Multiple R-squared:  0.9987, Adjusted R-squared:  0.9987 
## F-statistic: 4.99e+04 on 15 and 984 DF,  p-value: < 2.2e-16

4.remove x2*x3

## 
## Call:
## lm(formula = y ~ (x1 + x2 + x3 + as.factor(x4) + x5 + x1 * x2 + 
##     x1 * x3 + x1 * x5 + x2 * as.factor(x4) + x3 * x5 + as.factor(x4) * 
##     x5))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.2985  -3.7530   0.0978   3.8673  18.4095 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       547.90882  138.25893   3.963 7.94e-05 ***
## x1                 -2.92455    1.23580  -2.367 0.018148 *  
## x2                  4.56079    4.05437   1.125 0.260902    
## x3                 34.56488   16.18054   2.136 0.032909 *  
## as.factor(x4)1    -36.32265   35.75739  -1.016 0.309970    
## as.factor(x4)2    -40.48493   34.72523  -1.166 0.243951    
## x5                  4.89350    1.38118   3.543 0.000414 ***
## x1:x2               2.04314    0.03682  55.487  < 2e-16 ***
## x1:x3              -0.05900    0.04581  -1.288 0.198043    
## x1:x5              -0.04077    0.01232  -3.309 0.000970 ***
## x2:as.factor(x4)1   0.97040    1.07038   0.907 0.364846    
## x2:as.factor(x4)2   0.76183    1.03350   0.737 0.461216    
## x3:x5              -0.11755    0.15862  -0.741 0.458811    
## as.factor(x4)1:x5  -0.35048    0.35498  -0.987 0.323724    
## as.factor(x4)2:x5  -0.39629    0.34465  -1.150 0.250479    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.639 on 985 degrees of freedom
## Multiple R-squared:  0.9987, Adjusted R-squared:  0.9987 
## F-statistic: 5.348e+04 on 14 and 985 DF,  p-value: < 2.2e-16

5.remove x2*x4

## 
## Call:
## lm(formula = y ~ (x1 + x2 + x3 + as.factor(x4) + x5 + x1 * x2 + 
##     x1 * x3 + x1 * x5 + x3 * x5 + as.factor(x4) * x5))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.3851  -3.7656   0.0504   3.8684  18.4020 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       544.50618  138.12098   3.942 8.64e-05 ***
## x1                 -2.91892    1.23478  -2.364 0.018275 *  
## x2                  5.61552    3.86277   1.454 0.146332    
## x3                 33.77689   16.14753   2.092 0.036714 *  
## as.factor(x4)1    -31.90684   35.36297  -0.902 0.367135    
## as.factor(x4)2    -36.19666   34.35465  -1.054 0.292316    
## x5                  4.86807    1.37991   3.528 0.000438 ***
## x1:x2               2.04054    0.03669  55.617  < 2e-16 ***
## x1:x3              -0.05814    0.04574  -1.271 0.203966    
## x1:x5              -0.04072    0.01231  -3.307 0.000976 ***
## x3:x5              -0.12446    0.15832  -0.786 0.431976    
## as.factor(x4)1:x5  -0.31701    0.35250  -0.899 0.368709    
## as.factor(x4)2:x5  -0.36205    0.34230  -1.058 0.290450    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.636 on 987 degrees of freedom
## Multiple R-squared:  0.9987, Adjusted R-squared:  0.9987 
## F-statistic: 6.247e+04 on 12 and 987 DF,  p-value: < 2.2e-16

6.remove x3*x5

## 
## Call:
## lm(formula = y ~ (x1 + x2 + x3 + as.factor(x4) + x5 + x1 * x2 + 
##     x1 * x3 + x1 * x5 + as.factor(x4) * x5))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.3649  -3.7606   0.0664   3.8277  18.7300 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       518.64992  134.12189   3.867 0.000117 ***
## x1                 -2.92859    1.23448  -2.372 0.017867 *  
## x2                  5.69140    3.86082   1.474 0.140762    
## x3                 45.89739    4.79853   9.565  < 2e-16 ***
## as.factor(x4)1    -29.65644   35.24010  -0.842 0.400241    
## as.factor(x4)2    -34.63480   34.29053  -1.010 0.312723    
## x5                  4.60305    1.33784   3.441 0.000605 ***
## x1:x2               2.03972    0.03667  55.628  < 2e-16 ***
## x1:x3              -0.05501    0.04555  -1.208 0.227473    
## x1:x5              -0.04075    0.01231  -3.311 0.000964 ***
## as.factor(x4)1:x5  -0.29449    0.35127  -0.838 0.402031    
## as.factor(x4)2:x5  -0.34638    0.34165  -1.014 0.310906    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.634 on 988 degrees of freedom
## Multiple R-squared:  0.9987, Adjusted R-squared:  0.9987 
## F-statistic: 6.818e+04 on 11 and 988 DF,  p-value: < 2.2e-16

7.remove x4*x5

## 
## Call:
## lm(formula = y ~ (x1 + x2 + x3 + as.factor(x4) + x5 + x1 * x2 + 
##     x1 * x3 + x1 * x5))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.3505  -3.7964   0.0813   3.8120  18.8757 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    487.74797  130.49176   3.738 0.000196 ***
## x1              -2.92156    1.23323  -2.369 0.018026 *  
## x2               5.64258    3.85771   1.463 0.143874    
## x3              45.89092    4.79617   9.568  < 2e-16 ***
## as.factor(x4)1  -0.08795    0.75128  -0.117 0.906829    
## as.factor(x4)2   0.12132    0.72904   0.166 0.867866    
## x5               4.29407    1.30153   3.299 0.001004 ** 
## x1:x2            2.04035    0.03663  55.696  < 2e-16 ***
## x1:x3           -0.05500    0.04553  -1.208 0.227322    
## x1:x5           -0.04067    0.01230  -3.308 0.000974 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.632 on 990 degrees of freedom
## Multiple R-squared:  0.9987, Adjusted R-squared:  0.9987 
## F-statistic: 8.341e+04 on 9 and 990 DF,  p-value: < 2.2e-16

8.remove x4

## 
## Call:
## lm(formula = y ~ (x1 + x2 + x3 + x5 + x1 * x2 + x1 * x3 + x1 * 
##     x5))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.4713  -3.7879   0.0671   3.8169  18.9612 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 483.40821  130.13689   3.715 0.000215 ***
## x1           -2.88133    1.23004  -2.342 0.019354 *  
## x2            5.56036    3.85042   1.444 0.149029    
## x3           45.92148    4.79130   9.584  < 2e-16 ***
## x5            4.25025    1.29800   3.274 0.001095 ** 
## x1:x2         2.04113    0.03656  55.828  < 2e-16 ***
## x1:x3        -0.05528    0.04549  -1.215 0.224527    
## x1:x5        -0.04027    0.01226  -3.284 0.001061 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.627 on 992 degrees of freedom
## Multiple R-squared:  0.9987, Adjusted R-squared:  0.9987 
## F-statistic: 1.074e+05 on 7 and 992 DF,  p-value: < 2.2e-16

9.remove x1*x3

## 
## Call:
## lm(formula = y ~ (x1 + x2 + x3 + x5 + x1 * x2 + x1 * x5))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.4956  -3.7021   0.0265   3.8025  18.7154 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 492.44574  129.95546   3.789  0.00016 ***
## x1           -2.97051    1.22815  -2.419  0.01576 *  
## x2            5.48855    3.85089   1.425  0.15439    
## x3           40.11084    0.31261 128.310  < 2e-16 ***
## x5            4.22101    1.29809   3.252  0.00119 ** 
## x1:x2         2.04184    0.03657  55.841  < 2e-16 ***
## x1:x5        -0.04002    0.01226  -3.263  0.00114 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.628 on 993 degrees of freedom
## Multiple R-squared:  0.9987, Adjusted R-squared:  0.9987 
## F-statistic: 1.253e+05 on 6 and 993 DF,  p-value: < 2.2e-16

Test Assumptions

library(car)
residualPlots(full.mod.final, type = "rstandard",layout = c(1,1) ,test = FALSE)

qqPlot(full.mod.final)

After Transformation

full.mod.final.1 = lm(y~(x1+x2+poly(x3,2)+x5+x1*x2+x1*x5))
summary(full.mod.final.1)
## 
## Call:
## lm(formula = y ~ (x1 + x2 + poly(x3, 2) + x5 + x1 * x2 + x1 * 
##     x5))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.3960  -3.2202  -0.0167   3.0723  14.6984 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  434.19265  111.11736   3.908 9.96e-05 ***
## x1            -1.61351    1.05015  -1.536   0.1247    
## x2             7.80546    3.28759   2.374   0.0178 *  
## poly(x3, 2)1 723.63582    4.81312 150.347  < 2e-16 ***
## poly(x3, 2)2  92.94391    4.81725  19.294  < 2e-16 ***
## x5             2.85703    1.10972   2.575   0.0102 *  
## x1:x2          2.01997    0.03122  64.709  < 2e-16 ***
## x1:x5         -0.02673    0.01049  -2.549   0.0109 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.802 on 992 degrees of freedom
## Multiple R-squared:  0.999,  Adjusted R-squared:  0.999 
## F-statistic: 1.476e+05 on 7 and 992 DF,  p-value: < 2.2e-16
residualPlots(full.mod.final.1, type = "rstandard",layout = c(1,1) ,test = FALSE)

qqPlot(full.mod.final.1)

Lets say we ended with this model

Not too bad…

alternate.model = lm(y~x1+x2+x3)
summary(alternate.model)
## 
## Call:
## lm(formula = y ~ x1 + x2 + x3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -68.980  -5.853  -0.107   5.979  73.821 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -143.07792    5.69878  -25.11   <2e-16 ***
## x1             3.06499    0.05173   59.25   <2e-16 ***
## x2           220.02667    0.52479  419.27   <2e-16 ***
## x3            39.88070    0.63476   62.83   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.44 on 996 degrees of freedom
## Multiple R-squared:  0.9945, Adjusted R-squared:  0.9945 
## F-statistic: 6.036e+04 on 3 and 996 DF,  p-value: < 2.2e-16

…how about our assumptions

Assumptions for alternate model…

residualPlots(alternate.model, type = "rstandard",layout = c(1,1) ,test = FALSE)

qqPlot(alternate.model)

Lets transform our alternate model…

alternate.model.trans = lm(y~x1+x2+poly(x3,2))
summary(alternate.model.trans)
## 
## Call:
## lm(formula = y ~ x1 + x2 + poly(x3, 2))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -67.282  -5.556   0.339   5.404  75.511 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -63.13755    5.26038 -12.002   <2e-16 ***
## x1             3.07136    0.04955  61.990   <2e-16 ***
## x2           220.04566    0.50259 437.819   <2e-16 ***
## poly(x3, 2)1 719.78684   10.97089  65.609   <2e-16 ***
## poly(x3, 2)2 104.50250   10.95938   9.535   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.96 on 995 degrees of freedom
## Multiple R-squared:  0.995,  Adjusted R-squared:  0.995 
## F-statistic: 4.938e+04 on 4 and 995 DF,  p-value: < 2.2e-16

and now look at our assumptions

residualPlots(alternate.model.trans, type = "rstandard",layout = c(1,1) ,test = FALSE)

qqPlot(alternate.model.trans)

Compare Models

Lets say we needed to pick between these two models? Which is best?

summary(full.mod.final.1)
## 
## Call:
## lm(formula = y ~ (x1 + x2 + poly(x3, 2) + x5 + x1 * x2 + x1 * 
##     x5))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.3960  -3.2202  -0.0167   3.0723  14.6984 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  434.19265  111.11736   3.908 9.96e-05 ***
## x1            -1.61351    1.05015  -1.536   0.1247    
## x2             7.80546    3.28759   2.374   0.0178 *  
## poly(x3, 2)1 723.63582    4.81312 150.347  < 2e-16 ***
## poly(x3, 2)2  92.94391    4.81725  19.294  < 2e-16 ***
## x5             2.85703    1.10972   2.575   0.0102 *  
## x1:x2          2.01997    0.03122  64.709  < 2e-16 ***
## x1:x5         -0.02673    0.01049  -2.549   0.0109 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.802 on 992 degrees of freedom
## Multiple R-squared:  0.999,  Adjusted R-squared:  0.999 
## F-statistic: 1.476e+05 on 7 and 992 DF,  p-value: < 2.2e-16
summary(alternate.model.trans)
## 
## Call:
## lm(formula = y ~ x1 + x2 + poly(x3, 2))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -67.282  -5.556   0.339   5.404  75.511 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -63.13755    5.26038 -12.002   <2e-16 ***
## x1             3.07136    0.04955  61.990   <2e-16 ***
## x2           220.04566    0.50259 437.819   <2e-16 ***
## poly(x3, 2)1 719.78684   10.97089  65.609   <2e-16 ***
## poly(x3, 2)2 104.50250   10.95938   9.535   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.96 on 995 degrees of freedom
## Multiple R-squared:  0.995,  Adjusted R-squared:  0.995 
## F-statistic: 4.938e+04 on 4 and 995 DF,  p-value: < 2.2e-16

Remember These?

  1. R^2
  2. Adj R^2
  3. AIC
  4. BIC

Lets make a table of our metrics

criteria = 
matrix(c(
summary(full.mod.final.1)$r.squared,
summary(alternate.model.trans)$r.squared,
summary(full.mod.final.1)$adj.r.squared,
summary(alternate.model.trans)$adj.r.squared,
AIC(full.mod.final.1),
AIC(alternate.model.trans),
BIC(full.mod.final.1),
BIC(alternate.model.trans)
),
nrow = 4, 
byrow = TRUE
)
rownames(criteria) = c("R^2", "Adj R^2", "AIC", "BIC")
colnames(criteria) = c("1st Model", "2nd Model")
print(criteria)
##            1st Model    2nd Model
## R^2        0.9990406    0.9949881
## Adj R^2    0.9990338    0.9949679
## AIC     5985.7889607 7633.0624780
## BIC     6029.9587582 7662.5090097