408 HW10 Blohm,Alex

library(MASS)
library(olsrr)

## 
## Attaching package: 'olsrr'

## The following object is masked from 'package:MASS':
## 
##     cement

## The following object is masked from 'package:datasets':
## 
##     rivers

library(leaps)
library(DAAG)

## Loading required package: lattice

## 
## Attaching package: 'DAAG'

## The following object is masked from 'package:MASS':
## 
##     hills

(a) 9.10 b

Data=read.table("http://users.stat.ufl.edu/~rrandles/sta4210/Rclassnotes/data/textdatasets/KutnerData/Chapter%20%209%20Data%20Sets/CH09PR10.txt")
names(Data) = c("y", "x1", "x2", "x3", "x4") #y-Job Proficiency x1-x4-Test number
n=nrow(Data)
cor(Data)

##            y        x1        x2        x3        x4
## y  1.0000000 0.5144107 0.4970057 0.8970645 0.8693865
## x1 0.5144107 1.0000000 0.1022689 0.1807692 0.3266632
## x2 0.4970057 0.1022689 1.0000000 0.5190448 0.3967101
## x3 0.8970645 0.1807692 0.5190448 1.0000000 0.7820385
## x4 0.8693865 0.3266632 0.3967101 0.7820385 1.0000000

plot(Data)

It appears that all 4 variables are linearly associated with y. From both outputs, we might suspect that x2 and x3 have some multicollinearity with an \(R^2\) of .519, and x3 and x4 have a bigger issue of multicollinearity with an \(R^2\) of .782 (we also see a correlation in the scatterplots).

(a) 9.10 c

fit_f = lm(y~x1+x2+x3+x4,data=Data)
summary(fit_f)

## 
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4, data = Data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.9779 -3.4506  0.0941  2.4749  5.9959 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -124.38182    9.94106 -12.512 6.48e-11 ***
## x1             0.29573    0.04397   6.725 1.52e-06 ***
## x2             0.04829    0.05662   0.853  0.40383    
## x3             1.30601    0.16409   7.959 1.26e-07 ***
## x4             0.51982    0.13194   3.940  0.00081 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.099 on 20 degrees of freedom
## Multiple R-squared:  0.9629, Adjusted R-squared:  0.9555 
## F-statistic: 129.7 on 4 and 20 DF,  p-value: 5.262e-14

From the summary of the full model, I suspect that we need all variables except test 2 (x2) because all other predictors are significant.

(b) \(R^2_{a,p}\)

#All possible combinations
fit_n = lm(y~1,data=Data)
fit_x1 = lm(y~x1, data=Data)
fit_x2 = lm(y~x2, data=Data)
fit_x3 = lm(y~x3, data=Data)
fit_x4 = lm(y~x4, data=Data)
fit_x1x2 = lm(y~x1+x2, data=Data)
fit_x1x3 = lm(y~x1+x3, data=Data)
fit_x1x4 = lm(y~x1+x4, data=Data)
fit_x2x3 = lm(y~x2+x3, data=Data)
fit_x2x4 = lm(y~x2+x4, data=Data)
fit_x3x4 = lm(y~x3+x4, data=Data)
fit_x1x2x3 = lm(y~x1+x2+x3, data=Data)
fit_x1x2x4 = lm(y~x1+x2+x4, data=Data)
fit_x1x3x4 = lm(y~x1+x3+x4, data=Data)
fit_x2x3x4 = lm(y~x2+x3+x4, data=Data)
fit_f = lm(y~x1+x2+x3+x4,data=Data)

#1 variable
Rsq <- summary(fit_x1)$r.squared;Rsq

## [1] 0.2646184

AdjRsq <- summary(fit_x1)$adj.r.squared;AdjRsq

## [1] 0.2326452

Rsq <- summary(fit_x2)$r.squared;Rsq

## [1] 0.2470147

AdjRsq <- summary(fit_x2)$adj.r.squared;AdjRsq

## [1] 0.2142762

Rsq <- summary(fit_x3)$r.squared;Rsq

## [1] 0.8047247

AdjRsq <- summary(fit_x3)$adj.r.squared;AdjRsq

## [1] 0.7962344

Rsq <- summary(fit_x4)$r.squared;Rsq

## [1] 0.7558329

AdjRsq <- summary(fit_x4)$adj.r.squared;AdjRsq

## [1] 0.745217

So best model with 1 variable is with y~x3: fit_x3

2 Variables

AdjRsq <- summary(fit_x1x2)$adj.r.squared;AdjRsq

## [1] 0.4154853

AdjRsq <- summary(fit_x1x3)$adj.r.squared;AdjRsq

## [1] 0.9269043

AdjRsq <- summary(fit_x1x4)$adj.r.squared;AdjRsq

## [1] 0.7984716

AdjRsq <- summary(fit_x2x3)$adj.r.squared;AdjRsq

## [1] 0.7884436

AdjRsq <- summary(fit_x2x4)$adj.r.squared;AdjRsq

## [1] 0.7635916

AdjRsq <- summary(fit_x3x4)$adj.r.squared;AdjRsq

## [1] 0.8660988

So best model with 2 variables is with y~x1+x3: fit_x1x3

3 Variables

AdjRsq <- summary(fit_x1x2x3)$adj.r.squared;AdjRsq

## [1] 0.9246779

AdjRsq <- summary(fit_x1x2x4)$adj.r.squared;AdjRsq

## [1] 0.8232664

AdjRsq <- summary(fit_x1x3x4)$adj.r.squared;AdjRsq

## [1] 0.9560482

AdjRsq <- summary(fit_x2x3x4)$adj.r.squared;AdjRsq

## [1] 0.8616797

So best model with 3 variables is with y~x1+x3+x4: fit_x1x3x4

#Comparing all models, n variables, 1 variable, 2 variables, 3 variables, and all 4
AdjRsq <- summary(fit_n)$adj.r.squared;AdjRsq

## [1] 0

AdjRsq <- summary(fit_x3)$adj.r.squared;AdjRsq

## [1] 0.7962344

AdjRsq <- summary(fit_x1x3)$adj.r.squared;AdjRsq

## [1] 0.9269043

AdjRsq <- summary(fit_x1x3x4)$adj.r.squared;AdjRsq

## [1] 0.9560482

AdjRsq <- summary(fit_f)$adj.r.squared;AdjRsq

## [1] 0.9554702

The highest adjusted \(R^2\) is .956 which comes from the model with test 1, 3, and 4. It appears that test 2 should be dropped from the model as we suspected when I started.

Doing the same thing, but in a much quicker way:

ols_step_all_possible(fit_f)   # Rsquare, Adjust Rsquare and Mallow' Cp

## # A tibble: 15 x 6
##    Index     N Predictors  `R-Square` `Adj. R-Square` `Mallow's Cp`
##  * <int> <int> <chr>            <dbl>           <dbl>         <dbl>
##  1     1     1 x3               0.805           0.796         84.2 
##  2     2     1 x4               0.756           0.745        111.  
##  3     3     1 x1               0.265           0.233        375.  
##  4     4     1 x2               0.247           0.214        385.  
##  5     5     2 x1 x3            0.933           0.927         17.1 
##  6     6     2 x3 x4            0.877           0.866         47.2 
##  7     7     2 x1 x4            0.815           0.798         80.6 
##  8     8     2 x2 x3            0.806           0.788         85.5 
##  9     9     2 x2 x4            0.783           0.764         97.8 
## 10    10     2 x1 x2            0.464           0.415        270.  
## 11    11     3 x1 x3 x4         0.962           0.956          3.73
## 12    12     3 x1 x2 x3         0.934           0.925         18.5 
## 13    13     3 x2 x3 x4         0.879           0.862         48.2 
## 14    14     3 x1 x2 x4         0.845           0.823         66.3 
## 15    15     4 x1 x2 x3 x4      0.963           0.955          5

k <- ols_step_all_possible(fit_f)
plot(k)

As I showed, using adjusted \(R^2\) (we are maximizing this), the best model: 1 variable is y~x3 2 variables is y~x1+x3 3 variables is y~x1+x3+x4

(b) Mallows’ \(C_p\)

Using Mallow’s Cp (we are minimizing this and want Cp \(\approx\) number of predictors p) Best 1 variable, y~x3 \(C_2\) = 84.24 (too high) p=2 Best 2 variables, y~x1+x3 \(C_3\) = 17.11 (too high) Best 3 variable, y~x1+x3+x4 \(C_4\) = 3.727 \(\approx\) 4 so good! All 4 variable, y~x1+x2+x3+x4 \(C_5\) = 5

(b) AIC

attach(Data)
k <- as.data.frame(k)
sse1 = (1-k$rsquare)*var(y)*(n-1)
aic1 <- k$aic-n*log(2*pi)-n-2 
bic1 <- aic1 + (log(n)-2)*(k[,2]+1) 
press1 <- (1-k$predrsq)*var(y)*(n-1)
k1 <- cbind(k,aic1,bic1,press1,sse1)
k1

##    mindex n  predictors   rsquare      adjr   predrsq         cp      aic
## 3       1 1          x3 0.8047247 0.7962344 0.7719685  84.246496 183.4155
## 4       2 1          x4 0.7558329 0.7452170 0.7185073 110.597414 189.0015
## 1       3 1          x1 0.2646184 0.2326452 0.1394302 375.344689 216.5649
## 2       4 1          x2 0.2470147 0.2142762 0.1173960 384.832454 217.1563
## 6       5 2       x1 x3 0.9329956 0.9269043 0.9159516  17.112978 158.6741
## 10      6 2       x3 x4 0.8772573 0.8660988 0.8398940  47.153985 173.8075
## 7       7 2       x1 x4 0.8152656 0.7984716 0.7669652  80.565307 184.0282
## 8       8 2       x2 x3 0.8060733 0.7884436 0.7562794  85.519650 185.2422
## 9       9 2       x2 x4 0.7832923 0.7635916 0.7247849  97.797790 188.0189
## 5      10 2       x1 x2 0.4641948 0.4154853 0.2882658 269.780029 210.6495
## 13     11 3    x1 x3 x4 0.9615422 0.9560482 0.9479289   3.727399 146.7942
## 11     12 3    x1 x2 x3 0.9340931 0.9246779 0.9082006  18.521465 160.2613
## 14     13 3    x2 x3 x4 0.8789698 0.8616797 0.8265340  48.231020 175.4562
## 12     14 3    x1 x2 x4 0.8453581 0.8232664 0.7917114  66.346500 181.5830
## 15     15 4 x1 x2 x3 x4 0.9628918 0.9554702 0.9426785   5.000000 147.9011
##         sbic      sbc      msep       fpe        apc        hsp      aic1
## 3  108.25599 187.0721  83.57926  83.02020 0.22923626  3.4941163 110.46853
## 4  113.50220 192.6581 104.50528 103.80625 0.28663091  4.3689499 116.05459
## 1  140.11466 220.2216 314.74869 312.64335 0.86327410 13.1583900 143.61801
## 2  140.69464 220.8130 322.28319 320.12745 0.88393929 13.4733777 144.20941
## 6   85.69145 163.5496  31.40962  30.88438 0.08527827  1.3131114  85.72721
## 10  98.35397 178.6830  57.53809  56.57591 0.15621801  2.4054385 100.86053
## 7  107.46607 188.9037  86.59792  85.14979 0.23511651  3.6203144 111.08125
## 8  108.57275 190.1177  90.90701  89.38683 0.24681584  3.8004603 112.29528
## 9  111.12083 192.8944 101.58604  99.88728 0.27580981  4.2469082 115.07201
## 5  132.55326 215.5250 251.16932 246.96916 0.68193384 10.5003895 137.70254
## 13  77.41140 152.8886  19.83065  19.23374 0.05310840  0.8290405  73.84732
## 11  86.76926 166.3556  33.98466  32.96171 0.09101423  1.4207635  87.31433
## 14  98.81455 181.5506  62.40886  60.53034 0.16713700  2.6090663 102.50928
## 12 104.05556 187.6774  79.74060  77.34038 0.21355306  3.3336370 108.63607
## 15  79.32921 155.2144  21.14890  20.15865 0.05566228  0.8841514  74.95421
##         bic1    press1      sse1
## 3  112.90629 2064.5976 1768.0228
## 4  118.49234 2548.6349 2210.6887
## 1  146.05576 7791.5994 6658.1453
## 2  146.64717 7991.0964 6817.5291
## 6   89.38384  760.9744  606.6574
## 10 104.51716 1449.6001 1111.3126
## 7  114.73788 2109.8967 1672.5853
## 8  115.95191 2206.6460 1755.8127
## 9  118.72864 2491.7979 1962.0716
## 5  141.35916 6444.0411 4851.1799
## 13  78.72282  471.4520  348.1970
## 11  92.18984  831.1521  596.7207
## 14 107.38479 1570.5610 1095.8078
## 12 113.51157 1885.8454 1400.1275
## 15  81.04859  518.9885  335.9775

We want to minimize AIC Best 1 variable, y~x3 AIC = 183.4 / AIC1 = 110.47 Best 2 variables, y~x1+x3 AIC = 158.67 / AIC1 = 85.72 Best 3 variable, y~x1+x3+x4 AIC = 146.79 / AIC1 = 73.847 All 4 variable, y~x1+x2+x3+x4 AIC = 147.9 / AIC1 = 74.95

(b) BIC

We want to minimize BIC Best 1 variable, y~x3 BIC = 112.91 Best 2 variables, y~x1+x3 BIC = 89.28 Best 3 variable, y~x1+x3+x4 BIC = 78.72 All 4 variable, y~x1+x2+x3+x4 BIC = 81.05

(b) PRESS

We want to minimize PRESS Best 1 variable, y~x3 PRESS = 2064.5976 Best 2 variables, y~x1+x3 PRESS = 760.9744 Best 3 variable, y~x1+x3+x4 PRESS = 471.4520 All 4 variable, y~x1+x2+x3+x4 PRESS = 518.9885

(c) 9.18 a

ols_step_forward_p(fit_f,pent=.05)

## Forward Selection Method    
## ---------------------------
## 
## Candidate Terms: 
## 
## 1. x1 
## 2. x2 
## 3. x3 
## 4. x4 
## 
## We are selecting variables based on p value...
## 
## Variables Entered: 
## 
## - x3 
## - x1 
## - x4 
## 
## No more variables to be added.
## 
## Final Model Output 
## ------------------
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.981       RMSE                4.072 
## R-Squared               0.962       Coef. Var           4.416 
## Adj. R-Squared          0.956       MSE                16.581 
## Pred R-Squared          0.948       MAE                 3.096 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                 ANOVA                                 
## ---------------------------------------------------------------------
##                 Sum of                                               
##                Squares        DF    Mean Square       F         Sig. 
## ---------------------------------------------------------------------
## Regression    8705.803         3       2901.934    175.018    0.0000 
## Residual       348.197        21         16.581                      
## Total         9054.000        24                                     
## ---------------------------------------------------------------------
## 
##                                       Parameter Estimates                                       
## -----------------------------------------------------------------------------------------------
##       model        Beta    Std. Error    Std. Beta       t        Sig        lower       upper 
## -----------------------------------------------------------------------------------------------
## (Intercept)    -124.200         9.874                 -12.578    0.000    -144.734    -103.666 
##          x3       1.357         0.152        0.619      8.937    0.000       1.041       1.673 
##          x1       0.296         0.044        0.310      6.784    0.000       0.205       0.387 
##          x4       0.517         0.131        0.284      3.948    0.001       0.245       0.790 
## -----------------------------------------------------------------------------------------------

## 
##                             Selection Summary                             
## -------------------------------------------------------------------------
##         Variable                  Adj.                                       
## Step    Entered     R-Square    R-Square     C(p)        AIC        RMSE     
## -------------------------------------------------------------------------
##    1    x3            0.8047      0.7962    84.2465    183.4155    8.7676    
##    2    x1            0.9330      0.9269    17.1130    158.6741    5.2512    
##    3    x4            0.9615      0.9560     3.7274    146.7942    4.0720    
## -------------------------------------------------------------------------

ols_step_forward_p(fit_f,pent=.1)

## Forward Selection Method    
## ---------------------------
## 
## Candidate Terms: 
## 
## 1. x1 
## 2. x2 
## 3. x3 
## 4. x4 
## 
## We are selecting variables based on p value...
## 
## Variables Entered: 
## 
## - x3 
## - x1 
## - x4 
## 
## No more variables to be added.
## 
## Final Model Output 
## ------------------
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.981       RMSE                4.072 
## R-Squared               0.962       Coef. Var           4.416 
## Adj. R-Squared          0.956       MSE                16.581 
## Pred R-Squared          0.948       MAE                 3.096 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                 ANOVA                                 
## ---------------------------------------------------------------------
##                 Sum of                                               
##                Squares        DF    Mean Square       F         Sig. 
## ---------------------------------------------------------------------
## Regression    8705.803         3       2901.934    175.018    0.0000 
## Residual       348.197        21         16.581                      
## Total         9054.000        24                                     
## ---------------------------------------------------------------------
## 
##                                       Parameter Estimates                                       
## -----------------------------------------------------------------------------------------------
##       model        Beta    Std. Error    Std. Beta       t        Sig        lower       upper 
## -----------------------------------------------------------------------------------------------
## (Intercept)    -124.200         9.874                 -12.578    0.000    -144.734    -103.666 
##          x3       1.357         0.152        0.619      8.937    0.000       1.041       1.673 
##          x1       0.296         0.044        0.310      6.784    0.000       0.205       0.387 
##          x4       0.517         0.131        0.284      3.948    0.001       0.245       0.790 
## -----------------------------------------------------------------------------------------------

## 
##                             Selection Summary                             
## -------------------------------------------------------------------------
##         Variable                  Adj.                                       
## Step    Entered     R-Square    R-Square     C(p)        AIC        RMSE     
## -------------------------------------------------------------------------
##    1    x3            0.8047      0.7962    84.2465    183.4155    8.7676    
##    2    x1            0.9330      0.9269    17.1130    158.6741    5.2512    
##    3    x4            0.9615      0.9560     3.7274    146.7942    4.0720    
## -------------------------------------------------------------------------

Final Model:\[\widehat{Job Proficiency} = -124.2 + 1.357*Test3 + 0.296*Test1 + 0.517*Test4\]

\[\hat{Y} = -124.2 + 1.357x3 + 0.296x1 + 0.517x4\]

(d) Backwards stepwise AIC Method

ols_step_backward_aic(fit_f)

## Backward Elimination Method 
## ---------------------------
## 
## Candidate Terms: 
## 
## 1 . x1 
## 2 . x2 
## 3 . x3 
## 4 . x4 
## 
## 
## Variables Removed: 
## 
## - x2 
## 
## No more variables to be removed.

## 
## 
##                      Backward Elimination Summary                     
## --------------------------------------------------------------------
## Variable        AIC        RSS       Sum Sq      R-Sq      Adj. R-Sq 
## --------------------------------------------------------------------
## Full Model    147.901    335.978    8718.022    0.96289      0.95547 
## x2            146.794    348.197    8705.803    0.96154      0.95605 
## --------------------------------------------------------------------

This yields the same model as part (c)

(e) 9.21

press(fit_x1x3x4)

## [1] 471.452

SSE <- sigma(fit_x1x3x4)^2*(n-5);SSE

## [1] 331.6162

These values are not close, this suggests that MSE is NOT a valid indicator of the predictive ability of the fitted model.

(f) 9.22 a

Data2=read.table("http://users.stat.ufl.edu/~rrandles/sta4210/Rclassnotes/data/textdatasets/KutnerData/Chapter%20%209%20Data%20Sets/CH09PR22.txt")
names(Data2) = c("y", "x1", "x2", "x3", "x4") #y-Job Proficiency x1-x4-Test number
n=row(Data2)
cor(Data)

##            y        x1        x2        x3        x4
## y  1.0000000 0.5144107 0.4970057 0.8970645 0.8693865
## x1 0.5144107 1.0000000 0.1022689 0.1807692 0.3266632
## x2 0.4970057 0.1022689 1.0000000 0.5190448 0.3967101
## x3 0.8970645 0.1807692 0.5190448 1.0000000 0.7820385
## x4 0.8693865 0.3266632 0.3967101 0.7820385 1.0000000

cor(Data2)

##            y         x1         x2        x3        x4
## y  1.0000000 0.53707787 0.34477442 0.8880519 0.8879388
## x1 0.5370779 1.00000000 0.01057088 0.1772891 0.3196395
## x2 0.3447744 0.01057088 1.00000000 0.3437441 0.2207638
## x3 0.8880519 0.17728907 0.34374413 1.0000000 0.8714466
## x4 0.8879388 0.31963945 0.22076377 0.8714466 1.0000000

plot(Data)

plot(Data2)

The correlation matrix shows many different values for test 2 (x2), but the other values are pretty close.

(f) 9.22 b

fit_x1x3x4v = lm(y~x1+x3+x4, data=Data2)
fit_x1x3x4

## 
## Call:
## lm(formula = y ~ x1 + x3 + x4, data = Data)
## 
## Coefficients:
## (Intercept)           x1           x3           x4  
##   -124.2000       0.2963       1.3570       0.5174

fit_x1x3x4v

## 
## Call:
## lm(formula = y ~ x1 + x3 + x4, data = Data2)
## 
## Coefficients:
## (Intercept)           x1           x3           x4  
##   -122.7671       0.3124       1.4068       0.4284

The estimates for all 5 regression coefficients are relatively close for the training data and the validation data.

sigma(fit_x1x3x4)^2                      # MSE

## [1] 16.58081

sigma(fit_x1x3x4v)^2

## [1] 18.35493

summary(fit_x1x3x4)

## 
## Call:
## lm(formula = y ~ x1 + x3 + x4, data = Data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.4579 -3.1563 -0.2057  1.8070  6.6083 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -124.20002    9.87406 -12.578 3.04e-11 ***
## x1             0.29633    0.04368   6.784 1.04e-06 ***
## x3             1.35697    0.15183   8.937 1.33e-08 ***
## x4             0.51742    0.13105   3.948 0.000735 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.072 on 21 degrees of freedom
## Multiple R-squared:  0.9615, Adjusted R-squared:  0.956 
## F-statistic:   175 on 3 and 21 DF,  p-value: 5.16e-15

summary(fit_x1x3x4v)

## 
## Call:
## lm(formula = y ~ x1 + x3 + x4, data = Data2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.4619 -2.3836  0.6834  2.1123  7.2394 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -122.76705   11.84783 -10.362 1.04e-09 ***
## x1             0.31238    0.04729   6.605 1.54e-06 ***
## x3             1.40676    0.23262   6.048 5.31e-06 ***
## x4             0.42838    0.19749   2.169   0.0417 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.284 on 21 degrees of freedom
## Multiple R-squared:  0.9489, Adjusted R-squared:  0.9416 
## F-statistic:   130 on 3 and 21 DF,  p-value: 1.017e-13

The MSE is pretty close for both sets of data.

And the \(R^2\) is pretty close for both models.

From all this, we can say that the estimates from our validation set appear to be reasonably similar to our training data set.

(f) 9.22 c

fitY_t1 <- predict(fit_x1x3x4,new<-data.frame(x1=Data2$x1,x3=Data2$x3,x4=Data2$x4))
MSPR <-mean((Data2$y-fitY_t1)^2); MSPR  #MSPR

## [1] 15.70972

sigma(fit_x1x3x4)^2                 #MSE

## [1] 16.58081

This does NOT show a substantial bias problem. However this is different than our conclusion in 9.21.

(f) 9.22 d

total <- rbind(Data, Data2)
names(total) = c("y", "x1", "x2", "x3", "x4") #y-Job Proficiency x1-x4-Test number
n=nrow(total)
fit_x1x3x4t = lm(y~x1+x3+x4, data=total)
summary(fit_x1x3x4)

## 
## Call:
## lm(formula = y ~ x1 + x3 + x4, data = Data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.4579 -3.1563 -0.2057  1.8070  6.6083 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -124.20002    9.87406 -12.578 3.04e-11 ***
## x1             0.29633    0.04368   6.784 1.04e-06 ***
## x3             1.35697    0.15183   8.937 1.33e-08 ***
## x4             0.51742    0.13105   3.948 0.000735 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.072 on 21 degrees of freedom
## Multiple R-squared:  0.9615, Adjusted R-squared:  0.956 
## F-statistic:   175 on 3 and 21 DF,  p-value: 5.16e-15

summary(fit_x1x3x4t)

## 
## Call:
## lm(formula = y ~ x1 + x3 + x4, data = total)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.7192 -2.7369  0.1278  2.0971  7.0657 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -123.44104    7.16508 -17.228  < 2e-16 ***
## x1             0.30364    0.03072   9.886 5.86e-13 ***
## x3             1.36906    0.12280  11.148 1.15e-14 ***
## x4             0.48735    0.10475   4.652 2.79e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.006 on 46 degrees of freedom
## Multiple R-squared:  0.9567, Adjusted R-squared:  0.9539 
## F-statistic: 338.9 on 3 and 46 DF,  p-value: < 2.2e-16

The standard errors on each of the regression coefficients are reduced from the training (model-building) data set.