patient=read.table("C:/Users/tusha/OneDrive/Documents/Columbia Fall 2018 Courses/Linear Regression Models/Data Sets/Chapter  6 Data Sets/CH06PR15.txt")

names(patient) = c("y", "x1", "x2", "x3")

9.9 (a)

bestsubset <- function(X, y){
  P = ncol(X)
  subsets = expand.grid( rep( list( 0:1), P))
  names(subsets)=paste('X',1:P,sep='')
  stat = NULL
  SST = sum((y-mean(y))^2)
  
  fitall = lm(y~X)
  n = nrow(X)
  MSEP = deviance(fitall)/(n-P-1) 
  for(i in 1:nrow(subsets)){
    subs = which(subsets[i,]>0)
    if(length(subs)==0) fit = lm(y~1)
    else {
      subX = X[,which(subsets[i,]>0)]
      fit = lm(y~subX)
    }
    p = length(subs)+1
    SSE = deviance(fit)
    R2 = 1-SSE/SST
    R2a = 1-SSE/SST*(n-1)/(n-p)
    Cp = SSE/MSEP - (n-2*p) 
    AIC = n*log(SSE)-n*log(n)+2*p
    BIC = n*log(SSE)-n*log(n)+log(n)*p
    
    X1 = as.matrix(cbind(1,X[,subs]))
    hatMat = X1%*%solve(t(X1)%*%X1)%*%t(X1)
    eList = fit$residuals
    dList = eList/(1-diag(hatMat))
    PRESS = sum(dList^2)
    
    
    criList = c(length(subs)+1, subsets[i,],  SSE, R2, R2a, Cp, AIC, BIC, PRESS)
    
    stat=rbind(stat,criList)
    
  }
  rownames(stat)=NULL
  colnames(stat)=c('p',names(subsets),'SSE','R2','R2a','Cp','AIC','BIC','PRESS')
  
  
  model = NULL
  model$R2 = which.max(stat[,P+3])
  
  model$R2a = which.max(stat[,P+4])
  
  model$Cp = which.min(stat[,P+5])
  
  model$AIC = which.min(stat[,P+6])
  
  model$BIC = which.min(stat[,P+7])
  
  model$PRESS = which.min(stat[,P+8])
  return(list(model=model, stat=stat))
}

X = cbind(patient$x1, patient$x2, patient$x3)
y = patient$y

subset = bestsubset(X,y)
subset$stat
##      p X1 X2 X3 SSE      R2            R2a           Cp       AIC     
## [1,] 1 0  0  0  13369.3  -2.220446e-16 -2.220446e-16 88.15623 262.9155
## [2,] 2 1  0  0  5093.915 0.6189843     0.6103248     8.353606 220.5294
## [3,] 2 0  1  0  8509.044 0.3635387     0.3490737     42.11232 244.1312
## [4,] 3 1  1  0  4613     0.6549559     0.6389073     5.599735 217.9676
## [5,] 2 0  0  1  7814.391 0.4154975     0.4022134     35.24564 240.2137
## [6,] 3 1  0  1  4330.5   0.6760864     0.6610206     2.807204 215.0607
## [7,] 3 0  1  1  7106.394 0.4684545     0.4437314     30.24706 237.845 
## [8,] 4 1  1  1  4248.841 0.6821943     0.6594939     4        216.185 
##      BIC      PRESS   
## [1,] 264.7441 13970.1 
## [2,] 224.1867 5569.562
## [3,] 247.7885 9254.489
## [4,] 223.4536 5235.192
## [5,] 243.871  8451.432
## [6,] 220.5466 4902.751
## [7,] 243.3309 8115.912
## [8,] 223.4995 5057.886
require(leaps)
## Loading required package: leaps
leaps = regsubsets(y~., nbest = 3, data = patient)
summary(leaps)
## Subset selection object
## Call: regsubsets.formula(y ~ ., nbest = 3, data = patient)
## 3 Variables  (and intercept)
##    Forced in Forced out
## x1     FALSE      FALSE
## x2     FALSE      FALSE
## x3     FALSE      FALSE
## 3 subsets of each size up to 3
## Selection Algorithm: exhaustive
##          x1  x2  x3 
## 1  ( 1 ) "*" " " " "
## 1  ( 2 ) " " " " "*"
## 1  ( 3 ) " " "*" " "
## 2  ( 1 ) "*" " " "*"
## 2  ( 2 ) "*" "*" " "
## 2  ( 3 ) " " "*" "*"
## 3  ( 1 ) "*" "*" "*"
plot(leaps, scale="adjr2")

plot(leaps, scale="bic")

plot(leaps, scale="Cp")

As seen from the results, [6,] indicates that the variables x1 and x3 should be included in the model while the variable x2 should be discarded. This is confirmed by the highest value of \(R^2_{a,p}=0.6610206\), and lowest values for \(AIC_p\), \(C_p\), \(PRESS_p\).

Reading the plot: the level of shading represents adding/not adding variable to our model. FOr example, if we look at the BIC plot at the level of -40, we see that (intercept), x1, and x3 are shaded black which means they should be added to the model, whereas x2 is not shaded at all in the final step so we can drop it from our model.

The plots for adjr2, bic, and Cp confirm our results. Note that plot for AICp and PRESSp will result the same plot as BICp.

9.9 (b)

Yes, the four criteria in part (a) identify the same best subset. This does not always happen.

9.9 (c)

The forward stepwise regression is faster and adds and drops variables at the same time so yes, it would be a good screening procedure over the all-possible-regression procedure. It also shows the result of the t-statistic and p-values for significant predictors which can be useful.


9.17 (a) Forward Stepwise Regression

library(MASS)
full = lm(y~., data=patient)
null = lm(y~1, data=patient)

#starting with empty model and adding variables
addterm(null, scope = full, test="F")
## Single term additions
## 
## Model:
## y ~ 1
##        Df Sum of Sq     RSS    AIC F Value     Pr(F)    
## <none>              13369.3 262.92                      
## x1      1    8275.4  5093.9 220.53  71.481 9.058e-11 ***
## x2      1    4860.3  8509.0 244.13  25.132 9.230e-06 ***
## x3      1    5554.9  7814.4 240.21  31.278 1.335e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#No variables can be dropped since no F value < 2.9. However, variables can be added
#add variable with largest F value > 3.0 --> add x1
fsr = update(null, .~. + x1)

#check to see if terms can be added/dropped
addterm(fsr, scope = full, test="F")
## Single term additions
## 
## Model:
## y ~ x1
##        Df Sum of Sq    RSS    AIC F Value   Pr(F)   
## <none>              5093.9 220.53                   
## x2      1    480.92 4613.0 217.97  4.4828 0.04006 * 
## x3      1    763.42 4330.5 215.06  7.5804 0.00861 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
dropterm(fsr, test = "F")
## Single term deletions
## 
## Model:
## y ~ x1
##        Df Sum of Sq     RSS    AIC F Value     Pr(F)    
## <none>               5093.9 220.53                      
## x1      1    8275.4 13369.3 262.92  71.481 9.058e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#still no variables can be dropped. However, we can add x3 since it has the largest F value > 3.0 
fsr = update(fsr, .~. + x3)

#check to see if terms can be added/dropped
addterm(fsr, scope = full, test="F")
## Single term additions
## 
## Model:
## y ~ x1 + x3
##        Df Sum of Sq    RSS    AIC F Value  Pr(F)
## <none>              4330.5 215.06               
## x2      1    81.659 4248.8 216.19  0.8072 0.3741
dropterm(fsr, test = "F")
## Single term deletions
## 
## Model:
## y ~ x1 + x3
##        Df Sum of Sq    RSS    AIC F Value     Pr(F)    
## <none>              4330.5 215.06                      
## x1      1    3483.9 7814.4 240.21  34.594 5.434e-07 ***
## x3      1     763.4 5093.9 220.53   7.580   0.00861 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#No variables can be added or dropped at this point

Since no more variables can be added to dropped at this point, we conclude that the final model contains only x1 and x3 predictors. This model is confirmed by the direct procedure (which doesn’t show F values so we add/remove variables manually to compare F values to the limits provided)

step(null, scope=list(upper=full, lower=null), direction='both', test = 'F', trace=TRUE)
## Start:  AIC=262.92
## y ~ 1
## 
##        Df Sum of Sq     RSS    AIC F value    Pr(>F)    
## + x1    1    8275.4  5093.9 220.53  71.481 9.058e-11 ***
## + x3    1    5554.9  7814.4 240.21  31.278 1.335e-06 ***
## + x2    1    4860.3  8509.0 244.13  25.132 9.230e-06 ***
## <none>              13369.3 262.92                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=220.53
## y ~ x1
## 
##        Df Sum of Sq     RSS    AIC F value    Pr(>F)    
## + x3    1     763.4  4330.5 215.06  7.5804   0.00861 ** 
## + x2    1     480.9  4613.0 217.97  4.4828   0.04006 *  
## <none>               5093.9 220.53                      
## - x1    1    8275.4 13369.3 262.92 71.4808 9.058e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=215.06
## y ~ x1 + x3
## 
##        Df Sum of Sq    RSS    AIC F value    Pr(>F)    
## <none>              4330.5 215.06                      
## + x2    1      81.7 4248.8 216.19  0.8072   0.37407    
## - x3    1     763.4 5093.9 220.53  7.5804   0.00861 ** 
## - x1    1    3483.9 7814.4 240.21 34.5935 5.434e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Call:
## lm(formula = y ~ x1 + x3, data = patient)
## 
## Coefficients:
## (Intercept)           x1           x3  
##      145.94        -1.20       -16.74

9.17 (b)

1-pf(3, 42, 3)
## [1] 0.1987257

The level of significance is 0.1987257 for F value of 3.0

9.17 (c) Forward Selection

#start with empty model
addterm(null, scope = full, test="F")
## Single term additions
## 
## Model:
## y ~ 1
##        Df Sum of Sq     RSS    AIC F Value     Pr(F)    
## <none>              13369.3 262.92                      
## x1      1    8275.4  5093.9 220.53  71.481 9.058e-11 ***
## x2      1    4860.3  8509.0 244.13  25.132 9.230e-06 ***
## x3      1    5554.9  7814.4 240.21  31.278 1.335e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#add variable with largest F value > 3.0 --> add x1
NewMod = update( null, .~. + x1 )
addterm(NewMod, scope = full, test="F")
## Single term additions
## 
## Model:
## y ~ x1
##        Df Sum of Sq    RSS    AIC F Value   Pr(F)   
## <none>              5093.9 220.53                   
## x2      1    480.92 4613.0 217.97  4.4828 0.04006 * 
## x3      1    763.42 4330.5 215.06  7.5804 0.00861 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#add variable with largest F value > 3.0 --> add x3
NewMod = update( NewMod, .~. + x3 )
addterm(NewMod, scope = full, test="F")
## Single term additions
## 
## Model:
## y ~ x1 + x3
##        Df Sum of Sq    RSS    AIC F Value  Pr(F)
## <none>              4330.5 215.06               
## x2      1    81.659 4248.8 216.19  0.8072 0.3741
#observe that after adding x1, x3, we get F value of 0.8072 for x2 < 3.0 so we stop.
#select x1 and x3 to be the final additions to the model
summary(NewMod)
## 
## Call:
## lm(formula = y ~ x1 + x3, data = patient)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.4453  -7.3285   0.6733   8.5126  18.0534 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 145.9412    11.5251  12.663 4.21e-16 ***
## x1           -1.2005     0.2041  -5.882 5.43e-07 ***
## x3          -16.7421     6.0808  -2.753  0.00861 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.04 on 43 degrees of freedom
## Multiple R-squared:  0.6761, Adjusted R-squared:  0.661 
## F-statistic: 44.88 on 2 and 43 DF,  p-value: 2.98e-11

9.17 (d) Backward elimination

#start with full model
dropterm(full, test = "F")
## Single term deletions
## 
## Model:
## y ~ x1 + x2 + x3
##        Df Sum of Sq    RSS    AIC F Value    Pr(F)    
## <none>              4248.8 216.19                     
## x1      1   2857.55 7106.4 237.84 28.2471 3.81e-06 ***
## x2      1     81.66 4330.5 215.06  0.8072  0.37407    
## x3      1    364.16 4613.0 217.97  3.5997  0.06468 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#drop variable with smallest F value < 2.9 --> drop variable x2
NewModbackward = update( full, .~. - x2)

#check to see if any more variables can be dropped
dropterm(NewModbackward, test = "F")
## Single term deletions
## 
## Model:
## y ~ x1 + x3
##        Df Sum of Sq    RSS    AIC F Value     Pr(F)    
## <none>              4330.5 215.06                      
## x1      1    3483.9 7814.4 240.21  34.594 5.434e-07 ***
## x3      1     763.4 5093.9 220.53   7.580   0.00861 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#all F values > 2.9 at this point so we stop and select the model without x2 as the final model
summary(NewModbackward)
## 
## Call:
## lm(formula = y ~ x1 + x3, data = patient)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.4453  -7.3285   0.6733   8.5126  18.0534 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 145.9412    11.5251  12.663 4.21e-16 ***
## x1           -1.2005     0.2041  -5.882 5.43e-07 ***
## x3          -16.7421     6.0808  -2.753  0.00861 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.04 on 43 degrees of freedom
## Multiple R-squared:  0.6761, Adjusted R-squared:  0.661 
## F-statistic: 44.88 on 2 and 43 DF,  p-value: 2.98e-11

9.17 (e)

The results from all 3 selection procedures are consistently same. The results for all possible regressions in problem 9.9 yielded the exact same results so we can say that our testing procedure is valid.


11.6

data11.6 = read.table("C:/Users/tusha/OneDrive/Documents/Columbia Fall 2018 Courses/Linear Regression Models/Data Sets/Chapter 11 Data Sets/CH11PR06.txt")

x = data11.6[,2]
y = data11.6[,1]

#(a) OLS
lmfit = lm(y~x)

plot(x, y)
abline(lmfit)

resi = residuals(lmfit)
par(mfrow=c(1,2))
plot(x,resi)
abline(0,0)

#(c)
plot(x,abs(resi))

#(b) BF test
fitted.val = fitted(lmfit)
fitted.val = sort(fitted.val)
g1 = fitted.val[1:6]
g2 = fitted.val[6:12]
m1 = median(g1)
m2 = median(g2)
d1 = sum(abs(g1-m1))/length(g1)
d2 = sum(abs(g2-m2))/length(g2)
s = sqrt((sum((abs(g1-m1)-d1)^2)+sum((abs(g2-m2)-d2)^2))/10)

tstar_bf <- abs((d1-d2)/(s*sqrt(1/length(g1)+1/length(g2))))
tstar_bf
## [1] 1.036942
qt(0.975, 12)
## [1] 2.178813
#(d) regress absolute residuals on X
vfit = lm(abs(resi)~x)
abline(vfit)

#use the fitted values to get the weights
wlist = vfit$fitted^(-2)
wlist
##          1          2          3          4          5          6 
## 0.05517614 0.07665099 0.02607360 0.18556213 0.07665099 0.04767612 
##          7          8          9         10         11         12 
## 0.18556213 0.09242488 0.03662794 0.11362048 0.04160751 0.14304022
min(wlist)
## [1] 0.0260736
max(wlist)
## [1] 0.1855621
#(e) use weighted least squares
wlfit = lm(y~x, weights=wlist)

#(f)compare the two fit...
summary(lmfit)
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -6.389 -3.536 -0.334  3.319  6.418 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  19.4727     5.5162   3.530  0.00545 ** 
## x             3.2689     0.3651   8.955 4.33e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.598 on 10 degrees of freedom
## Multiple R-squared:  0.8891, Adjusted R-squared:  0.878 
## F-statistic: 80.19 on 1 and 10 DF,  p-value: 4.33e-06
summary(wlfit)
## 
## Call:
## lm(formula = y ~ x, weights = wlist)
## 
## Weighted Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.48741 -0.96167 -0.04198  1.10930  1.50265 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  17.3006     4.8277   3.584  0.00498 ** 
## x             3.4211     0.3703   9.238 3.27e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.159 on 10 degrees of freedom
## Multiple R-squared:  0.8951, Adjusted R-squared:  0.8846 
## F-statistic: 85.35 on 1 and 10 DF,  p-value: 3.269e-06
#(g)
res = vfit$residuals
vfit.new = lm(abs(res)~x)
wlist.new = vfit.new$fitted^(-2)

wlfit.new = lm(y~x, weights = wlist.new)
summary(wlfit.new)
## 
## Call:
## lm(formula = y ~ x, weights = wlist.new)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.3041 -3.3775 -0.3748  3.1911  6.5249 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  19.9470     5.6334   3.541  0.00535 ** 
## x             3.2367     0.3667   8.826 4.93e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.487 on 10 degrees of freedom
## Multiple R-squared:  0.8862, Adjusted R-squared:  0.8748 
## F-statistic: 77.89 on 1 and 10 DF,  p-value: 4.932e-06
  1. The plot of the residuals versus the predictor values indicates possible nonconstant variance.

  2. BF test –>
  1. The plot of abs(residual) against x potentially suggests that the means of two normally distributed populations are the same where the populations are the absolute deviations between the prediction and the observed output space in two non-overlapping partitions of the input space. Basically we check the same thing as in part (a) but with absolute values.

  2. We observe that the 3rd observation gets the smallest weight of 0.02607 while the 7th case gets the largest weight of 0.18556.

  3. The weighted least square estimates for the parameters are different from the OLS estimates found in (a) but not too different.

  4. We find that the standard deviation of coefficients of weighted least squares are different, but not by a significant number from the standard deviation of the coefficients of OLS.

  5. After the iterative step, we find that that there isn’t substantial change in the estimated regression coefficients which means that our algorithm converges. If there was significant change that means that we would have to iterate one more step until we find not much change in the estimates.