Preparation of the library and dataset

rm(list = ls())
# install.packages("MASS")
# install.packages("glmnet")
library(MASS)
library(glmnet)

uscrime <- read.table("/Users/helenalindsay/Documents/Fall_23/ISYE6501/hw7/uscrime.txt", header = TRUE)

Stepwise Regression

stepwiseAIC <- stepAIC(lm(Crime ~., data = uscrime), direction = "both")
## Start:  AIC=514.65
## Crime ~ M + So + Ed + Po1 + Po2 + LF + M.F + Pop + NW + U1 + 
##     U2 + Wealth + Ineq + Prob + Time
## 
##          Df Sum of Sq     RSS    AIC
## - So      1        29 1354974 512.65
## - LF      1      8917 1363862 512.96
## - Time    1     10304 1365250 513.00
## - Pop     1     14122 1369068 513.14
## - NW      1     18395 1373341 513.28
## - M.F     1     31967 1386913 513.74
## - Wealth  1     37613 1392558 513.94
## - Po2     1     37919 1392865 513.95
## <none>                1354946 514.65
## - U1      1     83722 1438668 515.47
## - Po1     1    144306 1499252 517.41
## - U2      1    181536 1536482 518.56
## - M       1    193770 1548716 518.93
## - Prob    1    199538 1554484 519.11
## - Ed      1    402117 1757063 524.86
## - Ineq    1    423031 1777977 525.42
## 
## Step:  AIC=512.65
## Crime ~ M + Ed + Po1 + Po2 + LF + M.F + Pop + NW + U1 + U2 + 
##     Wealth + Ineq + Prob + Time
## 
##          Df Sum of Sq     RSS    AIC
## - Time    1     10341 1365315 511.01
## - LF      1     10878 1365852 511.03
## - Pop     1     14127 1369101 511.14
## - NW      1     21626 1376600 511.39
## - M.F     1     32449 1387423 511.76
## - Po2     1     37954 1392929 511.95
## - Wealth  1     39223 1394197 511.99
## <none>                1354974 512.65
## - U1      1     96420 1451395 513.88
## + So      1        29 1354946 514.65
## - Po1     1    144302 1499277 515.41
## - U2      1    189859 1544834 516.81
## - M       1    195084 1550059 516.97
## - Prob    1    204463 1559437 517.26
## - Ed      1    403140 1758114 522.89
## - Ineq    1    488834 1843808 525.13
## 
## Step:  AIC=511.01
## Crime ~ M + Ed + Po1 + Po2 + LF + M.F + Pop + NW + U1 + U2 + 
##     Wealth + Ineq + Prob
## 
##          Df Sum of Sq     RSS    AIC
## - LF      1     10533 1375848 509.37
## - NW      1     15482 1380797 509.54
## - Pop     1     21846 1387161 509.75
## - Po2     1     28932 1394247 509.99
## - Wealth  1     36070 1401385 510.23
## - M.F     1     41784 1407099 510.42
## <none>                1365315 511.01
## - U1      1     91420 1456735 512.05
## + Time    1     10341 1354974 512.65
## + So      1        65 1365250 513.00
## - Po1     1    134137 1499452 513.41
## - U2      1    184143 1549458 514.95
## - M       1    186110 1551425 515.01
## - Prob    1    237493 1602808 516.54
## - Ed      1    409448 1774763 521.33
## - Ineq    1    502909 1868224 523.75
## 
## Step:  AIC=509.37
## Crime ~ M + Ed + Po1 + Po2 + M.F + Pop + NW + U1 + U2 + Wealth + 
##     Ineq + Prob
## 
##          Df Sum of Sq     RSS    AIC
## - NW      1     11675 1387523 507.77
## - Po2     1     21418 1397266 508.09
## - Pop     1     27803 1403651 508.31
## - M.F     1     31252 1407100 508.42
## - Wealth  1     35035 1410883 508.55
## <none>                1375848 509.37
## - U1      1     80954 1456802 510.06
## + LF      1     10533 1365315 511.01
## + Time    1      9996 1365852 511.03
## + So      1      3046 1372802 511.26
## - Po1     1    123896 1499744 511.42
## - U2      1    190746 1566594 513.47
## - M       1    217716 1593564 514.27
## - Prob    1    226971 1602819 514.54
## - Ed      1    413254 1789103 519.71
## - Ineq    1    500944 1876792 521.96
## 
## Step:  AIC=507.77
## Crime ~ M + Ed + Po1 + Po2 + M.F + Pop + U1 + U2 + Wealth + Ineq + 
##     Prob
## 
##          Df Sum of Sq     RSS    AIC
## - Po2     1     16706 1404229 506.33
## - Pop     1     25793 1413315 506.63
## - M.F     1     26785 1414308 506.66
## - Wealth  1     31551 1419073 506.82
## <none>                1387523 507.77
## - U1      1     83881 1471404 508.52
## + NW      1     11675 1375848 509.37
## + So      1      7207 1380316 509.52
## + LF      1      6726 1380797 509.54
## + Time    1      4534 1382989 509.61
## - Po1     1    118348 1505871 509.61
## - U2      1    201453 1588976 512.14
## - Prob    1    216760 1604282 512.59
## - M       1    309214 1696737 515.22
## - Ed      1    402754 1790276 517.74
## - Ineq    1    589736 1977259 522.41
## 
## Step:  AIC=506.33
## Crime ~ M + Ed + Po1 + M.F + Pop + U1 + U2 + Wealth + Ineq + 
##     Prob
## 
##          Df Sum of Sq     RSS    AIC
## - Pop     1     22345 1426575 505.07
## - Wealth  1     32142 1436371 505.39
## - M.F     1     36808 1441037 505.54
## <none>                1404229 506.33
## - U1      1     86373 1490602 507.13
## + Po2     1     16706 1387523 507.77
## + NW      1      6963 1397266 508.09
## + So      1      3807 1400422 508.20
## + LF      1      1986 1402243 508.26
## + Time    1       575 1403654 508.31
## - U2      1    205814 1610043 510.76
## - Prob    1    218607 1622836 511.13
## - M       1    307001 1711230 513.62
## - Ed      1    389502 1793731 515.83
## - Ineq    1    608627 2012856 521.25
## - Po1     1   1050202 2454432 530.57
## 
## Step:  AIC=505.07
## Crime ~ M + Ed + Po1 + M.F + U1 + U2 + Wealth + Ineq + Prob
## 
##          Df Sum of Sq     RSS    AIC
## - Wealth  1     26493 1453068 503.93
## <none>                1426575 505.07
## - M.F     1     84491 1511065 505.77
## - U1      1     99463 1526037 506.24
## + Pop     1     22345 1404229 506.33
## + Po2     1     13259 1413315 506.63
## + NW      1      5927 1420648 506.87
## + So      1      5724 1420851 506.88
## + LF      1      5176 1421398 506.90
## + Time    1      3913 1422661 506.94
## - Prob    1    198571 1625145 509.20
## - U2      1    208880 1635455 509.49
## - M       1    320926 1747501 512.61
## - Ed      1    386773 1813348 514.35
## - Ineq    1    594779 2021354 519.45
## - Po1     1   1127277 2553852 530.44
## 
## Step:  AIC=503.93
## Crime ~ M + Ed + Po1 + M.F + U1 + U2 + Ineq + Prob
## 
##          Df Sum of Sq     RSS    AIC
## <none>                1453068 503.93
## + Wealth  1     26493 1426575 505.07
## - M.F     1    103159 1556227 505.16
## + Pop     1     16697 1436371 505.39
## + Po2     1     14148 1438919 505.47
## + So      1      9329 1443739 505.63
## + LF      1      4374 1448694 505.79
## + NW      1      3799 1449269 505.81
## + Time    1      2293 1450775 505.86
## - U1      1    127044 1580112 505.87
## - Prob    1    247978 1701046 509.34
## - U2      1    255443 1708511 509.55
## - M       1    296790 1749858 510.67
## - Ed      1    445788 1898855 514.51
## - Ineq    1    738244 2191312 521.24
## - Po1     1   1672038 3125105 537.93
summary(stepwiseAIC)
## 
## Call:
## lm(formula = Crime ~ M + Ed + Po1 + M.F + U1 + U2 + Ineq + Prob, 
##     data = uscrime)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -444.70 -111.07    3.03  122.15  483.30 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -6426.10    1194.61  -5.379 4.04e-06 ***
## M              93.32      33.50   2.786  0.00828 ** 
## Ed            180.12      52.75   3.414  0.00153 ** 
## Po1           102.65      15.52   6.613 8.26e-08 ***
## M.F            22.34      13.60   1.642  0.10874    
## U1          -6086.63    3339.27  -1.823  0.07622 .  
## U2            187.35      72.48   2.585  0.01371 *  
## Ineq           61.33      13.96   4.394 8.63e-05 ***
## Prob        -3796.03    1490.65  -2.547  0.01505 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 195.5 on 38 degrees of freedom
## Multiple R-squared:  0.7888, Adjusted R-squared:  0.7444 
## F-statistic: 17.74 on 8 and 38 DF,  p-value: 1.159e-10
stepwiseBIC <- stepAIC(lm(Crime ~., data = uscrime), direction = "both", k = log(nrow(uscrime)))
## Start:  AIC=544.25
## Crime ~ M + So + Ed + Po1 + Po2 + LF + M.F + Pop + NW + U1 + 
##     U2 + Wealth + Ineq + Prob + Time
## 
##          Df Sum of Sq     RSS    AIC
## - So      1        29 1354974 540.40
## - LF      1      8917 1363862 540.71
## - Time    1     10304 1365250 540.76
## - Pop     1     14122 1369068 540.89
## - NW      1     18395 1373341 541.03
## - M.F     1     31967 1386913 541.50
## - Wealth  1     37613 1392558 541.69
## - Po2     1     37919 1392865 541.70
## - U1      1     83722 1438668 543.22
## <none>                1354946 544.25
## - Po1     1    144306 1499252 545.16
## - U2      1    181536 1536482 546.31
## - M       1    193770 1548716 546.68
## - Prob    1    199538 1554484 546.86
## - Ed      1    402117 1757063 552.62
## - Ineq    1    423031 1777977 553.17
## 
## Step:  AIC=540.4
## Crime ~ M + Ed + Po1 + Po2 + LF + M.F + Pop + NW + U1 + U2 + 
##     Wealth + Ineq + Prob + Time
## 
##          Df Sum of Sq     RSS    AIC
## - Time    1     10341 1365315 536.91
## - LF      1     10878 1365852 536.93
## - Pop     1     14127 1369101 537.04
## - NW      1     21626 1376600 537.30
## - M.F     1     32449 1387423 537.66
## - Po2     1     37954 1392929 537.85
## - Wealth  1     39223 1394197 537.89
## - U1      1     96420 1451395 539.78
## <none>                1354974 540.40
## - Po1     1    144302 1499277 541.31
## - U2      1    189859 1544834 542.72
## - M       1    195084 1550059 542.87
## - Prob    1    204463 1559437 543.16
## + So      1        29 1354946 544.25
## - Ed      1    403140 1758114 548.79
## - Ineq    1    488834 1843808 551.03
## 
## Step:  AIC=536.91
## Crime ~ M + Ed + Po1 + Po2 + LF + M.F + Pop + NW + U1 + U2 + 
##     Wealth + Ineq + Prob
## 
##          Df Sum of Sq     RSS    AIC
## - LF      1     10533 1375848 533.42
## - NW      1     15482 1380797 533.59
## - Pop     1     21846 1387161 533.81
## - Po2     1     28932 1394247 534.04
## - Wealth  1     36070 1401385 534.28
## - M.F     1     41784 1407099 534.48
## - U1      1     91420 1456735 536.11
## <none>                1365315 536.91
## - Po1     1    134137 1499452 537.46
## - U2      1    184143 1549458 539.01
## - M       1    186110 1551425 539.07
## + Time    1     10341 1354974 540.40
## - Prob    1    237493 1602808 540.60
## + So      1        65 1365250 540.76
## - Ed      1    409448 1774763 545.39
## - Ineq    1    502909 1868224 547.80
## 
## Step:  AIC=533.42
## Crime ~ M + Ed + Po1 + Po2 + M.F + Pop + NW + U1 + U2 + Wealth + 
##     Ineq + Prob
## 
##          Df Sum of Sq     RSS    AIC
## - NW      1     11675 1387523 529.97
## - Po2     1     21418 1397266 530.30
## - Pop     1     27803 1403651 530.51
## - M.F     1     31252 1407100 530.63
## - Wealth  1     35035 1410883 530.75
## - U1      1     80954 1456802 532.26
## <none>                1375848 533.42
## - Po1     1    123896 1499744 533.62
## - U2      1    190746 1566594 535.67
## - M       1    217716 1593564 536.47
## - Prob    1    226971 1602819 536.75
## + LF      1     10533 1365315 536.91
## + Time    1      9996 1365852 536.93
## + So      1      3046 1372802 537.17
## - Ed      1    413254 1789103 541.91
## - Ineq    1    500944 1876792 544.16
## 
## Step:  AIC=529.97
## Crime ~ M + Ed + Po1 + Po2 + M.F + Pop + U1 + U2 + Wealth + Ineq + 
##     Prob
## 
##          Df Sum of Sq     RSS    AIC
## - Po2     1     16706 1404229 526.68
## - Pop     1     25793 1413315 526.98
## - M.F     1     26785 1414308 527.02
## - Wealth  1     31551 1419073 527.17
## - U1      1     83881 1471404 528.88
## - Po1     1    118348 1505871 529.96
## <none>                1387523 529.97
## - U2      1    201453 1588976 532.49
## - Prob    1    216760 1604282 532.94
## + NW      1     11675 1375848 533.42
## + So      1      7207 1380316 533.57
## + LF      1      6726 1380797 533.59
## + Time    1      4534 1382989 533.66
## - M       1    309214 1696737 535.57
## - Ed      1    402754 1790276 538.10
## - Ineq    1    589736 1977259 542.76
## 
## Step:  AIC=526.68
## Crime ~ M + Ed + Po1 + M.F + Pop + U1 + U2 + Wealth + Ineq + 
##     Prob
## 
##          Df Sum of Sq     RSS    AIC
## - Pop     1     22345 1426575 523.57
## - Wealth  1     32142 1436371 523.89
## - M.F     1     36808 1441037 524.05
## - U1      1     86373 1490602 525.64
## <none>                1404229 526.68
## - U2      1    205814 1610043 529.26
## - Prob    1    218607 1622836 529.63
## + Po2     1     16706 1387523 529.97
## + NW      1      6963 1397266 530.30
## + So      1      3807 1400422 530.40
## + LF      1      1986 1402243 530.46
## + Time    1       575 1403654 530.51
## - M       1    307001 1711230 532.12
## - Ed      1    389502 1793731 534.34
## - Ineq    1    608627 2012856 539.75
## - Po1     1   1050202 2454432 549.07
## 
## Step:  AIC=523.57
## Crime ~ M + Ed + Po1 + M.F + U1 + U2 + Wealth + Ineq + Prob
## 
##          Df Sum of Sq     RSS    AIC
## - Wealth  1     26493 1453068 520.59
## - M.F     1     84491 1511065 522.43
## - U1      1     99463 1526037 522.89
## <none>                1426575 523.57
## - Prob    1    198571 1625145 525.85
## - U2      1    208880 1635455 526.14
## + Pop     1     22345 1404229 526.68
## + Po2     1     13259 1413315 526.98
## + NW      1      5927 1420648 527.23
## + So      1      5724 1420851 527.23
## + LF      1      5176 1421398 527.25
## + Time    1      3913 1422661 527.29
## - M       1    320926 1747501 529.26
## - Ed      1    386773 1813348 531.00
## - Ineq    1    594779 2021354 536.10
## - Po1     1   1127277 2553852 547.09
## 
## Step:  AIC=520.59
## Crime ~ M + Ed + Po1 + M.F + U1 + U2 + Ineq + Prob
## 
##          Df Sum of Sq     RSS    AIC
## - M.F     1    103159 1556227 519.96
## <none>                1453068 520.59
## - U1      1    127044 1580112 520.68
## + Wealth  1     26493 1426575 523.57
## + Pop     1     16697 1436371 523.89
## + Po2     1     14148 1438919 523.98
## + So      1      9329 1443739 524.13
## - Prob    1    247978 1701046 524.14
## + LF      1      4374 1448694 524.29
## + NW      1      3799 1449269 524.31
## - U2      1    255443 1708511 524.35
## + Time    1      2293 1450775 524.36
## - M       1    296790 1749858 525.47
## - Ed      1    445788 1898855 529.31
## - Ineq    1    738244 2191312 536.04
## - Po1     1   1672038 3125105 552.73
## 
## Step:  AIC=519.96
## Crime ~ M + Ed + Po1 + U1 + U2 + Ineq + Prob
## 
##          Df Sum of Sq     RSS    AIC
## - U1      1     54830 1611057 517.74
## <none>                1556227 519.96
## + M.F     1    103159 1453068 520.59
## - U2      1    194750 1750977 521.65
## + Pop     1     66223 1490004 521.77
## + Wealth  1     45162 1511065 522.43
## - Prob    1    239705 1795931 522.84
## + Po2     1     29979 1526248 522.90
## + Time    1     22501 1533726 523.13
## + LF      1     10865 1545361 523.48
## + So      1      3867 1552360 523.69
## + NW      1       147 1556080 523.81
## - M       1    413318 1969545 527.18
## - Ed      1    815182 2371408 535.91
## - Ineq    1    906629 2462856 537.69
## - Po1     1   1811722 3367949 552.40
## 
## Step:  AIC=517.74
## Crime ~ M + Ed + Po1 + U2 + Ineq + Prob
## 
##          Df Sum of Sq     RSS    AIC
## <none>                1611057 517.74
## - U2      1    192233 1803290 519.18
## + Wealth  1     59910 1551147 519.81
## + U1      1     54830 1556227 519.96
## + Pop     1     51320 1559737 520.07
## - Prob    1    249308 1860365 520.65
## + M.F     1     30945 1580112 520.68
## + Po2     1     25017 1586040 520.85
## + So      1     17958 1593098 521.06
## + LF      1     13179 1597878 521.20
## + Time    1      7159 1603898 521.38
## + NW      1       359 1610698 521.58
## - M       1    400611 2011667 524.32
## - Ed      1    776207 2387264 532.37
## - Ineq    1    949221 2560278 535.66
## - Po1     1   2817067 4428124 561.41
summary(stepwiseBIC)
## 
## Call:
## lm(formula = Crime ~ M + Ed + Po1 + U2 + Ineq + Prob, data = uscrime)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -470.68  -78.41  -19.68  133.12  556.23 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5040.50     899.84  -5.602 1.72e-06 ***
## M             105.02      33.30   3.154  0.00305 ** 
## Ed            196.47      44.75   4.390 8.07e-05 ***
## Po1           115.02      13.75   8.363 2.56e-10 ***
## U2             89.37      40.91   2.185  0.03483 *  
## Ineq           67.65      13.94   4.855 1.88e-05 ***
## Prob        -3801.84    1528.10  -2.488  0.01711 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 200.7 on 40 degrees of freedom
## Multiple R-squared:  0.7659, Adjusted R-squared:  0.7307 
## F-statistic: 21.81 on 6 and 40 DF,  p-value: 3.418e-11

Lasso

Scaling the data
uscrime_x <- scale(as.matrix(uscrime[, -ncol(uscrime)]))
uscrime_y <- scale(uscrime$Crime)
lasso <- glmnet(uscrime_x, uscrime_y, alpha = 1)
summary(lasso)
##           Length Class     Mode   
## a0          88   -none-    numeric
## beta      1320   dgCMatrix S4     
## df          88   -none-    numeric
## dim          2   -none-    numeric
## lambda      88   -none-    numeric
## dev.ratio   88   -none-    numeric
## nulldev      1   -none-    numeric
## npasses      1   -none-    numeric
## jerr         1   -none-    numeric
## offset       1   -none-    logical
## call         4   -none-    call   
## nobs         1   -none-    numeric
plot(lasso, label = TRUE)

Lasso: cross-validation
set.seed(123)
lasso_cv <- cv.glmnet(uscrime_x, uscrime_y, alpha = 1)
lasso_best_lambda <- lasso_cv$lambda.min
lasso_coefs <- coef(lasso_cv, s = lasso_best_lambda)
lasso_best_lambda
## [1] 0.008583784
lasso_coefs
## 16 x 1 sparse Matrix of class "dgCMatrix"
##                        s1
## (Intercept) -3.353617e-16
## M            2.700905e-01
## So           3.917902e-02
## Ed           4.527764e-01
## Po1          7.661597e-01
## Po2          .           
## LF           .           
## M.F          1.353429e-01
## Pop         -4.898014e-02
## NW           3.702627e-02
## U1          -1.837816e-01
## U2           3.005588e-01
## Wealth       1.389017e-01
## Ineq         6.468310e-01
## Prob        -2.306323e-01
## Time         .

Elastic net

elasticnet <- glmnet(uscrime_x, uscrime_y, alpha = 0.5)
summary(elasticnet)
##           Length Class     Mode   
## a0          96   -none-    numeric
## beta      1440   dgCMatrix S4     
## df          96   -none-    numeric
## dim          2   -none-    numeric
## lambda      96   -none-    numeric
## dev.ratio   96   -none-    numeric
## nulldev      1   -none-    numeric
## npasses      1   -none-    numeric
## jerr         1   -none-    numeric
## offset       1   -none-    logical
## call         4   -none-    call   
## nobs         1   -none-    numeric
plot(elasticnet, label = TRUE)

Elastic net: cross-validation
set.seed(123)
elasticnet_cv <- cv.glmnet(uscrime_x, uscrime_y, alpha = 0.5)
elasticnet_best_lambda <- elasticnet_cv$lambda.min
elasticnet_coefs <- coef(elasticnet_cv, s = elasticnet_best_lambda)
elasticnet_best_lambda
## [1] 0.01183294
elasticnet_coefs
## 16 x 1 sparse Matrix of class "dgCMatrix"
##                        s1
## (Intercept) -3.384054e-16
## M            2.677695e-01
## So           4.370161e-02
## Ed           4.518333e-01
## Po1          7.464564e-01
## Po2          .           
## LF           .           
## M.F          1.434832e-01
## Pop         -4.966532e-02
## NW           4.920026e-02
## U1          -1.958952e-01
## U2           3.109392e-01
## Wealth       1.523322e-01
## Ineq         6.370044e-01
## Prob        -2.356296e-01
## Time         .

Different alphas

list.of.fits <- list()
for (i in 0:10) {
  fit.name <- paste0("alpha", i/10)
  list.of.fits[[fit.name]] <-
    cv.glmnet(uscrime_x, uscrime_y, alpha=i/10)
}

## predicting the values in the Testing dataset.
results <- data.frame()
for (i in 0:10) {
  fit.name <- paste0("alpha", i/10)
  predicted <- 
    predict(list.of.fits[[fit.name]], 
      s=list.of.fits[[fit.name]]$lambda.1se, newx=uscrime_x)
  mse <- mean((uscrime_y - predicted)^2)
  rmse <- sqrt(mean((uscrime_y - predicted)^2))
  # Calculate R-squared for Lasso model
  ss_total <- sum((uscrime_y - mean(uscrime_y))^2)
  ss_residual <- sum((uscrime_y - predicted)^2)
  r_squared <- 1 - (ss_residual / ss_total)
  ## Store the results
  temp <- data.frame(alpha=i/10, r_squared=r_squared, fit.name=fit.name)
  results <- rbind(results, temp)
}
results
##    alpha r_squared fit.name
## 1    0.0 0.5500243   alpha0
## 2    0.1 0.6080307 alpha0.1
## 3    0.2 0.6750612 alpha0.2
## 4    0.3 0.6079146 alpha0.3
## 5    0.4 0.6427385 alpha0.4
## 6    0.5 0.5847286 alpha0.5
## 7    0.6 0.5512065 alpha0.6
## 8    0.7 0.6678918 alpha0.7
## 9    0.8 0.6103203 alpha0.8
## 10   0.9 0.6322980 alpha0.9
## 11   1.0 0.6223538   alpha1
plot(results$r_squared~results$alpha)