Simple Linear Regression

library("ISLR")
library("car")
library("MASS")
library("leaps")
library("glmnet")
library("pls")
library("splines")
library("gam")
library("akima")
library("tree")
library("randomForest")
library("gbm")
library("e1071")
library("ROCR")

lm_fit<-lm(medv~lstat,data=Boston)
summary(lm_fit)

## 
## Call:
## lm(formula = medv ~ lstat, data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.168  -3.990  -1.318   2.034  24.500 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 34.55384    0.56263   61.41   <2e-16 ***
## lstat       -0.95005    0.03873  -24.53   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.216 on 504 degrees of freedom
## Multiple R-squared:  0.5441, Adjusted R-squared:  0.5432 
## F-statistic: 601.6 on 1 and 504 DF,  p-value: < 2.2e-16

attach(Boston)
plot(lstat,medv)
abline(lm_fit)

par(mfrow=c(2,2))
plot(lm_fit)

Multiple Linear Regression

lm_fit<-lm(medv~lstat+age,data=Boston)
summary(lm_fit)

## 
## Call:
## lm(formula = medv ~ lstat + age, data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.981  -3.978  -1.283   1.968  23.158 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 33.22276    0.73085  45.458  < 2e-16 ***
## lstat       -1.03207    0.04819 -21.416  < 2e-16 ***
## age          0.03454    0.01223   2.826  0.00491 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.173 on 503 degrees of freedom
## Multiple R-squared:  0.5513, Adjusted R-squared:  0.5495 
## F-statistic:   309 on 2 and 503 DF,  p-value: < 2.2e-16

lm_fit<-lm(medv~.,data=Boston)
summary(lm_fit)

## 
## Call:
## lm(formula = medv ~ ., data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.595  -2.730  -0.518   1.777  26.199 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.646e+01  5.103e+00   7.144 3.28e-12 ***
## crim        -1.080e-01  3.286e-02  -3.287 0.001087 ** 
## zn           4.642e-02  1.373e-02   3.382 0.000778 ***
## indus        2.056e-02  6.150e-02   0.334 0.738288    
## chas         2.687e+00  8.616e-01   3.118 0.001925 ** 
## nox         -1.777e+01  3.820e+00  -4.651 4.25e-06 ***
## rm           3.810e+00  4.179e-01   9.116  < 2e-16 ***
## age          6.922e-04  1.321e-02   0.052 0.958229    
## dis         -1.476e+00  1.995e-01  -7.398 6.01e-13 ***
## rad          3.060e-01  6.635e-02   4.613 5.07e-06 ***
## tax         -1.233e-02  3.760e-03  -3.280 0.001112 ** 
## ptratio     -9.527e-01  1.308e-01  -7.283 1.31e-12 ***
## black        9.312e-03  2.686e-03   3.467 0.000573 ***
## lstat       -5.248e-01  5.072e-02 -10.347  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.745 on 492 degrees of freedom
## Multiple R-squared:  0.7406, Adjusted R-squared:  0.7338 
## F-statistic: 108.1 on 13 and 492 DF,  p-value: < 2.2e-16

vif(lm_fit)

##     crim       zn    indus     chas      nox       rm      age      dis 
## 1.792192 2.298758 3.991596 1.073995 4.393720 1.933744 3.100826 3.955945 
##      rad      tax  ptratio    black    lstat 
## 7.484496 9.008554 1.799084 1.348521 2.941491

Non- Linear Transformation

lm_fit1<-lm(medv~lstat+I(lstat^2))
summary(lm_fit)

## 
## Call:
## lm(formula = medv ~ ., data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.595  -2.730  -0.518   1.777  26.199 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.646e+01  5.103e+00   7.144 3.28e-12 ***
## crim        -1.080e-01  3.286e-02  -3.287 0.001087 ** 
## zn           4.642e-02  1.373e-02   3.382 0.000778 ***
## indus        2.056e-02  6.150e-02   0.334 0.738288    
## chas         2.687e+00  8.616e-01   3.118 0.001925 ** 
## nox         -1.777e+01  3.820e+00  -4.651 4.25e-06 ***
## rm           3.810e+00  4.179e-01   9.116  < 2e-16 ***
## age          6.922e-04  1.321e-02   0.052 0.958229    
## dis         -1.476e+00  1.995e-01  -7.398 6.01e-13 ***
## rad          3.060e-01  6.635e-02   4.613 5.07e-06 ***
## tax         -1.233e-02  3.760e-03  -3.280 0.001112 ** 
## ptratio     -9.527e-01  1.308e-01  -7.283 1.31e-12 ***
## black        9.312e-03  2.686e-03   3.467 0.000573 ***
## lstat       -5.248e-01  5.072e-02 -10.347  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.745 on 492 degrees of freedom
## Multiple R-squared:  0.7406, Adjusted R-squared:  0.7338 
## F-statistic: 108.1 on 13 and 492 DF,  p-value: < 2.2e-16

plot(lm_fit1)

names(Carseats)

##  [1] "Sales"       "CompPrice"   "Income"      "Advertising" "Population" 
##  [6] "Price"       "ShelveLoc"   "Age"         "Education"   "Urban"      
## [11] "US"

lm_fit=lm(Sales~.+Income:Advertising+Price:Age,data=Carseats)
summary(lm_fit)

## 
## Call:
## lm(formula = Sales ~ . + Income:Advertising + Price:Age, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.9208 -0.7503  0.0177  0.6754  3.3413 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         6.5755654  1.0087470   6.519 2.22e-10 ***
## CompPrice           0.0929371  0.0041183  22.567  < 2e-16 ***
## Income              0.0108940  0.0026044   4.183 3.57e-05 ***
## Advertising         0.0702462  0.0226091   3.107 0.002030 ** 
## Population          0.0001592  0.0003679   0.433 0.665330    
## Price              -0.1008064  0.0074399 -13.549  < 2e-16 ***
## ShelveLocGood       4.8486762  0.1528378  31.724  < 2e-16 ***
## ShelveLocMedium     1.9532620  0.1257682  15.531  < 2e-16 ***
## Age                -0.0579466  0.0159506  -3.633 0.000318 ***
## Education          -0.0208525  0.0196131  -1.063 0.288361    
## UrbanYes            0.1401597  0.1124019   1.247 0.213171    
## USYes              -0.1575571  0.1489234  -1.058 0.290729    
## Income:Advertising  0.0007510  0.0002784   2.698 0.007290 ** 
## Price:Age           0.0001068  0.0001333   0.801 0.423812    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.011 on 386 degrees of freedom
## Multiple R-squared:  0.8761, Adjusted R-squared:  0.8719 
## F-statistic:   210 on 13 and 386 DF,  p-value: < 2.2e-16

names(Smarket)

## [1] "Year"      "Lag1"      "Lag2"      "Lag3"      "Lag4"      "Lag5"     
## [7] "Volume"    "Today"     "Direction"

cor(Smarket[,-9])

##              Year         Lag1         Lag2         Lag3         Lag4
## Year   1.00000000  0.029699649  0.030596422  0.033194581  0.035688718
## Lag1   0.02969965  1.000000000 -0.026294328 -0.010803402 -0.002985911
## Lag2   0.03059642 -0.026294328  1.000000000 -0.025896670 -0.010853533
## Lag3   0.03319458 -0.010803402 -0.025896670  1.000000000 -0.024051036
## Lag4   0.03568872 -0.002985911 -0.010853533 -0.024051036  1.000000000
## Lag5   0.02978799 -0.005674606 -0.003557949 -0.018808338 -0.027083641
## Volume 0.53900647  0.040909908 -0.043383215 -0.041823686 -0.048414246
## Today  0.03009523 -0.026155045 -0.010250033 -0.002447647 -0.006899527
##                Lag5      Volume        Today
## Year    0.029787995  0.53900647  0.030095229
## Lag1   -0.005674606  0.04090991 -0.026155045
## Lag2   -0.003557949 -0.04338321 -0.010250033
## Lag3   -0.018808338 -0.04182369 -0.002447647
## Lag4   -0.027083641 -0.04841425 -0.006899527
## Lag5    1.000000000 -0.02200231 -0.034860083
## Volume -0.022002315  1.00000000  0.014591823
## Today  -0.034860083  0.01459182  1.000000000

Logistic Regression

fitting a logistic model to predict the direction of the market

attach(Smarket)
glm_fit<-glm(Direction~.-Year-Direction-Today,data = Smarket,family = "binomial")
summary(glm_fit)

## 
## Call:
## glm(formula = Direction ~ . - Year - Direction - Today, family = "binomial", 
##     data = Smarket)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.446  -1.203   1.065   1.145   1.326  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.126000   0.240736  -0.523    0.601
## Lag1        -0.073074   0.050167  -1.457    0.145
## Lag2        -0.042301   0.050086  -0.845    0.398
## Lag3         0.011085   0.049939   0.222    0.824
## Lag4         0.009359   0.049974   0.187    0.851
## Lag5         0.010313   0.049511   0.208    0.835
## Volume       0.135441   0.158360   0.855    0.392
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1731.2  on 1249  degrees of freedom
## Residual deviance: 1727.6  on 1243  degrees of freedom
## AIC: 1741.6
## 
## Number of Fisher Scoring iterations: 3

glm_probs<-predict(glm_fit,type="response")
glm_probs[1:10]

##         1         2         3         4         5         6         7 
## 0.5070841 0.4814679 0.4811388 0.5152224 0.5107812 0.5069565 0.4926509 
##         8         9        10 
## 0.5092292 0.5176135 0.4888378

contrasts(Direction)

##      Up
## Down  0
## Up    1

glm_pred<-rep("Down",1250)
glm_pred[glm_probs>0.5]=="Up"

##   [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [23] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [34] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [45] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [56] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [67] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [78] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [89] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [100] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [111] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [122] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [144] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [155] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [166] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [177] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [188] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [199] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [210] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [221] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [232] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [243] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [254] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [276] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [287] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [298] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [309] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [320] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [331] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [342] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [353] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [364] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [375] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [386] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [397] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [408] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [419] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [430] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [441] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [452] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [463] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [474] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [485] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [496] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [507] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [518] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [529] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [540] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [551] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [562] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [573] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [584] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [595] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [606] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [617] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [628] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [639] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [650] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [661] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [672] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [683] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [694] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [705] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [716] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [727] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [738] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [749] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [760] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [771] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [782] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [793] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [804] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [815] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [826] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [837] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [848] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [859] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [870] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [881] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [892] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [903] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [914] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [925] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [936] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [947] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [958] FALSE FALSE FALSE FALSE FALSE FALSE FALSE

sum(glm_pred==Direction)

## [1] 602

Linear Discriminant Analysis

Using observations only before 2005

train<-(Year<2005)
Smarket.2005<-Smarket[!train,]
Direction.2005<-Direction[!train]

lda_fit<-lda(Direction~Lag1+Lag2,data=Smarket,subset = train)
lda_fit

## Call:
## lda(Direction ~ Lag1 + Lag2, data = Smarket, subset = train)
## 
## Prior probabilities of groups:
##     Down       Up 
## 0.491984 0.508016 
## 
## Group means:
##             Lag1        Lag2
## Down  0.04279022  0.03389409
## Up   -0.03954635 -0.03132544
## 
## Coefficients of linear discriminants:
##             LD1
## Lag1 -0.6420190
## Lag2 -0.5135293

lda_pred<-predict(lda_fit,Smarket.2005)
names(lda_pred)

## [1] "class"     "posterior" "x"

lda_class<-lda_pred$class
table(lda_class,Direction.2005)

##          Direction.2005
## lda_class Down  Up
##      Down   35  35
##      Up     76 106

lda_pred$posterior

##           Down        Up
## 999  0.4901792 0.5098208
## 1000 0.4792185 0.5207815
## 1001 0.4668185 0.5331815
## 1002 0.4740011 0.5259989
## 1003 0.4927877 0.5072123
## 1004 0.4938562 0.5061438
## 1005 0.4951016 0.5048984
## 1006 0.4872861 0.5127139
## 1007 0.4907013 0.5092987
## 1008 0.4844026 0.5155974
## 1009 0.4906963 0.5093037
## 1010 0.5119988 0.4880012
## 1011 0.4895152 0.5104848
## 1012 0.4706761 0.5293239
## 1013 0.4744593 0.5255407
## 1014 0.4799583 0.5200417
## 1015 0.4935775 0.5064225
## 1016 0.5030894 0.4969106
## 1017 0.4978806 0.5021194
## 1018 0.4886331 0.5113669
## 1019 0.5006568 0.4993432
## 1020 0.5108735 0.4891265
## 1021 0.5039925 0.4960075
## 1022 0.4916335 0.5083665
## 1023 0.5041772 0.4958228
## 1024 0.5026751 0.4973249
## 1025 0.4914043 0.5085957
## 1026 0.4805964 0.5194036
## 1027 0.4882718 0.5117282
## 1028 0.5062187 0.4937813
## 1029 0.5005996 0.4994004
## 1030 0.4972965 0.5027035
## 1031 0.4958546 0.5041454
## 1032 0.4811777 0.5188223
## 1033 0.4841417 0.5158583
## 1034 0.4726388 0.5273612
## 1035 0.4836418 0.5163582
## 1036 0.5091007 0.4908993
## 1037 0.5135941 0.4864059
## 1038 0.4933839 0.5066161
## 1039 0.4926856 0.5073144
## 1040 0.4978472 0.5021528
## 1041 0.4920914 0.5079086
## 1042 0.5056346 0.4943654
## 1043 0.5062288 0.4937712
## 1044 0.4881894 0.5118106
## 1045 0.4725293 0.5274707
## 1046 0.4832339 0.5167661
## 1047 0.4835086 0.5164914
## 1048 0.4913334 0.5086666
## 1049 0.4877566 0.5122434
## 1050 0.4724386 0.5275614
## 1051 0.4854877 0.5145123
## 1052 0.4932911 0.5067089
## 1053 0.4845973 0.5154027
## 1054 0.4723718 0.5276282
## 1055 0.4816170 0.5183830
## 1056 0.4914067 0.5085933
## 1057 0.4942755 0.5057245
## 1058 0.4841232 0.5158768
## 1059 0.5026064 0.4973936
## 1060 0.5062557 0.4937443
## 1061 0.4821800 0.5178200
## 1062 0.4885263 0.5114737
## 1063 0.5011825 0.4988175
## 1064 0.5000595 0.4999405
## 1065 0.5027377 0.4972623
## 1066 0.4870086 0.5129914
## 1067 0.4827213 0.5172787
## 1068 0.4996501 0.5003499
## 1069 0.4818079 0.5181921
## 1070 0.4651057 0.5348943
## 1071 0.4577867 0.5422133
## 1072 0.4775004 0.5224996
## 1073 0.5034250 0.4965750
## 1074 0.4801664 0.5198336
## 1075 0.5046171 0.4953829
## 1076 0.5044752 0.4955248
## 1077 0.4964663 0.5035337
## 1078 0.4892965 0.5107035
## 1079 0.4876236 0.5123764
## 1080 0.4805625 0.5194375
## 1081 0.4958518 0.5041482
## 1082 0.5115212 0.4884788
## 1083 0.4958572 0.5041428
## 1084 0.5082871 0.4917129
## 1085 0.5022091 0.4977909
## 1086 0.4875892 0.5124108
## 1087 0.4995948 0.5004052
## 1088 0.4841917 0.5158083
## 1089 0.4858843 0.5141157
## 1090 0.4826969 0.5173031
## 1091 0.4745012 0.5254988
## 1092 0.5008540 0.4991460
## 1093 0.5127766 0.4872234
## 1094 0.5135472 0.4864528
## 1095 0.5095127 0.4904873
## 1096 0.4950201 0.5049799
## 1097 0.4956088 0.5043912
## 1098 0.4964643 0.5035357
## 1099 0.4874363 0.5125637
## 1100 0.4970339 0.5029661
## 1101 0.5003751 0.4996249
## 1102 0.4846137 0.5153863
## 1103 0.4976914 0.5023086
## 1104 0.5043081 0.4956919
## 1105 0.4843366 0.5156634
## 1106 0.4860664 0.5139336
## 1107 0.4930417 0.5069583
## 1108 0.4887219 0.5112781
## 1109 0.4968147 0.5031853
## 1110 0.4944989 0.5055011
## 1111 0.4924743 0.5075257
## 1112 0.4980141 0.5019859
## 1113 0.4978727 0.5021273
## 1114 0.4994390 0.5005610
## 1115 0.5028317 0.4971683
## 1116 0.4964503 0.5035497
## 1117 0.4883202 0.5116798
## 1118 0.4899801 0.5100199
## 1119 0.4771957 0.5228043
## 1120 0.4694030 0.5305970
## 1121 0.4824692 0.5175308
## 1122 0.5037943 0.4962057
## 1123 0.5000974 0.4999026
## 1124 0.4805303 0.5194697
## 1125 0.4876953 0.5123047
## 1126 0.5070782 0.4929218
## 1127 0.4901776 0.5098224
## 1128 0.4860999 0.5139001
## 1129 0.5108497 0.4891503
## 1130 0.5135547 0.4864453
## 1131 0.5020218 0.4979782
## 1132 0.4956830 0.5043170
## 1133 0.4965536 0.5034464
## 1134 0.4964590 0.5035410
## 1135 0.4855719 0.5144281
## 1136 0.4951439 0.5048561
## 1137 0.5060048 0.4939952
## 1138 0.4880643 0.5119357
## 1139 0.4921175 0.5078825
## 1140 0.4927195 0.5072805
## 1141 0.4901661 0.5098339
## 1142 0.5001986 0.4998014
## 1143 0.5047746 0.4952254
## 1144 0.4875267 0.5124733
## 1145 0.4847648 0.5152352
## 1146 0.5028405 0.4971595
## 1147 0.5008435 0.4991565
## 1148 0.4825591 0.5174409
## 1149 0.4732124 0.5267876
## 1150 0.4797731 0.5202269
## 1151 0.4983172 0.5016828
## 1152 0.4968824 0.5031176
## 1153 0.4997031 0.5002969
## 1154 0.4914721 0.5085279
## 1155 0.4892300 0.5107700
## 1156 0.4787694 0.5212306
## 1157 0.4799234 0.5200766
## 1158 0.4913818 0.5086182
## 1159 0.4916287 0.5083713
## 1160 0.4948795 0.5051205
## 1161 0.4890900 0.5109100
## 1162 0.4790944 0.5209056
## 1163 0.4878531 0.5121469
## 1164 0.4861838 0.5138162
## 1165 0.4935558 0.5064442
## 1166 0.4941329 0.5058671
## 1167 0.5020762 0.4979238
## 1168 0.5043051 0.4956949
## 1169 0.4890430 0.5109570
## 1170 0.5062006 0.4937994
## 1171 0.5092767 0.4907233
## 1172 0.4893670 0.5106330
## 1173 0.4987776 0.5012224
## 1174 0.4997456 0.5002544
## 1175 0.4806852 0.5193148
## 1176 0.4790536 0.5209464
## 1177 0.4889496 0.5110504
## 1178 0.5039466 0.4960534
## 1179 0.4934174 0.5065826
## 1180 0.4748985 0.5251015
## 1181 0.4706261 0.5293739
## 1182 0.4868978 0.5131022
## 1183 0.4967554 0.5032446
## 1184 0.4929449 0.5070551
## 1185 0.4922853 0.5077147
## 1186 0.4933690 0.5066310
## 1187 0.5053601 0.4946399
## 1188 0.5030552 0.4969448
## 1189 0.4905837 0.5094163
## 1190 0.4762390 0.5237610
## 1191 0.4603392 0.5396608
## 1192 0.4697932 0.5302068
## 1193 0.4925300 0.5074700
## 1194 0.4861143 0.5138857
## 1195 0.4811376 0.5188624
## 1196 0.4812474 0.5187526
## 1197 0.4842383 0.5157617
## 1198 0.5026218 0.4973782
## 1199 0.5052312 0.4947688
## 1200 0.4813184 0.5186816
## 1201 0.5015397 0.4984603
## 1202 0.4877161 0.5122839
## 1203 0.4774171 0.5225829
## 1204 0.5168827 0.4831173
## 1205 0.5072640 0.4927360
## 1206 0.4833515 0.5166485
## 1207 0.4726701 0.5273299
## 1208 0.5032667 0.4967333
## 1209 0.5202350 0.4797650
## 1210 0.4950279 0.5049721
## 1211 0.5018767 0.4981233
## 1212 0.5089142 0.4910858
## 1213 0.4968911 0.5031089
## 1214 0.4951595 0.5048405
## 1215 0.4895942 0.5104058
## 1216 0.4904653 0.5095347
## 1217 0.5055318 0.4944682
## 1218 0.5055416 0.4944584
## 1219 0.4942470 0.5057530
## 1220 0.4857495 0.5142505
## 1221 0.4901606 0.5098394
## 1222 0.5069730 0.4930270
## 1223 0.5084764 0.4915236
## 1224 0.5041288 0.4958712
## 1225 0.5048299 0.4951701
## 1226 0.5023879 0.4976121
## 1227 0.4986903 0.5013097
## 1228 0.4824758 0.5175242
## 1229 0.4825469 0.5174531
## 1230 0.4831600 0.5168400
## 1231 0.5017497 0.4982503
## 1232 0.5058708 0.4941292
## 1233 0.4890321 0.5109679
## 1234 0.4911052 0.5088948
## 1235 0.4864250 0.5135750
## 1236 0.4847062 0.5152938
## 1237 0.4944890 0.5055110
## 1238 0.4962261 0.5037739
## 1239 0.5005702 0.4994298
## 1240 0.5039068 0.4960932
## 1241 0.4946376 0.5053624
## 1242 0.4864366 0.5135634
## 1243 0.4807022 0.5192978
## 1244 0.4851439 0.5148561
## 1245 0.4951734 0.5048266
## 1246 0.5005893 0.4994107
## 1247 0.4972210 0.5027790
## 1248 0.4791988 0.5208012
## 1249 0.4831673 0.5168327
## 1250 0.4892591 0.5107409

lda_pred$posterior[1:20,1]

##       999      1000      1001      1002      1003      1004      1005 
## 0.4901792 0.4792185 0.4668185 0.4740011 0.4927877 0.4938562 0.4951016 
##      1006      1007      1008      1009      1010      1011      1012 
## 0.4872861 0.4907013 0.4844026 0.4906963 0.5119988 0.4895152 0.4706761 
##      1013      1014      1015      1016      1017      1018 
## 0.4744593 0.4799583 0.4935775 0.5030894 0.4978806 0.4886331

lda_class[1:20]

##  [1] Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Down Up   Up  
## [15] Up   Up   Up   Down Up   Up  
## Levels: Down Up

Quadrant Discriminant Analysis

qda_fit<-qda(Direction~Lag1+Lag2,data = Smarket,subset = train)
qda_fit

## Call:
## qda(Direction ~ Lag1 + Lag2, data = Smarket, subset = train)
## 
## Prior probabilities of groups:
##     Down       Up 
## 0.491984 0.508016 
## 
## Group means:
##             Lag1        Lag2
## Down  0.04279022  0.03389409
## Up   -0.03954635 -0.03132544

KNN

library("class")
train_X=cbind(Lag1,Lag2)[train,]
test_X=cbind(Lag1,Lag2)[!train,]
train_Direction=Direction[train]

set.seed(1)
knn_pred<-knn(train_X,test_X,train_Direction,k=3)
table(knn_pred,Direction.2005)

##         Direction.2005
## knn_pred Down Up
##     Down   48 55
##     Up     63 86

attach(Caravan)
standardized_X=scale(Caravan[,-86])

test=1:1000
train_x=standardized_X[-test,]
test_X=standardized_X[test,]

train_Y=Purchase[-test]
test_Y=Purchase[test]
set.seed(1)

knn_pred=knn(train_x,test_X,train_Y,k=1)
mean(test_Y!=knn_pred)

## [1] 0.118

Sampling and Bootstrapping methods

set.seed(1)
train=sample(392,196)
attach(Auto)
lm_fit<-lm(mpg~horsepower,data=Auto,subset = train)
mean((mpg- predict(lm_fit,Auto))[-train]^2)

## [1] 26.14142

library("boot")

## 
## Attaching package: 'boot'

## The following object is masked from 'package:car':
## 
##     logit

glm_fit<-glm(mpg~horsepower,data = Auto)
cv.err=cv.glm(Auto,glm_fit)
cv.err$K

## [1] 392

cv.error=rep(0,5)

for(i in 1:5){
  glm_fit<-glm(mpg~poly(horsepower,i),data = Auto)
  cv.error[i]=cv.glm(Auto,glm_fit)$delta[i]
}

cv.error

## [1] 24.23151 19.24787       NA       NA       NA

Bootstrap

alpha_fn=function(data,index){
  X=data$X[index]
  Y=data$Y[index]
  return((var(Y)-cov(X,Y))/(var(X)+var(Y)-2*cov(X,Y)))
}
alpha_fn(Portfolio,1:100)

## [1] 0.5758321

boot(Portfolio,alpha_fn,R=1000)

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Portfolio, statistic = alpha_fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##      original     bias    std. error
## t1* 0.5758321 0.00107223  0.08860155

Best Subset Selection Methods

Hitters=na.omit(Hitters)
regfit_full=regsubsets(Salary~.,Hitters)
summary(regfit_full)

## Subset selection object
## Call: regsubsets.formula(Salary ~ ., Hitters)
## 19 Variables  (and intercept)
##            Forced in Forced out
## AtBat          FALSE      FALSE
## Hits           FALSE      FALSE
## HmRun          FALSE      FALSE
## Runs           FALSE      FALSE
## RBI            FALSE      FALSE
## Walks          FALSE      FALSE
## Years          FALSE      FALSE
## CAtBat         FALSE      FALSE
## CHits          FALSE      FALSE
## CHmRun         FALSE      FALSE
## CRuns          FALSE      FALSE
## CRBI           FALSE      FALSE
## CWalks         FALSE      FALSE
## LeagueN        FALSE      FALSE
## DivisionW      FALSE      FALSE
## PutOuts        FALSE      FALSE
## Assists        FALSE      FALSE
## Errors         FALSE      FALSE
## NewLeagueN     FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
##          AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns
## 1  ( 1 ) " "   " "  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 2  ( 1 ) " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 3  ( 1 ) " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 4  ( 1 ) " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 5  ( 1 ) "*"   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 6  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    " "  
## 7  ( 1 ) " "   "*"  " "   " "  " " "*"   " "   "*"    "*"   "*"    " "  
## 8  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   " "    " "   "*"    "*"  
##          CRBI CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1  ( 1 ) "*"  " "    " "     " "       " "     " "     " "    " "       
## 2  ( 1 ) "*"  " "    " "     " "       " "     " "     " "    " "       
## 3  ( 1 ) "*"  " "    " "     " "       "*"     " "     " "    " "       
## 4  ( 1 ) "*"  " "    " "     "*"       "*"     " "     " "    " "       
## 5  ( 1 ) "*"  " "    " "     "*"       "*"     " "     " "    " "       
## 6  ( 1 ) "*"  " "    " "     "*"       "*"     " "     " "    " "       
## 7  ( 1 ) " "  " "    " "     "*"       "*"     " "     " "    " "       
## 8  ( 1 ) " "  "*"    " "     "*"       "*"     " "     " "    " "

The number of stars indicate the importance of the variable. In the given configuration out of the 8 possible times that the iteration was run hits were found to be important all of the time whereas CRBI and PutOuts were found important 6 times

regfit_full<-regsubsets(Salary~.,data = Hitters,nvmax = 19)
reg_summary=summary(regfit_full)

reg_summary$rsq

##  [1] 0.3214501 0.4252237 0.4514294 0.4754067 0.4908036 0.5087146 0.5141227
##  [8] 0.5285569 0.5346124 0.5404950 0.5426153 0.5436302 0.5444570 0.5452164
## [15] 0.5454692 0.5457656 0.5459518 0.5460945 0.5461159

par(mfrow=c(2,2))

plot(reg_summary$rss,xlab="Number of variables",ylab="RSS",type="l")
plot(reg_summary$adjr2,xlab="Number of variables",ylab="Adjusted Rsq",type="l")

which.max(reg_summary$adjr2)

## [1] 11

plot(regfit_full,scale="adjr2")
points(11,reg_summary$adjr2[11],col="red",cex=2,pch=20)

plot(regfit_full,scale="r2")

Forward and Backward Stepwise selection

regfit_fwd<-regsubsets(Salary~.,data = Hitters,nvmax = 19,method = "forward")
summary(regfit_fwd)

## Subset selection object
## Call: regsubsets.formula(Salary ~ ., data = Hitters, nvmax = 19, method = "forward")
## 19 Variables  (and intercept)
##            Forced in Forced out
## AtBat          FALSE      FALSE
## Hits           FALSE      FALSE
## HmRun          FALSE      FALSE
## Runs           FALSE      FALSE
## RBI            FALSE      FALSE
## Walks          FALSE      FALSE
## Years          FALSE      FALSE
## CAtBat         FALSE      FALSE
## CHits          FALSE      FALSE
## CHmRun         FALSE      FALSE
## CRuns          FALSE      FALSE
## CRBI           FALSE      FALSE
## CWalks         FALSE      FALSE
## LeagueN        FALSE      FALSE
## DivisionW      FALSE      FALSE
## PutOuts        FALSE      FALSE
## Assists        FALSE      FALSE
## Errors         FALSE      FALSE
## NewLeagueN     FALSE      FALSE
## 1 subsets of each size up to 19
## Selection Algorithm: forward
##           AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns
## 1  ( 1 )  " "   " "  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 2  ( 1 )  " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 3  ( 1 )  " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 4  ( 1 )  " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 5  ( 1 )  "*"   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 6  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    " "  
## 7  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    " "  
## 8  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    "*"  
## 9  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   "*"    " "   " "    "*"  
## 10  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   "*"    " "   " "    "*"  
## 11  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   "*"    " "   " "    "*"  
## 12  ( 1 ) "*"   "*"  " "   "*"  " " "*"   " "   "*"    " "   " "    "*"  
## 13  ( 1 ) "*"   "*"  " "   "*"  " " "*"   " "   "*"    " "   " "    "*"  
## 14  ( 1 ) "*"   "*"  "*"   "*"  " " "*"   " "   "*"    " "   " "    "*"  
## 15  ( 1 ) "*"   "*"  "*"   "*"  " " "*"   " "   "*"    "*"   " "    "*"  
## 16  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   " "   "*"    "*"   " "    "*"  
## 17  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   " "   "*"    "*"   " "    "*"  
## 18  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   "*"   "*"    "*"   " "    "*"  
## 19  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   "*"   "*"    "*"   "*"    "*"  
##           CRBI CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1  ( 1 )  "*"  " "    " "     " "       " "     " "     " "    " "       
## 2  ( 1 )  "*"  " "    " "     " "       " "     " "     " "    " "       
## 3  ( 1 )  "*"  " "    " "     " "       "*"     " "     " "    " "       
## 4  ( 1 )  "*"  " "    " "     "*"       "*"     " "     " "    " "       
## 5  ( 1 )  "*"  " "    " "     "*"       "*"     " "     " "    " "       
## 6  ( 1 )  "*"  " "    " "     "*"       "*"     " "     " "    " "       
## 7  ( 1 )  "*"  "*"    " "     "*"       "*"     " "     " "    " "       
## 8  ( 1 )  "*"  "*"    " "     "*"       "*"     " "     " "    " "       
## 9  ( 1 )  "*"  "*"    " "     "*"       "*"     " "     " "    " "       
## 10  ( 1 ) "*"  "*"    " "     "*"       "*"     "*"     " "    " "       
## 11  ( 1 ) "*"  "*"    "*"     "*"       "*"     "*"     " "    " "       
## 12  ( 1 ) "*"  "*"    "*"     "*"       "*"     "*"     " "    " "       
## 13  ( 1 ) "*"  "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 14  ( 1 ) "*"  "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 15  ( 1 ) "*"  "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 16  ( 1 ) "*"  "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 17  ( 1 ) "*"  "*"    "*"     "*"       "*"     "*"     "*"    "*"       
## 18  ( 1 ) "*"  "*"    "*"     "*"       "*"     "*"     "*"    "*"       
## 19  ( 1 ) "*"  "*"    "*"     "*"       "*"     "*"     "*"    "*"

set.seed(1)
train=sample(c(TRUE,FALSE),nrow(Hitters),rep=TRUE)
test=(!train)

regfit_best=regsubsets(Salary~.,data=Hitters[train,],nvmax = 19)
test_mat=model.matrix(Salary~.,data = Hitters[test,])

val_errors=rep(NA,19)
for(i in 1:19){
  coefi=coef(regfit_best,id=i)
  pred_test=test_mat[,names(coefi)] %*% coefi
  val_errors[i]=mean((Hitters$Salary[test]-pred_test)^2)
  
  
}

val_errors

##  [1] 220968.0 169157.1 178518.2 163426.1 168418.1 171270.6 162377.1
##  [8] 157909.3 154055.7 148162.1 151156.4 151742.5 152214.5 157358.7
## [15] 158541.4 158743.3 159972.7 159859.8 160105.6

Since there is no predict function for regsubsets we are creating our own predict function

predict_regsubsets<-function(object,newdata,id,...){
  form=as.formula(object$call[[2]])
  mat=model.matrix(form,newdata)
  coefi=coef(object,id=id)
  xvars=names(coefi)
  mat[,xvars]%*% coefi
  }

regfit_best=regsubsets(Salary~.,data=Hitters,nvmax=19)
coef(regfit_best,10)

##  (Intercept)        AtBat         Hits        Walks       CAtBat 
##  162.5354420   -2.1686501    6.9180175    5.7732246   -0.1300798 
##        CRuns         CRBI       CWalks    DivisionW      PutOuts 
##    1.4082490    0.7743122   -0.8308264 -112.3800575    0.2973726 
##      Assists 
##    0.2831680

k=10
set.seed(1)
folds=sample(1:k,nrow(Hitters),replace = TRUE)
cv_errors=matrix(NA,k,19,dimnames=list(NULL,paste(1:19)))


for(j in 1:k){
  best_fit=regsubsets(Salary~.,data = Hitters[folds!=j,],nvmax=19)
  
  for(i in 1:19){
    pred=predict_regsubsets(best_fit,Hitters[folds==j,],id=i)
    cv_errors[j,i]=mean((Hitters$Salary[folds==j]-pred)^2)
  }
}

mean_cv_errors=apply(cv_errors, 2, mean)
plot(mean_cv_errors,type="b")

Ridge Regression and Lasso

x=model.matrix(Salary~.,Hitters)[,-1]
y=Hitters$Salary

model.matrix converts categorical variables into quantitative by making use of the dummy variables

Ridge Regression

grid=10^seq(10,-2,length=100)
ridge_mod=glmnet(x,y,alpha=0,lambda = grid)

ridge_mod$lambda[50]

## [1] 11497.57

coef(ridge_mod)[,50]

##   (Intercept)         AtBat          Hits         HmRun          Runs 
## 407.356050200   0.036957182   0.138180344   0.524629976   0.230701523 
##           RBI         Walks         Years        CAtBat         CHits 
##   0.239841459   0.289618741   1.107702929   0.003131815   0.011653637 
##        CHmRun         CRuns          CRBI        CWalks       LeagueN 
##   0.087545670   0.023379882   0.024138320   0.025015421   0.085028114 
##     DivisionW       PutOuts       Assists        Errors    NewLeagueN 
##  -6.215440973   0.016482577   0.002612988  -0.020502690   0.301433531

Calculate l2 norm

sqrt(sum(coef(ridge_mod)[-1,50]^2))

## [1] 6.360612

The higher the lambda, shrunker are the coefficients and hence less the l2 Let us calculate for some other value of lambda

sqrt(sum(coef(ridge_mod)[-1,60]^2))

## [1] 57.11001

Can use the predict function too, we just have to use type=“coefficients”

predict(ridge_mod,type = "coefficients",s=50)[1:20,]

##   (Intercept)         AtBat          Hits         HmRun          Runs 
##  4.876610e+01 -3.580999e-01  1.969359e+00 -1.278248e+00  1.145892e+00 
##           RBI         Walks         Years        CAtBat         CHits 
##  8.038292e-01  2.716186e+00 -6.218319e+00  5.447837e-03  1.064895e-01 
##        CHmRun         CRuns          CRBI        CWalks       LeagueN 
##  6.244860e-01  2.214985e-01  2.186914e-01 -1.500245e-01  4.592589e+01 
##     DivisionW       PutOuts       Assists        Errors    NewLeagueN 
## -1.182011e+02  2.502322e-01  1.215665e-01 -3.278600e+00 -9.496680e+00

set.seed(1)
train=sample(1:nrow(x),nrow(x)/2)
test=-train
y_test=y[test]

Fitting a ridge regression model

ridge_mod=glmnet(x[train,],y[train],alpha=0,lambda = grid,thresh = 1e-12)
ridge_pred=predict(ridge_mod,s=4,newx = x[test,])
mean((ridge_pred-y_test)^2)

## [1] 101036.8

mean((mean(y[train])-y_test)^2)

## [1] 193253.1

#ridge_pred=predict(ridge_mod,s=0,newx = x[test,],exact = T)

#mean((ridge_pred-y_test)^2)

#lm(y~x,subset = train)

#the following commands are not running 

# predict(object = ridge_mod,s=0,exact = T,type="coefficients")

# predict.glmnet(ridge_mod,s=0,exact = T,type="coefficients")[1:20,]

Using cross-validation to find the best value of lambda

set.seed(1)
cv_out=cv.glmnet(x[train,],y[train],alpha=0)

plot(cv_out)

bestlam=cv_out$lambda.min
bestlam

## [1] 211.7416

The best lambda value is 212

ridge_pred=predict(ridge_mod,s=bestlam,newx=x[test,])
mean((ridge_pred-y_test)^2)

## [1] 96015.51

Lasso Model

lasso_mod=glmnet(x[train,],y[train],alpha=1,lambda=grid)
plot(lasso_mod)

set.seed(1)
cv_out=cv.glmnet(x[train,],y[train],alpha=1)
plot(cv_out)

bestlam=cv_out$lambda.min
lasso_pred=predict(lasso_mod,s=bestlam,newx=x[test,])
mean((lasso_pred-y_test)^2)

## [1] 100743.4

The mean error is comparable to the ridge but in the ridge the coefficients are shrunken but they are not made zero

out=glmnet(x,y,alpha=1,lambda = grid)
lasso_coef=predict(out,type="coefficients",s=bestlam)[1:20,]

lasso_coef

##  (Intercept)        AtBat         Hits        HmRun         Runs 
##   18.5394844    0.0000000    1.8735390    0.0000000    0.0000000 
##          RBI        Walks        Years       CAtBat        CHits 
##    0.0000000    2.2178444    0.0000000    0.0000000    0.0000000 
##       CHmRun        CRuns         CRBI       CWalks      LeagueN 
##    0.0000000    0.2071252    0.4130132    0.0000000    3.2666677 
##    DivisionW      PutOuts      Assists       Errors   NewLeagueN 
## -103.4845458    0.2204284    0.0000000    0.0000000    0.0000000

As we see that most of the coefs are made zero so it’s easier to interpret

PCR and PLS

set.seed(2)
pcr_fit<-pcr(Salary~.,data=Hitters,scale=TRUE,validation="CV")
summary(pcr_fit)

## Data:    X dimension: 263 19 
##  Y dimension: 263 1
## Fit method: svdpc
## Number of components considered: 19
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV             452    348.9    352.2    353.5    352.8    350.1    349.1
## adjCV          452    348.7    351.8    352.9    352.1    349.3    348.0
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV       349.6    350.9    352.9     353.8     355.0     356.2     363.5
## adjCV    348.5    349.8    351.6     352.3     353.4     354.5     361.6
##        14 comps  15 comps  16 comps  17 comps  18 comps  19 comps
## CV        355.2     357.4     347.6     350.1     349.2     352.6
## adjCV     352.8     355.2     345.5     347.6     346.7     349.8
## 
## TRAINING: % variance explained
##         1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps
## X         38.31    60.16    70.84    79.03    84.29    88.63    92.26
## Salary    40.63    41.58    42.17    43.22    44.90    46.48    46.69
##         8 comps  9 comps  10 comps  11 comps  12 comps  13 comps  14 comps
## X         94.96    96.28     97.26     97.98     98.65     99.15     99.47
## Salary    46.75    46.86     47.76     47.82     47.85     48.10     50.40
##         15 comps  16 comps  17 comps  18 comps  19 comps
## X          99.75     99.89     99.97     99.99    100.00
## Salary     50.55     53.01     53.85     54.61     54.61

validationplot(pcr_fit,val.type = "MSEP")

set.seed(1)

pcr_fit<-pcr(Salary~.,data=Hitters,subset=train,scale=TRUE,validation="CV")
validationplot(pcr_fit,val.type = "MSEP")

pcr_fit=pcr(y~x,scale=TRUE,ncomp=7)
summary(pcr_fit)

## Data:    X dimension: 263 19 
##  Y dimension: 263 1
## Fit method: svdpc
## Number of components considered: 7
## TRAINING: % variance explained
##    1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps
## X    38.31    60.16    70.84    79.03    84.29    88.63    92.26
## y    40.63    41.58    42.17    43.22    44.90    46.48    46.69

Partial Least Squares

set.seed(1)
pls_fit=plsr(Salary~.,data=Hitters,subset=train,scale=TRUE,validation="CV")
summary(pls_fit)

## Data:    X dimension: 131 19 
##  Y dimension: 131 1
## Fit method: kernelpls
## Number of components considered: 19
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV           464.6    394.2    391.5    393.1    395.0    415.0    424.0
## adjCV        464.6    393.4    390.2    391.1    392.9    411.5    418.8
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV       424.5    415.8    404.6     407.1     412.0     414.4     410.3
## adjCV    418.9    411.4    400.7     402.2     407.2     409.3     405.6
##        14 comps  15 comps  16 comps  17 comps  18 comps  19 comps
## CV        406.2     408.6     410.5     408.8     407.8     410.2
## adjCV     401.8     403.9     405.6     404.1     403.2     405.5
## 
## TRAINING: % variance explained
##         1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps
## X         38.12    53.46    66.05    74.49    79.33    84.56    87.09
## Salary    33.58    38.96    41.57    42.43    44.04    45.59    47.05
##         8 comps  9 comps  10 comps  11 comps  12 comps  13 comps  14 comps
## X         90.74    92.55     93.94     97.23     97.88     98.35     98.85
## Salary    47.53    48.42     49.68     50.04     50.54     50.78     50.92
##         15 comps  16 comps  17 comps  18 comps  19 comps
## X          99.11     99.43     99.78     99.99    100.00
## Salary     51.04     51.11     51.15     51.16     51.18

Non-Linear Modelling

attach(Wage)

## The following object is masked from Auto:
## 
##     year

## The following object is masked from Boston:
## 
##     age

fit=lm(wage~poly(age,4),data=Wage)
coef(summary(fit))

##                 Estimate Std. Error    t value     Pr(>|t|)
## (Intercept)    111.70361  0.7287409 153.283015 0.000000e+00
## poly(age, 4)1  447.06785 39.9147851  11.200558 1.484604e-28
## poly(age, 4)2 -478.31581 39.9147851 -11.983424 2.355831e-32
## poly(age, 4)3  125.52169 39.9147851   3.144742 1.678622e-03
## poly(age, 4)4  -77.91118 39.9147851  -1.951938 5.103865e-02

fit=lm(wage~poly(age,4,raw=T),data=Wage)
coef(summary(fit))

##                             Estimate   Std. Error   t value     Pr(>|t|)
## (Intercept)            -1.841542e+02 6.004038e+01 -3.067172 0.0021802539
## poly(age, 4, raw = T)1  2.124552e+01 5.886748e+00  3.609042 0.0003123618
## poly(age, 4, raw = T)2 -5.638593e-01 2.061083e-01 -2.735743 0.0062606446
## poly(age, 4, raw = T)3  6.810688e-03 3.065931e-03  2.221409 0.0263977518
## poly(age, 4, raw = T)4 -3.203830e-05 1.641359e-05 -1.951938 0.0510386498

Important: use the I() function if you want to type the exact poly. Example age^2 should be written as I(age^2)

agelims=range(age)
age_grid=seq(from=agelims[1],to=agelims[2])
preds=predict(fit,newdata = list(age=age_grid),se=TRUE)
se_bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)

par(mfrow=c(1,2),mar=c(4.5,4.5,1,1),oma=c(0,0,4,0))
plot(age,wage,xlim=agelims,cex=.5,col="darkgrey")
title("Degree - 4 Polynomial", outer = T)
lines(age_grid,preds$fit,lwd=2,col="blue")
matlines(age_grid,se_bands,lwd=1,col="blue",lty=3)

How to determine what degree of polynomial to use ?

The anova function works here like a charm because of its simplicity and efficacy

The following code is slight different from the book. I created a function rather than typing in five lines of code.

polynomial_R<-function(degree_i){
 assign(paste("fit_",degree_i),lm(wage~poly(age,degree_i)),envir = parent.frame())
}

Either I can call the function 5 times or I can run a for loop

for (i in 1:5){
polynomial_R(i)
    }

anova(`fit_ 1`,`fit_ 2`,`fit_ 3`,`fit_ 4`,`fit_ 5`)

## Analysis of Variance Table
## 
## Model 1: wage ~ poly(age, degree_i)
## Model 2: wage ~ poly(age, degree_i)
## Model 3: wage ~ poly(age, degree_i)
## Model 4: wage ~ poly(age, degree_i)
## Model 5: wage ~ poly(age, degree_i)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8888  0.001679 ** 
## 4   2995 4771604  1      6070   3.8098  0.051046 .  
## 5   2994 4770322  1      1283   0.8050  0.369682    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

fit<-glm(I(wage>250)~poly(age,4),data=Wage,family = binomial)
preds=predict(fit,newdata = list(age=age_grid),se=T)

pfit=exp(preds$fit)/(1+exp(preds$fit))
se_bands_logit=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
se_bands=exp(se_bands_logit)/(1+exp(se_bands_logit))


preds=predict(fit,newdata=list(age=age_grid),type = "response",se=T)

plot(age,I(wage>250),xlim=agelims,type = "n",ylim=c(0,.2))
points(jitter(age),I((wage>250)/5),cex=0.5,pch="|",col="darkgrey")
lines(age_grid,pfit,lwd=2,col="blue")
matlines(age_grid,se_bands,lwd=1,col="blue",lty=3)

table(cut(age,4))

## 
## (17.9,33.5]   (33.5,49]   (49,64.5] (64.5,80.1] 
##         750        1399         779          72

fit=lm(wage~cut(age,4),data=Wage)
coef(summary(fit))

##                         Estimate Std. Error   t value     Pr(>|t|)
## (Intercept)            94.158392   1.476069 63.789970 0.000000e+00
## cut(age, 4)(33.5,49]   24.053491   1.829431 13.148074 1.982315e-38
## cut(age, 4)(49,64.5]   23.664559   2.067958 11.443444 1.040750e-29
## cut(age, 4)(64.5,80.1]  7.640592   4.987424  1.531972 1.256350e-01

Splines

fit=lm(wage~bs(age,knots = c(25,40,60)),data=Wage)
pred=predict(fit,newdata = list(age=age_grid),se=T)
{plot(age,wage,col="gray")
lines(age_grid,pred$fit,lwd=2)
lines(age_grid ,pred$fit +2* pred$se ,lty ="dashed")
lines(age_grid ,pred$fit -2* pred$se ,lty ="dashed")
}

Fitting a natural spline

fit2=lm(wage~ns(age,df=4),data=Wage)

summary(fit2)

## 
## Call:
## lm(formula = wage ~ ns(age, df = 4), data = Wage)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -98.737 -24.477  -5.083  15.371 204.874 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        58.556      4.235  13.827   <2e-16 ***
## ns(age, df = 4)1   60.462      4.190  14.430   <2e-16 ***
## ns(age, df = 4)2   41.963      4.372   9.597   <2e-16 ***
## ns(age, df = 4)3   97.020     10.386   9.341   <2e-16 ***
## ns(age, df = 4)4    9.773      8.657   1.129    0.259    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 39.92 on 2995 degrees of freedom
## Multiple R-squared:  0.08598,    Adjusted R-squared:  0.08476 
## F-statistic: 70.43 on 4 and 2995 DF,  p-value: < 2.2e-16

#**********************WARNING*******************#

#pred=predict(fit2,newdata=list(age_grid),se=T)
#lines(age_grid,pred$fit,col="red",lwd=2)

I don’t know why a vector of length 3000 is being generated when the predict function is applied to only a handful of values

plot(age,wage,xlim=agelims,cex=0.5,col="darkgrey")
title("Smoothing spline")
fit=smooth.spline(age,wage,df=16)
fit2=smooth.spline(age,wage,cv=TRUE)

## Warning in smooth.spline(age, wage, cv = TRUE): cross-validation with non-
## unique 'x' values seems doubtful

fit2$df

## [1] 6.794596

lines(fit,col="red",lwd=2)
lines(fit2,col="blue",lwd=2)
legend("topright",legend=c("16 DF","6.8 DF"),col=c("red","blue"),lty=1,lwd=2,cex=0.8)

Loess or local regression. I would like to research the difference between local and segmented regression

plot(age,wage,xlim=agelims,cex=0.5,col="darkgrey")
title("Local Regression")
fit=loess(wage~age,span = 0.2,data=Wage)
fit2=loess(wage~age,span = 0.5,data=Wage)
lines(age_grid,predict(fit,data.frame(age=age_grid)),col="red",lwd=2)
lines(age_grid,predict(fit2,data.frame(age=age_grid)),col="blue",lwd=2)
legend("topright",legend=c("Span=0.2","Span=0.5"),col=c("red","blue"),lty=1,lwd=2,cex=0.8)

GAM

Install gam and library(gam) at the top

gam1=lm(wage~ns(year,4)+ns(age,5)+education,data = Wage)
gam_m3=gam(wage~s(year,4)+s(age,5)+education,data = Wage)
par(mfrow=c(1,3))
plot(gam_m3,se=TRUE,col="blue")

plot.Gam(gam1,se=TRUE,col="red")

gam_m1=gam(wage~s(age,5)+education, data=Wage)
gam_m2=gam(wage~year+s(age,5)+education, data=Wage)
anova(gam_m1,gam_m2,gam_m3,test = "F")

## Analysis of Deviance Table
## 
## Model 1: wage ~ s(age, 5) + education
## Model 2: wage ~ year + s(age, 5) + education
## Model 3: wage ~ s(year, 4) + s(age, 5) + education
##   Resid. Df Resid. Dev Df Deviance       F    Pr(>F)    
## 1      2990    3711731                                  
## 2      2989    3693842  1  17889.2 14.4771 0.0001447 ***
## 3      2986    3689770  3   4071.1  1.0982 0.3485661    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(gam_m3)

## 
## Call: gam(formula = wage ~ s(year, 4) + s(age, 5) + education, data = Wage)
## Deviance Residuals:
##     Min      1Q  Median      3Q     Max 
## -119.43  -19.70   -3.33   14.17  213.48 
## 
## (Dispersion Parameter for gaussian family taken to be 1235.69)
## 
##     Null Deviance: 5222086 on 2999 degrees of freedom
## Residual Deviance: 3689770 on 2986 degrees of freedom
## AIC: 29887.75 
## 
## Number of Local Scoring Iterations: 2 
## 
## Anova for Parametric Effects
##              Df  Sum Sq Mean Sq F value    Pr(>F)    
## s(year, 4)    1   27162   27162  21.981 2.877e-06 ***
## s(age, 5)     1  195338  195338 158.081 < 2.2e-16 ***
## education     4 1069726  267432 216.423 < 2.2e-16 ***
## Residuals  2986 3689770    1236                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##             Npar Df Npar F  Pr(F)    
## (Intercept)                          
## s(year, 4)        3  1.086 0.3537    
## s(age, 5)         4 32.380 <2e-16 ***
## education                            
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Decision Trees

attach(Carseats)
High=ifelse(Sales<=8,"No","Yes")
Carseats=data.frame(Carseats,High)

Fit a classification tree

tree_carseats=tree(High~.-Sales,Carseats)
summary(tree_carseats)

## 
## Classification tree:
## tree(formula = High ~ . - Sales, data = Carseats)
## Variables actually used in tree construction:
## [1] "ShelveLoc"   "Price"       "Income"      "CompPrice"   "Population" 
## [6] "Advertising" "Age"         "US"         
## Number of terminal nodes:  27 
## Residual mean deviance:  0.4575 = 170.7 / 373 
## Misclassification error rate: 0.09 = 36 / 400

plot(tree_carseats)
text(tree_carseats,pretty=0)

Compute the test error on the dataset

set.seed(2)
train=sample(1:nrow(Carseats),200)
Carseats_test=Carseats[-train,]
High_test=High[-train]
tree_carseats=tree(High~.-Sales,Carseats,subset = train)
tree_pred=predict(tree_carseats,Carseats_test,type = "class")
table(tree_pred,High_test)

##          High_test
## tree_pred No Yes
##       No  86  27
##       Yes 30  57

Prune the tree

set.seed(3)
cv_carseats=cv.tree(tree_carseats,FUN=prune.misclass)
names(cv_carseats)

## [1] "size"   "dev"    "k"      "method"

par(mfrow=c(1,2))
plot(cv_carseats$size,cv_carseats$dev,type="b")
plot(cv_carseats$k,cv_carseats$dev,type="b")

prune_carseats=prune.misclass(tree_carseats,best=9)
plot(prune_carseats)
text(prune_carseats,pretty = 0)

After find the tree with the best k, we will use the predict function to find the accuracy of the tree

tree_pred=predict(prune_carseats,Carseats_test,type="class")
table(tree_pred,High_test)

##          High_test
## tree_pred No Yes
##       No  94  24
##       Yes 22  60

Fitting Regression Trees

set.seed(1)
train=sample(1:nrow(Boston),nrow(Boston)/2)
tree_boston=tree(medv~.,Boston,subset = train)
summary(tree_boston)

## 
## Regression tree:
## tree(formula = medv ~ ., data = Boston, subset = train)
## Variables actually used in tree construction:
## [1] "lstat" "rm"    "dis"  
## Number of terminal nodes:  8 
## Residual mean deviance:  12.65 = 3099 / 245 
## Distribution of residuals:
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -14.10000  -2.04200  -0.05357   0.00000   1.96000  12.60000

Note: In the context of regression tree, the deviance is simply the sum of squared errors for the tree

plot(tree_boston)
text(tree_boston,pretty = 0)

Cross-Validation

cv_boston<-cv.tree(tree_boston)
plot(cv_boston$size,cv_boston$dev,type='b')

Prune tree

prune_boston<-prune.tree(tree_boston,best=5)
plot(prune_boston)
text(prune_boston,pretty=0)

Predictions on the test set

yhat=predict(tree_boston,newdata = Boston[-train,])
boston_test=Boston[-train,"medv"]
plot(yhat,boston_test)
abline(0,1)

mean((yhat-boston_test)^2)

## [1] 25.04559

Bagging and Random Forests

set.seed(1)
bag_boston=randomForest(medv~.,data = Boston,subset = train,mtry=13,importance=TRUE)

Bagging Tree prediction on test set

yhat_bag=predict(bag_boston,newdata=Boston[-train,])
plot(yhat_bag,boston_test)
abline(0,1)

mean((yhat_bag-boston_test)^2)

## [1] 13.50808

In the book it is mentioned that p/3 variables are used in regression trees and square root of p are used for classification trees

set.seed(1)
rf_boston=randomForest(medv~.,data = Boston,subset = train,mtry=6,importance=TRUE)
importance(rf_boston)

##           %IncMSE IncNodePurity
## crim    12.132320     986.50338
## zn       1.955579      57.96945
## indus    9.069302     882.78261
## chas     2.210835      45.22941
## nox     11.104823    1044.33776
## rm      31.784033    6359.31971
## age     10.962684     516.82969
## dis     15.015236    1224.11605
## rad      4.118011      95.94586
## tax      8.587932     502.96719
## ptratio 12.503896     830.77523
## black    6.702609     341.30361
## lstat   30.695224    7505.73936

varImpPlot(rf_boston)

Boosting

set.seed(1)
boost_boston=gbm(medv~.,data=Boston[train,],distribution = "gaussian",n.trees = 5000,interaction.depth = 4)
summary(boost_boston)

##             var    rel.inf
## lstat     lstat 37.0661275
## rm           rm 25.3533123
## dis         dis 11.7903016
## crim       crim  8.0388750
## black     black  4.2531659
## nox         nox  3.5058570
## age         age  3.4868724
## ptratio ptratio  2.2500385
## indus     indus  1.7725070
## tax         tax  1.1836592
## chas       chas  0.7441319
## rad         rad  0.4274311
## zn           zn  0.1277206

par(mfrow=c(1,2))
plot(boost_boston,i="rm")

plot(boost_boston,i="lstat")

boost_boston=gbm(medv~.,data=Boston[train,],n.trees = 5000,interaction.depth = 4,shrinkage = 0.2,verbose = F)

## Distribution not specified, assuming gaussian ...

yhat_boost=predict(boost_boston,newdata=Boston[-train,],n.trees = 5000)
mean((yhat_boost-boston_test)^2)

## [1] 11.51109

Support Vector Machines

set.seed(1)
x=matrix(rnorm(20*2),ncol=2)
y=c(rep(-1,10),rep(1,10))
x[y==1,]=x[y==1,]+1

plot(x,col=(3-y))

dat<-data.frame(x=x,y=as.factor(y))
svmfit=svm(y~.,data=dat,kernel="linear",cost=10,scale=FALSE)
plot(svmfit,dat)

Plot function is able to draw boundaries, color regions, mark boundaries, mark observations

summary(svmfit)

## 
## Call:
## svm(formula = y ~ ., data = dat, kernel = "linear", cost = 10, 
##     scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  10 
##       gamma:  0.5 
## 
## Number of Support Vectors:  7
## 
##  ( 4 3 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  -1 1

svmfit<-svm(y~.,data=dat,kernel="linear",cost=0.01,scale=FALSE)
plot(svmfit,dat)

Cross-Validation. tune function helps you input different

set.seed(1)
tune_out<-tune(svm,y~.,data=dat,kernel="linear",ranges=list(cost=c(0.001,0.01,0.1,1,5,10,100)))
summary(tune_out)

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##   0.1
## 
## - best performance: 0.1 
## 
## - Detailed performance results:
##    cost error dispersion
## 1 1e-03  0.70  0.4216370
## 2 1e-02  0.70  0.4216370
## 3 1e-01  0.10  0.2108185
## 4 1e+00  0.15  0.2415229
## 5 5e+00  0.15  0.2415229
## 6 1e+01  0.15  0.2415229
## 7 1e+02  0.15  0.2415229

bestmod=tune_out$best.model

Generating the test data

xtest=matrix(rnorm(20*2),ncol=2)
ytest=sample(c(-1,1),20,rep=TRUE)
xtest[ytest==1,]=xtest[ytest==1,]+1
testdat=data.frame(x=xtest,y=as.factor(ytest))

Predictions on test data

ypred=predict(bestmod,testdat)
table(predict=ypred,truth=testdat$y)

##        truth
## predict -1  1
##      -1 11  1
##      1   0  8

Again separating the two classes

x[y==1,]=x[y==1,]+0.5
plot(x,col=(y+5)/2,pch=19)

dat=data.frame(x=x,y=as.factor(y))
svmfit=svm(y~.,data=dat,kernel="linear",cost=1e5)
summary(svmfit)

## 
## Call:
## svm(formula = y ~ ., data = dat, kernel = "linear", cost = 1e+05)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1e+05 
##       gamma:  0.5 
## 
## Number of Support Vectors:  3
## 
##  ( 1 2 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  -1 1

plot(svmfit,dat)

More cost , we allow more number of misclassifications and hence bigger the margin

svmfit=svm(y~.,data=dat,kernel="linear",cost=1)
summary(svmfit)

## 
## Call:
## svm(formula = y ~ ., data = dat, kernel = "linear", cost = 1)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
##       gamma:  0.5 
## 
## Number of Support Vectors:  7
## 
##  ( 4 3 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  -1 1

plot(svmfit,dat)

set.seed(1)
x=matrix(rnorm(200*2),ncol=2)
x[1:100,]=x[1:100]+2
x[101:150,]=x[101:150]-2
y=c(rep(1,150),rep(2,50))
dat=data.frame(x=x,y=as.factor(y))

plot(x,col=y)

train=sample(200,100)
svmfit=svm(y~.,data=dat[train,],kernel="radial",gamma=1,cost=1)
plot(svmfit,dat[train,])

summary(svmfit)

## 
## Call:
## svm(formula = y ~ ., data = dat[train, ], kernel = "radial", 
##     gamma = 1, cost = 1)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
##       gamma:  1 
## 
## Number of Support Vectors:  41
## 
##  ( 20 21 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  1 2

Increasing the value of cost term

svmfit<-svm(y~.,data=dat[train,],kernel="radial",gamma=1,cost=1e5)
plot(svmfit,dat[train,])

Variation by cost and gamma

set.seed(1)

tune_out=tune(svm,y~.,data=dat[train,],kernel="radial",ranges = list(cost=c(0.1,1,10,100,1000),gamma=c(0.5,1,2,3,4)))

summary(tune_out)

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##  1000     2
## 
## - best performance: 0.05 
## 
## - Detailed performance results:
##     cost gamma error dispersion
## 1  1e-01   0.5  0.27 0.11595018
## 2  1e+00   0.5  0.11 0.05676462
## 3  1e+01   0.5  0.13 0.04830459
## 4  1e+02   0.5  0.12 0.12292726
## 5  1e+03   0.5  0.09 0.07378648
## 6  1e-01   1.0  0.27 0.11595018
## 7  1e+00   1.0  0.13 0.04830459
## 8  1e+01   1.0  0.15 0.10801234
## 9  1e+02   1.0  0.10 0.09428090
## 10 1e+03   1.0  0.09 0.05676462
## 11 1e-01   2.0  0.27 0.11595018
## 12 1e+00   2.0  0.16 0.06992059
## 13 1e+01   2.0  0.16 0.09660918
## 14 1e+02   2.0  0.12 0.07888106
## 15 1e+03   2.0  0.05 0.05270463
## 16 1e-01   3.0  0.27 0.11595018
## 17 1e+00   3.0  0.16 0.06992059
## 18 1e+01   3.0  0.14 0.06992059
## 19 1e+02   3.0  0.12 0.07888106
## 20 1e+03   3.0  0.07 0.06749486
## 21 1e-01   4.0  0.27 0.11595018
## 22 1e+00   4.0  0.16 0.09660918
## 23 1e+01   4.0  0.14 0.06992059
## 24 1e+02   4.0  0.10 0.08164966
## 25 1e+03   4.0  0.10 0.08164966

table(true=dat[-train,"y"],pred=predict(tune_out$best.model,newx=dat[-train,]))

##     pred
## true  1  2
##    1 55 22
##    2 19  4

ROC Curves

rocplot=function(pred,truth,...)
  {
  predob=prediction(pred,truth)
  perf=performance(predob,"tpr","fpr")
  plot(perf,...)
 }

svmfit_opt=svm(y~.,data = dat[train,],kernel="radial",gamma=2,cost=1,decision.values=T)
fitted=attributes(predict(svmfit_opt,dat[train,],decision.values=TRUE))$decision.values

par(mfrow=c(1,2))
rocplot(fitted,dat[train,"y"],main="Training Data")


svmfit_flex=svm(y~.,data=dat[train,],kernel="radial",gamma=50,cost=1,decision.values=T)
fitted=attributes(predict(svmfit_flex,dat[train,],decision.values=TRUE))$decision.values
rocplot(fitted,dat[train,"y"],add=T,col="red")

Predictions on the test set

par(mfrow=c(1,2))

fitted=attributes(predict(svmfit_opt,dat[-train,],decision.values=TRUE))$decision.values
rocplot(fitted,dat[-train,"y"],main="Test Data")


fitted=attributes(predict(svmfit_flex,dat[-train,],decision.values=TRUE))$decision.values
rocplot(fitted,dat[-train,"y"],add=T,col="red")

SVM with multiple classes

Using one- versus all approach

Generating the data

set.seed(1)
x=rbind(x,matrix(rnorm(50*2),ncol=2))
y=c(y,rep(0,50))

x[y==0,2]=x[y==0,2]+2
dat=data.frame(x=x,y=as.factor(y))
par(mfrow=c(1,1))
plot(x,col=(y+1))

svmfit=svm(y~.,data=dat,kernel="radial",cost=10,gamma=1)
plot(svmfit,dat)

Application to the Gene expression data

names(Khan)

## [1] "xtrain" "xtest"  "ytrain" "ytest"

Principal Component Analysis

states=row.names(USArrests)
states

##  [1] "Alabama"        "Alaska"         "Arizona"        "Arkansas"      
##  [5] "California"     "Colorado"       "Connecticut"    "Delaware"      
##  [9] "Florida"        "Georgia"        "Hawaii"         "Idaho"         
## [13] "Illinois"       "Indiana"        "Iowa"           "Kansas"        
## [17] "Kentucky"       "Louisiana"      "Maine"          "Maryland"      
## [21] "Massachusetts"  "Michigan"       "Minnesota"      "Mississippi"   
## [25] "Missouri"       "Montana"        "Nebraska"       "Nevada"        
## [29] "New Hampshire"  "New Jersey"     "New Mexico"     "New York"      
## [33] "North Carolina" "North Dakota"   "Ohio"           "Oklahoma"      
## [37] "Oregon"         "Pennsylvania"   "Rhode Island"   "South Carolina"
## [41] "South Dakota"   "Tennessee"      "Texas"          "Utah"          
## [45] "Vermont"        "Virginia"       "Washington"     "West Virginia" 
## [49] "Wisconsin"      "Wyoming"

names(USArrests)

## [1] "Murder"   "Assault"  "UrbanPop" "Rape"

apply(USArrests, 2, mean)

##   Murder  Assault UrbanPop     Rape 
##    7.788  170.760   65.540   21.232

pr_out<-prcomp(USArrests,scale. = TRUE)
names(pr_out)

## [1] "sdev"     "rotation" "center"   "scale"    "x"

pr_out$center

##   Murder  Assault UrbanPop     Rape 
##    7.788  170.760   65.540   21.232

pr_out$scale

##    Murder   Assault  UrbanPop      Rape 
##  4.355510 83.337661 14.474763  9.366385

pr_out$rotation

##                 PC1        PC2        PC3         PC4
## Murder   -0.5358995  0.4181809 -0.3412327  0.64922780
## Assault  -0.5831836  0.1879856 -0.2681484 -0.74340748
## UrbanPop -0.2781909 -0.8728062 -0.3780158  0.13387773
## Rape     -0.5434321 -0.1673186  0.8177779  0.08902432

pr_out$x

##                        PC1         PC2         PC3          PC4
## Alabama        -0.97566045  1.12200121 -0.43980366  0.154696581
## Alaska         -1.93053788  1.06242692  2.01950027 -0.434175454
## Arizona        -1.74544285 -0.73845954  0.05423025 -0.826264240
## Arkansas        0.13999894  1.10854226  0.11342217 -0.180973554
## California     -2.49861285 -1.52742672  0.59254100 -0.338559240
## Colorado       -1.49934074 -0.97762966  1.08400162  0.001450164
## Connecticut     1.34499236 -1.07798362 -0.63679250 -0.117278736
## Delaware       -0.04722981 -0.32208890 -0.71141032 -0.873113315
## Florida        -2.98275967  0.03883425 -0.57103206 -0.095317042
## Georgia        -1.62280742  1.26608838 -0.33901818  1.065974459
## Hawaii          0.90348448 -1.55467609  0.05027151  0.893733198
## Idaho           1.62331903  0.20885253  0.25719021 -0.494087852
## Illinois       -1.36505197 -0.67498834 -0.67068647 -0.120794916
## Indiana         0.50038122 -0.15003926  0.22576277  0.420397595
## Iowa            2.23099579 -0.10300828  0.16291036  0.017379470
## Kansas          0.78887206 -0.26744941  0.02529648  0.204421034
## Kentucky        0.74331256  0.94880748 -0.02808429  0.663817237
## Louisiana      -1.54909076  0.86230011 -0.77560598  0.450157791
## Maine           2.37274014  0.37260865 -0.06502225 -0.327138529
## Maryland       -1.74564663  0.42335704 -0.15566968 -0.553450589
## Massachusetts   0.48128007 -1.45967706 -0.60337172 -0.177793902
## Michigan       -2.08725025 -0.15383500  0.38100046  0.101343128
## Minnesota       1.67566951 -0.62590670  0.15153200  0.066640316
## Mississippi    -0.98647919  2.36973712 -0.73336290  0.213342049
## Missouri       -0.68978426 -0.26070794  0.37365033  0.223554811
## Montana         1.17353751  0.53147851  0.24440796  0.122498555
## Nebraska        1.25291625 -0.19200440  0.17380930  0.015733156
## Nevada         -2.84550542 -0.76780502  1.15168793  0.311354436
## New Hampshire   2.35995585 -0.01790055  0.03648498 -0.032804291
## New Jersey     -0.17974128 -1.43493745 -0.75677041  0.240936580
## New Mexico     -1.96012351  0.14141308  0.18184598 -0.336121113
## New York       -1.66566662 -0.81491072 -0.63661186 -0.013348844
## North Carolina -1.11208808  2.20561081 -0.85489245 -0.944789648
## North Dakota    2.96215223  0.59309738  0.29824930 -0.251434626
## Ohio            0.22369436 -0.73477837 -0.03082616  0.469152817
## Oklahoma        0.30864928 -0.28496113 -0.01515592  0.010228476
## Oregon         -0.05852787 -0.53596999  0.93038718 -0.235390872
## Pennsylvania    0.87948680 -0.56536050 -0.39660218  0.355452378
## Rhode Island    0.85509072 -1.47698328 -1.35617705 -0.607402746
## South Carolina -1.30744986  1.91397297 -0.29751723 -0.130145378
## South Dakota    1.96779669  0.81506822  0.38538073 -0.108470512
## Tennessee      -0.98969377  0.85160534  0.18619262  0.646302674
## Texas          -1.34151838 -0.40833518 -0.48712332  0.636731051
## Utah            0.54503180 -1.45671524  0.29077592 -0.081486749
## Vermont         2.77325613  1.38819435  0.83280797 -0.143433697
## Virginia        0.09536670  0.19772785  0.01159482  0.209246429
## Washington      0.21472339 -0.96037394  0.61859067 -0.218628161
## West Virginia   2.08739306  1.41052627  0.10372163  0.130583080
## Wisconsin       2.05881199 -0.60512507 -0.13746933  0.182253407
## Wyoming         0.62310061  0.31778662 -0.23824049 -0.164976866

The x vector with it’s loadings in different principal components

biplot(pr_out,scale=0)

We will be calculating the amount of variance explained by each principal component

pr_out$sdev

## [1] 1.5748783 0.9948694 0.5971291 0.4164494

pr_var=pr_out$sdev^2
pr_var

## [1] 2.4802416 0.9897652 0.3565632 0.1734301

pve=pr_var/sum(pr_var)

plot(pve,xlab="Principal Component",ylab="Proportion of Variance Explained",ylim=c(0,1),type="b")

K-Means clustering

Generating the data

set.seed(2)
x=matrix(rnorm(50*2),ncol=2)
x[1:25,1]=x[1:25,1]+3
x[1:25,2]=x[1:25,2]-4

K-means clustering with K=2

km_out=kmeans(x,2,nstart=20)
plot(x,col=(km_out$cluster+1),main="K_means Clustering with x=2",pch=20,cex=2)

We generated the data so we know there are 2 groups. Let us try what would have happen if we have started with K=3 groups

set.seed(4)
km_out=kmeans(x,3,nstart=20)
km_out

## K-means clustering with 3 clusters of sizes 10, 23, 17
## 
## Cluster means:
##         [,1]        [,2]
## 1  2.3001545 -2.69622023
## 2 -0.3820397 -0.08740753
## 3  3.7789567 -4.56200798
## 
## Clustering vector:
##  [1] 3 1 3 1 3 3 3 1 3 1 3 1 3 1 3 1 3 3 3 3 3 1 3 3 3 2 2 2 2 2 2 2 2 2 2
## [36] 2 2 2 2 2 2 2 2 1 2 1 2 2 2 2
## 
## Within cluster sum of squares by cluster:
## [1] 19.56137 52.67700 25.74089
##  (between_SS / total_SS =  79.3 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

Hierarchical clustering

hc_average<-hclust(dist(x),method = 'average')
hc_single<-hclust(dist(x),method = 'single')
hc_complete<-hclust(dist(x),method = 'complete')

par(mfrow=c(1,3))
plot(hc_complete,cex=0.9)
plot(hc_average,cex=0.9)
plot(hc_single,cex=0.9)

Labelling

cutree(hc_complete,2)

##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2
## [36] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2

cutree(hc_average,2)

##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 1 2 2
## [36] 2 2 2 2 2 2 2 2 1 2 1 2 2 2 2

cutree(hc_single,2)

##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

cutree(hc_single,4)

##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3
## [36] 3 3 3 3 3 3 4 3 3 3 3 3 3 3 3

xsc=scale(x)
plot(hclust(dist(xsc)))

Not only absolute distances we can also calculate distances based on correlation

x=matrix(rnorm(30*3),ncol=3)
dd=as.dist(1-cor(t(x)))
plot(hclust(dd,method='complete'),main="Complete Linkage With Correlation-Based distancec",xlab="",ylab="")

NCI60 Data

nci_labs=NCI60$labs
nci_data=NCI60$data

The data has 64 rows and columns >>>64 This is a p>>>n problem

PCA on the NCI60 data

pr_out=prcomp(nci_data,scale=TRUE)
Cols=function(vec)
{
  cols=rainbow(length(unique(vec)))
  return(cols[as.numeric(as.factor(vec))])
}

Plotting the first two principal components

par(mfrow=c(1,2))

plot(pr_out$x[,1:2],col=Cols(nci_labs),pch=19,xlab="Z1",ylab="Z2")
plot(pr_out$x[,c(1,3)],col=Cols(nci_labs),pch=19,xlab="Z1",ylab="Z3")

[Copied]:

This indicates that cell lines from the same cancer type tend to have pretty similar gene expression levels.

summary(pr_out)

## Importance of components:
##                            PC1      PC2      PC3      PC4      PC5
## Standard deviation     27.8535 21.48136 19.82046 17.03256 15.97181
## Proportion of Variance  0.1136  0.06756  0.05752  0.04248  0.03735
## Cumulative Proportion   0.1136  0.18115  0.23867  0.28115  0.31850
##                             PC6      PC7      PC8      PC9     PC10
## Standard deviation     15.72108 14.47145 13.54427 13.14400 12.73860
## Proportion of Variance  0.03619  0.03066  0.02686  0.02529  0.02376
## Cumulative Proportion   0.35468  0.38534  0.41220  0.43750  0.46126
##                            PC11     PC12     PC13     PC14     PC15
## Standard deviation     12.68672 12.15769 11.83019 11.62554 11.43779
## Proportion of Variance  0.02357  0.02164  0.02049  0.01979  0.01915
## Cumulative Proportion   0.48482  0.50646  0.52695  0.54674  0.56590
##                            PC16     PC17     PC18     PC19    PC20
## Standard deviation     11.00051 10.65666 10.48880 10.43518 10.3219
## Proportion of Variance  0.01772  0.01663  0.01611  0.01594  0.0156
## Cumulative Proportion   0.58361  0.60024  0.61635  0.63229  0.6479
##                            PC21    PC22    PC23    PC24    PC25    PC26
## Standard deviation     10.14608 10.0544 9.90265 9.64766 9.50764 9.33253
## Proportion of Variance  0.01507  0.0148 0.01436 0.01363 0.01324 0.01275
## Cumulative Proportion   0.66296  0.6778 0.69212 0.70575 0.71899 0.73174
##                           PC27   PC28    PC29    PC30    PC31    PC32
## Standard deviation     9.27320 9.0900 8.98117 8.75003 8.59962 8.44738
## Proportion of Variance 0.01259 0.0121 0.01181 0.01121 0.01083 0.01045
## Cumulative Proportion  0.74433 0.7564 0.76824 0.77945 0.79027 0.80072
##                           PC33    PC34    PC35    PC36    PC37    PC38
## Standard deviation     8.37305 8.21579 8.15731 7.97465 7.90446 7.82127
## Proportion of Variance 0.01026 0.00988 0.00974 0.00931 0.00915 0.00896
## Cumulative Proportion  0.81099 0.82087 0.83061 0.83992 0.84907 0.85803
##                           PC39    PC40    PC41   PC42    PC43   PC44
## Standard deviation     7.72156 7.58603 7.45619 7.3444 7.10449 7.0131
## Proportion of Variance 0.00873 0.00843 0.00814 0.0079 0.00739 0.0072
## Cumulative Proportion  0.86676 0.87518 0.88332 0.8912 0.89861 0.9058
##                           PC45   PC46    PC47    PC48    PC49    PC50
## Standard deviation     6.95839 6.8663 6.80744 6.64763 6.61607 6.40793
## Proportion of Variance 0.00709 0.0069 0.00678 0.00647 0.00641 0.00601
## Cumulative Proportion  0.91290 0.9198 0.92659 0.93306 0.93947 0.94548
##                           PC51    PC52    PC53    PC54    PC55    PC56
## Standard deviation     6.21984 6.20326 6.06706 5.91805 5.91233 5.73539
## Proportion of Variance 0.00566 0.00563 0.00539 0.00513 0.00512 0.00482
## Cumulative Proportion  0.95114 0.95678 0.96216 0.96729 0.97241 0.97723
##                           PC57   PC58    PC59    PC60    PC61    PC62
## Standard deviation     5.47261 5.2921 5.02117 4.68398 4.17567 4.08212
## Proportion of Variance 0.00438 0.0041 0.00369 0.00321 0.00255 0.00244
## Cumulative Proportion  0.98161 0.9857 0.98940 0.99262 0.99517 0.99761
##                           PC63      PC64
## Standard deviation     4.04124 2.148e-14
## Proportion of Variance 0.00239 0.000e+00
## Cumulative Proportion  1.00000 1.000e+00

plot(pr_out)

Scree-Plot.

pve stands for proportion of variance explained

pve=100*pr_out$sdev^2/sum(pr_out$sdev^2)
par(mfrow=c(1,2))

plot(pve,type="o",ylab="PVE",xlab="Principal Component",col="blue")
plot(cumsum(pve),type="o",ylab="Cumulative PVE",xlab="Principal Component",col="brown3")

Hierarchical clustering

sd_data=scale(nci_data)
par(mfrow=c(1,3))
data_dist=dist(sd_data)

plot(hclust(data_dist),labels = nci_labs,main = "Complete Linkage",xlab = "",sub="",ylab="")
plot(hclust(data_dist,method="average"),labels = nci_labs,main = "Average Linkage",xlab = "",sub="",ylab="")
plot(hclust(data_dist,method="single"),labels = nci_labs,main = "Single Linkage",xlab = "",sub="",ylab="")

hc_out=hclust(dist(sd_data))
hc_clusters=cutree(hc_out,4)

table(hc_clusters,nci_labs)

##            nci_labs
## hc_clusters BREAST CNS COLON K562A-repro K562B-repro LEUKEMIA MCF7A-repro
##           1      2   3     2           0           0        0           0
##           2      3   2     0           0           0        0           0
##           3      0   0     0           1           1        6           0
##           4      2   0     5           0           0        0           1
##            nci_labs
## hc_clusters MCF7D-repro MELANOMA NSCLC OVARIAN PROSTATE RENAL UNKNOWN
##           1           0        8     8       6        2     8       1
##           2           0        0     1       0        0     1       0
##           3           0        0     0       0        0     0       0
##           4           1        0     0       0        0     0       0

par(mfrow=c(1,1))
plot(hc_out,labels = nci_labs)
abline(h=139,col="red")

At height=139 we see the cluster is cut into 4 different clusters. In the book it is specifically menioned that performing a k-means clustering with k=4 is not the same as cutting a dendogram at our chosen height. The resulting clusters can be different.

set.seed(2)
km_out=kmeans(sd_data,4,nstart=20)
km_clusters=km_out$cluster
table(km_clusters,hc_clusters)

##            hc_clusters
## km_clusters  1  2  3  4
##           1 11  0  0  9
##           2  0  0  8  0
##           3  9  0  0  0
##           4 20  7  0  0

Another important line in the book was denoising the data by using only the first few principal component scores

hc_out=hclust(dist(pr_out$x[,1:5]))
plot(hc_out,labels=nci_labs,main="Hier. Clust. on First Five score vectors")

table(cutree(hc_out,4),nci_labs)

##    nci_labs
##     BREAST CNS COLON K562A-repro K562B-repro LEUKEMIA MCF7A-repro
##   1      0   2     7           0           0        2           0
##   2      5   3     0           0           0        0           0
##   3      0   0     0           1           1        4           0
##   4      2   0     0           0           0        0           1
##    nci_labs
##     MCF7D-repro MELANOMA NSCLC OVARIAN PROSTATE RENAL UNKNOWN
##   1           0        1     8       5        2     7       0
##   2           0        7     1       1        0     2       1
##   3           0        0     0       0        0     0       0
##   4           1        0     0       0        0     0       0

ISLR

Aadish

August 6, 2018

Simple Linear Regression

Multiple Linear Regression

Non- Linear Transformation

Logistic Regression

Linear Discriminant Analysis

Quadrant Discriminant Analysis

KNN

Sampling and Bootstrapping methods

Bootstrap

Best Subset Selection Methods

Ridge Regression and Lasso

Lasso Model

PCR and PLS

Partial Least Squares

Non-Linear Modelling

How to determine what degree of polynomial to use ?

Splines

GAM

Decision Trees

Fitting Regression Trees

Bagging and Random Forests

Boosting

Support Vector Machines

ROC Curves

SVM with multiple classes

Principal Component Analysis

K-Means clustering

NCI60 Data

Hierarchical clustering