Simple Linear Regression

library("ISLR")
library("car")
library("MASS")
library("leaps")
library("glmnet")
library("pls")
library("splines")
library("gam")
library("akima")
library("tree")
library("randomForest")
library("gbm")
library("e1071")
library("ROCR")

lm_fit<-lm(medv~lstat,data=Boston)
summary(lm_fit)
## 
## Call:
## lm(formula = medv ~ lstat, data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.168  -3.990  -1.318   2.034  24.500 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 34.55384    0.56263   61.41   <2e-16 ***
## lstat       -0.95005    0.03873  -24.53   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.216 on 504 degrees of freedom
## Multiple R-squared:  0.5441, Adjusted R-squared:  0.5432 
## F-statistic: 601.6 on 1 and 504 DF,  p-value: < 2.2e-16
attach(Boston)
plot(lstat,medv)
abline(lm_fit)

par(mfrow=c(2,2))
plot(lm_fit)

Multiple Linear Regression

lm_fit<-lm(medv~lstat+age,data=Boston)
summary(lm_fit)
## 
## Call:
## lm(formula = medv ~ lstat + age, data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.981  -3.978  -1.283   1.968  23.158 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 33.22276    0.73085  45.458  < 2e-16 ***
## lstat       -1.03207    0.04819 -21.416  < 2e-16 ***
## age          0.03454    0.01223   2.826  0.00491 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.173 on 503 degrees of freedom
## Multiple R-squared:  0.5513, Adjusted R-squared:  0.5495 
## F-statistic:   309 on 2 and 503 DF,  p-value: < 2.2e-16
lm_fit<-lm(medv~.,data=Boston)
summary(lm_fit)
## 
## Call:
## lm(formula = medv ~ ., data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.595  -2.730  -0.518   1.777  26.199 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.646e+01  5.103e+00   7.144 3.28e-12 ***
## crim        -1.080e-01  3.286e-02  -3.287 0.001087 ** 
## zn           4.642e-02  1.373e-02   3.382 0.000778 ***
## indus        2.056e-02  6.150e-02   0.334 0.738288    
## chas         2.687e+00  8.616e-01   3.118 0.001925 ** 
## nox         -1.777e+01  3.820e+00  -4.651 4.25e-06 ***
## rm           3.810e+00  4.179e-01   9.116  < 2e-16 ***
## age          6.922e-04  1.321e-02   0.052 0.958229    
## dis         -1.476e+00  1.995e-01  -7.398 6.01e-13 ***
## rad          3.060e-01  6.635e-02   4.613 5.07e-06 ***
## tax         -1.233e-02  3.760e-03  -3.280 0.001112 ** 
## ptratio     -9.527e-01  1.308e-01  -7.283 1.31e-12 ***
## black        9.312e-03  2.686e-03   3.467 0.000573 ***
## lstat       -5.248e-01  5.072e-02 -10.347  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.745 on 492 degrees of freedom
## Multiple R-squared:  0.7406, Adjusted R-squared:  0.7338 
## F-statistic: 108.1 on 13 and 492 DF,  p-value: < 2.2e-16
vif(lm_fit)
##     crim       zn    indus     chas      nox       rm      age      dis 
## 1.792192 2.298758 3.991596 1.073995 4.393720 1.933744 3.100826 3.955945 
##      rad      tax  ptratio    black    lstat 
## 7.484496 9.008554 1.799084 1.348521 2.941491

Non- Linear Transformation

lm_fit1<-lm(medv~lstat+I(lstat^2))
summary(lm_fit)
## 
## Call:
## lm(formula = medv ~ ., data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.595  -2.730  -0.518   1.777  26.199 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.646e+01  5.103e+00   7.144 3.28e-12 ***
## crim        -1.080e-01  3.286e-02  -3.287 0.001087 ** 
## zn           4.642e-02  1.373e-02   3.382 0.000778 ***
## indus        2.056e-02  6.150e-02   0.334 0.738288    
## chas         2.687e+00  8.616e-01   3.118 0.001925 ** 
## nox         -1.777e+01  3.820e+00  -4.651 4.25e-06 ***
## rm           3.810e+00  4.179e-01   9.116  < 2e-16 ***
## age          6.922e-04  1.321e-02   0.052 0.958229    
## dis         -1.476e+00  1.995e-01  -7.398 6.01e-13 ***
## rad          3.060e-01  6.635e-02   4.613 5.07e-06 ***
## tax         -1.233e-02  3.760e-03  -3.280 0.001112 ** 
## ptratio     -9.527e-01  1.308e-01  -7.283 1.31e-12 ***
## black        9.312e-03  2.686e-03   3.467 0.000573 ***
## lstat       -5.248e-01  5.072e-02 -10.347  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.745 on 492 degrees of freedom
## Multiple R-squared:  0.7406, Adjusted R-squared:  0.7338 
## F-statistic: 108.1 on 13 and 492 DF,  p-value: < 2.2e-16
plot(lm_fit1)

names(Carseats)
##  [1] "Sales"       "CompPrice"   "Income"      "Advertising" "Population" 
##  [6] "Price"       "ShelveLoc"   "Age"         "Education"   "Urban"      
## [11] "US"
lm_fit=lm(Sales~.+Income:Advertising+Price:Age,data=Carseats)
summary(lm_fit)
## 
## Call:
## lm(formula = Sales ~ . + Income:Advertising + Price:Age, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.9208 -0.7503  0.0177  0.6754  3.3413 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         6.5755654  1.0087470   6.519 2.22e-10 ***
## CompPrice           0.0929371  0.0041183  22.567  < 2e-16 ***
## Income              0.0108940  0.0026044   4.183 3.57e-05 ***
## Advertising         0.0702462  0.0226091   3.107 0.002030 ** 
## Population          0.0001592  0.0003679   0.433 0.665330    
## Price              -0.1008064  0.0074399 -13.549  < 2e-16 ***
## ShelveLocGood       4.8486762  0.1528378  31.724  < 2e-16 ***
## ShelveLocMedium     1.9532620  0.1257682  15.531  < 2e-16 ***
## Age                -0.0579466  0.0159506  -3.633 0.000318 ***
## Education          -0.0208525  0.0196131  -1.063 0.288361    
## UrbanYes            0.1401597  0.1124019   1.247 0.213171    
## USYes              -0.1575571  0.1489234  -1.058 0.290729    
## Income:Advertising  0.0007510  0.0002784   2.698 0.007290 ** 
## Price:Age           0.0001068  0.0001333   0.801 0.423812    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.011 on 386 degrees of freedom
## Multiple R-squared:  0.8761, Adjusted R-squared:  0.8719 
## F-statistic:   210 on 13 and 386 DF,  p-value: < 2.2e-16
names(Smarket)
## [1] "Year"      "Lag1"      "Lag2"      "Lag3"      "Lag4"      "Lag5"     
## [7] "Volume"    "Today"     "Direction"
cor(Smarket[,-9])
##              Year         Lag1         Lag2         Lag3         Lag4
## Year   1.00000000  0.029699649  0.030596422  0.033194581  0.035688718
## Lag1   0.02969965  1.000000000 -0.026294328 -0.010803402 -0.002985911
## Lag2   0.03059642 -0.026294328  1.000000000 -0.025896670 -0.010853533
## Lag3   0.03319458 -0.010803402 -0.025896670  1.000000000 -0.024051036
## Lag4   0.03568872 -0.002985911 -0.010853533 -0.024051036  1.000000000
## Lag5   0.02978799 -0.005674606 -0.003557949 -0.018808338 -0.027083641
## Volume 0.53900647  0.040909908 -0.043383215 -0.041823686 -0.048414246
## Today  0.03009523 -0.026155045 -0.010250033 -0.002447647 -0.006899527
##                Lag5      Volume        Today
## Year    0.029787995  0.53900647  0.030095229
## Lag1   -0.005674606  0.04090991 -0.026155045
## Lag2   -0.003557949 -0.04338321 -0.010250033
## Lag3   -0.018808338 -0.04182369 -0.002447647
## Lag4   -0.027083641 -0.04841425 -0.006899527
## Lag5    1.000000000 -0.02200231 -0.034860083
## Volume -0.022002315  1.00000000  0.014591823
## Today  -0.034860083  0.01459182  1.000000000

Logistic Regression

fitting a logistic model to predict the direction of the market

attach(Smarket)
glm_fit<-glm(Direction~.-Year-Direction-Today,data = Smarket,family = "binomial")
summary(glm_fit)
## 
## Call:
## glm(formula = Direction ~ . - Year - Direction - Today, family = "binomial", 
##     data = Smarket)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.446  -1.203   1.065   1.145   1.326  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.126000   0.240736  -0.523    0.601
## Lag1        -0.073074   0.050167  -1.457    0.145
## Lag2        -0.042301   0.050086  -0.845    0.398
## Lag3         0.011085   0.049939   0.222    0.824
## Lag4         0.009359   0.049974   0.187    0.851
## Lag5         0.010313   0.049511   0.208    0.835
## Volume       0.135441   0.158360   0.855    0.392
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1731.2  on 1249  degrees of freedom
## Residual deviance: 1727.6  on 1243  degrees of freedom
## AIC: 1741.6
## 
## Number of Fisher Scoring iterations: 3
glm_probs<-predict(glm_fit,type="response")
glm_probs[1:10]
##         1         2         3         4         5         6         7 
## 0.5070841 0.4814679 0.4811388 0.5152224 0.5107812 0.5069565 0.4926509 
##         8         9        10 
## 0.5092292 0.5176135 0.4888378
contrasts(Direction)
##      Up
## Down  0
## Up    1
glm_pred<-rep("Down",1250)
glm_pred[glm_probs>0.5]=="Up"
##   [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [23] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [34] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [45] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [56] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [67] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [78] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [89] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [100] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [111] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [122] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [144] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [155] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [166] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [177] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [188] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [199] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [210] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [221] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [232] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [243] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [254] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [276] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [287] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [298] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [309] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [320] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [331] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [342] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [353] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [364] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [375] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [386] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [397] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [408] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [419] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [430] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [441] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [452] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [463] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [474] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [485] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [496] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [507] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [518] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [529] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [540] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [551] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [562] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [573] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [584] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [595] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [606] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [617] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [628] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [639] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [650] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [661] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [672] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [683] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [694] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [705] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [716] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [727] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [738] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [749] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [760] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [771] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [782] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [793] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [804] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [815] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [826] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [837] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [848] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [859] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [870] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [881] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [892] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [903] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [914] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [925] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [936] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [947] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [958] FALSE FALSE FALSE FALSE FALSE FALSE FALSE
sum(glm_pred==Direction)
## [1] 602

Linear Discriminant Analysis

Using observations only before 2005

train<-(Year<2005)
Smarket.2005<-Smarket[!train,]
Direction.2005<-Direction[!train]

lda_fit<-lda(Direction~Lag1+Lag2,data=Smarket,subset = train)
lda_fit
## Call:
## lda(Direction ~ Lag1 + Lag2, data = Smarket, subset = train)
## 
## Prior probabilities of groups:
##     Down       Up 
## 0.491984 0.508016 
## 
## Group means:
##             Lag1        Lag2
## Down  0.04279022  0.03389409
## Up   -0.03954635 -0.03132544
## 
## Coefficients of linear discriminants:
##             LD1
## Lag1 -0.6420190
## Lag2 -0.5135293
lda_pred<-predict(lda_fit,Smarket.2005)
names(lda_pred)
## [1] "class"     "posterior" "x"
lda_class<-lda_pred$class
table(lda_class,Direction.2005)
##          Direction.2005
## lda_class Down  Up
##      Down   35  35
##      Up     76 106
lda_pred$posterior
##           Down        Up
## 999  0.4901792 0.5098208
## 1000 0.4792185 0.5207815
## 1001 0.4668185 0.5331815
## 1002 0.4740011 0.5259989
## 1003 0.4927877 0.5072123
## 1004 0.4938562 0.5061438
## 1005 0.4951016 0.5048984
## 1006 0.4872861 0.5127139
## 1007 0.4907013 0.5092987
## 1008 0.4844026 0.5155974
## 1009 0.4906963 0.5093037
## 1010 0.5119988 0.4880012
## 1011 0.4895152 0.5104848
## 1012 0.4706761 0.5293239
## 1013 0.4744593 0.5255407
## 1014 0.4799583 0.5200417
## 1015 0.4935775 0.5064225
## 1016 0.5030894 0.4969106
## 1017 0.4978806 0.5021194
## 1018 0.4886331 0.5113669
## 1019 0.5006568 0.4993432
## 1020 0.5108735 0.4891265
## 1021 0.5039925 0.4960075
## 1022 0.4916335 0.5083665
## 1023 0.5041772 0.4958228
## 1024 0.5026751 0.4973249
## 1025 0.4914043 0.5085957
## 1026 0.4805964 0.5194036
## 1027 0.4882718 0.5117282
## 1028 0.5062187 0.4937813
## 1029 0.5005996 0.4994004
## 1030 0.4972965 0.5027035
## 1031 0.4958546 0.5041454
## 1032 0.4811777 0.5188223
## 1033 0.4841417 0.5158583
## 1034 0.4726388 0.5273612
## 1035 0.4836418 0.5163582
## 1036 0.5091007 0.4908993
## 1037 0.5135941 0.4864059
## 1038 0.4933839 0.5066161
## 1039 0.4926856 0.5073144
## 1040 0.4978472 0.5021528
## 1041 0.4920914 0.5079086
## 1042 0.5056346 0.4943654
## 1043 0.5062288 0.4937712
## 1044 0.4881894 0.5118106
## 1045 0.4725293 0.5274707
## 1046 0.4832339 0.5167661
## 1047 0.4835086 0.5164914
## 1048 0.4913334 0.5086666
## 1049 0.4877566 0.5122434
## 1050 0.4724386 0.5275614
## 1051 0.4854877 0.5145123
## 1052 0.4932911 0.5067089
## 1053 0.4845973 0.5154027
## 1054 0.4723718 0.5276282
## 1055 0.4816170 0.5183830
## 1056 0.4914067 0.5085933
## 1057 0.4942755 0.5057245
## 1058 0.4841232 0.5158768
## 1059 0.5026064 0.4973936
## 1060 0.5062557 0.4937443
## 1061 0.4821800 0.5178200
## 1062 0.4885263 0.5114737
## 1063 0.5011825 0.4988175
## 1064 0.5000595 0.4999405
## 1065 0.5027377 0.4972623
## 1066 0.4870086 0.5129914
## 1067 0.4827213 0.5172787
## 1068 0.4996501 0.5003499
## 1069 0.4818079 0.5181921
## 1070 0.4651057 0.5348943
## 1071 0.4577867 0.5422133
## 1072 0.4775004 0.5224996
## 1073 0.5034250 0.4965750
## 1074 0.4801664 0.5198336
## 1075 0.5046171 0.4953829
## 1076 0.5044752 0.4955248
## 1077 0.4964663 0.5035337
## 1078 0.4892965 0.5107035
## 1079 0.4876236 0.5123764
## 1080 0.4805625 0.5194375
## 1081 0.4958518 0.5041482
## 1082 0.5115212 0.4884788
## 1083 0.4958572 0.5041428
## 1084 0.5082871 0.4917129
## 1085 0.5022091 0.4977909
## 1086 0.4875892 0.5124108
## 1087 0.4995948 0.5004052
## 1088 0.4841917 0.5158083
## 1089 0.4858843 0.5141157
## 1090 0.4826969 0.5173031
## 1091 0.4745012 0.5254988
## 1092 0.5008540 0.4991460
## 1093 0.5127766 0.4872234
## 1094 0.5135472 0.4864528
## 1095 0.5095127 0.4904873
## 1096 0.4950201 0.5049799
## 1097 0.4956088 0.5043912
## 1098 0.4964643 0.5035357
## 1099 0.4874363 0.5125637
## 1100 0.4970339 0.5029661
## 1101 0.5003751 0.4996249
## 1102 0.4846137 0.5153863
## 1103 0.4976914 0.5023086
## 1104 0.5043081 0.4956919
## 1105 0.4843366 0.5156634
## 1106 0.4860664 0.5139336
## 1107 0.4930417 0.5069583
## 1108 0.4887219 0.5112781
## 1109 0.4968147 0.5031853
## 1110 0.4944989 0.5055011
## 1111 0.4924743 0.5075257
## 1112 0.4980141 0.5019859
## 1113 0.4978727 0.5021273
## 1114 0.4994390 0.5005610
## 1115 0.5028317 0.4971683
## 1116 0.4964503 0.5035497
## 1117 0.4883202 0.5116798
## 1118 0.4899801 0.5100199
## 1119 0.4771957 0.5228043
## 1120 0.4694030 0.5305970
## 1121 0.4824692 0.5175308
## 1122 0.5037943 0.4962057
## 1123 0.5000974 0.4999026
## 1124 0.4805303 0.5194697
## 1125 0.4876953 0.5123047
## 1126 0.5070782 0.4929218
## 1127 0.4901776 0.5098224
## 1128 0.4860999 0.5139001
## 1129 0.5108497 0.4891503
## 1130 0.5135547 0.4864453
## 1131 0.5020218 0.4979782
## 1132 0.4956830 0.5043170
## 1133 0.4965536 0.5034464
## 1134 0.4964590 0.5035410
## 1135 0.4855719 0.5144281
## 1136 0.4951439 0.5048561
## 1137 0.5060048 0.4939952
## 1138 0.4880643 0.5119357
## 1139 0.4921175 0.5078825
## 1140 0.4927195 0.5072805
## 1141 0.4901661 0.5098339
## 1142 0.5001986 0.4998014
## 1143 0.5047746 0.4952254
## 1144 0.4875267 0.5124733
## 1145 0.4847648 0.5152352
## 1146 0.5028405 0.4971595
## 1147 0.5008435 0.4991565
## 1148 0.4825591 0.5174409
## 1149 0.4732124 0.5267876
## 1150 0.4797731 0.5202269
## 1151 0.4983172 0.5016828
## 1152 0.4968824 0.5031176
## 1153 0.4997031 0.5002969
## 1154 0.4914721 0.5085279
## 1155 0.4892300 0.5107700
## 1156 0.4787694 0.5212306
## 1157 0.4799234 0.5200766
## 1158 0.4913818 0.5086182
## 1159 0.4916287 0.5083713
## 1160 0.4948795 0.5051205
## 1161 0.4890900 0.5109100
## 1162 0.4790944 0.5209056
## 1163 0.4878531 0.5121469
## 1164 0.4861838 0.5138162
## 1165 0.4935558 0.5064442
## 1166 0.4941329 0.5058671
## 1167 0.5020762 0.4979238
## 1168 0.5043051 0.4956949
## 1169 0.4890430 0.5109570
## 1170 0.5062006 0.4937994
## 1171 0.5092767 0.4907233
## 1172 0.4893670 0.5106330
## 1173 0.4987776 0.5012224
## 1174 0.4997456 0.5002544
## 1175 0.4806852 0.5193148
## 1176 0.4790536 0.5209464
## 1177 0.4889496 0.5110504
## 1178 0.5039466 0.4960534
## 1179 0.4934174 0.5065826
## 1180 0.4748985 0.5251015
## 1181 0.4706261 0.5293739
## 1182 0.4868978 0.5131022
## 1183 0.4967554 0.5032446
## 1184 0.4929449 0.5070551
## 1185 0.4922853 0.5077147
## 1186 0.4933690 0.5066310
## 1187 0.5053601 0.4946399
## 1188 0.5030552 0.4969448
## 1189 0.4905837 0.5094163
## 1190 0.4762390 0.5237610
## 1191 0.4603392 0.5396608
## 1192 0.4697932 0.5302068
## 1193 0.4925300 0.5074700
## 1194 0.4861143 0.5138857
## 1195 0.4811376 0.5188624
## 1196 0.4812474 0.5187526
## 1197 0.4842383 0.5157617
## 1198 0.5026218 0.4973782
## 1199 0.5052312 0.4947688
## 1200 0.4813184 0.5186816
## 1201 0.5015397 0.4984603
## 1202 0.4877161 0.5122839
## 1203 0.4774171 0.5225829
## 1204 0.5168827 0.4831173
## 1205 0.5072640 0.4927360
## 1206 0.4833515 0.5166485
## 1207 0.4726701 0.5273299
## 1208 0.5032667 0.4967333
## 1209 0.5202350 0.4797650
## 1210 0.4950279 0.5049721
## 1211 0.5018767 0.4981233
## 1212 0.5089142 0.4910858
## 1213 0.4968911 0.5031089
## 1214 0.4951595 0.5048405
## 1215 0.4895942 0.5104058
## 1216 0.4904653 0.5095347
## 1217 0.5055318 0.4944682
## 1218 0.5055416 0.4944584
## 1219 0.4942470 0.5057530
## 1220 0.4857495 0.5142505
## 1221 0.4901606 0.5098394
## 1222 0.5069730 0.4930270
## 1223 0.5084764 0.4915236
## 1224 0.5041288 0.4958712
## 1225 0.5048299 0.4951701
## 1226 0.5023879 0.4976121
## 1227 0.4986903 0.5013097
## 1228 0.4824758 0.5175242
## 1229 0.4825469 0.5174531
## 1230 0.4831600 0.5168400
## 1231 0.5017497 0.4982503
## 1232 0.5058708 0.4941292
## 1233 0.4890321 0.5109679
## 1234 0.4911052 0.5088948
## 1235 0.4864250 0.5135750
## 1236 0.4847062 0.5152938
## 1237 0.4944890 0.5055110
## 1238 0.4962261 0.5037739
## 1239 0.5005702 0.4994298
## 1240 0.5039068 0.4960932
## 1241 0.4946376 0.5053624
## 1242 0.4864366 0.5135634
## 1243 0.4807022 0.5192978
## 1244 0.4851439 0.5148561
## 1245 0.4951734 0.5048266
## 1246 0.5005893 0.4994107
## 1247 0.4972210 0.5027790
## 1248 0.4791988 0.5208012
## 1249 0.4831673 0.5168327
## 1250 0.4892591 0.5107409
lda_pred$posterior[1:20,1]
##       999      1000      1001      1002      1003      1004      1005 
## 0.4901792 0.4792185 0.4668185 0.4740011 0.4927877 0.4938562 0.4951016 
##      1006      1007      1008      1009      1010      1011      1012 
## 0.4872861 0.4907013 0.4844026 0.4906963 0.5119988 0.4895152 0.4706761 
##      1013      1014      1015      1016      1017      1018 
## 0.4744593 0.4799583 0.4935775 0.5030894 0.4978806 0.4886331
lda_class[1:20]
##  [1] Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Down Up   Up  
## [15] Up   Up   Up   Down Up   Up  
## Levels: Down Up

Quadrant Discriminant Analysis

qda_fit<-qda(Direction~Lag1+Lag2,data = Smarket,subset = train)
qda_fit
## Call:
## qda(Direction ~ Lag1 + Lag2, data = Smarket, subset = train)
## 
## Prior probabilities of groups:
##     Down       Up 
## 0.491984 0.508016 
## 
## Group means:
##             Lag1        Lag2
## Down  0.04279022  0.03389409
## Up   -0.03954635 -0.03132544

KNN

library("class")
train_X=cbind(Lag1,Lag2)[train,]
test_X=cbind(Lag1,Lag2)[!train,]
train_Direction=Direction[train]
set.seed(1)
knn_pred<-knn(train_X,test_X,train_Direction,k=3)
table(knn_pred,Direction.2005)
##         Direction.2005
## knn_pred Down Up
##     Down   48 55
##     Up     63 86
attach(Caravan)
standardized_X=scale(Caravan[,-86])

test=1:1000
train_x=standardized_X[-test,]
test_X=standardized_X[test,]

train_Y=Purchase[-test]
test_Y=Purchase[test]
set.seed(1)

knn_pred=knn(train_x,test_X,train_Y,k=1)
mean(test_Y!=knn_pred)
## [1] 0.118

Sampling and Bootstrapping methods

set.seed(1)
train=sample(392,196)
attach(Auto)
lm_fit<-lm(mpg~horsepower,data=Auto,subset = train)
mean((mpg- predict(lm_fit,Auto))[-train]^2)
## [1] 26.14142
library("boot")
## 
## Attaching package: 'boot'
## The following object is masked from 'package:car':
## 
##     logit
glm_fit<-glm(mpg~horsepower,data = Auto)
cv.err=cv.glm(Auto,glm_fit)
cv.err$K
## [1] 392
cv.error=rep(0,5)

for(i in 1:5){
  glm_fit<-glm(mpg~poly(horsepower,i),data = Auto)
  cv.error[i]=cv.glm(Auto,glm_fit)$delta[i]
}

cv.error
## [1] 24.23151 19.24787       NA       NA       NA

Bootstrap

alpha_fn=function(data,index){
  X=data$X[index]
  Y=data$Y[index]
  return((var(Y)-cov(X,Y))/(var(X)+var(Y)-2*cov(X,Y)))
}
alpha_fn(Portfolio,1:100)
## [1] 0.5758321
boot(Portfolio,alpha_fn,R=1000)
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Portfolio, statistic = alpha_fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##      original     bias    std. error
## t1* 0.5758321 0.00107223  0.08860155

Best Subset Selection Methods

Hitters=na.omit(Hitters)
regfit_full=regsubsets(Salary~.,Hitters)
summary(regfit_full)
## Subset selection object
## Call: regsubsets.formula(Salary ~ ., Hitters)
## 19 Variables  (and intercept)
##            Forced in Forced out
## AtBat          FALSE      FALSE
## Hits           FALSE      FALSE
## HmRun          FALSE      FALSE
## Runs           FALSE      FALSE
## RBI            FALSE      FALSE
## Walks          FALSE      FALSE
## Years          FALSE      FALSE
## CAtBat         FALSE      FALSE
## CHits          FALSE      FALSE
## CHmRun         FALSE      FALSE
## CRuns          FALSE      FALSE
## CRBI           FALSE      FALSE
## CWalks         FALSE      FALSE
## LeagueN        FALSE      FALSE
## DivisionW      FALSE      FALSE
## PutOuts        FALSE      FALSE
## Assists        FALSE      FALSE
## Errors         FALSE      FALSE
## NewLeagueN     FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
##          AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns
## 1  ( 1 ) " "   " "  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 2  ( 1 ) " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 3  ( 1 ) " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 4  ( 1 ) " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 5  ( 1 ) "*"   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 6  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    " "  
## 7  ( 1 ) " "   "*"  " "   " "  " " "*"   " "   "*"    "*"   "*"    " "  
## 8  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   " "    " "   "*"    "*"  
##          CRBI CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1  ( 1 ) "*"  " "    " "     " "       " "     " "     " "    " "       
## 2  ( 1 ) "*"  " "    " "     " "       " "     " "     " "    " "       
## 3  ( 1 ) "*"  " "    " "     " "       "*"     " "     " "    " "       
## 4  ( 1 ) "*"  " "    " "     "*"       "*"     " "     " "    " "       
## 5  ( 1 ) "*"  " "    " "     "*"       "*"     " "     " "    " "       
## 6  ( 1 ) "*"  " "    " "     "*"       "*"     " "     " "    " "       
## 7  ( 1 ) " "  " "    " "     "*"       "*"     " "     " "    " "       
## 8  ( 1 ) " "  "*"    " "     "*"       "*"     " "     " "    " "

The number of stars indicate the importance of the variable. In the given configuration out of the 8 possible times that the iteration was run hits were found to be important all of the time whereas CRBI and PutOuts were found important 6 times

regfit_full<-regsubsets(Salary~.,data = Hitters,nvmax = 19)
reg_summary=summary(regfit_full)

reg_summary$rsq
##  [1] 0.3214501 0.4252237 0.4514294 0.4754067 0.4908036 0.5087146 0.5141227
##  [8] 0.5285569 0.5346124 0.5404950 0.5426153 0.5436302 0.5444570 0.5452164
## [15] 0.5454692 0.5457656 0.5459518 0.5460945 0.5461159
par(mfrow=c(2,2))

plot(reg_summary$rss,xlab="Number of variables",ylab="RSS",type="l")
plot(reg_summary$adjr2,xlab="Number of variables",ylab="Adjusted Rsq",type="l")

which.max(reg_summary$adjr2)
## [1] 11
plot(regfit_full,scale="adjr2")
points(11,reg_summary$adjr2[11],col="red",cex=2,pch=20)

plot(regfit_full,scale="r2")

Forward and Backward Stepwise selection

regfit_fwd<-regsubsets(Salary~.,data = Hitters,nvmax = 19,method = "forward")
summary(regfit_fwd)
## Subset selection object
## Call: regsubsets.formula(Salary ~ ., data = Hitters, nvmax = 19, method = "forward")
## 19 Variables  (and intercept)
##            Forced in Forced out
## AtBat          FALSE      FALSE
## Hits           FALSE      FALSE
## HmRun          FALSE      FALSE
## Runs           FALSE      FALSE
## RBI            FALSE      FALSE
## Walks          FALSE      FALSE
## Years          FALSE      FALSE
## CAtBat         FALSE      FALSE
## CHits          FALSE      FALSE
## CHmRun         FALSE      FALSE
## CRuns          FALSE      FALSE
## CRBI           FALSE      FALSE
## CWalks         FALSE      FALSE
## LeagueN        FALSE      FALSE
## DivisionW      FALSE      FALSE
## PutOuts        FALSE      FALSE
## Assists        FALSE      FALSE
## Errors         FALSE      FALSE
## NewLeagueN     FALSE      FALSE
## 1 subsets of each size up to 19
## Selection Algorithm: forward
##           AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns
## 1  ( 1 )  " "   " "  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 2  ( 1 )  " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 3  ( 1 )  " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 4  ( 1 )  " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 5  ( 1 )  "*"   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 6  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    " "  
## 7  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    " "  
## 8  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    "*"  
## 9  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   "*"    " "   " "    "*"  
## 10  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   "*"    " "   " "    "*"  
## 11  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   "*"    " "   " "    "*"  
## 12  ( 1 ) "*"   "*"  " "   "*"  " " "*"   " "   "*"    " "   " "    "*"  
## 13  ( 1 ) "*"   "*"  " "   "*"  " " "*"   " "   "*"    " "   " "    "*"  
## 14  ( 1 ) "*"   "*"  "*"   "*"  " " "*"   " "   "*"    " "   " "    "*"  
## 15  ( 1 ) "*"   "*"  "*"   "*"  " " "*"   " "   "*"    "*"   " "    "*"  
## 16  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   " "   "*"    "*"   " "    "*"  
## 17  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   " "   "*"    "*"   " "    "*"  
## 18  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   "*"   "*"    "*"   " "    "*"  
## 19  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   "*"   "*"    "*"   "*"    "*"  
##           CRBI CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1  ( 1 )  "*"  " "    " "     " "       " "     " "     " "    " "       
## 2  ( 1 )  "*"  " "    " "     " "       " "     " "     " "    " "       
## 3  ( 1 )  "*"  " "    " "     " "       "*"     " "     " "    " "       
## 4  ( 1 )  "*"  " "    " "     "*"       "*"     " "     " "    " "       
## 5  ( 1 )  "*"  " "    " "     "*"       "*"     " "     " "    " "       
## 6  ( 1 )  "*"  " "    " "     "*"       "*"     " "     " "    " "       
## 7  ( 1 )  "*"  "*"    " "     "*"       "*"     " "     " "    " "       
## 8  ( 1 )  "*"  "*"    " "     "*"       "*"     " "     " "    " "       
## 9  ( 1 )  "*"  "*"    " "     "*"       "*"     " "     " "    " "       
## 10  ( 1 ) "*"  "*"    " "     "*"       "*"     "*"     " "    " "       
## 11  ( 1 ) "*"  "*"    "*"     "*"       "*"     "*"     " "    " "       
## 12  ( 1 ) "*"  "*"    "*"     "*"       "*"     "*"     " "    " "       
## 13  ( 1 ) "*"  "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 14  ( 1 ) "*"  "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 15  ( 1 ) "*"  "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 16  ( 1 ) "*"  "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 17  ( 1 ) "*"  "*"    "*"     "*"       "*"     "*"     "*"    "*"       
## 18  ( 1 ) "*"  "*"    "*"     "*"       "*"     "*"     "*"    "*"       
## 19  ( 1 ) "*"  "*"    "*"     "*"       "*"     "*"     "*"    "*"
set.seed(1)
train=sample(c(TRUE,FALSE),nrow(Hitters),rep=TRUE)
test=(!train)

regfit_best=regsubsets(Salary~.,data=Hitters[train,],nvmax = 19)
test_mat=model.matrix(Salary~.,data = Hitters[test,])

val_errors=rep(NA,19)
for(i in 1:19){
  coefi=coef(regfit_best,id=i)
  pred_test=test_mat[,names(coefi)] %*% coefi
  val_errors[i]=mean((Hitters$Salary[test]-pred_test)^2)
  
  
}

val_errors
##  [1] 220968.0 169157.1 178518.2 163426.1 168418.1 171270.6 162377.1
##  [8] 157909.3 154055.7 148162.1 151156.4 151742.5 152214.5 157358.7
## [15] 158541.4 158743.3 159972.7 159859.8 160105.6

Since there is no predict function for regsubsets we are creating our own predict function

predict_regsubsets<-function(object,newdata,id,...){
  form=as.formula(object$call[[2]])
  mat=model.matrix(form,newdata)
  coefi=coef(object,id=id)
  xvars=names(coefi)
  mat[,xvars]%*% coefi
  }
regfit_best=regsubsets(Salary~.,data=Hitters,nvmax=19)
coef(regfit_best,10)
##  (Intercept)        AtBat         Hits        Walks       CAtBat 
##  162.5354420   -2.1686501    6.9180175    5.7732246   -0.1300798 
##        CRuns         CRBI       CWalks    DivisionW      PutOuts 
##    1.4082490    0.7743122   -0.8308264 -112.3800575    0.2973726 
##      Assists 
##    0.2831680
k=10
set.seed(1)
folds=sample(1:k,nrow(Hitters),replace = TRUE)
cv_errors=matrix(NA,k,19,dimnames=list(NULL,paste(1:19)))


for(j in 1:k){
  best_fit=regsubsets(Salary~.,data = Hitters[folds!=j,],nvmax=19)
  
  for(i in 1:19){
    pred=predict_regsubsets(best_fit,Hitters[folds==j,],id=i)
    cv_errors[j,i]=mean((Hitters$Salary[folds==j]-pred)^2)
  }
}
mean_cv_errors=apply(cv_errors, 2, mean)
plot(mean_cv_errors,type="b")

Ridge Regression and Lasso

x=model.matrix(Salary~.,Hitters)[,-1]
y=Hitters$Salary

model.matrix converts categorical variables into quantitative by making use of the dummy variables

Ridge Regression

grid=10^seq(10,-2,length=100)
ridge_mod=glmnet(x,y,alpha=0,lambda = grid)

ridge_mod$lambda[50]
## [1] 11497.57
coef(ridge_mod)[,50]
##   (Intercept)         AtBat          Hits         HmRun          Runs 
## 407.356050200   0.036957182   0.138180344   0.524629976   0.230701523 
##           RBI         Walks         Years        CAtBat         CHits 
##   0.239841459   0.289618741   1.107702929   0.003131815   0.011653637 
##        CHmRun         CRuns          CRBI        CWalks       LeagueN 
##   0.087545670   0.023379882   0.024138320   0.025015421   0.085028114 
##     DivisionW       PutOuts       Assists        Errors    NewLeagueN 
##  -6.215440973   0.016482577   0.002612988  -0.020502690   0.301433531

Calculate l2 norm

sqrt(sum(coef(ridge_mod)[-1,50]^2))
## [1] 6.360612

The higher the lambda, shrunker are the coefficients and hence less the l2 Let us calculate for some other value of lambda

sqrt(sum(coef(ridge_mod)[-1,60]^2))
## [1] 57.11001

Can use the predict function too, we just have to use type=“coefficients”

predict(ridge_mod,type = "coefficients",s=50)[1:20,]
##   (Intercept)         AtBat          Hits         HmRun          Runs 
##  4.876610e+01 -3.580999e-01  1.969359e+00 -1.278248e+00  1.145892e+00 
##           RBI         Walks         Years        CAtBat         CHits 
##  8.038292e-01  2.716186e+00 -6.218319e+00  5.447837e-03  1.064895e-01 
##        CHmRun         CRuns          CRBI        CWalks       LeagueN 
##  6.244860e-01  2.214985e-01  2.186914e-01 -1.500245e-01  4.592589e+01 
##     DivisionW       PutOuts       Assists        Errors    NewLeagueN 
## -1.182011e+02  2.502322e-01  1.215665e-01 -3.278600e+00 -9.496680e+00
set.seed(1)
train=sample(1:nrow(x),nrow(x)/2)
test=-train
y_test=y[test]

Fitting a ridge regression model

ridge_mod=glmnet(x[train,],y[train],alpha=0,lambda = grid,thresh = 1e-12)
ridge_pred=predict(ridge_mod,s=4,newx = x[test,])
mean((ridge_pred-y_test)^2)
## [1] 101036.8
mean((mean(y[train])-y_test)^2)
## [1] 193253.1
#ridge_pred=predict(ridge_mod,s=0,newx = x[test,],exact = T)

#mean((ridge_pred-y_test)^2)

#lm(y~x,subset = train)

#the following commands are not running 

# predict(object = ridge_mod,s=0,exact = T,type="coefficients")

# predict.glmnet(ridge_mod,s=0,exact = T,type="coefficients")[1:20,]

Using cross-validation to find the best value of lambda

set.seed(1)
cv_out=cv.glmnet(x[train,],y[train],alpha=0)

plot(cv_out)

bestlam=cv_out$lambda.min
bestlam
## [1] 211.7416

The best lambda value is 212

ridge_pred=predict(ridge_mod,s=bestlam,newx=x[test,])
mean((ridge_pred-y_test)^2)
## [1] 96015.51

Lasso Model

lasso_mod=glmnet(x[train,],y[train],alpha=1,lambda=grid)
plot(lasso_mod)

set.seed(1)
cv_out=cv.glmnet(x[train,],y[train],alpha=1)
plot(cv_out)

bestlam=cv_out$lambda.min
lasso_pred=predict(lasso_mod,s=bestlam,newx=x[test,])
mean((lasso_pred-y_test)^2)
## [1] 100743.4

The mean error is comparable to the ridge but in the ridge the coefficients are shrunken but they are not made zero

out=glmnet(x,y,alpha=1,lambda = grid)
lasso_coef=predict(out,type="coefficients",s=bestlam)[1:20,]

lasso_coef
##  (Intercept)        AtBat         Hits        HmRun         Runs 
##   18.5394844    0.0000000    1.8735390    0.0000000    0.0000000 
##          RBI        Walks        Years       CAtBat        CHits 
##    0.0000000    2.2178444    0.0000000    0.0000000    0.0000000 
##       CHmRun        CRuns         CRBI       CWalks      LeagueN 
##    0.0000000    0.2071252    0.4130132    0.0000000    3.2666677 
##    DivisionW      PutOuts      Assists       Errors   NewLeagueN 
## -103.4845458    0.2204284    0.0000000    0.0000000    0.0000000

As we see that most of the coefs are made zero so it’s easier to interpret

PCR and PLS

set.seed(2)
pcr_fit<-pcr(Salary~.,data=Hitters,scale=TRUE,validation="CV")
summary(pcr_fit)
## Data:    X dimension: 263 19 
##  Y dimension: 263 1
## Fit method: svdpc
## Number of components considered: 19
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV             452    348.9    352.2    353.5    352.8    350.1    349.1
## adjCV          452    348.7    351.8    352.9    352.1    349.3    348.0
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV       349.6    350.9    352.9     353.8     355.0     356.2     363.5
## adjCV    348.5    349.8    351.6     352.3     353.4     354.5     361.6
##        14 comps  15 comps  16 comps  17 comps  18 comps  19 comps
## CV        355.2     357.4     347.6     350.1     349.2     352.6
## adjCV     352.8     355.2     345.5     347.6     346.7     349.8
## 
## TRAINING: % variance explained
##         1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps
## X         38.31    60.16    70.84    79.03    84.29    88.63    92.26
## Salary    40.63    41.58    42.17    43.22    44.90    46.48    46.69
##         8 comps  9 comps  10 comps  11 comps  12 comps  13 comps  14 comps
## X         94.96    96.28     97.26     97.98     98.65     99.15     99.47
## Salary    46.75    46.86     47.76     47.82     47.85     48.10     50.40
##         15 comps  16 comps  17 comps  18 comps  19 comps
## X          99.75     99.89     99.97     99.99    100.00
## Salary     50.55     53.01     53.85     54.61     54.61
validationplot(pcr_fit,val.type = "MSEP")

set.seed(1)

pcr_fit<-pcr(Salary~.,data=Hitters,subset=train,scale=TRUE,validation="CV")
validationplot(pcr_fit,val.type = "MSEP")

pcr_fit=pcr(y~x,scale=TRUE,ncomp=7)
summary(pcr_fit)
## Data:    X dimension: 263 19 
##  Y dimension: 263 1
## Fit method: svdpc
## Number of components considered: 7
## TRAINING: % variance explained
##    1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps
## X    38.31    60.16    70.84    79.03    84.29    88.63    92.26
## y    40.63    41.58    42.17    43.22    44.90    46.48    46.69

Partial Least Squares

set.seed(1)
pls_fit=plsr(Salary~.,data=Hitters,subset=train,scale=TRUE,validation="CV")
summary(pls_fit)
## Data:    X dimension: 131 19 
##  Y dimension: 131 1
## Fit method: kernelpls
## Number of components considered: 19
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV           464.6    394.2    391.5    393.1    395.0    415.0    424.0
## adjCV        464.6    393.4    390.2    391.1    392.9    411.5    418.8
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV       424.5    415.8    404.6     407.1     412.0     414.4     410.3
## adjCV    418.9    411.4    400.7     402.2     407.2     409.3     405.6
##        14 comps  15 comps  16 comps  17 comps  18 comps  19 comps
## CV        406.2     408.6     410.5     408.8     407.8     410.2
## adjCV     401.8     403.9     405.6     404.1     403.2     405.5
## 
## TRAINING: % variance explained
##         1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps
## X         38.12    53.46    66.05    74.49    79.33    84.56    87.09
## Salary    33.58    38.96    41.57    42.43    44.04    45.59    47.05
##         8 comps  9 comps  10 comps  11 comps  12 comps  13 comps  14 comps
## X         90.74    92.55     93.94     97.23     97.88     98.35     98.85
## Salary    47.53    48.42     49.68     50.04     50.54     50.78     50.92
##         15 comps  16 comps  17 comps  18 comps  19 comps
## X          99.11     99.43     99.78     99.99    100.00
## Salary     51.04     51.11     51.15     51.16     51.18

Non-Linear Modelling

attach(Wage)
## The following object is masked from Auto:
## 
##     year
## The following object is masked from Boston:
## 
##     age
fit=lm(wage~poly(age,4),data=Wage)
coef(summary(fit))
##                 Estimate Std. Error    t value     Pr(>|t|)
## (Intercept)    111.70361  0.7287409 153.283015 0.000000e+00
## poly(age, 4)1  447.06785 39.9147851  11.200558 1.484604e-28
## poly(age, 4)2 -478.31581 39.9147851 -11.983424 2.355831e-32
## poly(age, 4)3  125.52169 39.9147851   3.144742 1.678622e-03
## poly(age, 4)4  -77.91118 39.9147851  -1.951938 5.103865e-02
fit=lm(wage~poly(age,4,raw=T),data=Wage)
coef(summary(fit))
##                             Estimate   Std. Error   t value     Pr(>|t|)
## (Intercept)            -1.841542e+02 6.004038e+01 -3.067172 0.0021802539
## poly(age, 4, raw = T)1  2.124552e+01 5.886748e+00  3.609042 0.0003123618
## poly(age, 4, raw = T)2 -5.638593e-01 2.061083e-01 -2.735743 0.0062606446
## poly(age, 4, raw = T)3  6.810688e-03 3.065931e-03  2.221409 0.0263977518
## poly(age, 4, raw = T)4 -3.203830e-05 1.641359e-05 -1.951938 0.0510386498

Important: use the I() function if you want to type the exact poly. Example age^2 should be written as I(age^2)

agelims=range(age)
age_grid=seq(from=agelims[1],to=agelims[2])
preds=predict(fit,newdata = list(age=age_grid),se=TRUE)
se_bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
par(mfrow=c(1,2),mar=c(4.5,4.5,1,1),oma=c(0,0,4,0))
plot(age,wage,xlim=agelims,cex=.5,col="darkgrey")
title("Degree - 4 Polynomial", outer = T)
lines(age_grid,preds$fit,lwd=2,col="blue")
matlines(age_grid,se_bands,lwd=1,col="blue",lty=3)

How to determine what degree of polynomial to use ?

The anova function works here like a charm because of its simplicity and efficacy

The following code is slight different from the book. I created a function rather than typing in five lines of code.

polynomial_R<-function(degree_i){
 assign(paste("fit_",degree_i),lm(wage~poly(age,degree_i)),envir = parent.frame())
}

Either I can call the function 5 times or I can run a for loop

for (i in 1:5){
polynomial_R(i)
    }
anova(`fit_ 1`,`fit_ 2`,`fit_ 3`,`fit_ 4`,`fit_ 5`)
## Analysis of Variance Table
## 
## Model 1: wage ~ poly(age, degree_i)
## Model 2: wage ~ poly(age, degree_i)
## Model 3: wage ~ poly(age, degree_i)
## Model 4: wage ~ poly(age, degree_i)
## Model 5: wage ~ poly(age, degree_i)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8888  0.001679 ** 
## 4   2995 4771604  1      6070   3.8098  0.051046 .  
## 5   2994 4770322  1      1283   0.8050  0.369682    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fit<-glm(I(wage>250)~poly(age,4),data=Wage,family = binomial)
preds=predict(fit,newdata = list(age=age_grid),se=T)
pfit=exp(preds$fit)/(1+exp(preds$fit))
se_bands_logit=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
se_bands=exp(se_bands_logit)/(1+exp(se_bands_logit))


preds=predict(fit,newdata=list(age=age_grid),type = "response",se=T)
plot(age,I(wage>250),xlim=agelims,type = "n",ylim=c(0,.2))
points(jitter(age),I((wage>250)/5),cex=0.5,pch="|",col="darkgrey")
lines(age_grid,pfit,lwd=2,col="blue")
matlines(age_grid,se_bands,lwd=1,col="blue",lty=3)

table(cut(age,4))
## 
## (17.9,33.5]   (33.5,49]   (49,64.5] (64.5,80.1] 
##         750        1399         779          72
fit=lm(wage~cut(age,4),data=Wage)
coef(summary(fit))
##                         Estimate Std. Error   t value     Pr(>|t|)
## (Intercept)            94.158392   1.476069 63.789970 0.000000e+00
## cut(age, 4)(33.5,49]   24.053491   1.829431 13.148074 1.982315e-38
## cut(age, 4)(49,64.5]   23.664559   2.067958 11.443444 1.040750e-29
## cut(age, 4)(64.5,80.1]  7.640592   4.987424  1.531972 1.256350e-01

Splines

fit=lm(wage~bs(age,knots = c(25,40,60)),data=Wage)
pred=predict(fit,newdata = list(age=age_grid),se=T)
{plot(age,wage,col="gray")
lines(age_grid,pred$fit,lwd=2)
lines(age_grid ,pred$fit +2* pred$se ,lty ="dashed")
lines(age_grid ,pred$fit -2* pred$se ,lty ="dashed")
}

Fitting a natural spline

fit2=lm(wage~ns(age,df=4),data=Wage)

summary(fit2)
## 
## Call:
## lm(formula = wage ~ ns(age, df = 4), data = Wage)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -98.737 -24.477  -5.083  15.371 204.874 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        58.556      4.235  13.827   <2e-16 ***
## ns(age, df = 4)1   60.462      4.190  14.430   <2e-16 ***
## ns(age, df = 4)2   41.963      4.372   9.597   <2e-16 ***
## ns(age, df = 4)3   97.020     10.386   9.341   <2e-16 ***
## ns(age, df = 4)4    9.773      8.657   1.129    0.259    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 39.92 on 2995 degrees of freedom
## Multiple R-squared:  0.08598,    Adjusted R-squared:  0.08476 
## F-statistic: 70.43 on 4 and 2995 DF,  p-value: < 2.2e-16
#**********************WARNING*******************#

#pred=predict(fit2,newdata=list(age_grid),se=T)
#lines(age_grid,pred$fit,col="red",lwd=2)

I don’t know why a vector of length 3000 is being generated when the predict function is applied to only a handful of values

plot(age,wage,xlim=agelims,cex=0.5,col="darkgrey")
title("Smoothing spline")
fit=smooth.spline(age,wage,df=16)
fit2=smooth.spline(age,wage,cv=TRUE)
## Warning in smooth.spline(age, wage, cv = TRUE): cross-validation with non-
## unique 'x' values seems doubtful
fit2$df
## [1] 6.794596
lines(fit,col="red",lwd=2)
lines(fit2,col="blue",lwd=2)
legend("topright",legend=c("16 DF","6.8 DF"),col=c("red","blue"),lty=1,lwd=2,cex=0.8)

Loess or local regression. I would like to research the difference between local and segmented regression

plot(age,wage,xlim=agelims,cex=0.5,col="darkgrey")
title("Local Regression")
fit=loess(wage~age,span = 0.2,data=Wage)
fit2=loess(wage~age,span = 0.5,data=Wage)
lines(age_grid,predict(fit,data.frame(age=age_grid)),col="red",lwd=2)
lines(age_grid,predict(fit2,data.frame(age=age_grid)),col="blue",lwd=2)
legend("topright",legend=c("Span=0.2","Span=0.5"),col=c("red","blue"),lty=1,lwd=2,cex=0.8)

GAM

Install gam and library(gam) at the top

gam1=lm(wage~ns(year,4)+ns(age,5)+education,data = Wage)
gam_m3=gam(wage~s(year,4)+s(age,5)+education,data = Wage)
par(mfrow=c(1,3))
plot(gam_m3,se=TRUE,col="blue")

plot.Gam(gam1,se=TRUE,col="red")

gam_m1=gam(wage~s(age,5)+education, data=Wage)
gam_m2=gam(wage~year+s(age,5)+education, data=Wage)
anova(gam_m1,gam_m2,gam_m3,test = "F")
## Analysis of Deviance Table
## 
## Model 1: wage ~ s(age, 5) + education
## Model 2: wage ~ year + s(age, 5) + education
## Model 3: wage ~ s(year, 4) + s(age, 5) + education
##   Resid. Df Resid. Dev Df Deviance       F    Pr(>F)    
## 1      2990    3711731                                  
## 2      2989    3693842  1  17889.2 14.4771 0.0001447 ***
## 3      2986    3689770  3   4071.1  1.0982 0.3485661    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(gam_m3)
## 
## Call: gam(formula = wage ~ s(year, 4) + s(age, 5) + education, data = Wage)
## Deviance Residuals:
##     Min      1Q  Median      3Q     Max 
## -119.43  -19.70   -3.33   14.17  213.48 
## 
## (Dispersion Parameter for gaussian family taken to be 1235.69)
## 
##     Null Deviance: 5222086 on 2999 degrees of freedom
## Residual Deviance: 3689770 on 2986 degrees of freedom
## AIC: 29887.75 
## 
## Number of Local Scoring Iterations: 2 
## 
## Anova for Parametric Effects
##              Df  Sum Sq Mean Sq F value    Pr(>F)    
## s(year, 4)    1   27162   27162  21.981 2.877e-06 ***
## s(age, 5)     1  195338  195338 158.081 < 2.2e-16 ***
## education     4 1069726  267432 216.423 < 2.2e-16 ***
## Residuals  2986 3689770    1236                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##             Npar Df Npar F  Pr(F)    
## (Intercept)                          
## s(year, 4)        3  1.086 0.3537    
## s(age, 5)         4 32.380 <2e-16 ***
## education                            
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Decision Trees

attach(Carseats)
High=ifelse(Sales<=8,"No","Yes")
Carseats=data.frame(Carseats,High)

Fit a classification tree

tree_carseats=tree(High~.-Sales,Carseats)
summary(tree_carseats)
## 
## Classification tree:
## tree(formula = High ~ . - Sales, data = Carseats)
## Variables actually used in tree construction:
## [1] "ShelveLoc"   "Price"       "Income"      "CompPrice"   "Population" 
## [6] "Advertising" "Age"         "US"         
## Number of terminal nodes:  27 
## Residual mean deviance:  0.4575 = 170.7 / 373 
## Misclassification error rate: 0.09 = 36 / 400
plot(tree_carseats)
text(tree_carseats,pretty=0)

Compute the test error on the dataset

set.seed(2)
train=sample(1:nrow(Carseats),200)
Carseats_test=Carseats[-train,]
High_test=High[-train]
tree_carseats=tree(High~.-Sales,Carseats,subset = train)
tree_pred=predict(tree_carseats,Carseats_test,type = "class")
table(tree_pred,High_test)
##          High_test
## tree_pred No Yes
##       No  86  27
##       Yes 30  57

Prune the tree

set.seed(3)
cv_carseats=cv.tree(tree_carseats,FUN=prune.misclass)
names(cv_carseats)
## [1] "size"   "dev"    "k"      "method"
par(mfrow=c(1,2))
plot(cv_carseats$size,cv_carseats$dev,type="b")
plot(cv_carseats$k,cv_carseats$dev,type="b")

prune_carseats=prune.misclass(tree_carseats,best=9)
plot(prune_carseats)
text(prune_carseats,pretty = 0)

After find the tree with the best k, we will use the predict function to find the accuracy of the tree

tree_pred=predict(prune_carseats,Carseats_test,type="class")
table(tree_pred,High_test)
##          High_test
## tree_pred No Yes
##       No  94  24
##       Yes 22  60

Fitting Regression Trees

set.seed(1)
train=sample(1:nrow(Boston),nrow(Boston)/2)
tree_boston=tree(medv~.,Boston,subset = train)
summary(tree_boston)
## 
## Regression tree:
## tree(formula = medv ~ ., data = Boston, subset = train)
## Variables actually used in tree construction:
## [1] "lstat" "rm"    "dis"  
## Number of terminal nodes:  8 
## Residual mean deviance:  12.65 = 3099 / 245 
## Distribution of residuals:
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -14.10000  -2.04200  -0.05357   0.00000   1.96000  12.60000

Note: In the context of regression tree, the deviance is simply the sum of squared errors for the tree

plot(tree_boston)
text(tree_boston,pretty = 0)

Cross-Validation

cv_boston<-cv.tree(tree_boston)
plot(cv_boston$size,cv_boston$dev,type='b')

Prune tree

prune_boston<-prune.tree(tree_boston,best=5)
plot(prune_boston)
text(prune_boston,pretty=0)

Predictions on the test set

yhat=predict(tree_boston,newdata = Boston[-train,])
boston_test=Boston[-train,"medv"]
plot(yhat,boston_test)
abline(0,1)

mean((yhat-boston_test)^2)
## [1] 25.04559

Bagging and Random Forests

set.seed(1)
bag_boston=randomForest(medv~.,data = Boston,subset = train,mtry=13,importance=TRUE)

Bagging Tree prediction on test set

yhat_bag=predict(bag_boston,newdata=Boston[-train,])
plot(yhat_bag,boston_test)
abline(0,1)

mean((yhat_bag-boston_test)^2)
## [1] 13.50808

In the book it is mentioned that p/3 variables are used in regression trees and square root of p are used for classification trees

set.seed(1)
rf_boston=randomForest(medv~.,data = Boston,subset = train,mtry=6,importance=TRUE)
importance(rf_boston)
##           %IncMSE IncNodePurity
## crim    12.132320     986.50338
## zn       1.955579      57.96945
## indus    9.069302     882.78261
## chas     2.210835      45.22941
## nox     11.104823    1044.33776
## rm      31.784033    6359.31971
## age     10.962684     516.82969
## dis     15.015236    1224.11605
## rad      4.118011      95.94586
## tax      8.587932     502.96719
## ptratio 12.503896     830.77523
## black    6.702609     341.30361
## lstat   30.695224    7505.73936
varImpPlot(rf_boston)

Boosting

set.seed(1)
boost_boston=gbm(medv~.,data=Boston[train,],distribution = "gaussian",n.trees = 5000,interaction.depth = 4)
summary(boost_boston)

##             var    rel.inf
## lstat     lstat 37.0661275
## rm           rm 25.3533123
## dis         dis 11.7903016
## crim       crim  8.0388750
## black     black  4.2531659
## nox         nox  3.5058570
## age         age  3.4868724
## ptratio ptratio  2.2500385
## indus     indus  1.7725070
## tax         tax  1.1836592
## chas       chas  0.7441319
## rad         rad  0.4274311
## zn           zn  0.1277206
par(mfrow=c(1,2))
plot(boost_boston,i="rm")

plot(boost_boston,i="lstat")

boost_boston=gbm(medv~.,data=Boston[train,],n.trees = 5000,interaction.depth = 4,shrinkage = 0.2,verbose = F)
## Distribution not specified, assuming gaussian ...
yhat_boost=predict(boost_boston,newdata=Boston[-train,],n.trees = 5000)
mean((yhat_boost-boston_test)^2)
## [1] 11.51109

Support Vector Machines

set.seed(1)
x=matrix(rnorm(20*2),ncol=2)
y=c(rep(-1,10),rep(1,10))
x[y==1,]=x[y==1,]+1
plot(x,col=(3-y))

dat<-data.frame(x=x,y=as.factor(y))
svmfit=svm(y~.,data=dat,kernel="linear",cost=10,scale=FALSE)
plot(svmfit,dat)

Plot function is able to draw boundaries, color regions, mark boundaries, mark observations

summary(svmfit)
## 
## Call:
## svm(formula = y ~ ., data = dat, kernel = "linear", cost = 10, 
##     scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  10 
##       gamma:  0.5 
## 
## Number of Support Vectors:  7
## 
##  ( 4 3 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  -1 1
svmfit<-svm(y~.,data=dat,kernel="linear",cost=0.01,scale=FALSE)
plot(svmfit,dat)

Cross-Validation. tune function helps you input different

set.seed(1)
tune_out<-tune(svm,y~.,data=dat,kernel="linear",ranges=list(cost=c(0.001,0.01,0.1,1,5,10,100)))
summary(tune_out)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##   0.1
## 
## - best performance: 0.1 
## 
## - Detailed performance results:
##    cost error dispersion
## 1 1e-03  0.70  0.4216370
## 2 1e-02  0.70  0.4216370
## 3 1e-01  0.10  0.2108185
## 4 1e+00  0.15  0.2415229
## 5 5e+00  0.15  0.2415229
## 6 1e+01  0.15  0.2415229
## 7 1e+02  0.15  0.2415229
bestmod=tune_out$best.model

Generating the test data

xtest=matrix(rnorm(20*2),ncol=2)
ytest=sample(c(-1,1),20,rep=TRUE)
xtest[ytest==1,]=xtest[ytest==1,]+1
testdat=data.frame(x=xtest,y=as.factor(ytest))

Predictions on test data

ypred=predict(bestmod,testdat)
table(predict=ypred,truth=testdat$y)
##        truth
## predict -1  1
##      -1 11  1
##      1   0  8

Again separating the two classes

x[y==1,]=x[y==1,]+0.5
plot(x,col=(y+5)/2,pch=19)

dat=data.frame(x=x,y=as.factor(y))
svmfit=svm(y~.,data=dat,kernel="linear",cost=1e5)
summary(svmfit)
## 
## Call:
## svm(formula = y ~ ., data = dat, kernel = "linear", cost = 1e+05)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1e+05 
##       gamma:  0.5 
## 
## Number of Support Vectors:  3
## 
##  ( 1 2 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  -1 1
plot(svmfit,dat)

More cost , we allow more number of misclassifications and hence bigger the margin

svmfit=svm(y~.,data=dat,kernel="linear",cost=1)
summary(svmfit)
## 
## Call:
## svm(formula = y ~ ., data = dat, kernel = "linear", cost = 1)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
##       gamma:  0.5 
## 
## Number of Support Vectors:  7
## 
##  ( 4 3 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  -1 1
plot(svmfit,dat)

set.seed(1)
x=matrix(rnorm(200*2),ncol=2)
x[1:100,]=x[1:100]+2
x[101:150,]=x[101:150]-2
y=c(rep(1,150),rep(2,50))
dat=data.frame(x=x,y=as.factor(y))
plot(x,col=y)

train=sample(200,100)
svmfit=svm(y~.,data=dat[train,],kernel="radial",gamma=1,cost=1)
plot(svmfit,dat[train,])

summary(svmfit)
## 
## Call:
## svm(formula = y ~ ., data = dat[train, ], kernel = "radial", 
##     gamma = 1, cost = 1)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
##       gamma:  1 
## 
## Number of Support Vectors:  41
## 
##  ( 20 21 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  1 2

Increasing the value of cost term

svmfit<-svm(y~.,data=dat[train,],kernel="radial",gamma=1,cost=1e5)
plot(svmfit,dat[train,])

Variation by cost and gamma

set.seed(1)

tune_out=tune(svm,y~.,data=dat[train,],kernel="radial",ranges = list(cost=c(0.1,1,10,100,1000),gamma=c(0.5,1,2,3,4)))

summary(tune_out)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##  1000     2
## 
## - best performance: 0.05 
## 
## - Detailed performance results:
##     cost gamma error dispersion
## 1  1e-01   0.5  0.27 0.11595018
## 2  1e+00   0.5  0.11 0.05676462
## 3  1e+01   0.5  0.13 0.04830459
## 4  1e+02   0.5  0.12 0.12292726
## 5  1e+03   0.5  0.09 0.07378648
## 6  1e-01   1.0  0.27 0.11595018
## 7  1e+00   1.0  0.13 0.04830459
## 8  1e+01   1.0  0.15 0.10801234
## 9  1e+02   1.0  0.10 0.09428090
## 10 1e+03   1.0  0.09 0.05676462
## 11 1e-01   2.0  0.27 0.11595018
## 12 1e+00   2.0  0.16 0.06992059
## 13 1e+01   2.0  0.16 0.09660918
## 14 1e+02   2.0  0.12 0.07888106
## 15 1e+03   2.0  0.05 0.05270463
## 16 1e-01   3.0  0.27 0.11595018
## 17 1e+00   3.0  0.16 0.06992059
## 18 1e+01   3.0  0.14 0.06992059
## 19 1e+02   3.0  0.12 0.07888106
## 20 1e+03   3.0  0.07 0.06749486
## 21 1e-01   4.0  0.27 0.11595018
## 22 1e+00   4.0  0.16 0.09660918
## 23 1e+01   4.0  0.14 0.06992059
## 24 1e+02   4.0  0.10 0.08164966
## 25 1e+03   4.0  0.10 0.08164966
table(true=dat[-train,"y"],pred=predict(tune_out$best.model,newx=dat[-train,]))
##     pred
## true  1  2
##    1 55 22
##    2 19  4

ROC Curves

rocplot=function(pred,truth,...)
  {
  predob=prediction(pred,truth)
  perf=performance(predob,"tpr","fpr")
  plot(perf,...)
 }
svmfit_opt=svm(y~.,data = dat[train,],kernel="radial",gamma=2,cost=1,decision.values=T)
fitted=attributes(predict(svmfit_opt,dat[train,],decision.values=TRUE))$decision.values

par(mfrow=c(1,2))
rocplot(fitted,dat[train,"y"],main="Training Data")


svmfit_flex=svm(y~.,data=dat[train,],kernel="radial",gamma=50,cost=1,decision.values=T)
fitted=attributes(predict(svmfit_flex,dat[train,],decision.values=TRUE))$decision.values
rocplot(fitted,dat[train,"y"],add=T,col="red")

Predictions on the test set

par(mfrow=c(1,2))

fitted=attributes(predict(svmfit_opt,dat[-train,],decision.values=TRUE))$decision.values
rocplot(fitted,dat[-train,"y"],main="Test Data")


fitted=attributes(predict(svmfit_flex,dat[-train,],decision.values=TRUE))$decision.values
rocplot(fitted,dat[-train,"y"],add=T,col="red")

SVM with multiple classes

Using one- versus all approach

Generating the data

set.seed(1)
x=rbind(x,matrix(rnorm(50*2),ncol=2))
y=c(y,rep(0,50))

x[y==0,2]=x[y==0,2]+2
dat=data.frame(x=x,y=as.factor(y))
par(mfrow=c(1,1))
plot(x,col=(y+1))

svmfit=svm(y~.,data=dat,kernel="radial",cost=10,gamma=1)
plot(svmfit,dat)

Application to the Gene expression data

names(Khan)
## [1] "xtrain" "xtest"  "ytrain" "ytest"

Principal Component Analysis

states=row.names(USArrests)
states
##  [1] "Alabama"        "Alaska"         "Arizona"        "Arkansas"      
##  [5] "California"     "Colorado"       "Connecticut"    "Delaware"      
##  [9] "Florida"        "Georgia"        "Hawaii"         "Idaho"         
## [13] "Illinois"       "Indiana"        "Iowa"           "Kansas"        
## [17] "Kentucky"       "Louisiana"      "Maine"          "Maryland"      
## [21] "Massachusetts"  "Michigan"       "Minnesota"      "Mississippi"   
## [25] "Missouri"       "Montana"        "Nebraska"       "Nevada"        
## [29] "New Hampshire"  "New Jersey"     "New Mexico"     "New York"      
## [33] "North Carolina" "North Dakota"   "Ohio"           "Oklahoma"      
## [37] "Oregon"         "Pennsylvania"   "Rhode Island"   "South Carolina"
## [41] "South Dakota"   "Tennessee"      "Texas"          "Utah"          
## [45] "Vermont"        "Virginia"       "Washington"     "West Virginia" 
## [49] "Wisconsin"      "Wyoming"
names(USArrests)
## [1] "Murder"   "Assault"  "UrbanPop" "Rape"
apply(USArrests, 2, mean)
##   Murder  Assault UrbanPop     Rape 
##    7.788  170.760   65.540   21.232
pr_out<-prcomp(USArrests,scale. = TRUE)
names(pr_out)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
pr_out$center
##   Murder  Assault UrbanPop     Rape 
##    7.788  170.760   65.540   21.232
pr_out$scale
##    Murder   Assault  UrbanPop      Rape 
##  4.355510 83.337661 14.474763  9.366385
pr_out$rotation
##                 PC1        PC2        PC3         PC4
## Murder   -0.5358995  0.4181809 -0.3412327  0.64922780
## Assault  -0.5831836  0.1879856 -0.2681484 -0.74340748
## UrbanPop -0.2781909 -0.8728062 -0.3780158  0.13387773
## Rape     -0.5434321 -0.1673186  0.8177779  0.08902432
pr_out$x
##                        PC1         PC2         PC3          PC4
## Alabama        -0.97566045  1.12200121 -0.43980366  0.154696581
## Alaska         -1.93053788  1.06242692  2.01950027 -0.434175454
## Arizona        -1.74544285 -0.73845954  0.05423025 -0.826264240
## Arkansas        0.13999894  1.10854226  0.11342217 -0.180973554
## California     -2.49861285 -1.52742672  0.59254100 -0.338559240
## Colorado       -1.49934074 -0.97762966  1.08400162  0.001450164
## Connecticut     1.34499236 -1.07798362 -0.63679250 -0.117278736
## Delaware       -0.04722981 -0.32208890 -0.71141032 -0.873113315
## Florida        -2.98275967  0.03883425 -0.57103206 -0.095317042
## Georgia        -1.62280742  1.26608838 -0.33901818  1.065974459
## Hawaii          0.90348448 -1.55467609  0.05027151  0.893733198
## Idaho           1.62331903  0.20885253  0.25719021 -0.494087852
## Illinois       -1.36505197 -0.67498834 -0.67068647 -0.120794916
## Indiana         0.50038122 -0.15003926  0.22576277  0.420397595
## Iowa            2.23099579 -0.10300828  0.16291036  0.017379470
## Kansas          0.78887206 -0.26744941  0.02529648  0.204421034
## Kentucky        0.74331256  0.94880748 -0.02808429  0.663817237
## Louisiana      -1.54909076  0.86230011 -0.77560598  0.450157791
## Maine           2.37274014  0.37260865 -0.06502225 -0.327138529
## Maryland       -1.74564663  0.42335704 -0.15566968 -0.553450589
## Massachusetts   0.48128007 -1.45967706 -0.60337172 -0.177793902
## Michigan       -2.08725025 -0.15383500  0.38100046  0.101343128
## Minnesota       1.67566951 -0.62590670  0.15153200  0.066640316
## Mississippi    -0.98647919  2.36973712 -0.73336290  0.213342049
## Missouri       -0.68978426 -0.26070794  0.37365033  0.223554811
## Montana         1.17353751  0.53147851  0.24440796  0.122498555
## Nebraska        1.25291625 -0.19200440  0.17380930  0.015733156
## Nevada         -2.84550542 -0.76780502  1.15168793  0.311354436
## New Hampshire   2.35995585 -0.01790055  0.03648498 -0.032804291
## New Jersey     -0.17974128 -1.43493745 -0.75677041  0.240936580
## New Mexico     -1.96012351  0.14141308  0.18184598 -0.336121113
## New York       -1.66566662 -0.81491072 -0.63661186 -0.013348844
## North Carolina -1.11208808  2.20561081 -0.85489245 -0.944789648
## North Dakota    2.96215223  0.59309738  0.29824930 -0.251434626
## Ohio            0.22369436 -0.73477837 -0.03082616  0.469152817
## Oklahoma        0.30864928 -0.28496113 -0.01515592  0.010228476
## Oregon         -0.05852787 -0.53596999  0.93038718 -0.235390872
## Pennsylvania    0.87948680 -0.56536050 -0.39660218  0.355452378
## Rhode Island    0.85509072 -1.47698328 -1.35617705 -0.607402746
## South Carolina -1.30744986  1.91397297 -0.29751723 -0.130145378
## South Dakota    1.96779669  0.81506822  0.38538073 -0.108470512
## Tennessee      -0.98969377  0.85160534  0.18619262  0.646302674
## Texas          -1.34151838 -0.40833518 -0.48712332  0.636731051
## Utah            0.54503180 -1.45671524  0.29077592 -0.081486749
## Vermont         2.77325613  1.38819435  0.83280797 -0.143433697
## Virginia        0.09536670  0.19772785  0.01159482  0.209246429
## Washington      0.21472339 -0.96037394  0.61859067 -0.218628161
## West Virginia   2.08739306  1.41052627  0.10372163  0.130583080
## Wisconsin       2.05881199 -0.60512507 -0.13746933  0.182253407
## Wyoming         0.62310061  0.31778662 -0.23824049 -0.164976866

The x vector with it’s loadings in different principal components

biplot(pr_out,scale=0)

We will be calculating the amount of variance explained by each principal component

pr_out$sdev
## [1] 1.5748783 0.9948694 0.5971291 0.4164494
pr_var=pr_out$sdev^2
pr_var
## [1] 2.4802416 0.9897652 0.3565632 0.1734301
pve=pr_var/sum(pr_var)
plot(pve,xlab="Principal Component",ylab="Proportion of Variance Explained",ylim=c(0,1),type="b")

K-Means clustering

Generating the data

set.seed(2)
x=matrix(rnorm(50*2),ncol=2)
x[1:25,1]=x[1:25,1]+3
x[1:25,2]=x[1:25,2]-4

K-means clustering with K=2

km_out=kmeans(x,2,nstart=20)
plot(x,col=(km_out$cluster+1),main="K_means Clustering with x=2",pch=20,cex=2)

We generated the data so we know there are 2 groups. Let us try what would have happen if we have started with K=3 groups

set.seed(4)
km_out=kmeans(x,3,nstart=20)
km_out
## K-means clustering with 3 clusters of sizes 10, 23, 17
## 
## Cluster means:
##         [,1]        [,2]
## 1  2.3001545 -2.69622023
## 2 -0.3820397 -0.08740753
## 3  3.7789567 -4.56200798
## 
## Clustering vector:
##  [1] 3 1 3 1 3 3 3 1 3 1 3 1 3 1 3 1 3 3 3 3 3 1 3 3 3 2 2 2 2 2 2 2 2 2 2
## [36] 2 2 2 2 2 2 2 2 1 2 1 2 2 2 2
## 
## Within cluster sum of squares by cluster:
## [1] 19.56137 52.67700 25.74089
##  (between_SS / total_SS =  79.3 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

Hierarchical clustering

hc_average<-hclust(dist(x),method = 'average')
hc_single<-hclust(dist(x),method = 'single')
hc_complete<-hclust(dist(x),method = 'complete')
par(mfrow=c(1,3))
plot(hc_complete,cex=0.9)
plot(hc_average,cex=0.9)
plot(hc_single,cex=0.9)

Labelling

cutree(hc_complete,2)
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2
## [36] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
cutree(hc_average,2)
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 1 2 2
## [36] 2 2 2 2 2 2 2 2 1 2 1 2 2 2 2
cutree(hc_single,2)
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
cutree(hc_single,4)
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3
## [36] 3 3 3 3 3 3 4 3 3 3 3 3 3 3 3
xsc=scale(x)
plot(hclust(dist(xsc)))

Not only absolute distances we can also calculate distances based on correlation

x=matrix(rnorm(30*3),ncol=3)
dd=as.dist(1-cor(t(x)))
plot(hclust(dd,method='complete'),main="Complete Linkage With Correlation-Based distancec",xlab="",ylab="")

NCI60 Data

nci_labs=NCI60$labs
nci_data=NCI60$data

The data has 64 rows and columns >>>64 This is a p>>>n problem

PCA on the NCI60 data

pr_out=prcomp(nci_data,scale=TRUE)
Cols=function(vec)
{
  cols=rainbow(length(unique(vec)))
  return(cols[as.numeric(as.factor(vec))])
}

Plotting the first two principal components

par(mfrow=c(1,2))

plot(pr_out$x[,1:2],col=Cols(nci_labs),pch=19,xlab="Z1",ylab="Z2")
plot(pr_out$x[,c(1,3)],col=Cols(nci_labs),pch=19,xlab="Z1",ylab="Z3")

[Copied]:

This indicates that cell lines from the same cancer type tend to have pretty similar gene expression levels.

summary(pr_out)
## Importance of components:
##                            PC1      PC2      PC3      PC4      PC5
## Standard deviation     27.8535 21.48136 19.82046 17.03256 15.97181
## Proportion of Variance  0.1136  0.06756  0.05752  0.04248  0.03735
## Cumulative Proportion   0.1136  0.18115  0.23867  0.28115  0.31850
##                             PC6      PC7      PC8      PC9     PC10
## Standard deviation     15.72108 14.47145 13.54427 13.14400 12.73860
## Proportion of Variance  0.03619  0.03066  0.02686  0.02529  0.02376
## Cumulative Proportion   0.35468  0.38534  0.41220  0.43750  0.46126
##                            PC11     PC12     PC13     PC14     PC15
## Standard deviation     12.68672 12.15769 11.83019 11.62554 11.43779
## Proportion of Variance  0.02357  0.02164  0.02049  0.01979  0.01915
## Cumulative Proportion   0.48482  0.50646  0.52695  0.54674  0.56590
##                            PC16     PC17     PC18     PC19    PC20
## Standard deviation     11.00051 10.65666 10.48880 10.43518 10.3219
## Proportion of Variance  0.01772  0.01663  0.01611  0.01594  0.0156
## Cumulative Proportion   0.58361  0.60024  0.61635  0.63229  0.6479
##                            PC21    PC22    PC23    PC24    PC25    PC26
## Standard deviation     10.14608 10.0544 9.90265 9.64766 9.50764 9.33253
## Proportion of Variance  0.01507  0.0148 0.01436 0.01363 0.01324 0.01275
## Cumulative Proportion   0.66296  0.6778 0.69212 0.70575 0.71899 0.73174
##                           PC27   PC28    PC29    PC30    PC31    PC32
## Standard deviation     9.27320 9.0900 8.98117 8.75003 8.59962 8.44738
## Proportion of Variance 0.01259 0.0121 0.01181 0.01121 0.01083 0.01045
## Cumulative Proportion  0.74433 0.7564 0.76824 0.77945 0.79027 0.80072
##                           PC33    PC34    PC35    PC36    PC37    PC38
## Standard deviation     8.37305 8.21579 8.15731 7.97465 7.90446 7.82127
## Proportion of Variance 0.01026 0.00988 0.00974 0.00931 0.00915 0.00896
## Cumulative Proportion  0.81099 0.82087 0.83061 0.83992 0.84907 0.85803
##                           PC39    PC40    PC41   PC42    PC43   PC44
## Standard deviation     7.72156 7.58603 7.45619 7.3444 7.10449 7.0131
## Proportion of Variance 0.00873 0.00843 0.00814 0.0079 0.00739 0.0072
## Cumulative Proportion  0.86676 0.87518 0.88332 0.8912 0.89861 0.9058
##                           PC45   PC46    PC47    PC48    PC49    PC50
## Standard deviation     6.95839 6.8663 6.80744 6.64763 6.61607 6.40793
## Proportion of Variance 0.00709 0.0069 0.00678 0.00647 0.00641 0.00601
## Cumulative Proportion  0.91290 0.9198 0.92659 0.93306 0.93947 0.94548
##                           PC51    PC52    PC53    PC54    PC55    PC56
## Standard deviation     6.21984 6.20326 6.06706 5.91805 5.91233 5.73539
## Proportion of Variance 0.00566 0.00563 0.00539 0.00513 0.00512 0.00482
## Cumulative Proportion  0.95114 0.95678 0.96216 0.96729 0.97241 0.97723
##                           PC57   PC58    PC59    PC60    PC61    PC62
## Standard deviation     5.47261 5.2921 5.02117 4.68398 4.17567 4.08212
## Proportion of Variance 0.00438 0.0041 0.00369 0.00321 0.00255 0.00244
## Cumulative Proportion  0.98161 0.9857 0.98940 0.99262 0.99517 0.99761
##                           PC63      PC64
## Standard deviation     4.04124 2.148e-14
## Proportion of Variance 0.00239 0.000e+00
## Cumulative Proportion  1.00000 1.000e+00
plot(pr_out)

Scree-Plot.

pve stands for proportion of variance explained

pve=100*pr_out$sdev^2/sum(pr_out$sdev^2)
par(mfrow=c(1,2))

plot(pve,type="o",ylab="PVE",xlab="Principal Component",col="blue")
plot(cumsum(pve),type="o",ylab="Cumulative PVE",xlab="Principal Component",col="brown3")

Hierarchical clustering

sd_data=scale(nci_data)
par(mfrow=c(1,3))
data_dist=dist(sd_data)

plot(hclust(data_dist),labels = nci_labs,main = "Complete Linkage",xlab = "",sub="",ylab="")
plot(hclust(data_dist,method="average"),labels = nci_labs,main = "Average Linkage",xlab = "",sub="",ylab="")
plot(hclust(data_dist,method="single"),labels = nci_labs,main = "Single Linkage",xlab = "",sub="",ylab="")

hc_out=hclust(dist(sd_data))
hc_clusters=cutree(hc_out,4)

table(hc_clusters,nci_labs)
##            nci_labs
## hc_clusters BREAST CNS COLON K562A-repro K562B-repro LEUKEMIA MCF7A-repro
##           1      2   3     2           0           0        0           0
##           2      3   2     0           0           0        0           0
##           3      0   0     0           1           1        6           0
##           4      2   0     5           0           0        0           1
##            nci_labs
## hc_clusters MCF7D-repro MELANOMA NSCLC OVARIAN PROSTATE RENAL UNKNOWN
##           1           0        8     8       6        2     8       1
##           2           0        0     1       0        0     1       0
##           3           0        0     0       0        0     0       0
##           4           1        0     0       0        0     0       0
par(mfrow=c(1,1))
plot(hc_out,labels = nci_labs)
abline(h=139,col="red")

At height=139 we see the cluster is cut into 4 different clusters. In the book it is specifically menioned that performing a k-means clustering with k=4 is not the same as cutting a dendogram at our chosen height. The resulting clusters can be different.

set.seed(2)
km_out=kmeans(sd_data,4,nstart=20)
km_clusters=km_out$cluster
table(km_clusters,hc_clusters)
##            hc_clusters
## km_clusters  1  2  3  4
##           1 11  0  0  9
##           2  0  0  8  0
##           3  9  0  0  0
##           4 20  7  0  0

Another important line in the book was denoising the data by using only the first few principal component scores

hc_out=hclust(dist(pr_out$x[,1:5]))
plot(hc_out,labels=nci_labs,main="Hier. Clust. on First Five score vectors")

table(cutree(hc_out,4),nci_labs)
##    nci_labs
##     BREAST CNS COLON K562A-repro K562B-repro LEUKEMIA MCF7A-repro
##   1      0   2     7           0           0        2           0
##   2      5   3     0           0           0        0           0
##   3      0   0     0           1           1        4           0
##   4      2   0     0           0           0        0           1
##    nci_labs
##     MCF7D-repro MELANOMA NSCLC OVARIAN PROSTATE RENAL UNKNOWN
##   1           0        1     8       5        2     7       0
##   2           0        7     1       1        0     2       1
##   3           0        0     0       0        0     0       0
##   4           1        0     0       0        0     0       0