library("ISLR")
library("car")
library("MASS")
library("leaps")
library("glmnet")
library("pls")
library("splines")
library("gam")
library("akima")
library("tree")
library("randomForest")
library("gbm")
library("e1071")
library("ROCR")
lm_fit<-lm(medv~lstat,data=Boston)
summary(lm_fit)
##
## Call:
## lm(formula = medv ~ lstat, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.168 -3.990 -1.318 2.034 24.500
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.55384 0.56263 61.41 <2e-16 ***
## lstat -0.95005 0.03873 -24.53 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.216 on 504 degrees of freedom
## Multiple R-squared: 0.5441, Adjusted R-squared: 0.5432
## F-statistic: 601.6 on 1 and 504 DF, p-value: < 2.2e-16
attach(Boston)
plot(lstat,medv)
abline(lm_fit)
par(mfrow=c(2,2))
plot(lm_fit)
lm_fit<-lm(medv~lstat+age,data=Boston)
summary(lm_fit)
##
## Call:
## lm(formula = medv ~ lstat + age, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.981 -3.978 -1.283 1.968 23.158
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.22276 0.73085 45.458 < 2e-16 ***
## lstat -1.03207 0.04819 -21.416 < 2e-16 ***
## age 0.03454 0.01223 2.826 0.00491 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.173 on 503 degrees of freedom
## Multiple R-squared: 0.5513, Adjusted R-squared: 0.5495
## F-statistic: 309 on 2 and 503 DF, p-value: < 2.2e-16
lm_fit<-lm(medv~.,data=Boston)
summary(lm_fit)
##
## Call:
## lm(formula = medv ~ ., data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.595 -2.730 -0.518 1.777 26.199
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.646e+01 5.103e+00 7.144 3.28e-12 ***
## crim -1.080e-01 3.286e-02 -3.287 0.001087 **
## zn 4.642e-02 1.373e-02 3.382 0.000778 ***
## indus 2.056e-02 6.150e-02 0.334 0.738288
## chas 2.687e+00 8.616e-01 3.118 0.001925 **
## nox -1.777e+01 3.820e+00 -4.651 4.25e-06 ***
## rm 3.810e+00 4.179e-01 9.116 < 2e-16 ***
## age 6.922e-04 1.321e-02 0.052 0.958229
## dis -1.476e+00 1.995e-01 -7.398 6.01e-13 ***
## rad 3.060e-01 6.635e-02 4.613 5.07e-06 ***
## tax -1.233e-02 3.760e-03 -3.280 0.001112 **
## ptratio -9.527e-01 1.308e-01 -7.283 1.31e-12 ***
## black 9.312e-03 2.686e-03 3.467 0.000573 ***
## lstat -5.248e-01 5.072e-02 -10.347 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.745 on 492 degrees of freedom
## Multiple R-squared: 0.7406, Adjusted R-squared: 0.7338
## F-statistic: 108.1 on 13 and 492 DF, p-value: < 2.2e-16
vif(lm_fit)
## crim zn indus chas nox rm age dis
## 1.792192 2.298758 3.991596 1.073995 4.393720 1.933744 3.100826 3.955945
## rad tax ptratio black lstat
## 7.484496 9.008554 1.799084 1.348521 2.941491
lm_fit1<-lm(medv~lstat+I(lstat^2))
summary(lm_fit)
##
## Call:
## lm(formula = medv ~ ., data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.595 -2.730 -0.518 1.777 26.199
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.646e+01 5.103e+00 7.144 3.28e-12 ***
## crim -1.080e-01 3.286e-02 -3.287 0.001087 **
## zn 4.642e-02 1.373e-02 3.382 0.000778 ***
## indus 2.056e-02 6.150e-02 0.334 0.738288
## chas 2.687e+00 8.616e-01 3.118 0.001925 **
## nox -1.777e+01 3.820e+00 -4.651 4.25e-06 ***
## rm 3.810e+00 4.179e-01 9.116 < 2e-16 ***
## age 6.922e-04 1.321e-02 0.052 0.958229
## dis -1.476e+00 1.995e-01 -7.398 6.01e-13 ***
## rad 3.060e-01 6.635e-02 4.613 5.07e-06 ***
## tax -1.233e-02 3.760e-03 -3.280 0.001112 **
## ptratio -9.527e-01 1.308e-01 -7.283 1.31e-12 ***
## black 9.312e-03 2.686e-03 3.467 0.000573 ***
## lstat -5.248e-01 5.072e-02 -10.347 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.745 on 492 degrees of freedom
## Multiple R-squared: 0.7406, Adjusted R-squared: 0.7338
## F-statistic: 108.1 on 13 and 492 DF, p-value: < 2.2e-16
plot(lm_fit1)
names(Carseats)
## [1] "Sales" "CompPrice" "Income" "Advertising" "Population"
## [6] "Price" "ShelveLoc" "Age" "Education" "Urban"
## [11] "US"
lm_fit=lm(Sales~.+Income:Advertising+Price:Age,data=Carseats)
summary(lm_fit)
##
## Call:
## lm(formula = Sales ~ . + Income:Advertising + Price:Age, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.9208 -0.7503 0.0177 0.6754 3.3413
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.5755654 1.0087470 6.519 2.22e-10 ***
## CompPrice 0.0929371 0.0041183 22.567 < 2e-16 ***
## Income 0.0108940 0.0026044 4.183 3.57e-05 ***
## Advertising 0.0702462 0.0226091 3.107 0.002030 **
## Population 0.0001592 0.0003679 0.433 0.665330
## Price -0.1008064 0.0074399 -13.549 < 2e-16 ***
## ShelveLocGood 4.8486762 0.1528378 31.724 < 2e-16 ***
## ShelveLocMedium 1.9532620 0.1257682 15.531 < 2e-16 ***
## Age -0.0579466 0.0159506 -3.633 0.000318 ***
## Education -0.0208525 0.0196131 -1.063 0.288361
## UrbanYes 0.1401597 0.1124019 1.247 0.213171
## USYes -0.1575571 0.1489234 -1.058 0.290729
## Income:Advertising 0.0007510 0.0002784 2.698 0.007290 **
## Price:Age 0.0001068 0.0001333 0.801 0.423812
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.011 on 386 degrees of freedom
## Multiple R-squared: 0.8761, Adjusted R-squared: 0.8719
## F-statistic: 210 on 13 and 386 DF, p-value: < 2.2e-16
names(Smarket)
## [1] "Year" "Lag1" "Lag2" "Lag3" "Lag4" "Lag5"
## [7] "Volume" "Today" "Direction"
cor(Smarket[,-9])
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 0.029699649 0.030596422 0.033194581 0.035688718
## Lag1 0.02969965 1.000000000 -0.026294328 -0.010803402 -0.002985911
## Lag2 0.03059642 -0.026294328 1.000000000 -0.025896670 -0.010853533
## Lag3 0.03319458 -0.010803402 -0.025896670 1.000000000 -0.024051036
## Lag4 0.03568872 -0.002985911 -0.010853533 -0.024051036 1.000000000
## Lag5 0.02978799 -0.005674606 -0.003557949 -0.018808338 -0.027083641
## Volume 0.53900647 0.040909908 -0.043383215 -0.041823686 -0.048414246
## Today 0.03009523 -0.026155045 -0.010250033 -0.002447647 -0.006899527
## Lag5 Volume Today
## Year 0.029787995 0.53900647 0.030095229
## Lag1 -0.005674606 0.04090991 -0.026155045
## Lag2 -0.003557949 -0.04338321 -0.010250033
## Lag3 -0.018808338 -0.04182369 -0.002447647
## Lag4 -0.027083641 -0.04841425 -0.006899527
## Lag5 1.000000000 -0.02200231 -0.034860083
## Volume -0.022002315 1.00000000 0.014591823
## Today -0.034860083 0.01459182 1.000000000
fitting a logistic model to predict the direction of the market
attach(Smarket)
glm_fit<-glm(Direction~.-Year-Direction-Today,data = Smarket,family = "binomial")
summary(glm_fit)
##
## Call:
## glm(formula = Direction ~ . - Year - Direction - Today, family = "binomial",
## data = Smarket)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.446 -1.203 1.065 1.145 1.326
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.126000 0.240736 -0.523 0.601
## Lag1 -0.073074 0.050167 -1.457 0.145
## Lag2 -0.042301 0.050086 -0.845 0.398
## Lag3 0.011085 0.049939 0.222 0.824
## Lag4 0.009359 0.049974 0.187 0.851
## Lag5 0.010313 0.049511 0.208 0.835
## Volume 0.135441 0.158360 0.855 0.392
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1731.2 on 1249 degrees of freedom
## Residual deviance: 1727.6 on 1243 degrees of freedom
## AIC: 1741.6
##
## Number of Fisher Scoring iterations: 3
glm_probs<-predict(glm_fit,type="response")
glm_probs[1:10]
## 1 2 3 4 5 6 7
## 0.5070841 0.4814679 0.4811388 0.5152224 0.5107812 0.5069565 0.4926509
## 8 9 10
## 0.5092292 0.5176135 0.4888378
contrasts(Direction)
## Up
## Down 0
## Up 1
glm_pred<-rep("Down",1250)
glm_pred[glm_probs>0.5]=="Up"
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [23] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [34] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [45] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [56] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [67] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [78] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [89] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [100] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [111] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [122] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [144] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [155] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [166] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [177] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [188] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [199] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [210] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [221] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [232] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [243] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [254] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [276] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [287] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [298] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [309] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [320] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [331] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [342] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [353] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [364] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [375] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [386] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [397] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [408] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [419] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [430] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [441] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [452] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [463] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [474] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [485] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [496] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [507] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [518] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [529] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [540] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [551] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [562] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [573] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [584] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [595] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [606] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [617] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [628] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [639] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [650] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [661] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [672] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [683] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [694] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [705] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [716] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [727] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [738] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [749] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [760] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [771] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [782] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [793] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [804] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [815] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [826] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [837] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [848] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [859] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [870] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [881] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [892] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [903] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [914] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [925] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [936] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [947] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [958] FALSE FALSE FALSE FALSE FALSE FALSE FALSE
sum(glm_pred==Direction)
## [1] 602
Using observations only before 2005
train<-(Year<2005)
Smarket.2005<-Smarket[!train,]
Direction.2005<-Direction[!train]
lda_fit<-lda(Direction~Lag1+Lag2,data=Smarket,subset = train)
lda_fit
## Call:
## lda(Direction ~ Lag1 + Lag2, data = Smarket, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.491984 0.508016
##
## Group means:
## Lag1 Lag2
## Down 0.04279022 0.03389409
## Up -0.03954635 -0.03132544
##
## Coefficients of linear discriminants:
## LD1
## Lag1 -0.6420190
## Lag2 -0.5135293
lda_pred<-predict(lda_fit,Smarket.2005)
names(lda_pred)
## [1] "class" "posterior" "x"
lda_class<-lda_pred$class
table(lda_class,Direction.2005)
## Direction.2005
## lda_class Down Up
## Down 35 35
## Up 76 106
lda_pred$posterior
## Down Up
## 999 0.4901792 0.5098208
## 1000 0.4792185 0.5207815
## 1001 0.4668185 0.5331815
## 1002 0.4740011 0.5259989
## 1003 0.4927877 0.5072123
## 1004 0.4938562 0.5061438
## 1005 0.4951016 0.5048984
## 1006 0.4872861 0.5127139
## 1007 0.4907013 0.5092987
## 1008 0.4844026 0.5155974
## 1009 0.4906963 0.5093037
## 1010 0.5119988 0.4880012
## 1011 0.4895152 0.5104848
## 1012 0.4706761 0.5293239
## 1013 0.4744593 0.5255407
## 1014 0.4799583 0.5200417
## 1015 0.4935775 0.5064225
## 1016 0.5030894 0.4969106
## 1017 0.4978806 0.5021194
## 1018 0.4886331 0.5113669
## 1019 0.5006568 0.4993432
## 1020 0.5108735 0.4891265
## 1021 0.5039925 0.4960075
## 1022 0.4916335 0.5083665
## 1023 0.5041772 0.4958228
## 1024 0.5026751 0.4973249
## 1025 0.4914043 0.5085957
## 1026 0.4805964 0.5194036
## 1027 0.4882718 0.5117282
## 1028 0.5062187 0.4937813
## 1029 0.5005996 0.4994004
## 1030 0.4972965 0.5027035
## 1031 0.4958546 0.5041454
## 1032 0.4811777 0.5188223
## 1033 0.4841417 0.5158583
## 1034 0.4726388 0.5273612
## 1035 0.4836418 0.5163582
## 1036 0.5091007 0.4908993
## 1037 0.5135941 0.4864059
## 1038 0.4933839 0.5066161
## 1039 0.4926856 0.5073144
## 1040 0.4978472 0.5021528
## 1041 0.4920914 0.5079086
## 1042 0.5056346 0.4943654
## 1043 0.5062288 0.4937712
## 1044 0.4881894 0.5118106
## 1045 0.4725293 0.5274707
## 1046 0.4832339 0.5167661
## 1047 0.4835086 0.5164914
## 1048 0.4913334 0.5086666
## 1049 0.4877566 0.5122434
## 1050 0.4724386 0.5275614
## 1051 0.4854877 0.5145123
## 1052 0.4932911 0.5067089
## 1053 0.4845973 0.5154027
## 1054 0.4723718 0.5276282
## 1055 0.4816170 0.5183830
## 1056 0.4914067 0.5085933
## 1057 0.4942755 0.5057245
## 1058 0.4841232 0.5158768
## 1059 0.5026064 0.4973936
## 1060 0.5062557 0.4937443
## 1061 0.4821800 0.5178200
## 1062 0.4885263 0.5114737
## 1063 0.5011825 0.4988175
## 1064 0.5000595 0.4999405
## 1065 0.5027377 0.4972623
## 1066 0.4870086 0.5129914
## 1067 0.4827213 0.5172787
## 1068 0.4996501 0.5003499
## 1069 0.4818079 0.5181921
## 1070 0.4651057 0.5348943
## 1071 0.4577867 0.5422133
## 1072 0.4775004 0.5224996
## 1073 0.5034250 0.4965750
## 1074 0.4801664 0.5198336
## 1075 0.5046171 0.4953829
## 1076 0.5044752 0.4955248
## 1077 0.4964663 0.5035337
## 1078 0.4892965 0.5107035
## 1079 0.4876236 0.5123764
## 1080 0.4805625 0.5194375
## 1081 0.4958518 0.5041482
## 1082 0.5115212 0.4884788
## 1083 0.4958572 0.5041428
## 1084 0.5082871 0.4917129
## 1085 0.5022091 0.4977909
## 1086 0.4875892 0.5124108
## 1087 0.4995948 0.5004052
## 1088 0.4841917 0.5158083
## 1089 0.4858843 0.5141157
## 1090 0.4826969 0.5173031
## 1091 0.4745012 0.5254988
## 1092 0.5008540 0.4991460
## 1093 0.5127766 0.4872234
## 1094 0.5135472 0.4864528
## 1095 0.5095127 0.4904873
## 1096 0.4950201 0.5049799
## 1097 0.4956088 0.5043912
## 1098 0.4964643 0.5035357
## 1099 0.4874363 0.5125637
## 1100 0.4970339 0.5029661
## 1101 0.5003751 0.4996249
## 1102 0.4846137 0.5153863
## 1103 0.4976914 0.5023086
## 1104 0.5043081 0.4956919
## 1105 0.4843366 0.5156634
## 1106 0.4860664 0.5139336
## 1107 0.4930417 0.5069583
## 1108 0.4887219 0.5112781
## 1109 0.4968147 0.5031853
## 1110 0.4944989 0.5055011
## 1111 0.4924743 0.5075257
## 1112 0.4980141 0.5019859
## 1113 0.4978727 0.5021273
## 1114 0.4994390 0.5005610
## 1115 0.5028317 0.4971683
## 1116 0.4964503 0.5035497
## 1117 0.4883202 0.5116798
## 1118 0.4899801 0.5100199
## 1119 0.4771957 0.5228043
## 1120 0.4694030 0.5305970
## 1121 0.4824692 0.5175308
## 1122 0.5037943 0.4962057
## 1123 0.5000974 0.4999026
## 1124 0.4805303 0.5194697
## 1125 0.4876953 0.5123047
## 1126 0.5070782 0.4929218
## 1127 0.4901776 0.5098224
## 1128 0.4860999 0.5139001
## 1129 0.5108497 0.4891503
## 1130 0.5135547 0.4864453
## 1131 0.5020218 0.4979782
## 1132 0.4956830 0.5043170
## 1133 0.4965536 0.5034464
## 1134 0.4964590 0.5035410
## 1135 0.4855719 0.5144281
## 1136 0.4951439 0.5048561
## 1137 0.5060048 0.4939952
## 1138 0.4880643 0.5119357
## 1139 0.4921175 0.5078825
## 1140 0.4927195 0.5072805
## 1141 0.4901661 0.5098339
## 1142 0.5001986 0.4998014
## 1143 0.5047746 0.4952254
## 1144 0.4875267 0.5124733
## 1145 0.4847648 0.5152352
## 1146 0.5028405 0.4971595
## 1147 0.5008435 0.4991565
## 1148 0.4825591 0.5174409
## 1149 0.4732124 0.5267876
## 1150 0.4797731 0.5202269
## 1151 0.4983172 0.5016828
## 1152 0.4968824 0.5031176
## 1153 0.4997031 0.5002969
## 1154 0.4914721 0.5085279
## 1155 0.4892300 0.5107700
## 1156 0.4787694 0.5212306
## 1157 0.4799234 0.5200766
## 1158 0.4913818 0.5086182
## 1159 0.4916287 0.5083713
## 1160 0.4948795 0.5051205
## 1161 0.4890900 0.5109100
## 1162 0.4790944 0.5209056
## 1163 0.4878531 0.5121469
## 1164 0.4861838 0.5138162
## 1165 0.4935558 0.5064442
## 1166 0.4941329 0.5058671
## 1167 0.5020762 0.4979238
## 1168 0.5043051 0.4956949
## 1169 0.4890430 0.5109570
## 1170 0.5062006 0.4937994
## 1171 0.5092767 0.4907233
## 1172 0.4893670 0.5106330
## 1173 0.4987776 0.5012224
## 1174 0.4997456 0.5002544
## 1175 0.4806852 0.5193148
## 1176 0.4790536 0.5209464
## 1177 0.4889496 0.5110504
## 1178 0.5039466 0.4960534
## 1179 0.4934174 0.5065826
## 1180 0.4748985 0.5251015
## 1181 0.4706261 0.5293739
## 1182 0.4868978 0.5131022
## 1183 0.4967554 0.5032446
## 1184 0.4929449 0.5070551
## 1185 0.4922853 0.5077147
## 1186 0.4933690 0.5066310
## 1187 0.5053601 0.4946399
## 1188 0.5030552 0.4969448
## 1189 0.4905837 0.5094163
## 1190 0.4762390 0.5237610
## 1191 0.4603392 0.5396608
## 1192 0.4697932 0.5302068
## 1193 0.4925300 0.5074700
## 1194 0.4861143 0.5138857
## 1195 0.4811376 0.5188624
## 1196 0.4812474 0.5187526
## 1197 0.4842383 0.5157617
## 1198 0.5026218 0.4973782
## 1199 0.5052312 0.4947688
## 1200 0.4813184 0.5186816
## 1201 0.5015397 0.4984603
## 1202 0.4877161 0.5122839
## 1203 0.4774171 0.5225829
## 1204 0.5168827 0.4831173
## 1205 0.5072640 0.4927360
## 1206 0.4833515 0.5166485
## 1207 0.4726701 0.5273299
## 1208 0.5032667 0.4967333
## 1209 0.5202350 0.4797650
## 1210 0.4950279 0.5049721
## 1211 0.5018767 0.4981233
## 1212 0.5089142 0.4910858
## 1213 0.4968911 0.5031089
## 1214 0.4951595 0.5048405
## 1215 0.4895942 0.5104058
## 1216 0.4904653 0.5095347
## 1217 0.5055318 0.4944682
## 1218 0.5055416 0.4944584
## 1219 0.4942470 0.5057530
## 1220 0.4857495 0.5142505
## 1221 0.4901606 0.5098394
## 1222 0.5069730 0.4930270
## 1223 0.5084764 0.4915236
## 1224 0.5041288 0.4958712
## 1225 0.5048299 0.4951701
## 1226 0.5023879 0.4976121
## 1227 0.4986903 0.5013097
## 1228 0.4824758 0.5175242
## 1229 0.4825469 0.5174531
## 1230 0.4831600 0.5168400
## 1231 0.5017497 0.4982503
## 1232 0.5058708 0.4941292
## 1233 0.4890321 0.5109679
## 1234 0.4911052 0.5088948
## 1235 0.4864250 0.5135750
## 1236 0.4847062 0.5152938
## 1237 0.4944890 0.5055110
## 1238 0.4962261 0.5037739
## 1239 0.5005702 0.4994298
## 1240 0.5039068 0.4960932
## 1241 0.4946376 0.5053624
## 1242 0.4864366 0.5135634
## 1243 0.4807022 0.5192978
## 1244 0.4851439 0.5148561
## 1245 0.4951734 0.5048266
## 1246 0.5005893 0.4994107
## 1247 0.4972210 0.5027790
## 1248 0.4791988 0.5208012
## 1249 0.4831673 0.5168327
## 1250 0.4892591 0.5107409
lda_pred$posterior[1:20,1]
## 999 1000 1001 1002 1003 1004 1005
## 0.4901792 0.4792185 0.4668185 0.4740011 0.4927877 0.4938562 0.4951016
## 1006 1007 1008 1009 1010 1011 1012
## 0.4872861 0.4907013 0.4844026 0.4906963 0.5119988 0.4895152 0.4706761
## 1013 1014 1015 1016 1017 1018
## 0.4744593 0.4799583 0.4935775 0.5030894 0.4978806 0.4886331
lda_class[1:20]
## [1] Up Up Up Up Up Up Up Up Up Up Up Down Up Up
## [15] Up Up Up Down Up Up
## Levels: Down Up
qda_fit<-qda(Direction~Lag1+Lag2,data = Smarket,subset = train)
qda_fit
## Call:
## qda(Direction ~ Lag1 + Lag2, data = Smarket, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.491984 0.508016
##
## Group means:
## Lag1 Lag2
## Down 0.04279022 0.03389409
## Up -0.03954635 -0.03132544
library("class")
train_X=cbind(Lag1,Lag2)[train,]
test_X=cbind(Lag1,Lag2)[!train,]
train_Direction=Direction[train]
set.seed(1)
knn_pred<-knn(train_X,test_X,train_Direction,k=3)
table(knn_pred,Direction.2005)
## Direction.2005
## knn_pred Down Up
## Down 48 55
## Up 63 86
attach(Caravan)
standardized_X=scale(Caravan[,-86])
test=1:1000
train_x=standardized_X[-test,]
test_X=standardized_X[test,]
train_Y=Purchase[-test]
test_Y=Purchase[test]
set.seed(1)
knn_pred=knn(train_x,test_X,train_Y,k=1)
mean(test_Y!=knn_pred)
## [1] 0.118
set.seed(1)
train=sample(392,196)
attach(Auto)
lm_fit<-lm(mpg~horsepower,data=Auto,subset = train)
mean((mpg- predict(lm_fit,Auto))[-train]^2)
## [1] 26.14142
library("boot")
##
## Attaching package: 'boot'
## The following object is masked from 'package:car':
##
## logit
glm_fit<-glm(mpg~horsepower,data = Auto)
cv.err=cv.glm(Auto,glm_fit)
cv.err$K
## [1] 392
cv.error=rep(0,5)
for(i in 1:5){
glm_fit<-glm(mpg~poly(horsepower,i),data = Auto)
cv.error[i]=cv.glm(Auto,glm_fit)$delta[i]
}
cv.error
## [1] 24.23151 19.24787 NA NA NA
alpha_fn=function(data,index){
X=data$X[index]
Y=data$Y[index]
return((var(Y)-cov(X,Y))/(var(X)+var(Y)-2*cov(X,Y)))
}
alpha_fn(Portfolio,1:100)
## [1] 0.5758321
boot(Portfolio,alpha_fn,R=1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Portfolio, statistic = alpha_fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 0.5758321 0.00107223 0.08860155
Hitters=na.omit(Hitters)
regfit_full=regsubsets(Salary~.,Hitters)
summary(regfit_full)
## Subset selection object
## Call: regsubsets.formula(Salary ~ ., Hitters)
## 19 Variables (and intercept)
## Forced in Forced out
## AtBat FALSE FALSE
## Hits FALSE FALSE
## HmRun FALSE FALSE
## Runs FALSE FALSE
## RBI FALSE FALSE
## Walks FALSE FALSE
## Years FALSE FALSE
## CAtBat FALSE FALSE
## CHits FALSE FALSE
## CHmRun FALSE FALSE
## CRuns FALSE FALSE
## CRBI FALSE FALSE
## CWalks FALSE FALSE
## LeagueN FALSE FALSE
## DivisionW FALSE FALSE
## PutOuts FALSE FALSE
## Assists FALSE FALSE
## Errors FALSE FALSE
## NewLeagueN FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " "
## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " "
## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " "
## 5 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " "
## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " "
## 7 ( 1 ) " " "*" " " " " " " "*" " " "*" "*" "*" " "
## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " "*" "*"
## CRBI CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1 ( 1 ) "*" " " " " " " " " " " " " " "
## 2 ( 1 ) "*" " " " " " " " " " " " " " "
## 3 ( 1 ) "*" " " " " " " "*" " " " " " "
## 4 ( 1 ) "*" " " " " "*" "*" " " " " " "
## 5 ( 1 ) "*" " " " " "*" "*" " " " " " "
## 6 ( 1 ) "*" " " " " "*" "*" " " " " " "
## 7 ( 1 ) " " " " " " "*" "*" " " " " " "
## 8 ( 1 ) " " "*" " " "*" "*" " " " " " "
The number of stars indicate the importance of the variable. In the given configuration out of the 8 possible times that the iteration was run hits were found to be important all of the time whereas CRBI and PutOuts were found important 6 times
regfit_full<-regsubsets(Salary~.,data = Hitters,nvmax = 19)
reg_summary=summary(regfit_full)
reg_summary$rsq
## [1] 0.3214501 0.4252237 0.4514294 0.4754067 0.4908036 0.5087146 0.5141227
## [8] 0.5285569 0.5346124 0.5404950 0.5426153 0.5436302 0.5444570 0.5452164
## [15] 0.5454692 0.5457656 0.5459518 0.5460945 0.5461159
par(mfrow=c(2,2))
plot(reg_summary$rss,xlab="Number of variables",ylab="RSS",type="l")
plot(reg_summary$adjr2,xlab="Number of variables",ylab="Adjusted Rsq",type="l")
which.max(reg_summary$adjr2)
## [1] 11
plot(regfit_full,scale="adjr2")
points(11,reg_summary$adjr2[11],col="red",cex=2,pch=20)
plot(regfit_full,scale="r2")
Forward and Backward Stepwise selection
regfit_fwd<-regsubsets(Salary~.,data = Hitters,nvmax = 19,method = "forward")
summary(regfit_fwd)
## Subset selection object
## Call: regsubsets.formula(Salary ~ ., data = Hitters, nvmax = 19, method = "forward")
## 19 Variables (and intercept)
## Forced in Forced out
## AtBat FALSE FALSE
## Hits FALSE FALSE
## HmRun FALSE FALSE
## Runs FALSE FALSE
## RBI FALSE FALSE
## Walks FALSE FALSE
## Years FALSE FALSE
## CAtBat FALSE FALSE
## CHits FALSE FALSE
## CHmRun FALSE FALSE
## CRuns FALSE FALSE
## CRBI FALSE FALSE
## CWalks FALSE FALSE
## LeagueN FALSE FALSE
## DivisionW FALSE FALSE
## PutOuts FALSE FALSE
## Assists FALSE FALSE
## Errors FALSE FALSE
## NewLeagueN FALSE FALSE
## 1 subsets of each size up to 19
## Selection Algorithm: forward
## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " "
## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " "
## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " "
## 5 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " "
## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " "
## 7 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " "
## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*"
## 9 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*"
## 10 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*"
## 11 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*"
## 12 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*"
## 13 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*"
## 14 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" " " " " "*"
## 15 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" "*" " " "*"
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*"
## 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" " " "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
## CRBI CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1 ( 1 ) "*" " " " " " " " " " " " " " "
## 2 ( 1 ) "*" " " " " " " " " " " " " " "
## 3 ( 1 ) "*" " " " " " " "*" " " " " " "
## 4 ( 1 ) "*" " " " " "*" "*" " " " " " "
## 5 ( 1 ) "*" " " " " "*" "*" " " " " " "
## 6 ( 1 ) "*" " " " " "*" "*" " " " " " "
## 7 ( 1 ) "*" "*" " " "*" "*" " " " " " "
## 8 ( 1 ) "*" "*" " " "*" "*" " " " " " "
## 9 ( 1 ) "*" "*" " " "*" "*" " " " " " "
## 10 ( 1 ) "*" "*" " " "*" "*" "*" " " " "
## 11 ( 1 ) "*" "*" "*" "*" "*" "*" " " " "
## 12 ( 1 ) "*" "*" "*" "*" "*" "*" " " " "
## 13 ( 1 ) "*" "*" "*" "*" "*" "*" "*" " "
## 14 ( 1 ) "*" "*" "*" "*" "*" "*" "*" " "
## 15 ( 1 ) "*" "*" "*" "*" "*" "*" "*" " "
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" "*" " "
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*"
## 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*"
set.seed(1)
train=sample(c(TRUE,FALSE),nrow(Hitters),rep=TRUE)
test=(!train)
regfit_best=regsubsets(Salary~.,data=Hitters[train,],nvmax = 19)
test_mat=model.matrix(Salary~.,data = Hitters[test,])
val_errors=rep(NA,19)
for(i in 1:19){
coefi=coef(regfit_best,id=i)
pred_test=test_mat[,names(coefi)] %*% coefi
val_errors[i]=mean((Hitters$Salary[test]-pred_test)^2)
}
val_errors
## [1] 220968.0 169157.1 178518.2 163426.1 168418.1 171270.6 162377.1
## [8] 157909.3 154055.7 148162.1 151156.4 151742.5 152214.5 157358.7
## [15] 158541.4 158743.3 159972.7 159859.8 160105.6
Since there is no predict function for regsubsets we are creating our own predict function
predict_regsubsets<-function(object,newdata,id,...){
form=as.formula(object$call[[2]])
mat=model.matrix(form,newdata)
coefi=coef(object,id=id)
xvars=names(coefi)
mat[,xvars]%*% coefi
}
regfit_best=regsubsets(Salary~.,data=Hitters,nvmax=19)
coef(regfit_best,10)
## (Intercept) AtBat Hits Walks CAtBat
## 162.5354420 -2.1686501 6.9180175 5.7732246 -0.1300798
## CRuns CRBI CWalks DivisionW PutOuts
## 1.4082490 0.7743122 -0.8308264 -112.3800575 0.2973726
## Assists
## 0.2831680
k=10
set.seed(1)
folds=sample(1:k,nrow(Hitters),replace = TRUE)
cv_errors=matrix(NA,k,19,dimnames=list(NULL,paste(1:19)))
for(j in 1:k){
best_fit=regsubsets(Salary~.,data = Hitters[folds!=j,],nvmax=19)
for(i in 1:19){
pred=predict_regsubsets(best_fit,Hitters[folds==j,],id=i)
cv_errors[j,i]=mean((Hitters$Salary[folds==j]-pred)^2)
}
}
mean_cv_errors=apply(cv_errors, 2, mean)
plot(mean_cv_errors,type="b")
x=model.matrix(Salary~.,Hitters)[,-1]
y=Hitters$Salary
model.matrix converts categorical variables into quantitative by making use of the dummy variables
Ridge Regression
grid=10^seq(10,-2,length=100)
ridge_mod=glmnet(x,y,alpha=0,lambda = grid)
ridge_mod$lambda[50]
## [1] 11497.57
coef(ridge_mod)[,50]
## (Intercept) AtBat Hits HmRun Runs
## 407.356050200 0.036957182 0.138180344 0.524629976 0.230701523
## RBI Walks Years CAtBat CHits
## 0.239841459 0.289618741 1.107702929 0.003131815 0.011653637
## CHmRun CRuns CRBI CWalks LeagueN
## 0.087545670 0.023379882 0.024138320 0.025015421 0.085028114
## DivisionW PutOuts Assists Errors NewLeagueN
## -6.215440973 0.016482577 0.002612988 -0.020502690 0.301433531
Calculate l2 norm
sqrt(sum(coef(ridge_mod)[-1,50]^2))
## [1] 6.360612
The higher the lambda, shrunker are the coefficients and hence less the l2 Let us calculate for some other value of lambda
sqrt(sum(coef(ridge_mod)[-1,60]^2))
## [1] 57.11001
Can use the predict function too, we just have to use type=“coefficients”
predict(ridge_mod,type = "coefficients",s=50)[1:20,]
## (Intercept) AtBat Hits HmRun Runs
## 4.876610e+01 -3.580999e-01 1.969359e+00 -1.278248e+00 1.145892e+00
## RBI Walks Years CAtBat CHits
## 8.038292e-01 2.716186e+00 -6.218319e+00 5.447837e-03 1.064895e-01
## CHmRun CRuns CRBI CWalks LeagueN
## 6.244860e-01 2.214985e-01 2.186914e-01 -1.500245e-01 4.592589e+01
## DivisionW PutOuts Assists Errors NewLeagueN
## -1.182011e+02 2.502322e-01 1.215665e-01 -3.278600e+00 -9.496680e+00
set.seed(1)
train=sample(1:nrow(x),nrow(x)/2)
test=-train
y_test=y[test]
Fitting a ridge regression model
ridge_mod=glmnet(x[train,],y[train],alpha=0,lambda = grid,thresh = 1e-12)
ridge_pred=predict(ridge_mod,s=4,newx = x[test,])
mean((ridge_pred-y_test)^2)
## [1] 101036.8
mean((mean(y[train])-y_test)^2)
## [1] 193253.1
#ridge_pred=predict(ridge_mod,s=0,newx = x[test,],exact = T)
#mean((ridge_pred-y_test)^2)
#lm(y~x,subset = train)
#the following commands are not running
# predict(object = ridge_mod,s=0,exact = T,type="coefficients")
# predict.glmnet(ridge_mod,s=0,exact = T,type="coefficients")[1:20,]
Using cross-validation to find the best value of lambda
set.seed(1)
cv_out=cv.glmnet(x[train,],y[train],alpha=0)
plot(cv_out)
bestlam=cv_out$lambda.min
bestlam
## [1] 211.7416
The best lambda value is 212
ridge_pred=predict(ridge_mod,s=bestlam,newx=x[test,])
mean((ridge_pred-y_test)^2)
## [1] 96015.51
lasso_mod=glmnet(x[train,],y[train],alpha=1,lambda=grid)
plot(lasso_mod)
set.seed(1)
cv_out=cv.glmnet(x[train,],y[train],alpha=1)
plot(cv_out)
bestlam=cv_out$lambda.min
lasso_pred=predict(lasso_mod,s=bestlam,newx=x[test,])
mean((lasso_pred-y_test)^2)
## [1] 100743.4
The mean error is comparable to the ridge but in the ridge the coefficients are shrunken but they are not made zero
out=glmnet(x,y,alpha=1,lambda = grid)
lasso_coef=predict(out,type="coefficients",s=bestlam)[1:20,]
lasso_coef
## (Intercept) AtBat Hits HmRun Runs
## 18.5394844 0.0000000 1.8735390 0.0000000 0.0000000
## RBI Walks Years CAtBat CHits
## 0.0000000 2.2178444 0.0000000 0.0000000 0.0000000
## CHmRun CRuns CRBI CWalks LeagueN
## 0.0000000 0.2071252 0.4130132 0.0000000 3.2666677
## DivisionW PutOuts Assists Errors NewLeagueN
## -103.4845458 0.2204284 0.0000000 0.0000000 0.0000000
As we see that most of the coefs are made zero so it’s easier to interpret
set.seed(2)
pcr_fit<-pcr(Salary~.,data=Hitters,scale=TRUE,validation="CV")
summary(pcr_fit)
## Data: X dimension: 263 19
## Y dimension: 263 1
## Fit method: svdpc
## Number of components considered: 19
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 452 348.9 352.2 353.5 352.8 350.1 349.1
## adjCV 452 348.7 351.8 352.9 352.1 349.3 348.0
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 349.6 350.9 352.9 353.8 355.0 356.2 363.5
## adjCV 348.5 349.8 351.6 352.3 353.4 354.5 361.6
## 14 comps 15 comps 16 comps 17 comps 18 comps 19 comps
## CV 355.2 357.4 347.6 350.1 349.2 352.6
## adjCV 352.8 355.2 345.5 347.6 346.7 349.8
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 38.31 60.16 70.84 79.03 84.29 88.63 92.26
## Salary 40.63 41.58 42.17 43.22 44.90 46.48 46.69
## 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps
## X 94.96 96.28 97.26 97.98 98.65 99.15 99.47
## Salary 46.75 46.86 47.76 47.82 47.85 48.10 50.40
## 15 comps 16 comps 17 comps 18 comps 19 comps
## X 99.75 99.89 99.97 99.99 100.00
## Salary 50.55 53.01 53.85 54.61 54.61
validationplot(pcr_fit,val.type = "MSEP")
set.seed(1)
pcr_fit<-pcr(Salary~.,data=Hitters,subset=train,scale=TRUE,validation="CV")
validationplot(pcr_fit,val.type = "MSEP")
pcr_fit=pcr(y~x,scale=TRUE,ncomp=7)
summary(pcr_fit)
## Data: X dimension: 263 19
## Y dimension: 263 1
## Fit method: svdpc
## Number of components considered: 7
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 38.31 60.16 70.84 79.03 84.29 88.63 92.26
## y 40.63 41.58 42.17 43.22 44.90 46.48 46.69
set.seed(1)
pls_fit=plsr(Salary~.,data=Hitters,subset=train,scale=TRUE,validation="CV")
summary(pls_fit)
## Data: X dimension: 131 19
## Y dimension: 131 1
## Fit method: kernelpls
## Number of components considered: 19
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 464.6 394.2 391.5 393.1 395.0 415.0 424.0
## adjCV 464.6 393.4 390.2 391.1 392.9 411.5 418.8
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 424.5 415.8 404.6 407.1 412.0 414.4 410.3
## adjCV 418.9 411.4 400.7 402.2 407.2 409.3 405.6
## 14 comps 15 comps 16 comps 17 comps 18 comps 19 comps
## CV 406.2 408.6 410.5 408.8 407.8 410.2
## adjCV 401.8 403.9 405.6 404.1 403.2 405.5
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 38.12 53.46 66.05 74.49 79.33 84.56 87.09
## Salary 33.58 38.96 41.57 42.43 44.04 45.59 47.05
## 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps
## X 90.74 92.55 93.94 97.23 97.88 98.35 98.85
## Salary 47.53 48.42 49.68 50.04 50.54 50.78 50.92
## 15 comps 16 comps 17 comps 18 comps 19 comps
## X 99.11 99.43 99.78 99.99 100.00
## Salary 51.04 51.11 51.15 51.16 51.18
attach(Wage)
## The following object is masked from Auto:
##
## year
## The following object is masked from Boston:
##
## age
fit=lm(wage~poly(age,4),data=Wage)
coef(summary(fit))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 111.70361 0.7287409 153.283015 0.000000e+00
## poly(age, 4)1 447.06785 39.9147851 11.200558 1.484604e-28
## poly(age, 4)2 -478.31581 39.9147851 -11.983424 2.355831e-32
## poly(age, 4)3 125.52169 39.9147851 3.144742 1.678622e-03
## poly(age, 4)4 -77.91118 39.9147851 -1.951938 5.103865e-02
fit=lm(wage~poly(age,4,raw=T),data=Wage)
coef(summary(fit))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.841542e+02 6.004038e+01 -3.067172 0.0021802539
## poly(age, 4, raw = T)1 2.124552e+01 5.886748e+00 3.609042 0.0003123618
## poly(age, 4, raw = T)2 -5.638593e-01 2.061083e-01 -2.735743 0.0062606446
## poly(age, 4, raw = T)3 6.810688e-03 3.065931e-03 2.221409 0.0263977518
## poly(age, 4, raw = T)4 -3.203830e-05 1.641359e-05 -1.951938 0.0510386498
Important: use the I() function if you want to type the exact poly. Example age^2 should be written as I(age^2)
agelims=range(age)
age_grid=seq(from=agelims[1],to=agelims[2])
preds=predict(fit,newdata = list(age=age_grid),se=TRUE)
se_bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
par(mfrow=c(1,2),mar=c(4.5,4.5,1,1),oma=c(0,0,4,0))
plot(age,wage,xlim=agelims,cex=.5,col="darkgrey")
title("Degree - 4 Polynomial", outer = T)
lines(age_grid,preds$fit,lwd=2,col="blue")
matlines(age_grid,se_bands,lwd=1,col="blue",lty=3)
The anova function works here like a charm because of its simplicity and efficacy
The following code is slight different from the book. I created a function rather than typing in five lines of code.
polynomial_R<-function(degree_i){
assign(paste("fit_",degree_i),lm(wage~poly(age,degree_i)),envir = parent.frame())
}
Either I can call the function 5 times or I can run a for loop
for (i in 1:5){
polynomial_R(i)
}
anova(`fit_ 1`,`fit_ 2`,`fit_ 3`,`fit_ 4`,`fit_ 5`)
## Analysis of Variance Table
##
## Model 1: wage ~ poly(age, degree_i)
## Model 2: wage ~ poly(age, degree_i)
## Model 3: wage ~ poly(age, degree_i)
## Model 4: wage ~ poly(age, degree_i)
## Model 5: wage ~ poly(age, degree_i)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.5931 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8888 0.001679 **
## 4 2995 4771604 1 6070 3.8098 0.051046 .
## 5 2994 4770322 1 1283 0.8050 0.369682
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fit<-glm(I(wage>250)~poly(age,4),data=Wage,family = binomial)
preds=predict(fit,newdata = list(age=age_grid),se=T)
pfit=exp(preds$fit)/(1+exp(preds$fit))
se_bands_logit=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
se_bands=exp(se_bands_logit)/(1+exp(se_bands_logit))
preds=predict(fit,newdata=list(age=age_grid),type = "response",se=T)
plot(age,I(wage>250),xlim=agelims,type = "n",ylim=c(0,.2))
points(jitter(age),I((wage>250)/5),cex=0.5,pch="|",col="darkgrey")
lines(age_grid,pfit,lwd=2,col="blue")
matlines(age_grid,se_bands,lwd=1,col="blue",lty=3)
table(cut(age,4))
##
## (17.9,33.5] (33.5,49] (49,64.5] (64.5,80.1]
## 750 1399 779 72
fit=lm(wage~cut(age,4),data=Wage)
coef(summary(fit))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 94.158392 1.476069 63.789970 0.000000e+00
## cut(age, 4)(33.5,49] 24.053491 1.829431 13.148074 1.982315e-38
## cut(age, 4)(49,64.5] 23.664559 2.067958 11.443444 1.040750e-29
## cut(age, 4)(64.5,80.1] 7.640592 4.987424 1.531972 1.256350e-01
fit=lm(wage~bs(age,knots = c(25,40,60)),data=Wage)
pred=predict(fit,newdata = list(age=age_grid),se=T)
{plot(age,wage,col="gray")
lines(age_grid,pred$fit,lwd=2)
lines(age_grid ,pred$fit +2* pred$se ,lty ="dashed")
lines(age_grid ,pred$fit -2* pred$se ,lty ="dashed")
}
Fitting a natural spline
fit2=lm(wage~ns(age,df=4),data=Wage)
summary(fit2)
##
## Call:
## lm(formula = wage ~ ns(age, df = 4), data = Wage)
##
## Residuals:
## Min 1Q Median 3Q Max
## -98.737 -24.477 -5.083 15.371 204.874
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 58.556 4.235 13.827 <2e-16 ***
## ns(age, df = 4)1 60.462 4.190 14.430 <2e-16 ***
## ns(age, df = 4)2 41.963 4.372 9.597 <2e-16 ***
## ns(age, df = 4)3 97.020 10.386 9.341 <2e-16 ***
## ns(age, df = 4)4 9.773 8.657 1.129 0.259
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 39.92 on 2995 degrees of freedom
## Multiple R-squared: 0.08598, Adjusted R-squared: 0.08476
## F-statistic: 70.43 on 4 and 2995 DF, p-value: < 2.2e-16
#**********************WARNING*******************#
#pred=predict(fit2,newdata=list(age_grid),se=T)
#lines(age_grid,pred$fit,col="red",lwd=2)
I don’t know why a vector of length 3000 is being generated when the predict function is applied to only a handful of values
plot(age,wage,xlim=agelims,cex=0.5,col="darkgrey")
title("Smoothing spline")
fit=smooth.spline(age,wage,df=16)
fit2=smooth.spline(age,wage,cv=TRUE)
## Warning in smooth.spline(age, wage, cv = TRUE): cross-validation with non-
## unique 'x' values seems doubtful
fit2$df
## [1] 6.794596
lines(fit,col="red",lwd=2)
lines(fit2,col="blue",lwd=2)
legend("topright",legend=c("16 DF","6.8 DF"),col=c("red","blue"),lty=1,lwd=2,cex=0.8)
Loess or local regression. I would like to research the difference between local and segmented regression
plot(age,wage,xlim=agelims,cex=0.5,col="darkgrey")
title("Local Regression")
fit=loess(wage~age,span = 0.2,data=Wage)
fit2=loess(wage~age,span = 0.5,data=Wage)
lines(age_grid,predict(fit,data.frame(age=age_grid)),col="red",lwd=2)
lines(age_grid,predict(fit2,data.frame(age=age_grid)),col="blue",lwd=2)
legend("topright",legend=c("Span=0.2","Span=0.5"),col=c("red","blue"),lty=1,lwd=2,cex=0.8)
Install gam and library(gam) at the top
gam1=lm(wage~ns(year,4)+ns(age,5)+education,data = Wage)
gam_m3=gam(wage~s(year,4)+s(age,5)+education,data = Wage)
par(mfrow=c(1,3))
plot(gam_m3,se=TRUE,col="blue")
plot.Gam(gam1,se=TRUE,col="red")
gam_m1=gam(wage~s(age,5)+education, data=Wage)
gam_m2=gam(wage~year+s(age,5)+education, data=Wage)
anova(gam_m1,gam_m2,gam_m3,test = "F")
## Analysis of Deviance Table
##
## Model 1: wage ~ s(age, 5) + education
## Model 2: wage ~ year + s(age, 5) + education
## Model 3: wage ~ s(year, 4) + s(age, 5) + education
## Resid. Df Resid. Dev Df Deviance F Pr(>F)
## 1 2990 3711731
## 2 2989 3693842 1 17889.2 14.4771 0.0001447 ***
## 3 2986 3689770 3 4071.1 1.0982 0.3485661
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(gam_m3)
##
## Call: gam(formula = wage ~ s(year, 4) + s(age, 5) + education, data = Wage)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -119.43 -19.70 -3.33 14.17 213.48
##
## (Dispersion Parameter for gaussian family taken to be 1235.69)
##
## Null Deviance: 5222086 on 2999 degrees of freedom
## Residual Deviance: 3689770 on 2986 degrees of freedom
## AIC: 29887.75
##
## Number of Local Scoring Iterations: 2
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## s(year, 4) 1 27162 27162 21.981 2.877e-06 ***
## s(age, 5) 1 195338 195338 158.081 < 2.2e-16 ***
## education 4 1069726 267432 216.423 < 2.2e-16 ***
## Residuals 2986 3689770 1236
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## s(year, 4) 3 1.086 0.3537
## s(age, 5) 4 32.380 <2e-16 ***
## education
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
attach(Carseats)
High=ifelse(Sales<=8,"No","Yes")
Carseats=data.frame(Carseats,High)
Fit a classification tree
tree_carseats=tree(High~.-Sales,Carseats)
summary(tree_carseats)
##
## Classification tree:
## tree(formula = High ~ . - Sales, data = Carseats)
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "Income" "CompPrice" "Population"
## [6] "Advertising" "Age" "US"
## Number of terminal nodes: 27
## Residual mean deviance: 0.4575 = 170.7 / 373
## Misclassification error rate: 0.09 = 36 / 400
plot(tree_carseats)
text(tree_carseats,pretty=0)
Compute the test error on the dataset
set.seed(2)
train=sample(1:nrow(Carseats),200)
Carseats_test=Carseats[-train,]
High_test=High[-train]
tree_carseats=tree(High~.-Sales,Carseats,subset = train)
tree_pred=predict(tree_carseats,Carseats_test,type = "class")
table(tree_pred,High_test)
## High_test
## tree_pred No Yes
## No 86 27
## Yes 30 57
Prune the tree
set.seed(3)
cv_carseats=cv.tree(tree_carseats,FUN=prune.misclass)
names(cv_carseats)
## [1] "size" "dev" "k" "method"
par(mfrow=c(1,2))
plot(cv_carseats$size,cv_carseats$dev,type="b")
plot(cv_carseats$k,cv_carseats$dev,type="b")
prune_carseats=prune.misclass(tree_carseats,best=9)
plot(prune_carseats)
text(prune_carseats,pretty = 0)
After find the tree with the best k, we will use the predict function to find the accuracy of the tree
tree_pred=predict(prune_carseats,Carseats_test,type="class")
table(tree_pred,High_test)
## High_test
## tree_pred No Yes
## No 94 24
## Yes 22 60
set.seed(1)
train=sample(1:nrow(Boston),nrow(Boston)/2)
tree_boston=tree(medv~.,Boston,subset = train)
summary(tree_boston)
##
## Regression tree:
## tree(formula = medv ~ ., data = Boston, subset = train)
## Variables actually used in tree construction:
## [1] "lstat" "rm" "dis"
## Number of terminal nodes: 8
## Residual mean deviance: 12.65 = 3099 / 245
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -14.10000 -2.04200 -0.05357 0.00000 1.96000 12.60000
Note: In the context of regression tree, the deviance is simply the sum of squared errors for the tree
plot(tree_boston)
text(tree_boston,pretty = 0)
Cross-Validation
cv_boston<-cv.tree(tree_boston)
plot(cv_boston$size,cv_boston$dev,type='b')
Prune tree
prune_boston<-prune.tree(tree_boston,best=5)
plot(prune_boston)
text(prune_boston,pretty=0)
Predictions on the test set
yhat=predict(tree_boston,newdata = Boston[-train,])
boston_test=Boston[-train,"medv"]
plot(yhat,boston_test)
abline(0,1)
mean((yhat-boston_test)^2)
## [1] 25.04559
set.seed(1)
bag_boston=randomForest(medv~.,data = Boston,subset = train,mtry=13,importance=TRUE)
Bagging Tree prediction on test set
yhat_bag=predict(bag_boston,newdata=Boston[-train,])
plot(yhat_bag,boston_test)
abline(0,1)
mean((yhat_bag-boston_test)^2)
## [1] 13.50808
In the book it is mentioned that p/3 variables are used in regression trees and square root of p are used for classification trees
set.seed(1)
rf_boston=randomForest(medv~.,data = Boston,subset = train,mtry=6,importance=TRUE)
importance(rf_boston)
## %IncMSE IncNodePurity
## crim 12.132320 986.50338
## zn 1.955579 57.96945
## indus 9.069302 882.78261
## chas 2.210835 45.22941
## nox 11.104823 1044.33776
## rm 31.784033 6359.31971
## age 10.962684 516.82969
## dis 15.015236 1224.11605
## rad 4.118011 95.94586
## tax 8.587932 502.96719
## ptratio 12.503896 830.77523
## black 6.702609 341.30361
## lstat 30.695224 7505.73936
varImpPlot(rf_boston)
set.seed(1)
boost_boston=gbm(medv~.,data=Boston[train,],distribution = "gaussian",n.trees = 5000,interaction.depth = 4)
summary(boost_boston)
## var rel.inf
## lstat lstat 37.0661275
## rm rm 25.3533123
## dis dis 11.7903016
## crim crim 8.0388750
## black black 4.2531659
## nox nox 3.5058570
## age age 3.4868724
## ptratio ptratio 2.2500385
## indus indus 1.7725070
## tax tax 1.1836592
## chas chas 0.7441319
## rad rad 0.4274311
## zn zn 0.1277206
par(mfrow=c(1,2))
plot(boost_boston,i="rm")
plot(boost_boston,i="lstat")
boost_boston=gbm(medv~.,data=Boston[train,],n.trees = 5000,interaction.depth = 4,shrinkage = 0.2,verbose = F)
## Distribution not specified, assuming gaussian ...
yhat_boost=predict(boost_boston,newdata=Boston[-train,],n.trees = 5000)
mean((yhat_boost-boston_test)^2)
## [1] 11.51109
set.seed(1)
x=matrix(rnorm(20*2),ncol=2)
y=c(rep(-1,10),rep(1,10))
x[y==1,]=x[y==1,]+1
plot(x,col=(3-y))
dat<-data.frame(x=x,y=as.factor(y))
svmfit=svm(y~.,data=dat,kernel="linear",cost=10,scale=FALSE)
plot(svmfit,dat)
Plot function is able to draw boundaries, color regions, mark boundaries, mark observations
summary(svmfit)
##
## Call:
## svm(formula = y ~ ., data = dat, kernel = "linear", cost = 10,
## scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 10
## gamma: 0.5
##
## Number of Support Vectors: 7
##
## ( 4 3 )
##
##
## Number of Classes: 2
##
## Levels:
## -1 1
svmfit<-svm(y~.,data=dat,kernel="linear",cost=0.01,scale=FALSE)
plot(svmfit,dat)
Cross-Validation. tune function helps you input different
set.seed(1)
tune_out<-tune(svm,y~.,data=dat,kernel="linear",ranges=list(cost=c(0.001,0.01,0.1,1,5,10,100)))
summary(tune_out)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 0.1
##
## - best performance: 0.1
##
## - Detailed performance results:
## cost error dispersion
## 1 1e-03 0.70 0.4216370
## 2 1e-02 0.70 0.4216370
## 3 1e-01 0.10 0.2108185
## 4 1e+00 0.15 0.2415229
## 5 5e+00 0.15 0.2415229
## 6 1e+01 0.15 0.2415229
## 7 1e+02 0.15 0.2415229
bestmod=tune_out$best.model
Generating the test data
xtest=matrix(rnorm(20*2),ncol=2)
ytest=sample(c(-1,1),20,rep=TRUE)
xtest[ytest==1,]=xtest[ytest==1,]+1
testdat=data.frame(x=xtest,y=as.factor(ytest))
Predictions on test data
ypred=predict(bestmod,testdat)
table(predict=ypred,truth=testdat$y)
## truth
## predict -1 1
## -1 11 1
## 1 0 8
Again separating the two classes
x[y==1,]=x[y==1,]+0.5
plot(x,col=(y+5)/2,pch=19)
dat=data.frame(x=x,y=as.factor(y))
svmfit=svm(y~.,data=dat,kernel="linear",cost=1e5)
summary(svmfit)
##
## Call:
## svm(formula = y ~ ., data = dat, kernel = "linear", cost = 1e+05)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1e+05
## gamma: 0.5
##
## Number of Support Vectors: 3
##
## ( 1 2 )
##
##
## Number of Classes: 2
##
## Levels:
## -1 1
plot(svmfit,dat)
More cost , we allow more number of misclassifications and hence bigger the margin
svmfit=svm(y~.,data=dat,kernel="linear",cost=1)
summary(svmfit)
##
## Call:
## svm(formula = y ~ ., data = dat, kernel = "linear", cost = 1)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
## gamma: 0.5
##
## Number of Support Vectors: 7
##
## ( 4 3 )
##
##
## Number of Classes: 2
##
## Levels:
## -1 1
plot(svmfit,dat)
set.seed(1)
x=matrix(rnorm(200*2),ncol=2)
x[1:100,]=x[1:100]+2
x[101:150,]=x[101:150]-2
y=c(rep(1,150),rep(2,50))
dat=data.frame(x=x,y=as.factor(y))
plot(x,col=y)
train=sample(200,100)
svmfit=svm(y~.,data=dat[train,],kernel="radial",gamma=1,cost=1)
plot(svmfit,dat[train,])
summary(svmfit)
##
## Call:
## svm(formula = y ~ ., data = dat[train, ], kernel = "radial",
## gamma = 1, cost = 1)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
## gamma: 1
##
## Number of Support Vectors: 41
##
## ( 20 21 )
##
##
## Number of Classes: 2
##
## Levels:
## 1 2
Increasing the value of cost term
svmfit<-svm(y~.,data=dat[train,],kernel="radial",gamma=1,cost=1e5)
plot(svmfit,dat[train,])
Variation by cost and gamma
set.seed(1)
tune_out=tune(svm,y~.,data=dat[train,],kernel="radial",ranges = list(cost=c(0.1,1,10,100,1000),gamma=c(0.5,1,2,3,4)))
summary(tune_out)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 1000 2
##
## - best performance: 0.05
##
## - Detailed performance results:
## cost gamma error dispersion
## 1 1e-01 0.5 0.27 0.11595018
## 2 1e+00 0.5 0.11 0.05676462
## 3 1e+01 0.5 0.13 0.04830459
## 4 1e+02 0.5 0.12 0.12292726
## 5 1e+03 0.5 0.09 0.07378648
## 6 1e-01 1.0 0.27 0.11595018
## 7 1e+00 1.0 0.13 0.04830459
## 8 1e+01 1.0 0.15 0.10801234
## 9 1e+02 1.0 0.10 0.09428090
## 10 1e+03 1.0 0.09 0.05676462
## 11 1e-01 2.0 0.27 0.11595018
## 12 1e+00 2.0 0.16 0.06992059
## 13 1e+01 2.0 0.16 0.09660918
## 14 1e+02 2.0 0.12 0.07888106
## 15 1e+03 2.0 0.05 0.05270463
## 16 1e-01 3.0 0.27 0.11595018
## 17 1e+00 3.0 0.16 0.06992059
## 18 1e+01 3.0 0.14 0.06992059
## 19 1e+02 3.0 0.12 0.07888106
## 20 1e+03 3.0 0.07 0.06749486
## 21 1e-01 4.0 0.27 0.11595018
## 22 1e+00 4.0 0.16 0.09660918
## 23 1e+01 4.0 0.14 0.06992059
## 24 1e+02 4.0 0.10 0.08164966
## 25 1e+03 4.0 0.10 0.08164966
table(true=dat[-train,"y"],pred=predict(tune_out$best.model,newx=dat[-train,]))
## pred
## true 1 2
## 1 55 22
## 2 19 4
rocplot=function(pred,truth,...)
{
predob=prediction(pred,truth)
perf=performance(predob,"tpr","fpr")
plot(perf,...)
}
svmfit_opt=svm(y~.,data = dat[train,],kernel="radial",gamma=2,cost=1,decision.values=T)
fitted=attributes(predict(svmfit_opt,dat[train,],decision.values=TRUE))$decision.values
par(mfrow=c(1,2))
rocplot(fitted,dat[train,"y"],main="Training Data")
svmfit_flex=svm(y~.,data=dat[train,],kernel="radial",gamma=50,cost=1,decision.values=T)
fitted=attributes(predict(svmfit_flex,dat[train,],decision.values=TRUE))$decision.values
rocplot(fitted,dat[train,"y"],add=T,col="red")
Predictions on the test set
par(mfrow=c(1,2))
fitted=attributes(predict(svmfit_opt,dat[-train,],decision.values=TRUE))$decision.values
rocplot(fitted,dat[-train,"y"],main="Test Data")
fitted=attributes(predict(svmfit_flex,dat[-train,],decision.values=TRUE))$decision.values
rocplot(fitted,dat[-train,"y"],add=T,col="red")
Using one- versus all approach
Generating the data
set.seed(1)
x=rbind(x,matrix(rnorm(50*2),ncol=2))
y=c(y,rep(0,50))
x[y==0,2]=x[y==0,2]+2
dat=data.frame(x=x,y=as.factor(y))
par(mfrow=c(1,1))
plot(x,col=(y+1))
svmfit=svm(y~.,data=dat,kernel="radial",cost=10,gamma=1)
plot(svmfit,dat)
Application to the Gene expression data
names(Khan)
## [1] "xtrain" "xtest" "ytrain" "ytest"
states=row.names(USArrests)
states
## [1] "Alabama" "Alaska" "Arizona" "Arkansas"
## [5] "California" "Colorado" "Connecticut" "Delaware"
## [9] "Florida" "Georgia" "Hawaii" "Idaho"
## [13] "Illinois" "Indiana" "Iowa" "Kansas"
## [17] "Kentucky" "Louisiana" "Maine" "Maryland"
## [21] "Massachusetts" "Michigan" "Minnesota" "Mississippi"
## [25] "Missouri" "Montana" "Nebraska" "Nevada"
## [29] "New Hampshire" "New Jersey" "New Mexico" "New York"
## [33] "North Carolina" "North Dakota" "Ohio" "Oklahoma"
## [37] "Oregon" "Pennsylvania" "Rhode Island" "South Carolina"
## [41] "South Dakota" "Tennessee" "Texas" "Utah"
## [45] "Vermont" "Virginia" "Washington" "West Virginia"
## [49] "Wisconsin" "Wyoming"
names(USArrests)
## [1] "Murder" "Assault" "UrbanPop" "Rape"
apply(USArrests, 2, mean)
## Murder Assault UrbanPop Rape
## 7.788 170.760 65.540 21.232
pr_out<-prcomp(USArrests,scale. = TRUE)
names(pr_out)
## [1] "sdev" "rotation" "center" "scale" "x"
pr_out$center
## Murder Assault UrbanPop Rape
## 7.788 170.760 65.540 21.232
pr_out$scale
## Murder Assault UrbanPop Rape
## 4.355510 83.337661 14.474763 9.366385
pr_out$rotation
## PC1 PC2 PC3 PC4
## Murder -0.5358995 0.4181809 -0.3412327 0.64922780
## Assault -0.5831836 0.1879856 -0.2681484 -0.74340748
## UrbanPop -0.2781909 -0.8728062 -0.3780158 0.13387773
## Rape -0.5434321 -0.1673186 0.8177779 0.08902432
pr_out$x
## PC1 PC2 PC3 PC4
## Alabama -0.97566045 1.12200121 -0.43980366 0.154696581
## Alaska -1.93053788 1.06242692 2.01950027 -0.434175454
## Arizona -1.74544285 -0.73845954 0.05423025 -0.826264240
## Arkansas 0.13999894 1.10854226 0.11342217 -0.180973554
## California -2.49861285 -1.52742672 0.59254100 -0.338559240
## Colorado -1.49934074 -0.97762966 1.08400162 0.001450164
## Connecticut 1.34499236 -1.07798362 -0.63679250 -0.117278736
## Delaware -0.04722981 -0.32208890 -0.71141032 -0.873113315
## Florida -2.98275967 0.03883425 -0.57103206 -0.095317042
## Georgia -1.62280742 1.26608838 -0.33901818 1.065974459
## Hawaii 0.90348448 -1.55467609 0.05027151 0.893733198
## Idaho 1.62331903 0.20885253 0.25719021 -0.494087852
## Illinois -1.36505197 -0.67498834 -0.67068647 -0.120794916
## Indiana 0.50038122 -0.15003926 0.22576277 0.420397595
## Iowa 2.23099579 -0.10300828 0.16291036 0.017379470
## Kansas 0.78887206 -0.26744941 0.02529648 0.204421034
## Kentucky 0.74331256 0.94880748 -0.02808429 0.663817237
## Louisiana -1.54909076 0.86230011 -0.77560598 0.450157791
## Maine 2.37274014 0.37260865 -0.06502225 -0.327138529
## Maryland -1.74564663 0.42335704 -0.15566968 -0.553450589
## Massachusetts 0.48128007 -1.45967706 -0.60337172 -0.177793902
## Michigan -2.08725025 -0.15383500 0.38100046 0.101343128
## Minnesota 1.67566951 -0.62590670 0.15153200 0.066640316
## Mississippi -0.98647919 2.36973712 -0.73336290 0.213342049
## Missouri -0.68978426 -0.26070794 0.37365033 0.223554811
## Montana 1.17353751 0.53147851 0.24440796 0.122498555
## Nebraska 1.25291625 -0.19200440 0.17380930 0.015733156
## Nevada -2.84550542 -0.76780502 1.15168793 0.311354436
## New Hampshire 2.35995585 -0.01790055 0.03648498 -0.032804291
## New Jersey -0.17974128 -1.43493745 -0.75677041 0.240936580
## New Mexico -1.96012351 0.14141308 0.18184598 -0.336121113
## New York -1.66566662 -0.81491072 -0.63661186 -0.013348844
## North Carolina -1.11208808 2.20561081 -0.85489245 -0.944789648
## North Dakota 2.96215223 0.59309738 0.29824930 -0.251434626
## Ohio 0.22369436 -0.73477837 -0.03082616 0.469152817
## Oklahoma 0.30864928 -0.28496113 -0.01515592 0.010228476
## Oregon -0.05852787 -0.53596999 0.93038718 -0.235390872
## Pennsylvania 0.87948680 -0.56536050 -0.39660218 0.355452378
## Rhode Island 0.85509072 -1.47698328 -1.35617705 -0.607402746
## South Carolina -1.30744986 1.91397297 -0.29751723 -0.130145378
## South Dakota 1.96779669 0.81506822 0.38538073 -0.108470512
## Tennessee -0.98969377 0.85160534 0.18619262 0.646302674
## Texas -1.34151838 -0.40833518 -0.48712332 0.636731051
## Utah 0.54503180 -1.45671524 0.29077592 -0.081486749
## Vermont 2.77325613 1.38819435 0.83280797 -0.143433697
## Virginia 0.09536670 0.19772785 0.01159482 0.209246429
## Washington 0.21472339 -0.96037394 0.61859067 -0.218628161
## West Virginia 2.08739306 1.41052627 0.10372163 0.130583080
## Wisconsin 2.05881199 -0.60512507 -0.13746933 0.182253407
## Wyoming 0.62310061 0.31778662 -0.23824049 -0.164976866
The x vector with it’s loadings in different principal components
biplot(pr_out,scale=0)
We will be calculating the amount of variance explained by each principal component
pr_out$sdev
## [1] 1.5748783 0.9948694 0.5971291 0.4164494
pr_var=pr_out$sdev^2
pr_var
## [1] 2.4802416 0.9897652 0.3565632 0.1734301
pve=pr_var/sum(pr_var)
plot(pve,xlab="Principal Component",ylab="Proportion of Variance Explained",ylim=c(0,1),type="b")
Generating the data
set.seed(2)
x=matrix(rnorm(50*2),ncol=2)
x[1:25,1]=x[1:25,1]+3
x[1:25,2]=x[1:25,2]-4
K-means clustering with K=2
km_out=kmeans(x,2,nstart=20)
plot(x,col=(km_out$cluster+1),main="K_means Clustering with x=2",pch=20,cex=2)
We generated the data so we know there are 2 groups. Let us try what would have happen if we have started with K=3 groups
set.seed(4)
km_out=kmeans(x,3,nstart=20)
km_out
## K-means clustering with 3 clusters of sizes 10, 23, 17
##
## Cluster means:
## [,1] [,2]
## 1 2.3001545 -2.69622023
## 2 -0.3820397 -0.08740753
## 3 3.7789567 -4.56200798
##
## Clustering vector:
## [1] 3 1 3 1 3 3 3 1 3 1 3 1 3 1 3 1 3 3 3 3 3 1 3 3 3 2 2 2 2 2 2 2 2 2 2
## [36] 2 2 2 2 2 2 2 2 1 2 1 2 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 19.56137 52.67700 25.74089
## (between_SS / total_SS = 79.3 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
Hierarchical clustering
hc_average<-hclust(dist(x),method = 'average')
hc_single<-hclust(dist(x),method = 'single')
hc_complete<-hclust(dist(x),method = 'complete')
par(mfrow=c(1,3))
plot(hc_complete,cex=0.9)
plot(hc_average,cex=0.9)
plot(hc_single,cex=0.9)
Labelling
cutree(hc_complete,2)
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2
## [36] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
cutree(hc_average,2)
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 1 2 2
## [36] 2 2 2 2 2 2 2 2 1 2 1 2 2 2 2
cutree(hc_single,2)
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
cutree(hc_single,4)
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3
## [36] 3 3 3 3 3 3 4 3 3 3 3 3 3 3 3
xsc=scale(x)
plot(hclust(dist(xsc)))
Not only absolute distances we can also calculate distances based on correlation
x=matrix(rnorm(30*3),ncol=3)
dd=as.dist(1-cor(t(x)))
plot(hclust(dd,method='complete'),main="Complete Linkage With Correlation-Based distancec",xlab="",ylab="")
nci_labs=NCI60$labs
nci_data=NCI60$data
The data has 64 rows and columns >>>64 This is a p>>>n problem
PCA on the NCI60 data
pr_out=prcomp(nci_data,scale=TRUE)
Cols=function(vec)
{
cols=rainbow(length(unique(vec)))
return(cols[as.numeric(as.factor(vec))])
}
Plotting the first two principal components
par(mfrow=c(1,2))
plot(pr_out$x[,1:2],col=Cols(nci_labs),pch=19,xlab="Z1",ylab="Z2")
plot(pr_out$x[,c(1,3)],col=Cols(nci_labs),pch=19,xlab="Z1",ylab="Z3")
[Copied]:
This indicates that cell lines from the same cancer type tend to have pretty similar gene expression levels.
summary(pr_out)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5
## Standard deviation 27.8535 21.48136 19.82046 17.03256 15.97181
## Proportion of Variance 0.1136 0.06756 0.05752 0.04248 0.03735
## Cumulative Proportion 0.1136 0.18115 0.23867 0.28115 0.31850
## PC6 PC7 PC8 PC9 PC10
## Standard deviation 15.72108 14.47145 13.54427 13.14400 12.73860
## Proportion of Variance 0.03619 0.03066 0.02686 0.02529 0.02376
## Cumulative Proportion 0.35468 0.38534 0.41220 0.43750 0.46126
## PC11 PC12 PC13 PC14 PC15
## Standard deviation 12.68672 12.15769 11.83019 11.62554 11.43779
## Proportion of Variance 0.02357 0.02164 0.02049 0.01979 0.01915
## Cumulative Proportion 0.48482 0.50646 0.52695 0.54674 0.56590
## PC16 PC17 PC18 PC19 PC20
## Standard deviation 11.00051 10.65666 10.48880 10.43518 10.3219
## Proportion of Variance 0.01772 0.01663 0.01611 0.01594 0.0156
## Cumulative Proportion 0.58361 0.60024 0.61635 0.63229 0.6479
## PC21 PC22 PC23 PC24 PC25 PC26
## Standard deviation 10.14608 10.0544 9.90265 9.64766 9.50764 9.33253
## Proportion of Variance 0.01507 0.0148 0.01436 0.01363 0.01324 0.01275
## Cumulative Proportion 0.66296 0.6778 0.69212 0.70575 0.71899 0.73174
## PC27 PC28 PC29 PC30 PC31 PC32
## Standard deviation 9.27320 9.0900 8.98117 8.75003 8.59962 8.44738
## Proportion of Variance 0.01259 0.0121 0.01181 0.01121 0.01083 0.01045
## Cumulative Proportion 0.74433 0.7564 0.76824 0.77945 0.79027 0.80072
## PC33 PC34 PC35 PC36 PC37 PC38
## Standard deviation 8.37305 8.21579 8.15731 7.97465 7.90446 7.82127
## Proportion of Variance 0.01026 0.00988 0.00974 0.00931 0.00915 0.00896
## Cumulative Proportion 0.81099 0.82087 0.83061 0.83992 0.84907 0.85803
## PC39 PC40 PC41 PC42 PC43 PC44
## Standard deviation 7.72156 7.58603 7.45619 7.3444 7.10449 7.0131
## Proportion of Variance 0.00873 0.00843 0.00814 0.0079 0.00739 0.0072
## Cumulative Proportion 0.86676 0.87518 0.88332 0.8912 0.89861 0.9058
## PC45 PC46 PC47 PC48 PC49 PC50
## Standard deviation 6.95839 6.8663 6.80744 6.64763 6.61607 6.40793
## Proportion of Variance 0.00709 0.0069 0.00678 0.00647 0.00641 0.00601
## Cumulative Proportion 0.91290 0.9198 0.92659 0.93306 0.93947 0.94548
## PC51 PC52 PC53 PC54 PC55 PC56
## Standard deviation 6.21984 6.20326 6.06706 5.91805 5.91233 5.73539
## Proportion of Variance 0.00566 0.00563 0.00539 0.00513 0.00512 0.00482
## Cumulative Proportion 0.95114 0.95678 0.96216 0.96729 0.97241 0.97723
## PC57 PC58 PC59 PC60 PC61 PC62
## Standard deviation 5.47261 5.2921 5.02117 4.68398 4.17567 4.08212
## Proportion of Variance 0.00438 0.0041 0.00369 0.00321 0.00255 0.00244
## Cumulative Proportion 0.98161 0.9857 0.98940 0.99262 0.99517 0.99761
## PC63 PC64
## Standard deviation 4.04124 2.148e-14
## Proportion of Variance 0.00239 0.000e+00
## Cumulative Proportion 1.00000 1.000e+00
plot(pr_out)
Scree-Plot.
pve stands for proportion of variance explained
pve=100*pr_out$sdev^2/sum(pr_out$sdev^2)
par(mfrow=c(1,2))
plot(pve,type="o",ylab="PVE",xlab="Principal Component",col="blue")
plot(cumsum(pve),type="o",ylab="Cumulative PVE",xlab="Principal Component",col="brown3")
sd_data=scale(nci_data)
par(mfrow=c(1,3))
data_dist=dist(sd_data)
plot(hclust(data_dist),labels = nci_labs,main = "Complete Linkage",xlab = "",sub="",ylab="")
plot(hclust(data_dist,method="average"),labels = nci_labs,main = "Average Linkage",xlab = "",sub="",ylab="")
plot(hclust(data_dist,method="single"),labels = nci_labs,main = "Single Linkage",xlab = "",sub="",ylab="")
hc_out=hclust(dist(sd_data))
hc_clusters=cutree(hc_out,4)
table(hc_clusters,nci_labs)
## nci_labs
## hc_clusters BREAST CNS COLON K562A-repro K562B-repro LEUKEMIA MCF7A-repro
## 1 2 3 2 0 0 0 0
## 2 3 2 0 0 0 0 0
## 3 0 0 0 1 1 6 0
## 4 2 0 5 0 0 0 1
## nci_labs
## hc_clusters MCF7D-repro MELANOMA NSCLC OVARIAN PROSTATE RENAL UNKNOWN
## 1 0 8 8 6 2 8 1
## 2 0 0 1 0 0 1 0
## 3 0 0 0 0 0 0 0
## 4 1 0 0 0 0 0 0
par(mfrow=c(1,1))
plot(hc_out,labels = nci_labs)
abline(h=139,col="red")
At height=139 we see the cluster is cut into 4 different clusters. In the book it is specifically menioned that performing a k-means clustering with k=4 is not the same as cutting a dendogram at our chosen height. The resulting clusters can be different.
set.seed(2)
km_out=kmeans(sd_data,4,nstart=20)
km_clusters=km_out$cluster
table(km_clusters,hc_clusters)
## hc_clusters
## km_clusters 1 2 3 4
## 1 11 0 0 9
## 2 0 0 8 0
## 3 9 0 0 0
## 4 20 7 0 0
Another important line in the book was denoising the data by using only the first few principal component scores
hc_out=hclust(dist(pr_out$x[,1:5]))
plot(hc_out,labels=nci_labs,main="Hier. Clust. on First Five score vectors")
table(cutree(hc_out,4),nci_labs)
## nci_labs
## BREAST CNS COLON K562A-repro K562B-repro LEUKEMIA MCF7A-repro
## 1 0 2 7 0 0 2 0
## 2 5 3 0 0 0 0 0
## 3 0 0 0 1 1 4 0
## 4 2 0 0 0 0 0 1
## nci_labs
## MCF7D-repro MELANOMA NSCLC OVARIAN PROSTATE RENAL UNKNOWN
## 1 0 1 8 5 2 7 0
## 2 0 7 1 1 0 2 1
## 3 0 0 0 0 0 0 0
## 4 1 0 0 0 0 0 0