Robbie Beane
set.seed(1)
v0 <- rnorm(n=50, 0, 1.2)
v1 <- rnorm(n=50, 0, 0.4)
df1 <- data.frame(x1 = v0 + v1 + 5, x2 = 0.8*v0 - v1 + 4)
mean1 = mean(df1$x1)
mean2 = mean(df1$x2)
plot(df1, xlim=c(2,8), ylim=c(1,7), pch=21, bg='cornflowerblue', col='black', cex=1.2)
points(mean1, mean2, pch=21, bg='orange', col='black', cex=2)## PC1 PC2
## x1 0.7835622 0.6213133
## x2 0.6213133 -0.7835622
pc1 <- pca1$rotation[,1]
pc2 <- pca1$rotation[,2]
plot(df1, xlim=c(2,8), ylim=c(1,7), pch=21, bg='cornflowerblue', col='black', cex=1.2)
Arrows(mean1, mean2, mean1 + pc1[1], mean2 + pc1[2], lwd=3, arr.type='triangle')
Arrows(mean1, mean2, mean1 + pc2[1], mean2 + pc2[2], lwd=3, arr.type='triangle')
points(mean1, mean2, pch=21, bg='orange', col='black', cex=2)## PC1 PC2
## [1,] -1.09883375 0.1626135
## [2,] 0.08051408 -0.4104127
## [3,] -1.42397846 0.1319798
## [4,] 2.21625250 -0.7105089
## [5,] 0.43739205 0.7378345
## [6,] -1.29429260 1.0530729
## [7,] 0.56324011 -0.2748628
## [8,] 0.90486898 -0.6569212
## [9,] 0.75982131 0.2510644
## [10,] -0.64004302 -0.1391293
pc1 <- pca1$rotation[,1]
pc2 <- pca1$rotation[,2]
par(mfrow=c(1,2))
plot(df1, xlim=c(2,8), ylim=c(1,7), pch=21, bg='cornflowerblue', col='black', cex=1.2,
main="Original Features")
Arrows(mean1, mean2, mean1 + pc1[1], mean2 + pc1[2], lwd=5, arr.type='triangle')
Arrows(mean1, mean2, mean1 + pc1[1], mean2 + pc1[2], lwd=2, arr.type='triangle', col='plum')
Arrows(mean1, mean2, mean1 + pc2[1], mean2 + pc2[2], lwd=5, arr.type='triangle')
Arrows(mean1, mean2, mean1 + pc2[1], mean2 + pc2[2], lwd=2, arr.type='triangle', col='cyan')
points(mean1, mean2, pch=21, bg='orange', col='black', cex=2)
plot(pca1$x, xlim=c(-4,4), ylim=c(-4,4), pch=21, bg='cornflowerblue', col='black', cex=1.2,
xlab='z1', ylab='z2', main='Transformed Features')
Arrows(0, 0, 1, 0, lwd=5, arr.type='triangle')
Arrows(0, 0, 1, 0, lwd=2, arr.type='triangle', col='plum')
Arrows(0, 0, 0, 1, lwd=5, arr.type='triangle')
Arrows(0, 0, 0, 1, lwd=2, arr.type='triangle', col='cyan')
points(0, 0, pch=21, bg='orange', col='black', cex=2)std <- pca1$sdev
var <- pca1$sdev^2
var_ex <- var / sum(var)
cat('Std Dev of New Features: ', round(std, 5),
'\nVariances of New Features: ', round(var, 5),
'\nProp of Variance Explained:', round(var_ex, 5), sep=' ')## Std Dev of New Features: 1.27672 0.54468
## Variances of New Features: 1.63002 0.29667
## Prop of Variance Explained: 0.84602 0.15398
wc <- read.table("data/wine_cultivars.txt", header = TRUE, sep="\t")
wc$Cultivar <- factor(wc$Cultivar)
summary(wc)## Cultivar Alcohol Malic_Acid Ash Alcalinity
## 1:59 Min. :11.03 Min. :0.740 Min. :1.360 Min. :10.60
## 2:71 1st Qu.:12.36 1st Qu.:1.603 1st Qu.:2.210 1st Qu.:17.20
## 3:48 Median :13.05 Median :1.865 Median :2.360 Median :19.50
## Mean :13.00 Mean :2.336 Mean :2.367 Mean :19.49
## 3rd Qu.:13.68 3rd Qu.:3.083 3rd Qu.:2.558 3rd Qu.:21.50
## Max. :14.83 Max. :5.800 Max. :3.230 Max. :30.00
## Magnesium Total_Phenols Flavanoids NonFlavanoid_Phenols
## Min. : 70.00 Min. :0.980 Min. :0.340 Min. :0.1300
## 1st Qu.: 88.00 1st Qu.:1.742 1st Qu.:1.205 1st Qu.:0.2700
## Median : 98.00 Median :2.355 Median :2.135 Median :0.3400
## Mean : 99.74 Mean :2.295 Mean :2.029 Mean :0.3619
## 3rd Qu.:107.00 3rd Qu.:2.800 3rd Qu.:2.875 3rd Qu.:0.4375
## Max. :162.00 Max. :3.880 Max. :5.080 Max. :0.6600
## Proanthocyanins Color Hue OD280_OD315
## Min. :0.410 Min. : 1.280 Min. :0.4800 Min. :1.270
## 1st Qu.:1.250 1st Qu.: 3.220 1st Qu.:0.7825 1st Qu.:1.938
## Median :1.555 Median : 4.690 Median :0.9650 Median :2.780
## Mean :1.591 Mean : 5.058 Mean :0.9574 Mean :2.612
## 3rd Qu.:1.950 3rd Qu.: 6.200 3rd Qu.:1.1200 3rd Qu.:3.170
## Max. :3.580 Max. :13.000 Max. :1.7100 Max. :4.000
## Proline
## Min. : 278.0
## 1st Qu.: 500.5
## Median : 673.5
## Mean : 746.9
## 3rd Qu.: 985.0
## Max. :1680.0
## PC1 PC2 PC3 PC4
## Alcohol -0.144329395 0.483651548 -0.20738262 0.01785630
## Malic_Acid 0.245187580 0.224930935 0.08901289 -0.53689028
## Ash 0.002051061 0.316068814 0.62622390 0.21417556
## Alcalinity 0.239320405 -0.010590502 0.61208035 -0.06085941
## Magnesium -0.141992042 0.299634003 0.13075693 0.35179658
## Total_Phenols -0.394660845 0.065039512 0.14617896 -0.19806835
## Flavanoids -0.422934297 -0.003359812 0.15068190 -0.15229479
## NonFlavanoid_Phenols 0.298533103 0.028779488 0.17036816 0.20330102
## Proanthocyanins -0.313429488 0.039301722 0.14945431 -0.39905653
## Color 0.088616705 0.529995672 -0.13730621 -0.06592568
## Hue -0.296714564 -0.279235148 0.08522192 0.42777141
## OD280_OD315 -0.376167411 -0.164496193 0.16600459 -0.18412074
## Proline -0.286752227 0.364902832 -0.12674592 0.23207086
## PC5 PC6 PC7 PC8
## Alcohol -0.26566365 0.21353865 -0.05639636 0.39613926
## Malic_Acid 0.03521363 0.53681385 0.42052391 0.06582674
## Ash -0.14302547 0.15447466 -0.14917061 -0.17026002
## Alcalinity 0.06610294 -0.10082451 -0.28696914 0.42797018
## Magnesium 0.72704851 0.03814394 0.32288330 -0.15636143
## Total_Phenols -0.14931841 -0.08412230 -0.02792498 -0.40593409
## Flavanoids -0.10902584 -0.01892002 -0.06068521 -0.18724536
## NonFlavanoid_Phenols -0.50070298 -0.25859401 0.59544729 -0.23328465
## Proanthocyanins 0.13685982 -0.53379539 0.37213935 0.36822675
## Color -0.07643678 -0.41864414 -0.22771214 -0.03379692
## Hue -0.17361452 0.10598274 0.23207564 0.43662362
## OD280_OD315 -0.10116099 0.26585107 -0.04476370 -0.07810789
## Proline -0.15786880 0.11972557 0.07680450 0.12002267
## PC9 PC10 PC11 PC12
## Alcohol -0.50861912 0.21160473 0.22591696 -0.26628645
## Malic_Acid 0.07528304 -0.30907994 -0.07648554 0.12169604
## Ash 0.30769445 -0.02712539 0.49869142 -0.04962237
## Alcalinity -0.20044931 0.05279942 -0.47931378 -0.05574287
## Magnesium -0.27140257 0.06787022 -0.07128891 0.06222011
## Total_Phenols -0.28603452 -0.32013135 -0.30434119 -0.30388245
## Flavanoids -0.04957849 -0.16315051 0.02569409 -0.04289883
## NonFlavanoid_Phenols -0.19550132 0.21553507 -0.11689586 0.04235219
## Proanthocyanins 0.20914487 0.13418390 0.23736257 -0.09555303
## Color -0.05621752 -0.29077518 -0.03183880 0.60422163
## Hue -0.08582839 -0.52239889 0.04821201 0.25921400
## OD280_OD315 -0.13722690 0.52370587 -0.04642330 0.60095872
## Proline 0.57578611 0.16211600 -0.53926983 -0.07940162
## PC13
## Alcohol 0.01496997
## Malic_Acid 0.02596375
## Ash -0.14121803
## Alcalinity 0.09168285
## Magnesium 0.05677422
## Total_Phenols -0.46390791
## Flavanoids 0.83225706
## NonFlavanoid_Phenols 0.11403985
## Proanthocyanins -0.11691707
## Color -0.01199280
## Hue -0.08988884
## OD280_OD315 -0.15671813
## Proline 0.01444734
## [1] 0.361988481 0.192074903 0.111236305 0.070690302 0.065632937
## [6] 0.049358233 0.042386793 0.026807489 0.022221534 0.019300191
## [11] 0.017368357 0.012982326 0.007952149
## [1] 0.3619885 0.5540634 0.6652997 0.7359900 0.8016229 0.8509812 0.8933680
## [8] 0.9201754 0.9423970 0.9616972 0.9790655 0.9920479 1.0000000
plot(1:13, cum_ev, cex=1.4, pch=21, col='black', bg='steelblue',
xlab='Number of Principal Components', ylab='Cumulative Variance Explained')
abline(v=8, lty=3, lwd=2, col='steelblue')
abline(h=cum_ev[8], lty=3, lwd=2, col='steelblue')plot(wc_pca$x[,1:2], pch=21, col='black', cex=1.2,
bg=c('salmon', 'steelblue', 'gold2')[wc$Cultivar],
main='First two Transformed Features')## diagnosis radius_mean texture_mean perimeter_mean
## B:357 Min. : 6.981 Min. : 9.71 Min. : 43.79
## M:212 1st Qu.:11.700 1st Qu.:16.17 1st Qu.: 75.17
## Median :13.370 Median :18.84 Median : 86.24
## Mean :14.127 Mean :19.29 Mean : 91.97
## 3rd Qu.:15.780 3rd Qu.:21.80 3rd Qu.:104.10
## Max. :28.110 Max. :39.28 Max. :188.50
## area_mean smoothness_mean compactness_mean concavity_mean
## Min. : 143.5 Min. :0.05263 Min. :0.01938 Min. :0.00000
## 1st Qu.: 420.3 1st Qu.:0.08637 1st Qu.:0.06492 1st Qu.:0.02956
## Median : 551.1 Median :0.09587 Median :0.09263 Median :0.06154
## Mean : 654.9 Mean :0.09636 Mean :0.10434 Mean :0.08880
## 3rd Qu.: 782.7 3rd Qu.:0.10530 3rd Qu.:0.13040 3rd Qu.:0.13070
## Max. :2501.0 Max. :0.16340 Max. :0.34540 Max. :0.42680
## concave.points_mean symmetry_mean fractal_dimension_mean
## Min. :0.00000 Min. :0.1060 Min. :0.04996
## 1st Qu.:0.02031 1st Qu.:0.1619 1st Qu.:0.05770
## Median :0.03350 Median :0.1792 Median :0.06154
## Mean :0.04892 Mean :0.1812 Mean :0.06280
## 3rd Qu.:0.07400 3rd Qu.:0.1957 3rd Qu.:0.06612
## Max. :0.20120 Max. :0.3040 Max. :0.09744
## radius_se texture_se perimeter_se area_se
## Min. :0.1115 Min. :0.3602 Min. : 0.757 Min. : 6.802
## 1st Qu.:0.2324 1st Qu.:0.8339 1st Qu.: 1.606 1st Qu.: 17.850
## Median :0.3242 Median :1.1080 Median : 2.287 Median : 24.530
## Mean :0.4052 Mean :1.2169 Mean : 2.866 Mean : 40.337
## 3rd Qu.:0.4789 3rd Qu.:1.4740 3rd Qu.: 3.357 3rd Qu.: 45.190
## Max. :2.8730 Max. :4.8850 Max. :21.980 Max. :542.200
## smoothness_se compactness_se concavity_se
## Min. :0.001713 Min. :0.002252 Min. :0.00000
## 1st Qu.:0.005169 1st Qu.:0.013080 1st Qu.:0.01509
## Median :0.006380 Median :0.020450 Median :0.02589
## Mean :0.007041 Mean :0.025478 Mean :0.03189
## 3rd Qu.:0.008146 3rd Qu.:0.032450 3rd Qu.:0.04205
## Max. :0.031130 Max. :0.135400 Max. :0.39600
## concave.points_se symmetry_se fractal_dimension_se
## Min. :0.000000 Min. :0.007882 Min. :0.0008948
## 1st Qu.:0.007638 1st Qu.:0.015160 1st Qu.:0.0022480
## Median :0.010930 Median :0.018730 Median :0.0031870
## Mean :0.011796 Mean :0.020542 Mean :0.0037949
## 3rd Qu.:0.014710 3rd Qu.:0.023480 3rd Qu.:0.0045580
## Max. :0.052790 Max. :0.078950 Max. :0.0298400
## radius_worst texture_worst perimeter_worst area_worst
## Min. : 7.93 Min. :12.02 Min. : 50.41 Min. : 185.2
## 1st Qu.:13.01 1st Qu.:21.08 1st Qu.: 84.11 1st Qu.: 515.3
## Median :14.97 Median :25.41 Median : 97.66 Median : 686.5
## Mean :16.27 Mean :25.68 Mean :107.26 Mean : 880.6
## 3rd Qu.:18.79 3rd Qu.:29.72 3rd Qu.:125.40 3rd Qu.:1084.0
## Max. :36.04 Max. :49.54 Max. :251.20 Max. :4254.0
## smoothness_worst compactness_worst concavity_worst concave.points_worst
## Min. :0.07117 Min. :0.02729 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.11660 1st Qu.:0.14720 1st Qu.:0.1145 1st Qu.:0.06493
## Median :0.13130 Median :0.21190 Median :0.2267 Median :0.09993
## Mean :0.13237 Mean :0.25427 Mean :0.2722 Mean :0.11461
## 3rd Qu.:0.14600 3rd Qu.:0.33910 3rd Qu.:0.3829 3rd Qu.:0.16140
## Max. :0.22260 Max. :1.05800 Max. :1.2520 Max. :0.29100
## symmetry_worst fractal_dimension_worst
## Min. :0.1565 Min. :0.05504
## 1st Qu.:0.2504 1st Qu.:0.07146
## Median :0.2822 Median :0.08004
## Mean :0.2901 Mean :0.08395
## 3rd Qu.:0.3179 3rd Qu.:0.09208
## Max. :0.6638 Max. :0.20750
bc_pca <- prcomp(scale(bc[,-1]))
ev <- bc_pca$sdev**2 / sum(bc_pca$sdev**2)
cum_ev <- cumsum(ev)
plot(1:30, cum_ev, cex=1.4, pch=21, col='black', bg='steelblue',
xlab='Number of Principal Components', ylab='Cumulative Variance Explained')
abline(v=7, lty=3, lwd=2, col='steelblue')
abline(h=cum_ev[7], lty=3, lwd=2, col='steelblue')plot(bc_pca$x[,1:2], pch=21, col='black', cex=1,
bg=c('salmon', 'steelblue')[bc$diagnosis],
main='First Two Transformed Features')set.seed(1)
bc_mod <- train(bc_pca$x[,1:10], bc$diagnosis, method='glmnet', metric='Accuracy',
trControl = trainControl(method='cv', number=10),
tuneGrid = expand.grid(alpha=seq(0, 1, by=0.2),
lambda=seq(-0.2, 0.2, length=50)))
best_ix = which.max(bc_mod$results$Accuracy)
bc_mod$results[best_ix, ]## alpha lambda Accuracy Kappa AccuracySD KappaSD
## 276 1 0.004081633 0.9755283 0.9467832 0.02051406 0.04560119
## 11 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -0.5090167
## PC1 -1.8296826
## PC2 0.9360084
## PC3 -0.3481095
## PC4 -0.4372119
## PC5 0.6865393
## PC6 -0.1271122
## PC7 .
## PC8 -0.4762884
## PC9 0.7597011
## PC10 -0.1502363
ss <- read.table('data/student_success_data.csv', header=TRUE, sep=',')
ss <- na.omit(ss)
ss <- ss[(ss$G3 <= 20) & (ss$G3 >= 0), ]
ss$Passed <- factor(ifelse(ss$G3 < 10, 0, 1))
ss$G1 <- NULL
ss$G2 <- NULL
ss$G3 <- NULL
ss$absences <- NULL
summary(ss)## school sex age address famsize Pstatus
## GP :462 F:297 Min. :15.00 R:155 GT3:393 A: 45
## MHS:101 M:266 1st Qu.:16.00 U:408 LE3:170 T:518
## Median :16.00
## Mean :16.61
## 3rd Qu.:18.00
## Max. :22.00
## Medu Fedu Mjob Fjob
## Min. :1.000 Min. :1.000 at_home : 66 at_home : 31
## 1st Qu.:2.000 1st Qu.:2.000 health : 39 health : 25
## Median :3.000 Median :3.000 other :184 other :275
## Mean :2.874 Mean :2.686 services:182 services:173
## 3rd Qu.:4.000 3rd Qu.:4.000 teacher : 92 teacher : 59
## Max. :4.000 Max. :4.000
## reason guardian traveltime studytime
## course :215 father:129 Min. :1.000 Min. :1.000
## home :158 mother:398 1st Qu.:1.000 1st Qu.:1.000
## other : 48 other : 36 Median :1.000 Median :2.000
## reputation:142 Mean :1.481 Mean :1.986
## 3rd Qu.:2.000 3rd Qu.:2.000
## Max. :4.000 Max. :4.000
## failures schoolsup famsup paid activities nursery
## Min. :0.0000 no :507 no :254 no :332 no :262 no :102
## 1st Qu.:0.0000 yes: 56 yes:309 yes:231 yes:301 yes:461
## Median :0.0000
## Mean :0.2842
## 3rd Qu.:0.0000
## Max. :3.0000
## higher internet romantic famrel freetime
## no : 22 no : 93 no :341 Min. :1.000 Min. :1.000
## yes:541 yes:470 yes:222 1st Qu.:4.000 1st Qu.:3.000
## Median :4.000 Median :3.000
## Mean :3.938 Mean :3.213
## 3rd Qu.:5.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000
## goout Dalc Walc health Passed
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.00 0:200
## 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:3.00 1:363
## Median :3.000 Median :1.000 Median :2.000 Median :4.00
## Mean :3.021 Mean :1.401 Mean :2.176 Mean :3.67
## 3rd Qu.:4.000 3rd Qu.:2.000 3rd Qu.:3.000 3rd Qu.:5.00
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.00
## [1] 55
## [1] 55 55
## [1] 563 55
ev <- ss_pca$sdev**2 / sum(ss_pca$sdev**2)
cum_ev <- cumsum(ev)
plot(1:55, cum_ev, cex=1, pch=21, col='black', bg='steelblue',
xlab='Number of Principal Components', ylab='Cumulative Variance Explained')
abline(v=28, lty=3, lwd=2, col='steelblue')
abline(h=cum_ev[28], lty=3, lwd=2, col='steelblue')set.seed(1)
ss_mod <- train(ss_pca$x[,1:30], ss$Passed, method='glmnet', metric='Accuracy',
trControl = trainControl(method='cv', number=10),
tuneGrid = expand.grid(alpha=seq(0, 1, by=0.2),
lambda=seq(-0.5, 0.5, length=50)))
best_ix = which.max(ss_mod$results$Accuracy)
ss_mod$results[best_ix, ]## alpha lambda Accuracy Kappa AccuracySD KappaSD
## 76 0.2 0.01020408 0.7107456 0.3398844 0.05179869 0.1172835