Lesson 10.1 - Principal Component Analysis

Robbie Beane

Load Packages

library(caret)
library(shape)
library(ggplot2)

Example 1: Synthetic Example with Two Features

set.seed(1)
v0 <- rnorm(n=50, 0, 1.2)
v1 <- rnorm(n=50, 0, 0.4)
df1 <- data.frame(x1 = v0 + v1 + 5, x2 = 0.8*v0 - v1 + 4)

mean1 = mean(df1$x1)
mean2 = mean(df1$x2)

plot(df1, xlim=c(2,8), ylim=c(1,7), pch=21, bg='cornflowerblue', col='black', cex=1.2)
points(mean1, mean2, pch=21, bg='orange', col='black', cex=2)

Perform PCA

pca1 <- prcomp(df1)
pca1$rotation
##          PC1        PC2
## x1 0.7835622  0.6213133
## x2 0.6213133 -0.7835622

Perform PCA

pc1 <- pca1$rotation[,1]
pc2 <- pca1$rotation[,2]

plot(df1, xlim=c(2,8), ylim=c(1,7), pch=21, bg='cornflowerblue', col='black', cex=1.2)
Arrows(mean1, mean2, mean1 + pc1[1], mean2 + pc1[2], lwd=3, arr.type='triangle')
Arrows(mean1, mean2, mean1 + pc2[1], mean2 + pc2[2], lwd=3, arr.type='triangle')
points(mean1, mean2, pch=21, bg='orange', col='black', cex=2)

Transformed Features

pca1$x[1:10,]
##               PC1        PC2
##  [1,] -1.09883375  0.1626135
##  [2,]  0.08051408 -0.4104127
##  [3,] -1.42397846  0.1319798
##  [4,]  2.21625250 -0.7105089
##  [5,]  0.43739205  0.7378345
##  [6,] -1.29429260  1.0530729
##  [7,]  0.56324011 -0.2748628
##  [8,]  0.90486898 -0.6569212
##  [9,]  0.75982131  0.2510644
## [10,] -0.64004302 -0.1391293

Transformed Features

pc1 <- pca1$rotation[,1]
pc2 <- pca1$rotation[,2]

par(mfrow=c(1,2))

plot(df1, xlim=c(2,8), ylim=c(1,7), pch=21, bg='cornflowerblue', col='black', cex=1.2,
     main="Original Features")
Arrows(mean1, mean2, mean1 + pc1[1], mean2 + pc1[2], lwd=5, arr.type='triangle')
Arrows(mean1, mean2, mean1 + pc1[1], mean2 + pc1[2], lwd=2, arr.type='triangle', col='plum')
Arrows(mean1, mean2, mean1 + pc2[1], mean2 + pc2[2], lwd=5, arr.type='triangle')
Arrows(mean1, mean2, mean1 + pc2[1], mean2 + pc2[2], lwd=2, arr.type='triangle', col='cyan')
points(mean1, mean2, pch=21, bg='orange', col='black', cex=2)

plot(pca1$x, xlim=c(-4,4), ylim=c(-4,4), pch=21, bg='cornflowerblue', col='black', cex=1.2, 
     xlab='z1', ylab='z2', main='Transformed Features')
Arrows(0, 0, 1, 0, lwd=5, arr.type='triangle')
Arrows(0, 0, 1, 0, lwd=2, arr.type='triangle', col='plum')
Arrows(0, 0, 0, 1, lwd=5, arr.type='triangle')
Arrows(0, 0, 0, 1, lwd=2, arr.type='triangle', col='cyan')
points(0, 0, pch=21, bg='orange', col='black', cex=2)

par(mfrow=c(1,1))

Proportion of Variance Explained

std <- pca1$sdev
var <- pca1$sdev^2
var_ex <- var / sum(var)


cat('Std Dev of New Features:   ', round(std, 5), 
    '\nVariances of New Features: ', round(var, 5), 
    '\nProp of Variance Explained:', round(var_ex, 5), sep=' ')
## Std Dev of New Features:    1.27672 0.54468 
## Variances of New Features:  1.63002 0.29667 
## Prop of Variance Explained: 0.84602 0.15398

Example 2: Wine Cultivars Dataset

wc <- read.table("data/wine_cultivars.txt", header = TRUE, sep="\t")
wc$Cultivar <- factor(wc$Cultivar)
summary(wc)
##  Cultivar    Alcohol        Malic_Acid         Ash          Alcalinity   
##  1:59     Min.   :11.03   Min.   :0.740   Min.   :1.360   Min.   :10.60  
##  2:71     1st Qu.:12.36   1st Qu.:1.603   1st Qu.:2.210   1st Qu.:17.20  
##  3:48     Median :13.05   Median :1.865   Median :2.360   Median :19.50  
##           Mean   :13.00   Mean   :2.336   Mean   :2.367   Mean   :19.49  
##           3rd Qu.:13.68   3rd Qu.:3.083   3rd Qu.:2.558   3rd Qu.:21.50  
##           Max.   :14.83   Max.   :5.800   Max.   :3.230   Max.   :30.00  
##    Magnesium      Total_Phenols     Flavanoids    NonFlavanoid_Phenols
##  Min.   : 70.00   Min.   :0.980   Min.   :0.340   Min.   :0.1300      
##  1st Qu.: 88.00   1st Qu.:1.742   1st Qu.:1.205   1st Qu.:0.2700      
##  Median : 98.00   Median :2.355   Median :2.135   Median :0.3400      
##  Mean   : 99.74   Mean   :2.295   Mean   :2.029   Mean   :0.3619      
##  3rd Qu.:107.00   3rd Qu.:2.800   3rd Qu.:2.875   3rd Qu.:0.4375      
##  Max.   :162.00   Max.   :3.880   Max.   :5.080   Max.   :0.6600      
##  Proanthocyanins     Color             Hue          OD280_OD315   
##  Min.   :0.410   Min.   : 1.280   Min.   :0.4800   Min.   :1.270  
##  1st Qu.:1.250   1st Qu.: 3.220   1st Qu.:0.7825   1st Qu.:1.938  
##  Median :1.555   Median : 4.690   Median :0.9650   Median :2.780  
##  Mean   :1.591   Mean   : 5.058   Mean   :0.9574   Mean   :2.612  
##  3rd Qu.:1.950   3rd Qu.: 6.200   3rd Qu.:1.1200   3rd Qu.:3.170  
##  Max.   :3.580   Max.   :13.000   Max.   :1.7100   Max.   :4.000  
##     Proline      
##  Min.   : 278.0  
##  1st Qu.: 500.5  
##  Median : 673.5  
##  Mean   : 746.9  
##  3rd Qu.: 985.0  
##  Max.   :1680.0

Perform PCA

wc_pca <- prcomp(scale(wc[,-1]))
wc_pca$rotation
##                               PC1          PC2         PC3         PC4
## Alcohol              -0.144329395  0.483651548 -0.20738262  0.01785630
## Malic_Acid            0.245187580  0.224930935  0.08901289 -0.53689028
## Ash                   0.002051061  0.316068814  0.62622390  0.21417556
## Alcalinity            0.239320405 -0.010590502  0.61208035 -0.06085941
## Magnesium            -0.141992042  0.299634003  0.13075693  0.35179658
## Total_Phenols        -0.394660845  0.065039512  0.14617896 -0.19806835
## Flavanoids           -0.422934297 -0.003359812  0.15068190 -0.15229479
## NonFlavanoid_Phenols  0.298533103  0.028779488  0.17036816  0.20330102
## Proanthocyanins      -0.313429488  0.039301722  0.14945431 -0.39905653
## Color                 0.088616705  0.529995672 -0.13730621 -0.06592568
## Hue                  -0.296714564 -0.279235148  0.08522192  0.42777141
## OD280_OD315          -0.376167411 -0.164496193  0.16600459 -0.18412074
## Proline              -0.286752227  0.364902832 -0.12674592  0.23207086
##                              PC5         PC6         PC7         PC8
## Alcohol              -0.26566365  0.21353865 -0.05639636  0.39613926
## Malic_Acid            0.03521363  0.53681385  0.42052391  0.06582674
## Ash                  -0.14302547  0.15447466 -0.14917061 -0.17026002
## Alcalinity            0.06610294 -0.10082451 -0.28696914  0.42797018
## Magnesium             0.72704851  0.03814394  0.32288330 -0.15636143
## Total_Phenols        -0.14931841 -0.08412230 -0.02792498 -0.40593409
## Flavanoids           -0.10902584 -0.01892002 -0.06068521 -0.18724536
## NonFlavanoid_Phenols -0.50070298 -0.25859401  0.59544729 -0.23328465
## Proanthocyanins       0.13685982 -0.53379539  0.37213935  0.36822675
## Color                -0.07643678 -0.41864414 -0.22771214 -0.03379692
## Hue                  -0.17361452  0.10598274  0.23207564  0.43662362
## OD280_OD315          -0.10116099  0.26585107 -0.04476370 -0.07810789
## Proline              -0.15786880  0.11972557  0.07680450  0.12002267
##                              PC9        PC10        PC11        PC12
## Alcohol              -0.50861912  0.21160473  0.22591696 -0.26628645
## Malic_Acid            0.07528304 -0.30907994 -0.07648554  0.12169604
## Ash                   0.30769445 -0.02712539  0.49869142 -0.04962237
## Alcalinity           -0.20044931  0.05279942 -0.47931378 -0.05574287
## Magnesium            -0.27140257  0.06787022 -0.07128891  0.06222011
## Total_Phenols        -0.28603452 -0.32013135 -0.30434119 -0.30388245
## Flavanoids           -0.04957849 -0.16315051  0.02569409 -0.04289883
## NonFlavanoid_Phenols -0.19550132  0.21553507 -0.11689586  0.04235219
## Proanthocyanins       0.20914487  0.13418390  0.23736257 -0.09555303
## Color                -0.05621752 -0.29077518 -0.03183880  0.60422163
## Hue                  -0.08582839 -0.52239889  0.04821201  0.25921400
## OD280_OD315          -0.13722690  0.52370587 -0.04642330  0.60095872
## Proline               0.57578611  0.16211600 -0.53926983 -0.07940162
##                             PC13
## Alcohol               0.01496997
## Malic_Acid            0.02596375
## Ash                  -0.14121803
## Alcalinity            0.09168285
## Magnesium             0.05677422
## Total_Phenols        -0.46390791
## Flavanoids            0.83225706
## NonFlavanoid_Phenols  0.11403985
## Proanthocyanins      -0.11691707
## Color                -0.01199280
## Hue                  -0.08988884
## OD280_OD315          -0.15671813
## Proline               0.01444734

Explained Variance

ev <- wc_pca$sdev**2 / sum(wc_pca$sdev**2)
ev
##  [1] 0.361988481 0.192074903 0.111236305 0.070690302 0.065632937
##  [6] 0.049358233 0.042386793 0.026807489 0.022221534 0.019300191
## [11] 0.017368357 0.012982326 0.007952149
cum_ev <- cumsum(ev)
cum_ev
##  [1] 0.3619885 0.5540634 0.6652997 0.7359900 0.8016229 0.8509812 0.8933680
##  [8] 0.9201754 0.9423970 0.9616972 0.9790655 0.9920479 1.0000000
plot(1:13, cum_ev, cex=1.4, pch=21, col='black', bg='steelblue',
     xlab='Number of Principal Components', ylab='Cumulative Variance Explained')
abline(v=8, lty=3, lwd=2, col='steelblue')
abline(h=cum_ev[8], lty=3, lwd=2, col='steelblue')

Plotting First two Transformed Features

plot(wc_pca$x[,1:2], pch=21,  col='black', cex=1.2, 
     bg=c('salmon', 'steelblue', 'gold2')[wc$Cultivar],
     main='First two Transformed Features')

Example 3: Wisconsin Breast Cancer Dataset

bc <- read.table("data/breast_cancer.csv", header=TRUE, sep=",")
bc$id <- NULL
summary(bc)
##  diagnosis  radius_mean      texture_mean   perimeter_mean  
##  B:357     Min.   : 6.981   Min.   : 9.71   Min.   : 43.79  
##  M:212     1st Qu.:11.700   1st Qu.:16.17   1st Qu.: 75.17  
##            Median :13.370   Median :18.84   Median : 86.24  
##            Mean   :14.127   Mean   :19.29   Mean   : 91.97  
##            3rd Qu.:15.780   3rd Qu.:21.80   3rd Qu.:104.10  
##            Max.   :28.110   Max.   :39.28   Max.   :188.50  
##    area_mean      smoothness_mean   compactness_mean  concavity_mean   
##  Min.   : 143.5   Min.   :0.05263   Min.   :0.01938   Min.   :0.00000  
##  1st Qu.: 420.3   1st Qu.:0.08637   1st Qu.:0.06492   1st Qu.:0.02956  
##  Median : 551.1   Median :0.09587   Median :0.09263   Median :0.06154  
##  Mean   : 654.9   Mean   :0.09636   Mean   :0.10434   Mean   :0.08880  
##  3rd Qu.: 782.7   3rd Qu.:0.10530   3rd Qu.:0.13040   3rd Qu.:0.13070  
##  Max.   :2501.0   Max.   :0.16340   Max.   :0.34540   Max.   :0.42680  
##  concave.points_mean symmetry_mean    fractal_dimension_mean
##  Min.   :0.00000     Min.   :0.1060   Min.   :0.04996       
##  1st Qu.:0.02031     1st Qu.:0.1619   1st Qu.:0.05770       
##  Median :0.03350     Median :0.1792   Median :0.06154       
##  Mean   :0.04892     Mean   :0.1812   Mean   :0.06280       
##  3rd Qu.:0.07400     3rd Qu.:0.1957   3rd Qu.:0.06612       
##  Max.   :0.20120     Max.   :0.3040   Max.   :0.09744       
##    radius_se        texture_se      perimeter_se       area_se       
##  Min.   :0.1115   Min.   :0.3602   Min.   : 0.757   Min.   :  6.802  
##  1st Qu.:0.2324   1st Qu.:0.8339   1st Qu.: 1.606   1st Qu.: 17.850  
##  Median :0.3242   Median :1.1080   Median : 2.287   Median : 24.530  
##  Mean   :0.4052   Mean   :1.2169   Mean   : 2.866   Mean   : 40.337  
##  3rd Qu.:0.4789   3rd Qu.:1.4740   3rd Qu.: 3.357   3rd Qu.: 45.190  
##  Max.   :2.8730   Max.   :4.8850   Max.   :21.980   Max.   :542.200  
##  smoothness_se      compactness_se      concavity_se    
##  Min.   :0.001713   Min.   :0.002252   Min.   :0.00000  
##  1st Qu.:0.005169   1st Qu.:0.013080   1st Qu.:0.01509  
##  Median :0.006380   Median :0.020450   Median :0.02589  
##  Mean   :0.007041   Mean   :0.025478   Mean   :0.03189  
##  3rd Qu.:0.008146   3rd Qu.:0.032450   3rd Qu.:0.04205  
##  Max.   :0.031130   Max.   :0.135400   Max.   :0.39600  
##  concave.points_se   symmetry_se       fractal_dimension_se
##  Min.   :0.000000   Min.   :0.007882   Min.   :0.0008948   
##  1st Qu.:0.007638   1st Qu.:0.015160   1st Qu.:0.0022480   
##  Median :0.010930   Median :0.018730   Median :0.0031870   
##  Mean   :0.011796   Mean   :0.020542   Mean   :0.0037949   
##  3rd Qu.:0.014710   3rd Qu.:0.023480   3rd Qu.:0.0045580   
##  Max.   :0.052790   Max.   :0.078950   Max.   :0.0298400   
##   radius_worst   texture_worst   perimeter_worst    area_worst    
##  Min.   : 7.93   Min.   :12.02   Min.   : 50.41   Min.   : 185.2  
##  1st Qu.:13.01   1st Qu.:21.08   1st Qu.: 84.11   1st Qu.: 515.3  
##  Median :14.97   Median :25.41   Median : 97.66   Median : 686.5  
##  Mean   :16.27   Mean   :25.68   Mean   :107.26   Mean   : 880.6  
##  3rd Qu.:18.79   3rd Qu.:29.72   3rd Qu.:125.40   3rd Qu.:1084.0  
##  Max.   :36.04   Max.   :49.54   Max.   :251.20   Max.   :4254.0  
##  smoothness_worst  compactness_worst concavity_worst  concave.points_worst
##  Min.   :0.07117   Min.   :0.02729   Min.   :0.0000   Min.   :0.00000     
##  1st Qu.:0.11660   1st Qu.:0.14720   1st Qu.:0.1145   1st Qu.:0.06493     
##  Median :0.13130   Median :0.21190   Median :0.2267   Median :0.09993     
##  Mean   :0.13237   Mean   :0.25427   Mean   :0.2722   Mean   :0.11461     
##  3rd Qu.:0.14600   3rd Qu.:0.33910   3rd Qu.:0.3829   3rd Qu.:0.16140     
##  Max.   :0.22260   Max.   :1.05800   Max.   :1.2520   Max.   :0.29100     
##  symmetry_worst   fractal_dimension_worst
##  Min.   :0.1565   Min.   :0.05504        
##  1st Qu.:0.2504   1st Qu.:0.07146        
##  Median :0.2822   Median :0.08004        
##  Mean   :0.2901   Mean   :0.08395        
##  3rd Qu.:0.3179   3rd Qu.:0.09208        
##  Max.   :0.6638   Max.   :0.20750

Perform PCA

bc_pca <- prcomp(scale(bc[,-1]))

ev <- bc_pca$sdev**2 / sum(bc_pca$sdev**2)

cum_ev <- cumsum(ev)

plot(1:30, cum_ev, cex=1.4, pch=21, col='black', bg='steelblue',
     xlab='Number of Principal Components', ylab='Cumulative Variance Explained')

abline(v=7, lty=3, lwd=2, col='steelblue')
abline(h=cum_ev[7], lty=3, lwd=2, col='steelblue')

Plot First two Transformed Features

plot(bc_pca$x[,1:2], pch=21,  col='black', cex=1, 
     bg=c('salmon', 'steelblue')[bc$diagnosis],
     main='First Two Transformed Features')

Create Elasticnet Model

set.seed(1)
bc_mod <- train(bc_pca$x[,1:10], bc$diagnosis, method='glmnet', metric='Accuracy', 
                trControl = trainControl(method='cv', number=10),
                tuneGrid = expand.grid(alpha=seq(0, 1, by=0.2),
                                       lambda=seq(-0.2, 0.2, length=50))) 

best_ix = which.max(bc_mod$results$Accuracy)
bc_mod$results[best_ix, ]
##     alpha      lambda  Accuracy     Kappa AccuracySD    KappaSD
## 276     1 0.004081633 0.9755283 0.9467832 0.02051406 0.04560119

Create Elasticnet Model

plot(bc_mod, pch='')

Create Elasticnet Model

coef(bc_mod$finalModel, 
     bc_mod$finalModel$lambdaOpt)
## 11 x 1 sparse Matrix of class "dgCMatrix"
##                      1
## (Intercept) -0.5090167
## PC1         -1.8296826
## PC2          0.9360084
## PC3         -0.3481095
## PC4         -0.4372119
## PC5          0.6865393
## PC6         -0.1271122
## PC7          .        
## PC8         -0.4762884
## PC9          0.7597011
## PC10        -0.1502363

Example 4: Student Success Data

ss <- read.table('data/student_success_data.csv', header=TRUE, sep=',')
ss <- na.omit(ss)
ss <- ss[(ss$G3 <= 20) & (ss$G3 >= 0), ]
ss$Passed <- factor(ifelse(ss$G3 < 10, 0, 1))
ss$G1 <- NULL
ss$G2 <- NULL
ss$G3 <- NULL
ss$absences <- NULL
summary(ss)
##  school    sex          age        address famsize   Pstatus
##  GP :462   F:297   Min.   :15.00   R:155   GT3:393   A: 45  
##  MHS:101   M:266   1st Qu.:16.00   U:408   LE3:170   T:518  
##                    Median :16.00                            
##                    Mean   :16.61                            
##                    3rd Qu.:18.00                            
##                    Max.   :22.00                            
##       Medu            Fedu             Mjob           Fjob    
##  Min.   :1.000   Min.   :1.000   at_home : 66   at_home : 31  
##  1st Qu.:2.000   1st Qu.:2.000   health  : 39   health  : 25  
##  Median :3.000   Median :3.000   other   :184   other   :275  
##  Mean   :2.874   Mean   :2.686   services:182   services:173  
##  3rd Qu.:4.000   3rd Qu.:4.000   teacher : 92   teacher : 59  
##  Max.   :4.000   Max.   :4.000                                
##         reason      guardian     traveltime      studytime    
##  course    :215   father:129   Min.   :1.000   Min.   :1.000  
##  home      :158   mother:398   1st Qu.:1.000   1st Qu.:1.000  
##  other     : 48   other : 36   Median :1.000   Median :2.000  
##  reputation:142                Mean   :1.481   Mean   :1.986  
##                                3rd Qu.:2.000   3rd Qu.:2.000  
##                                Max.   :4.000   Max.   :4.000  
##     failures      schoolsup famsup     paid     activities nursery  
##  Min.   :0.0000   no :507   no :254   no :332   no :262    no :102  
##  1st Qu.:0.0000   yes: 56   yes:309   yes:231   yes:301    yes:461  
##  Median :0.0000                                                     
##  Mean   :0.2842                                                     
##  3rd Qu.:0.0000                                                     
##  Max.   :3.0000                                                     
##  higher    internet  romantic      famrel         freetime    
##  no : 22   no : 93   no :341   Min.   :1.000   Min.   :1.000  
##  yes:541   yes:470   yes:222   1st Qu.:4.000   1st Qu.:3.000  
##                                Median :4.000   Median :3.000  
##                                Mean   :3.938   Mean   :3.213  
##                                3rd Qu.:5.000   3rd Qu.:4.000  
##                                Max.   :5.000   Max.   :5.000  
##      goout            Dalc            Walc           health     Passed 
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.00   0:200  
##  1st Qu.:2.000   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:3.00   1:363  
##  Median :3.000   Median :1.000   Median :2.000   Median :4.00          
##  Mean   :3.021   Mean   :1.401   Mean   :2.176   Mean   :3.67          
##  3rd Qu.:4.000   3rd Qu.:2.000   3rd Qu.:3.000   3rd Qu.:5.00          
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.00

Feature Encoding

encoder <- dummyVars("~ .", ss[,1:29], fullRank = FALSE)
enc <- predict(encoder, ss)
ncol(enc)
## [1] 55

Perform PCA

ss_pca <- prcomp(enc, center=TRUE, scale=TRUE)
dim(ss_pca$rotation)
## [1] 55 55
dim(ss_pca$x)
## [1] 563  55

Plot First Two Transformed Features

plot(ss_pca$x[,1:2], pch=21, bg = c('salmon', 'steelblue')[ss$Passed], col='black')

Explained Variance

ev <- ss_pca$sdev**2 / sum(ss_pca$sdev**2)
cum_ev <- cumsum(ev)

plot(1:55, cum_ev, cex=1, pch=21, col='black', bg='steelblue',
     xlab='Number of Principal Components', ylab='Cumulative Variance Explained')
abline(v=28, lty=3, lwd=2, col='steelblue')
abline(h=cum_ev[28], lty=3, lwd=2, col='steelblue')

set.seed(1)
ss_mod <- train(ss_pca$x[,1:30], ss$Passed, method='glmnet', metric='Accuracy', 
                trControl = trainControl(method='cv', number=10),
                tuneGrid = expand.grid(alpha=seq(0, 1, by=0.2),
                                       lambda=seq(-0.5, 0.5, length=50))) 
best_ix = which.max(ss_mod$results$Accuracy)
ss_mod$results[best_ix, ]
##    alpha     lambda  Accuracy     Kappa AccuracySD   KappaSD
## 76   0.2 0.01020408 0.7107456 0.3398844 0.05179869 0.1172835
plot(ss_mod, pch='')