Correlated predictors

library(caret); library(kernlab); data(spam)
inTrain <- createDataPartition(y=spam$type,
                              p=0.75, list=FALSE)
training <- spam[inTrain,]
testing <- spam[-inTrain,]

M <- abs(cor(training[,-58]))
diag(M) <- 0
which(M > 0.8,arr.ind=T)
##        row col
## num415  34  32
## direct  40  32
## num857  32  34
## direct  40  34
## num857  32  40
## num415  34  40
str(training)
## 'data.frame':    3451 obs. of  58 variables:
##  $ make             : num  0 0.21 0 0 0 0 0.15 0 0 0 ...
##  $ address          : num  0.64 0.28 0 0 0 0 0 0 0 0.69 ...
##  $ all              : num  0.64 0.5 0 0 0 0 0.46 0 0.25 0.34 ...
##  $ num3d            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ our              : num  0.32 0.14 0.63 0.63 1.85 1.92 0.61 0 0.38 0.34 ...
##  $ over             : num  0 0.28 0 0 0 0 0 0 0.25 0 ...
##  $ remove           : num  0 0.21 0.31 0.31 0 0 0.3 0.96 0.25 0 ...
##  $ internet         : num  0 0.07 0.63 0.63 1.85 0 0 0 0 0 ...
##  $ order            : num  0 0 0.31 0.31 0 0 0.92 0 0 0 ...
##  $ mail             : num  0 0.94 0.63 0.63 0 0.64 0.76 1.92 0 0 ...
##  $ receive          : num  0 0.21 0.31 0.31 0 0.96 0.76 0.96 0.12 0 ...
##  $ will             : num  0.64 0.79 0.31 0.31 0 1.28 0.92 0 0.12 0.69 ...
##  $ people           : num  0 0.65 0.31 0.31 0 0 0 0 0.12 0 ...
##  $ report           : num  0 0.21 0 0 0 0 0 0 0 0 ...
##  $ addresses        : num  0 0.14 0 0 0 0 0 0 0 0 ...
##  $ free             : num  0.32 0.14 0.31 0.31 0 0.96 0 0 0 0.34 ...
##  $ business         : num  0 0.07 0 0 0 0 0 0 0 0 ...
##  $ email            : num  1.29 0.28 0 0 0 0.32 0.15 0.96 0 1.39 ...
##  $ you              : num  1.93 3.47 3.18 3.18 0 3.85 1.23 3.84 1.16 2.09 ...
##  $ credit           : num  0 0 0 0 0 0 3.53 0 0 0 ...
##  $ your             : num  0.96 1.59 0.31 0.31 0 0.64 2 0.96 0.77 1.04 ...
##  $ font             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ num000           : num  0 0.43 0 0 0 0 0 0 0 0 ...
##  $ money            : num  0 0.43 0 0 0 0 0.15 0 0 0 ...
##  $ hp               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ hpl              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ george           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ num650           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ lab              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ labs             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ telnet           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ num857           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ data             : num  0 0 0 0 0 0 0.15 0 0 0 ...
##  $ num415           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ num85            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ technology       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ num1999          : num  0 0.07 0 0 0 0 0 0 0 0 ...
##  $ parts            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ pm               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ direct           : num  0 0 0 0 0 0 0 0.96 0 0 ...
##  $ cs               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ meeting          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ original         : num  0 0 0 0 0 0 0.3 0 0 0 ...
##  $ project          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ re               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ edu              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ table            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ conference       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ charSemicolon    : num  0 0 0 0 0 0 0 0 0.022 0 ...
##  $ charRoundbracket : num  0 0.132 0.137 0.135 0.223 0.054 0.271 0 0.044 0.056 ...
##  $ charSquarebracket: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ charExclamation  : num  0.778 0.372 0.137 0.135 0 0.164 0.181 0.462 0.663 0.786 ...
##  $ charDollar       : num  0 0.18 0 0 0 0.054 0.203 0 0 0 ...
##  $ charHash         : num  0 0.048 0 0 0 0 0.022 0 0 0 ...
##  $ capitalAve       : num  3.76 5.11 3.54 3.54 3 ...
##  $ capitalLong      : num  61 101 40 40 15 4 445 6 11 61 ...
##  $ capitalTotal     : num  278 1028 191 191 54 ...
##  $ type             : Factor w/ 2 levels "nonspam","spam": 2 2 2 2 2 2 2 2 2 2 ...
dt <- training[, 58]
dt2 <- training[, -58]
length(dt2)
## [1] 57
num415 <- training$num415
num857 <- training$num857
unique(num415)
##  [1] 0.00 0.30 0.17 0.38 0.12 0.07 0.19 1.35 0.14 2.04 0.63 0.58 4.54 0.27
## [15] 0.76 0.51 0.52 0.39 0.28 0.35 0.90 1.11 3.57 0.66 1.31 0.34 0.68 0.54
## [29] 1.38 0.73 0.50 0.99 0.48 1.08 0.22 0.24 0.15 0.61 2.27 0.87 0.45 0.82
## [43] 0.53 0.37 1.28 0.75 0.93 0.29 0.11 4.76 0.91 2.77 1.42 0.64 0.13 0.32
## [57] 0.49 0.85 0.65 1.01 4.34 0.62 3.12 1.56 2.63 1.72 0.59 0.55 0.60 1.00
## [71] 1.33 1.63 0.70 1.07 0.09 0.78 0.97 0.86 0.26 0.43 2.56 3.84 1.20 2.14
## [85] 4.00 4.16 0.46 0.20 2.32 0.88 0.33 2.22 0.23 0.80 0.77
unique(num857)
##  [1] 0.00 0.47 2.04 0.63 0.58 4.54 0.27 0.76 0.51 0.52 0.39 0.28 0.35 1.11
## [15] 3.57 0.66 1.31 0.34 0.68 0.54 1.38 0.17 0.73 0.50 0.99 0.48 1.08 0.22
## [29] 0.24 0.15 0.61 2.27 0.87 0.45 0.82 0.53 0.37 1.28 0.75 0.93 4.76 0.91
## [43] 2.77 1.42 0.64 0.13 0.32 0.49 0.85 0.65 1.01 4.34 0.62 3.12 0.29 1.56
## [57] 2.63 1.72 0.59 0.55 0.60 1.00 1.33 1.63 0.70 1.07 0.09 0.78 0.97 0.86
## [71] 0.26 0.43 2.56 3.84 1.20 2.14 4.00 4.16 0.90 0.19 0.46 0.20 2.32 0.88
## [85] 0.33 2.22 0.23 0.80 0.77 0.31
hist(num415)

hist(num857)

Correlated predictors

names(spam)[c(34,32)]
## [1] "num415" "num857"
plot(spam[,34],spam[,32])

# We could rotate the plot

\[ X = 0.71 \times {\rm num 415} + 0.71 \times {\rm num857}\]

\[ Y = 0.71 \times {\rm num 415} - 0.71 \times {\rm num857}\]

X <- 0.71*training$num415 + 0.71*training$num857
Y <- 0.71*training$num415 - 0.71*training$num857
plot(X,Y)

# Principal components in R - prcomp

smallSpam <- spam[,c(34,32)]
prComp <- prcomp(smallSpam)
plot(prComp$x[,1],prComp$x[,2])

Principal components in R - prcomp

prComp$rotation
##              PC1        PC2
## num415 0.7080625  0.7061498
## num857 0.7061498 -0.7080625

PCA on SPAM data

typeColor <- ((spam$type=="spam")*1 + 1)
prComp <- prcomp(log10(spam[,-58]+1))
plot(prComp$x[,1],prComp$x[,2],col=typeColor,xlab="PC1",ylab="PC2")

# PCA with caret

preProc <- preProcess(log10(spam[,-58]+1),method="pca",pcaComp=2)
spamPC <- predict(preProc,log10(spam[,-58]+1))
plot(spamPC[,1],spamPC[,2],col=typeColor)

Preprocessing with PCA

#preProc <- preProcess(log10(training[,-58]+1),method="pca",pcaComp=2)
#trainPC <- predict(preProc,log10(training[,-58]+1))
#modelFit <- train(training$type ~ .,method="glm",data=trainPC)