library(caret); library(kernlab); data(spam)
inTrain <- createDataPartition(y=spam$type,
p=0.75, list=FALSE)
training <- spam[inTrain,]
testing <- spam[-inTrain,]
M <- abs(cor(training[,-58]))
diag(M) <- 0
which(M > 0.8,arr.ind=T)
## row col
## num415 34 32
## direct 40 32
## num857 32 34
## direct 40 34
## num857 32 40
## num415 34 40
str(training)
## 'data.frame': 3451 obs. of 58 variables:
## $ make : num 0 0.21 0 0 0 0 0.15 0 0 0 ...
## $ address : num 0.64 0.28 0 0 0 0 0 0 0 0.69 ...
## $ all : num 0.64 0.5 0 0 0 0 0.46 0 0.25 0.34 ...
## $ num3d : num 0 0 0 0 0 0 0 0 0 0 ...
## $ our : num 0.32 0.14 0.63 0.63 1.85 1.92 0.61 0 0.38 0.34 ...
## $ over : num 0 0.28 0 0 0 0 0 0 0.25 0 ...
## $ remove : num 0 0.21 0.31 0.31 0 0 0.3 0.96 0.25 0 ...
## $ internet : num 0 0.07 0.63 0.63 1.85 0 0 0 0 0 ...
## $ order : num 0 0 0.31 0.31 0 0 0.92 0 0 0 ...
## $ mail : num 0 0.94 0.63 0.63 0 0.64 0.76 1.92 0 0 ...
## $ receive : num 0 0.21 0.31 0.31 0 0.96 0.76 0.96 0.12 0 ...
## $ will : num 0.64 0.79 0.31 0.31 0 1.28 0.92 0 0.12 0.69 ...
## $ people : num 0 0.65 0.31 0.31 0 0 0 0 0.12 0 ...
## $ report : num 0 0.21 0 0 0 0 0 0 0 0 ...
## $ addresses : num 0 0.14 0 0 0 0 0 0 0 0 ...
## $ free : num 0.32 0.14 0.31 0.31 0 0.96 0 0 0 0.34 ...
## $ business : num 0 0.07 0 0 0 0 0 0 0 0 ...
## $ email : num 1.29 0.28 0 0 0 0.32 0.15 0.96 0 1.39 ...
## $ you : num 1.93 3.47 3.18 3.18 0 3.85 1.23 3.84 1.16 2.09 ...
## $ credit : num 0 0 0 0 0 0 3.53 0 0 0 ...
## $ your : num 0.96 1.59 0.31 0.31 0 0.64 2 0.96 0.77 1.04 ...
## $ font : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num000 : num 0 0.43 0 0 0 0 0 0 0 0 ...
## $ money : num 0 0.43 0 0 0 0 0.15 0 0 0 ...
## $ hp : num 0 0 0 0 0 0 0 0 0 0 ...
## $ hpl : num 0 0 0 0 0 0 0 0 0 0 ...
## $ george : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num650 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ lab : num 0 0 0 0 0 0 0 0 0 0 ...
## $ labs : num 0 0 0 0 0 0 0 0 0 0 ...
## $ telnet : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num857 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ data : num 0 0 0 0 0 0 0.15 0 0 0 ...
## $ num415 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num85 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ technology : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num1999 : num 0 0.07 0 0 0 0 0 0 0 0 ...
## $ parts : num 0 0 0 0 0 0 0 0 0 0 ...
## $ pm : num 0 0 0 0 0 0 0 0 0 0 ...
## $ direct : num 0 0 0 0 0 0 0 0.96 0 0 ...
## $ cs : num 0 0 0 0 0 0 0 0 0 0 ...
## $ meeting : num 0 0 0 0 0 0 0 0 0 0 ...
## $ original : num 0 0 0 0 0 0 0.3 0 0 0 ...
## $ project : num 0 0 0 0 0 0 0 0 0 0 ...
## $ re : num 0 0 0 0 0 0 0 0 0 0 ...
## $ edu : num 0 0 0 0 0 0 0 0 0 0 ...
## $ table : num 0 0 0 0 0 0 0 0 0 0 ...
## $ conference : num 0 0 0 0 0 0 0 0 0 0 ...
## $ charSemicolon : num 0 0 0 0 0 0 0 0 0.022 0 ...
## $ charRoundbracket : num 0 0.132 0.137 0.135 0.223 0.054 0.271 0 0.044 0.056 ...
## $ charSquarebracket: num 0 0 0 0 0 0 0 0 0 0 ...
## $ charExclamation : num 0.778 0.372 0.137 0.135 0 0.164 0.181 0.462 0.663 0.786 ...
## $ charDollar : num 0 0.18 0 0 0 0.054 0.203 0 0 0 ...
## $ charHash : num 0 0.048 0 0 0 0 0.022 0 0 0 ...
## $ capitalAve : num 3.76 5.11 3.54 3.54 3 ...
## $ capitalLong : num 61 101 40 40 15 4 445 6 11 61 ...
## $ capitalTotal : num 278 1028 191 191 54 ...
## $ type : Factor w/ 2 levels "nonspam","spam": 2 2 2 2 2 2 2 2 2 2 ...
dt <- training[, 58]
dt2 <- training[, -58]
length(dt2)
## [1] 57
num415 <- training$num415
num857 <- training$num857
unique(num415)
## [1] 0.00 0.30 0.17 0.38 0.12 0.07 0.19 1.35 0.14 2.04 0.63 0.58 4.54 0.27
## [15] 0.76 0.51 0.52 0.39 0.28 0.35 0.90 1.11 3.57 0.66 1.31 0.34 0.68 0.54
## [29] 1.38 0.73 0.50 0.99 0.48 1.08 0.22 0.24 0.15 0.61 2.27 0.87 0.45 0.82
## [43] 0.53 0.37 1.28 0.75 0.93 0.29 0.11 4.76 0.91 2.77 1.42 0.64 0.13 0.32
## [57] 0.49 0.85 0.65 1.01 4.34 0.62 3.12 1.56 2.63 1.72 0.59 0.55 0.60 1.00
## [71] 1.33 1.63 0.70 1.07 0.09 0.78 0.97 0.86 0.26 0.43 2.56 3.84 1.20 2.14
## [85] 4.00 4.16 0.46 0.20 2.32 0.88 0.33 2.22 0.23 0.80 0.77
unique(num857)
## [1] 0.00 0.47 2.04 0.63 0.58 4.54 0.27 0.76 0.51 0.52 0.39 0.28 0.35 1.11
## [15] 3.57 0.66 1.31 0.34 0.68 0.54 1.38 0.17 0.73 0.50 0.99 0.48 1.08 0.22
## [29] 0.24 0.15 0.61 2.27 0.87 0.45 0.82 0.53 0.37 1.28 0.75 0.93 4.76 0.91
## [43] 2.77 1.42 0.64 0.13 0.32 0.49 0.85 0.65 1.01 4.34 0.62 3.12 0.29 1.56
## [57] 2.63 1.72 0.59 0.55 0.60 1.00 1.33 1.63 0.70 1.07 0.09 0.78 0.97 0.86
## [71] 0.26 0.43 2.56 3.84 1.20 2.14 4.00 4.16 0.90 0.19 0.46 0.20 2.32 0.88
## [85] 0.33 2.22 0.23 0.80 0.77 0.31
hist(num415)
hist(num857)
names(spam)[c(34,32)]
## [1] "num415" "num857"
plot(spam[,34],spam[,32])
# We could rotate the plot
\[ X = 0.71 \times {\rm num 415} + 0.71 \times {\rm num857}\]
\[ Y = 0.71 \times {\rm num 415} - 0.71 \times {\rm num857}\]
X <- 0.71*training$num415 + 0.71*training$num857
Y <- 0.71*training$num415 - 0.71*training$num857
plot(X,Y)
# Principal components in R - prcomp
smallSpam <- spam[,c(34,32)]
prComp <- prcomp(smallSpam)
plot(prComp$x[,1],prComp$x[,2])
prComp$rotation
## PC1 PC2
## num415 0.7080625 0.7061498
## num857 0.7061498 -0.7080625
typeColor <- ((spam$type=="spam")*1 + 1)
prComp <- prcomp(log10(spam[,-58]+1))
plot(prComp$x[,1],prComp$x[,2],col=typeColor,xlab="PC1",ylab="PC2")
# PCA with caret
preProc <- preProcess(log10(spam[,-58]+1),method="pca",pcaComp=2)
spamPC <- predict(preProc,log10(spam[,-58]+1))
plot(spamPC[,1],spamPC[,2],col=typeColor)
#preProc <- preProcess(log10(training[,-58]+1),method="pca",pcaComp=2)
#trainPC <- predict(preProc,log10(training[,-58]+1))
#modelFit <- train(training$type ~ .,method="glm",data=trainPC)