library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(kernlab)
data(spam)
inTrain<- createDataPartition(y=spam$type,p=0.75,list=FALSE)
training<- spam[inTrain,]
testing<- spam[-inTrain,]
hist(training$capitalAve,main="",xlab="ave.capital run length")
#Reason1: This plot could not show any thing, because they are distributed on the same area
mean(training$capitalAve)
## [1] 4.459112
sd(training$capitalAve)
## [1] 18.36233
#Standardizing- Training set
trainCapAve<- training$capitalAve
trainCapAves<- (trainCapAve- mean(trainCapAve))/sd(trainCapAve)
mean(trainCapAves)
## [1] 1.975077e-17
sd(trainCapAves)
## [1] 1
testCapAve<- testing$capitalAve
testCapAves<- (testCapAve- mean(trainCapAve))/sd(trainCapAve)
mean(testCapAves)
## [1] 0.1595793
sd(testCapAves)
## [1] 2.988663
preObj<- preProcess(training[,-58],method=c("center","scale"))
#Because the 58th coloum of the dataset is the outcome, it need to be deleted
#Method,"center" and "scale" are the same function as the previous one
#Notice: preObj is a formula
trainCapAves<- predict(preObj,training[,-58])$capitalAve
#Use predict function to do the standardizing
mean(trainCapAves)
## [1] 1.975077e-17
sd(trainCapAves)
## [1] 1
testCapAveS<- predict(preObj, testing[,-58])$capitalAve
mean(testCapAveS)
## [1] 0.1595793
sd(testCapAveS)
## [1] 2.988663
set.seed(32343)
modelFit<- train(type~.,data = training,preProcess=c("center","scale"),method="glm")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
modelFit
## Generalized Linear Model
##
## 3451 samples
## 57 predictor
## 2 classes: 'nonspam', 'spam'
##
## Pre-processing: centered, scaled
## Resampling: Bootstrapped (25 reps)
##
## Summary of sample sizes: 3451, 3451, 3451, 3451, 3451, 3451, ...
##
## Resampling results
##
## Accuracy Kappa Accuracy SD Kappa SD
## 0.9207059 0.833597 0.01611926 0.03085186
##
##
preObj<- preProcess(training[,-58],method=c("BoxCox"))
trainCapAveS<- predict(preObj,training[,-58])$capitalAve
par(mfrow=c(1,2))
hist(trainCapAveS)
qqnorm(trainCapAveS)
##Approach 5-Standardizing-Imputing Data
set.seed(13343)
#Make some values NA
training$capAve<- training$capitalAve
selectNA<- rbinom(dim(training)[1],size=1,prob=0.05)==1
#Use binorminal distribution to make half of the data are true or false
training$capAve[selectNA]<- NA
#And make the "true" one as NA
#Impute and standardize
preObj<- preProcess(training[,-58],method="knnImpute")
#Use k-nearest neighbors Imputation to imput the data
library(RANN)
capAve<- predict(preObj,training[,-58])$capAve
#And the outcome will contain the results with missing value
#Standardize true values
capAveTruth<- training$capitalAve
capAveTruth<- (capAveTruth-mean(capAveTruth))/sd(capAveTruth)
quantile(capAve-capAveTruth) #Compare the data after NA and before impute NA
## 0% 25% 50% 75% 100%
## -9.060521342 0.002171090 0.002809970 0.004206612 0.766566765
quantile((capAve-capAveTruth)[selectNA]) #The one select to be NA
## 0% 25% 50% 75% 100%
## -9.060521342 -0.025726747 0.009938512 0.040944374 0.766566765
quantile((capAve-capAveTruth)[!selectNA]) #The one select not be NA
## 0% 25% 50% 75% 100%
## 0.001652908 0.002197408 0.002793635 0.004092267 0.606047546