Preprocess

library(caret)

## Loading required package: lattice
## Loading required package: ggplot2

library(kernlab)
data(spam)
inTrain<- createDataPartition(y=spam$type,p=0.75,list=FALSE)
training<- spam[inTrain,]
testing<- spam[-inTrain,]
hist(training$capitalAve,main="",xlab="ave.capital run length")

#Reason1: This plot could not show any thing, because they are distributed on the same area

mean(training$capitalAve)

## [1] 4.459112

sd(training$capitalAve)

## [1] 18.36233

Reseaon2: the sd is too large, which is much larger than the median

so, we need to prepocess the data by Standardizing

#Standardizing- Training set
trainCapAve<- training$capitalAve
trainCapAves<- (trainCapAve- mean(trainCapAve))/sd(trainCapAve)
mean(trainCapAves)

## [1] 1.975077e-17

sd(trainCapAves)

## [1] 1

Standardizing- Test set

testCapAve<- testing$capitalAve
testCapAves<- (testCapAve- mean(trainCapAve))/sd(trainCapAve)
mean(testCapAves)

## [1] 0.1595793

sd(testCapAves)

## [1] 2.988663

Approach 2-Standardizing-PreProcessFunction(Training)

preObj<- preProcess(training[,-58],method=c("center","scale"))
#Because the 58th coloum of the dataset is the outcome, it need to be deleted 
#Method,"center" and "scale" are the same function as the previous one
#Notice: preObj is a formula
trainCapAves<- predict(preObj,training[,-58])$capitalAve
#Use predict function to do the standardizing 
mean(trainCapAves)

## [1] 1.975077e-17

sd(trainCapAves)

## [1] 1

Approach 2-Standardizing-PreProcessFunction(Testing)

testCapAveS<- predict(preObj, testing[,-58])$capitalAve
mean(testCapAveS)

## [1] 0.1595793

sd(testCapAveS)

## [1] 2.988663

Approach 3-Standardizing-preprocess argument

set.seed(32343)
modelFit<- train(type~.,data = training,preProcess=c("center","scale"),method="glm")

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

modelFit

## Generalized Linear Model 
## 
## 3451 samples
##   57 predictor
##    2 classes: 'nonspam', 'spam' 
## 
## Pre-processing: centered, scaled 
## Resampling: Bootstrapped (25 reps) 
## 
## Summary of sample sizes: 3451, 3451, 3451, 3451, 3451, 3451, ... 
## 
## Resampling results
## 
##   Accuracy   Kappa     Accuracy SD  Kappa SD  
##   0.9207059  0.833597  0.01611926   0.03085186
## 
##

Approach 4-Standardizing-Box-Cox transforms

Box Cox transformation can make data look like normal distribution, and easilier for finding out the features

preObj<- preProcess(training[,-58],method=c("BoxCox"))
trainCapAveS<- predict(preObj,training[,-58])$capitalAve
par(mfrow=c(1,2))
hist(trainCapAveS)
qqnorm(trainCapAveS)

##Approach 5-Standardizing-Imputing Data

set.seed(13343)
#Make some values NA
training$capAve<- training$capitalAve
selectNA<- rbinom(dim(training)[1],size=1,prob=0.05)==1
#Use binorminal distribution to make half of the data are true or false 
training$capAve[selectNA]<- NA
#And make the "true" one as NA

#Impute and standardize
preObj<- preProcess(training[,-58],method="knnImpute")
#Use k-nearest neighbors Imputation to imput the data
library(RANN)
capAve<- predict(preObj,training[,-58])$capAve
#And the outcome will contain the results with missing value

#Standardize true values
capAveTruth<- training$capitalAve
capAveTruth<- (capAveTruth-mean(capAveTruth))/sd(capAveTruth)
quantile(capAve-capAveTruth) #Compare the data after NA and before impute NA

##           0%          25%          50%          75%         100% 
## -9.060521342  0.002171090  0.002809970  0.004206612  0.766566765

quantile((capAve-capAveTruth)[selectNA]) #The one select to be NA

##           0%          25%          50%          75%         100% 
## -9.060521342 -0.025726747  0.009938512  0.040944374  0.766566765

quantile((capAve-capAveTruth)[!selectNA]) #The one select not be NA

##          0%         25%         50%         75%        100% 
## 0.001652908 0.002197408 0.002793635 0.004092267 0.606047546