#use caret and iris data
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(datasets)
colnames(swiss)
## [1] "Fertility" "Agriculture" "Examination"
## [4] "Education" "Catholic" "Infant.Mortality"
part<-createDataPartition(y=swiss$Fertility,p=0.75,list=F)
training<-swiss[part,]
testing<-swiss[-part,]
Training with preprocessing
fit<-train(Fertility~.,data=training,preProcess=c("center","scale"),model="glm")
## Loading required package: randomForest
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
Preoprocess separately - center and scale
#"Except For Education"
ex_ed<-names(training) != "Education"
obj<-preProcess(training[,-ex_ed],method=c("center","scale"))
predict(obj,training[,-ex_ed])
## Agriculture Examination Education Catholic Infant.Mortality
## Aigle 0.5357926 0.52952 0.02834 -0.82415 -1.21620
## Lausanne -1.2918131 1.11428 1.51214 -0.73919 -0.01976
## La Vallee -1.4719995 1.69903 0.77024 -0.97492 -3.05937
## Orbe 0.1968704 0.41257 -0.52809 -0.92640 -1.60424
## Vevey -0.9743417 0.99733 0.67750 -0.58890 0.20659
## Neuchatel -1.3690358 2.16683 1.88309 -0.62534 0.88565
## V. De Geneve -2.0726211 2.40073 3.83058 -0.02371 -0.73116
## Rive Droite -0.1248912 -0.05523 1.60488 0.16777 -0.66649
## Rive Gauche -0.9357303 0.64648 1.60488 0.35474 -0.31079
## Aubonne 0.7717511 -0.28913 -0.43535 -0.97208 -0.37546
## Avenches 0.4800206 0.29562 0.02834 -0.92096 0.78865
## Echallens 0.9905489 0.17867 -0.89904 -0.45304 0.30360
## Morges 0.4414092 0.64648 -0.15714 -0.90202 -0.73116
## Moudon 0.2397719 -0.28913 -0.80630 -0.91883 0.69164
## Yverdon -0.0004767 -0.17218 -0.34261 -0.88143 0.72397
## Entremont 1.5182379 -1.10778 -0.52809 1.33341 -0.14911
## Boudry -0.4766838 1.11428 0.02834 -0.89279 0.01258
## La Chauxdfnd -1.7937611 1.46513 -0.06440 -0.69942 0.07725
## Neuveville -0.2578859 0.06172 0.30655 -0.90368 0.10958
## Porrentruy -0.6096786 -0.87388 -0.43535 1.11779 2.04976
## Grandson -0.6654506 0.06172 -0.34261 -0.94770 -0.08443
## Oron 0.9304868 -0.52303 -0.99178 -0.96900 0.23893
## Payerne 0.3684766 -0.28913 -0.34261 -0.90202 1.14434
## Conthey 1.5611394 -1.57558 -0.89904 1.33412 -1.66891
## Herens 1.7241653 -1.34168 -0.89904 1.34098 -0.63415
## Martigwy 1.2307975 -0.52303 -0.52809 1.31637 -0.27845
## Val de Ruz -0.5110050 -0.17218 -0.43535 -0.90818 -0.08443
## Courtelary -1.3947768 -0.17218 0.02834 -0.79007 0.62696
## Delemont -0.1892435 -1.22473 -0.24988 0.98218 0.62696
## Franches-Mnt -0.4209118 -1.34168 -0.62083 1.18477 -0.01976
## Glane 0.7846215 -0.28913 -0.34261 1.27377 1.50004
## Gruyere 0.1625492 -0.52303 -0.43535 1.28584 0.23893
## Sarine -0.1849533 -0.05523 0.12107 1.13696 1.33836
## Veveyse 0.6430464 -0.28913 -0.52809 1.30808 1.37070
## Monthey 0.6602070 -1.10778 -0.80630 1.29885 -0.01976
## Sierre 1.5053674 -1.57558 -0.80630 1.32820 -1.28087
Preoprocess separately - Imputing
brokenIndex<- rbinom(dim(training)[1],size=1,prob=0.5)==1
training$FertilityBroken<-training$Fertility
training[brokenIndex,c("FertilityBroken")]<-NA
#Except For Broken Columns
ex_brk<-names(training) != "FertilityBroken"
fixed<-preProcess(training[,ex_brk],method="knnImpute")
head(training$FertilityBroken)
## [1] NA NA 54.3 NA 58.3 64.4
head(predict(fixed,training[,ex_brk])$Fertility)
## [1] -0.4554 -1.0841 -1.1889 -0.9569 -0.8895 -0.4329