week2_preproc

#use caret and iris data
library(caret)

## Loading required package: lattice
## Loading required package: ggplot2

library(datasets)
colnames(swiss)

## [1] "Fertility"        "Agriculture"      "Examination"     
## [4] "Education"        "Catholic"         "Infant.Mortality"

part<-createDataPartition(y=swiss$Fertility,p=0.75,list=F)
training<-swiss[part,]
testing<-swiss[-part,]

Training with preprocessing

fit<-train(Fertility~.,data=training,preProcess=c("center","scale"),model="glm")

## Loading required package: randomForest
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.

Preoprocess separately - center and scale

#"Except For Education"
ex_ed<-names(training) != "Education"
obj<-preProcess(training[,-ex_ed],method=c("center","scale"))
predict(obj,training[,-ex_ed])

##              Agriculture Examination Education Catholic Infant.Mortality
## Aigle          0.5357926     0.52952   0.02834 -0.82415         -1.21620
## Lausanne      -1.2918131     1.11428   1.51214 -0.73919         -0.01976
## La Vallee     -1.4719995     1.69903   0.77024 -0.97492         -3.05937
## Orbe           0.1968704     0.41257  -0.52809 -0.92640         -1.60424
## Vevey         -0.9743417     0.99733   0.67750 -0.58890          0.20659
## Neuchatel     -1.3690358     2.16683   1.88309 -0.62534          0.88565
## V. De Geneve  -2.0726211     2.40073   3.83058 -0.02371         -0.73116
## Rive Droite   -0.1248912    -0.05523   1.60488  0.16777         -0.66649
## Rive Gauche   -0.9357303     0.64648   1.60488  0.35474         -0.31079
## Aubonne        0.7717511    -0.28913  -0.43535 -0.97208         -0.37546
## Avenches       0.4800206     0.29562   0.02834 -0.92096          0.78865
## Echallens      0.9905489     0.17867  -0.89904 -0.45304          0.30360
## Morges         0.4414092     0.64648  -0.15714 -0.90202         -0.73116
## Moudon         0.2397719    -0.28913  -0.80630 -0.91883          0.69164
## Yverdon       -0.0004767    -0.17218  -0.34261 -0.88143          0.72397
## Entremont      1.5182379    -1.10778  -0.52809  1.33341         -0.14911
## Boudry        -0.4766838     1.11428   0.02834 -0.89279          0.01258
## La Chauxdfnd  -1.7937611     1.46513  -0.06440 -0.69942          0.07725
## Neuveville    -0.2578859     0.06172   0.30655 -0.90368          0.10958
## Porrentruy    -0.6096786    -0.87388  -0.43535  1.11779          2.04976
## Grandson      -0.6654506     0.06172  -0.34261 -0.94770         -0.08443
## Oron           0.9304868    -0.52303  -0.99178 -0.96900          0.23893
## Payerne        0.3684766    -0.28913  -0.34261 -0.90202          1.14434
## Conthey        1.5611394    -1.57558  -0.89904  1.33412         -1.66891
## Herens         1.7241653    -1.34168  -0.89904  1.34098         -0.63415
## Martigwy       1.2307975    -0.52303  -0.52809  1.31637         -0.27845
## Val de Ruz    -0.5110050    -0.17218  -0.43535 -0.90818         -0.08443
## Courtelary    -1.3947768    -0.17218   0.02834 -0.79007          0.62696
## Delemont      -0.1892435    -1.22473  -0.24988  0.98218          0.62696
## Franches-Mnt  -0.4209118    -1.34168  -0.62083  1.18477         -0.01976
## Glane          0.7846215    -0.28913  -0.34261  1.27377          1.50004
## Gruyere        0.1625492    -0.52303  -0.43535  1.28584          0.23893
## Sarine        -0.1849533    -0.05523   0.12107  1.13696          1.33836
## Veveyse        0.6430464    -0.28913  -0.52809  1.30808          1.37070
## Monthey        0.6602070    -1.10778  -0.80630  1.29885         -0.01976
## Sierre         1.5053674    -1.57558  -0.80630  1.32820         -1.28087

Preoprocess separately - Imputing

brokenIndex<- rbinom(dim(training)[1],size=1,prob=0.5)==1
training$FertilityBroken<-training$Fertility
training[brokenIndex,c("FertilityBroken")]<-NA

#Except For Broken Columns
ex_brk<-names(training) != "FertilityBroken"

fixed<-preProcess(training[,ex_brk],method="knnImpute")

head(training$FertilityBroken)

## [1]   NA   NA 54.3   NA 58.3 64.4

head(predict(fixed,training[,ex_brk])$Fertility)

## [1] -0.4554 -1.0841 -1.1889 -0.9569 -0.8895 -0.4329

week2_preproc

Kiichi Takeuchi

October 17, 2014

Training with preprocessing

Preoprocess separately - center and scale

Preoprocess separately - Imputing