knitr::opts_chunk$set(echo = TRUE)
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
The following is the code for Scaling the iris data:
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
data(iris)
summary(iris[,4])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.100 0.300 1.300 1.199 1.800 2.500
preprocessParams<- preProcess(iris[,1:4], method = c("scale"))
print(preprocessParams)
## Created from 150 samples and 4 variables
##
## Pre-processing:
## - ignored (0)
## - scaled (4)
transformed=predict(preprocessParams, iris[,1:4])
summary(transformed)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :5.193 Min. : 4.589 Min. :0.5665 Min. :0.1312
## 1st Qu.:6.159 1st Qu.: 6.424 1st Qu.:0.9064 1st Qu.:0.3936
## Median :7.004 Median : 6.883 Median :2.4642 Median :1.7055
## Mean :7.057 Mean : 7.014 Mean :2.1288 Mean :1.5734
## 3rd Qu.:7.729 3rd Qu.: 7.571 3rd Qu.:2.8890 3rd Qu.:2.3615
## Max. :9.540 Max. :10.095 Max. :3.9087 Max. :3.2798
The following is the code for centering the iris data:
data(iris)
summary(iris[,1:4])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
preprocessParams<-preProcess(iris[,1:4], method = c("center"))
print(preprocessParams)
## Created from 150 samples and 4 variables
##
## Pre-processing:
## - centered (4)
## - ignored (0)
transformed<-predict(preprocessParams, iris[,1:4])
summary(transformed)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :-1.54333 Min. :-1.05733 Min. :-2.758 Min. :-1.0993
## 1st Qu.:-0.74333 1st Qu.:-0.25733 1st Qu.:-2.158 1st Qu.:-0.8993
## Median :-0.04333 Median :-0.05733 Median : 0.592 Median : 0.1007
## Mean : 0.00000 Mean : 0.00000 Mean : 0.000 Mean : 0.0000
## 3rd Qu.: 0.55667 3rd Qu.: 0.24267 3rd Qu.: 1.342 3rd Qu.: 0.6007
## Max. : 2.05667 Max. : 1.34267 Max. : 3.142 Max. : 1.3007
The following R code standardizes the iris data
data(iris)
summary(iris[,1:4])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
preprocessParam<- preProcess(iris[,1:4], method = "center", "scale")
print(preprocessParam)
## Created from 150 samples and 4 variables
##
## Pre-processing:
## - centered (4)
## - ignored (0)
transformed<- predict(preprocessParam, iris[,1:4])
summary(transformed)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :-1.54333 Min. :-1.05733 Min. :-2.758 Min. :-1.0993
## 1st Qu.:-0.74333 1st Qu.:-0.25733 1st Qu.:-2.158 1st Qu.:-0.8993
## Median :-0.04333 Median :-0.05733 Median : 0.592 Median : 0.1007
## Mean : 0.00000 Mean : 0.00000 Mean : 0.000 Mean : 0.0000
## 3rd Qu.: 0.55667 3rd Qu.: 0.24267 3rd Qu.: 1.342 3rd Qu.: 0.6007
## Max. : 2.05667 Max. : 1.34267 Max. : 3.142 Max. : 1.3007
The following code normalizes the iris data
data(iris)
summary(iris[,1:4])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
preprocessParams<- preProcess(iris[,1:4],method = c("range"))
print(preprocessParams)
## Created from 150 samples and 4 variables
##
## Pre-processing:
## - ignored (0)
## - re-scaling to [0, 1] (4)
transformed <- predict(preprocessParams, iris[,1:4])
summary(transformed)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.2222 1st Qu.:0.3333 1st Qu.:0.1017 1st Qu.:0.08333
## Median :0.4167 Median :0.4167 Median :0.5678 Median :0.50000
## Mean :0.4287 Mean :0.4406 Mean :0.4675 Mean :0.45806
## 3rd Qu.:0.5833 3rd Qu.:0.5417 3rd Qu.:0.6949 3rd Qu.:0.70833
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.00000
The following code is Box-Cox Transform using the PimaIndiansDiabestes dataset
library(mlbench)
data(PimaIndiansDiabetes)
summary(PimaIndiansDiabetes[,7:8])
## pedigree age
## Min. :0.0780 Min. :21.00
## 1st Qu.:0.2437 1st Qu.:24.00
## Median :0.3725 Median :29.00
## Mean :0.4719 Mean :33.24
## 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :2.4200 Max. :81.00
preprocessParams<- preProcess(PimaIndiansDiabetes[,7:8], method = c("BoxCox"))
print(preprocessParams)
## Created from 768 samples and 2 variables
##
## Pre-processing:
## - Box-Cox transformation (2)
## - ignored (0)
##
## Lambda estimates for Box-Cox transformation:
## -0.1, -1.1
transformed<-predict(preprocessParams, PimaIndiansDiabetes[,7:8])
summary(transformed)
## pedigree age
## Min. :-2.5510 Min. :0.8772
## 1st Qu.:-1.4116 1st Qu.:0.8815
## Median :-0.9875 Median :0.8867
## Mean :-0.9599 Mean :0.8874
## 3rd Qu.:-0.4680 3rd Qu.:0.8938
## Max. : 0.8838 Max. :0.9019
The following code will code will Yeo-Johnson Transform the PimaIndianDiabetes datatset
data(PimaIndiansDiabetes)
summary(PimaIndiansDiabetes[,7:8])
## pedigree age
## Min. :0.0780 Min. :21.00
## 1st Qu.:0.2437 1st Qu.:24.00
## Median :0.3725 Median :29.00
## Mean :0.4719 Mean :33.24
## 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :2.4200 Max. :81.00
preprocessParams <- preProcess(PimaIndiansDiabetes[,7:8], method=c("YeoJohnson"))
print(preprocessParams)
## Created from 768 samples and 2 variables
##
## Pre-processing:
## - ignored (0)
## - Yeo-Johnson transformation (2)
##
## Lambda estimates for Yeo-Johnson transformation:
## -2.25, -1.15
transformed<-predict(preprocessParams, PimaIndiansDiabetes[,7:8])
summary(transformed)
## pedigree age
## Min. :0.0691 Min. :0.8450
## 1st Qu.:0.1724 1st Qu.:0.8484
## Median :0.2265 Median :0.8524
## Mean :0.2317 Mean :0.8530
## 3rd Qu.:0.2956 3rd Qu.:0.8580
## Max. :0.4164 Max. :0.8644
data(iris)
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
preprocessParams <- preProcess(iris, method = c("center", "scale", "pca"))
print(preprocessParams)
## Created from 150 samples and 5 variables
##
## Pre-processing:
## - centered (4)
## - ignored (1)
## - principal component signal extraction (4)
## - scaled (4)
##
## PCA needed 2 components to capture 95 percent of the variance
transformed <- predict(preprocessParams,iris)
summary(transformed)
## Species PC1 PC2
## setosa :50 Min. :-2.7651 Min. :-2.67732
## versicolor:50 1st Qu.:-2.0957 1st Qu.:-0.59205
## virginica :50 Median : 0.4169 Median :-0.01744
## Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 1.3385 3rd Qu.: 0.59649
## Max. : 3.2996 Max. : 2.64521
data(PimaIndiansDiabetes)
summary(PimaIndiansDiabetes[,1:8])
## pregnant glucose pressure triceps
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## insulin mass pedigree age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
preprocessParams <- preProcess(PimaIndiansDiabetes[,1:8], method = c("center", "scale","ica"), n.comp=5)
print(preprocessParams)
## Created from 768 samples and 8 variables
##
## Pre-processing:
## - centered (8)
## - independent component signal extraction (8)
## - ignored (0)
## - scaled (8)
##
## ICA used 5 components
transformed<- predict(preprocessParams, PimaIndiansDiabetes[,1:8])
summary(transformed)
## ICA1 ICA2 ICA3 ICA4
## Min. :-1.5773 Min. :-2.37945 Min. :-3.2214 Min. :-3.0696
## 1st Qu.:-0.6826 1st Qu.:-0.73881 1st Qu.:-0.6497 1st Qu.:-0.7709
## Median :-0.2595 Median : 0.07479 Median :-0.1379 Median : 0.2781
## Mean : 0.0000 Mean : 0.00000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.4280 3rd Qu.: 0.72086 3rd Qu.: 0.4694 3rd Qu.: 0.8412
## Max. : 6.0203 Max. : 2.93421 Max. : 5.5386 Max. : 1.4155
## ICA5
## Min. :-4.89637
## 1st Qu.:-0.48309
## Median : 0.02395
## Mean : 0.00000
## 3rd Qu.: 0.59431
## Max. : 4.17248