Scale Data
# install packages
# install.packages('caret')
# load packages
library(caret)
# load the dataset
data(iris)
# summarize data
summary(iris[,1:4])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
# calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(iris[,1:4], method= c('scale'))
# summarize transform parameters
print(preprocessParams)
## Created from 150 samples and 4 variables
##
## Pre-processing:
## - ignored (0)
## - scaled (4)
# transform the dataset using the parameters
transformed <- predict(preprocessParams, iris[,1:4])
summary(transformed)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :5.193 Min. : 4.589 Min. :0.5665 Min. :0.1312
## 1st Qu.:6.159 1st Qu.: 6.424 1st Qu.:0.9064 1st Qu.:0.3936
## Median :7.004 Median : 6.883 Median :2.4642 Median :1.7055
## Mean :7.057 Mean : 7.014 Mean :2.1288 Mean :1.5734
## 3rd Qu.:7.729 3rd Qu.: 7.571 3rd Qu.:2.8890 3rd Qu.:2.3615
## Max. :9.540 Max. :10.095 Max. :3.9087 Max. :3.2798
Center Data
# load packages
library(caret)
# load the dataset
data(iris)
# summarize data
summary(iris[,1:4])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
# calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(iris[,1:4], method= c('center'))
# summarize transform parameters
print(preprocessParams)
## Created from 150 samples and 4 variables
##
## Pre-processing:
## - centered (4)
## - ignored (0)
# transform the dataset using the parameters
transformed <- predict(preprocessParams, iris[,1:4])
summary(transformed)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :-1.54333 Min. :-1.05733 Min. :-2.758 Min. :-1.0993
## 1st Qu.:-0.74333 1st Qu.:-0.25733 1st Qu.:-2.158 1st Qu.:-0.8993
## Median :-0.04333 Median :-0.05733 Median : 0.592 Median : 0.1007
## Mean : 0.00000 Mean : 0.00000 Mean : 0.000 Mean : 0.0000
## 3rd Qu.: 0.55667 3rd Qu.: 0.24267 3rd Qu.: 1.342 3rd Qu.: 0.6007
## Max. : 2.05667 Max. : 1.34267 Max. : 3.142 Max. : 1.3007
Standardize Data
# install packages
# install.packages('caret')
# load packages
library(caret)
# load the dataset
data(iris)
# summarize data
summary(iris[,1:4])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
# calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(iris[,1:4], method= c('center','scale'))
# summarize transform parameters
print(preprocessParams)
## Created from 150 samples and 4 variables
##
## Pre-processing:
## - centered (4)
## - ignored (0)
## - scaled (4)
# transform the dataset using the parameters
transformed <- predict(preprocessParams, iris[,1:4])
summary(transformed)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :-1.86378 Min. :-2.4258 Min. :-1.5623 Min. :-1.4422
## 1st Qu.:-0.89767 1st Qu.:-0.5904 1st Qu.:-1.2225 1st Qu.:-1.1799
## Median :-0.05233 Median :-0.1315 Median : 0.3354 Median : 0.1321
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.67225 3rd Qu.: 0.5567 3rd Qu.: 0.7602 3rd Qu.: 0.7880
## Max. : 2.48370 Max. : 3.0805 Max. : 1.7799 Max. : 1.7064
Normalize Data
# install packages
# install.packages('caret')
# load packages
library(caret)
# load the dataset
data(iris)
# summarize data
summary(iris[,1:4])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
# calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(iris[,1:4], method= c('range'))
# summarize transofrm parameters
print(preprocessParams)
## Created from 150 samples and 4 variables
##
## Pre-processing:
## - ignored (0)
## - re-scaling to [0, 1] (4)
# transform the dataset using the parameters
transformed <- predict(preprocessParams, iris[,1:4])
summary(transformed)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.2222 1st Qu.:0.3333 1st Qu.:0.1017 1st Qu.:0.08333
## Median :0.4167 Median :0.4167 Median :0.5678 Median :0.50000
## Mean :0.4287 Mean :0.4406 Mean :0.4675 Mean :0.45806
## 3rd Qu.:0.5833 3rd Qu.:0.5417 3rd Qu.:0.6949 3rd Qu.:0.70833
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.00000
More about IRIS dataset
data(iris)
print('Below is the size and structure of the data and type of each variable:\n')
## [1] "Below is the size and structure of the data and type of each variable:\n"
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
cat("Number of missing values: ", sum(is.na(iris)),'\n')
## Number of missing values: 0
print('Below is the summary of key statistics of the data:\n')
## [1] "Below is the summary of key statistics of the data:\n"
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
# Sketch histograms for 4 columns
par(mfrow = c(2, 2))
for (i in 1:4){
hist(iris[,i], main = 'Histogram', xlab = colnames(iris)[i], ylab = "Frequency")
}

More about Pima Indians Diabetes dataset
data(PimaIndiansDiabetes)
print('Below is the size and structure of the data and type of each variable:\n')
## [1] "Below is the size and structure of the data and type of each variable:\n"
str(PimaIndiansDiabetes)
## 'data.frame': 768 obs. of 9 variables:
## $ pregnant: num 6 1 8 1 0 5 3 10 2 8 ...
## $ glucose : num 148 85 183 89 137 116 78 115 197 125 ...
## $ pressure: num 72 66 64 66 40 74 50 0 70 96 ...
## $ triceps : num 35 29 0 23 35 0 32 0 45 0 ...
## $ insulin : num 0 0 0 94 168 0 88 0 543 0 ...
## $ mass : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ pedigree: num 0.627 0.351 0.672 0.167 2.288 ...
## $ age : num 50 31 32 21 33 30 26 29 53 54 ...
## $ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...
cat("Number of missing values: ", sum(is.na(PimaIndiansDiabetes)),'\n')
## Number of missing values: 0
print('Below is the summary of key statistics of the data:\n')
## [1] "Below is the summary of key statistics of the data:\n"
summary(PimaIndiansDiabetes)
## pregnant glucose pressure triceps
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## insulin mass pedigree age diabetes
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00 neg:500
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00 pos:268
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
# Sketch histograms for 9 columns
par(mfrow = c(3, 3))
PimaIndiansDiabetescopy <- PimaIndiansDiabetes
PimaIndiansDiabetescopy[,9] <- as.numeric(PimaIndiansDiabetescopy[,9], levels= c('neg','pos'))-1
for (i in 1:9){
hist(PimaIndiansDiabetescopy[,i], main = 'Histogram', xlab = colnames(PimaIndiansDiabetes)[i], ylab = "Frequency")
}

print('1: diabetic, 0: non diabetic')
## [1] "1: diabetic, 0: non diabetic"