## Lakeshia Legette Jones
## Assignment 2a
## Code from Section 9.3: Scale Data
## scale val = x/s
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
data(iris)
dim(iris)
## [1] 150 5
## This code will summarize the data; similar to 5 number summary.
summary(iris[, 1:4])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
# This code will calculate the pre process parameters from the dataset
# The preProcess() function tells it to skip the non-numerical values
preproecessParams <- preProcess( iris[, 1:4], method = c("scale"))
# This code will summarize transformed parameters
print(preproecessParams)
## Created from 150 samples and 4 variables
##
## Pre-processing:
## - ignored (0)
## - scaled (4)
# This code will transform the data using the parameters
transformed <- predict(preproecessParams, iris[, 1:4])
# This code will summarize the transformed dataset
summary(transformed)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :5.193 Min. : 4.589 Min. :0.5665 Min. :0.1312
## 1st Qu.:6.159 1st Qu.: 6.424 1st Qu.:0.9064 1st Qu.:0.3936
## Median :7.004 Median : 6.883 Median :2.4642 Median :1.7055
## Mean :7.057 Mean : 7.014 Mean :2.1288 Mean :1.5734
## 3rd Qu.:7.729 3rd Qu.: 7.571 3rd Qu.:2.8890 3rd Qu.:2.3615
## Max. :9.540 Max. :10.095 Max. :3.9087 Max. :3.2798
## This code will check result
apply(transformed, 2, sd)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 1 1 1
##Code from Section 9.4: Centering
# transform_val = x - mean
library(caret)
data(iris)
summary(iris[, 1:4])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
# This code will calculate the pre process parameters fromt the dataset
preproecessParams <- preProcess( iris[, 1:4], method = c("center"))
# This code will summarize transformed parameters
print(preproecessParams)
## Created from 150 samples and 4 variables
##
## Pre-processing:
## - centered (4)
## - ignored (0)
# Transform the data using the parameters
transformed <- predict(preproecessParams, iris[, 1:4])
# summarize the transformed dataset
summary(transformed)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :-1.54333 Min. :-1.05733 Min. :-2.758 Min. :-1.0993
## 1st Qu.:-0.74333 1st Qu.:-0.25733 1st Qu.:-2.158 1st Qu.:-0.8993
## Median :-0.04333 Median :-0.05733 Median : 0.592 Median : 0.1007
## Mean : 0.00000 Mean : 0.00000 Mean : 0.000 Mean : 0.0000
## 3rd Qu.: 0.55667 3rd Qu.: 0.24267 3rd Qu.: 1.342 3rd Qu.: 0.6007
## Max. : 2.05667 Max. : 1.34267 Max. : 3.142 Max. : 1.3007
## Check result
apply(transformed, 2, mean)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## -4.315067e-16 -2.391605e-17 -1.095420e-16 -3.545312e-16
## Code from Section 9.5: Standardize Data
# transform_val = (x - mean)/sd by using the "center" and "scale" functions to get it done
# It converts to mean of 0 and standard deviation of 1; basically a z-score
library(caret)
data(iris)
summary(iris[, 1:4])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
# Calculate the pre process parameters fromt the dataset
preproecessParams <- preProcess( iris[, 1:4], method = c("center", "scale"))
# summarize transformed parameters
print(preproecessParams)
## Created from 150 samples and 4 variables
##
## Pre-processing:
## - centered (4)
## - ignored (0)
## - scaled (4)
# Transform the data using the parameters
transformed <- predict(preproecessParams, iris[, 1:4])
# summarize the transformed dataset
summary(transformed)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :-1.86378 Min. :-2.4258 Min. :-1.5623 Min. :-1.4422
## 1st Qu.:-0.89767 1st Qu.:-0.5904 1st Qu.:-1.2225 1st Qu.:-1.1799
## Median :-0.05233 Median :-0.1315 Median : 0.3354 Median : 0.1321
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.67225 3rd Qu.: 0.5567 3rd Qu.: 0.7602 3rd Qu.: 0.7880
## Max. : 2.48370 Max. : 3.0805 Max. : 1.7799 Max. : 1.7064
## Check results
apply(transformed, 2, mean)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## -1.556718e-15 7.096175e-16 1.361874e-16 -2.782959e-16
apply(transformed, 2, sd)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 1 1 1
## Code from Section 9.6: Normalize Data
## Data values can be scaled into the range of [0, 1] which is called normalization
## xnormalized = (x - xminimum) / range of x
## xnormalized = a + ( ((x - xminimum) * (b - a)) / range of x)
##
library(caret)
data(iris)
summary(iris[, 1:4])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
# Calculate the pre process parameters fromt the dataset
preproecessParams <- preProcess( iris[, 1:4], method = c("range"))
# summarize transformed parameters
print(preproecessParams)
## Created from 150 samples and 4 variables
##
## Pre-processing:
## - ignored (0)
## - re-scaling to [0, 1] (4)
# Transform the data using the parameters
transformed <- predict(preproecessParams, iris[, 1:4])
# summarize the transformed dataset
summary(transformed)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.2222 1st Qu.:0.3333 1st Qu.:0.1017 1st Qu.:0.08333
## Median :0.4167 Median :0.4167 Median :0.5678 Median :0.50000
## Mean :0.4287 Mean :0.4406 Mean :0.4675 Mean :0.45806
## 3rd Qu.:0.5833 3rd Qu.:0.5417 3rd Qu.:0.6949 3rd Qu.:0.70833
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.00000
## Check results
apply(transformed, 2, min)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 0 0 0 0
apply(transformed, 2, max)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 1 1 1
## Code from Section 9.7: Box-Cox Transform
## Shift to reduce the skewness and make more Gaussian
## It assumes all positive values.
library(mlbench) #Use this to access benchmark datasets
library(caret)
data(PimaIndiansDiabetes)
dim(PimaIndiansDiabetes)
## [1] 768 9
#summarize pedigree and age
summary(PimaIndiansDiabetes[, 7:8])
## pedigree age
## Min. :0.0780 Min. :21.00
## 1st Qu.:0.2437 1st Qu.:24.00
## Median :0.3725 Median :29.00
## Mean :0.4719 Mean :33.24
## 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :2.4200 Max. :81.00
#Calculate the preprocess parameters from the dataset
preproecessParams <- preProcess( PimaIndiansDiabetes[, 7:8], method = "BoxCox")
# Summarise transformed parameters
print(preproecessParams)
## Created from 768 samples and 2 variables
##
## Pre-processing:
## - Box-Cox transformation (2)
## - ignored (0)
##
## Lambda estimates for Box-Cox transformation:
## -0.1, -1.1
#transform the data using the parameters
transformed <- predict(preproecessParams, PimaIndiansDiabetes[, 7:8])
# summarize the transformed dataset (note pedigree and age)
summary(transformed)
## pedigree age
## Min. :-2.5510 Min. :0.8772
## 1st Qu.:-1.4116 1st Qu.:0.8815
## Median :-0.9875 Median :0.8867
## Mean :-0.9599 Mean :0.8874
## 3rd Qu.:-0.4680 3rd Qu.:0.8938
## Max. : 0.8838 Max. :0.9019
## Code from Section 9.8: Yeo-Johnson Transform
## Another powerful transformation like Box-cox make more Gaussian
## It supports zero and negative values
library(mlbench)
library(caret)
data(PimaIndiansDiabetes)
#summarize pedigree and age
summary(PimaIndiansDiabetes[, 7:8])
## pedigree age
## Min. :0.0780 Min. :21.00
## 1st Qu.:0.2437 1st Qu.:24.00
## Median :0.3725 Median :29.00
## Mean :0.4719 Mean :33.24
## 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :2.4200 Max. :81.00
#Calculate the preprocess parameters from the dataset
preproecessParams <- preProcess( PimaIndiansDiabetes[, 7:8], method = "YeoJohnson")
# Summarise transformed parameters
print(preproecessParams)
## Created from 768 samples and 2 variables
##
## Pre-processing:
## - ignored (0)
## - Yeo-Johnson transformation (2)
##
## Lambda estimates for Yeo-Johnson transformation:
## -2.25, -1.15
#transform the data using the parameters
transformed <- predict(preproecessParams, PimaIndiansDiabetes[, 7:8])
# summarize the transformed dataset (note pedigree and age)
summary(transformed)
## pedigree age
## Min. :0.0691 Min. :0.8450
## 1st Qu.:0.1724 1st Qu.:0.8484
## Median :0.2265 Median :0.8524
## Mean :0.2317 Mean :0.8530
## 3rd Qu.:0.2956 3rd Qu.:0.8580
## Max. :0.4164 Max. :0.8644
## Code from Section 9.9: Principle component analysis transformation
## The PCA transforms the data to return only the principle components, a technique
## from multivariate statistics and linear algebra. The transform keeps those components
## above the variance threshold (default = 0.95) or the number of components can be
## specified (pcaComp). The result is attributes that are uncorrelated, useful for
## algorithms like linear and generalized linear regression.
library(mlbench)
data(iris)
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
# Calculate the pre process parameters fromt the dataset
preproecessParams <- preProcess( iris[, 1:4], method = c("center", "scale", "pca"))
# summarize transformed parameters
print(preproecessParams)
## Created from 150 samples and 4 variables
##
## Pre-processing:
## - centered (4)
## - ignored (0)
## - principal component signal extraction (4)
## - scaled (4)
##
## PCA needed 2 components to capture 95 percent of the variance
# Transform the data using the parameters
transformed <- predict(preproecessParams, iris[, 1:4])
# summarize the transformed dataset
summary(transformed)
## PC1 PC2
## Min. :-2.7651 Min. :-2.67732
## 1st Qu.:-2.0957 1st Qu.:-0.59205
## Median : 0.4169 Median :-0.01744
## Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 1.3385 3rd Qu.: 0.59649
## Max. : 3.2996 Max. : 2.64521
## Check results
apply(transformed, 2, sd)
## PC1 PC2
## 1.7083611 0.9560494
## Code from Section 9.10: Independent Component Analysis (ICA) Transformation
## Transform the data to the independent components. Unlike PCA, ICA retains those components that
## are independent. You must specity the number of desired independent components with the
## n.comp argument. This transformation may be useful for algorithm such as Naive Bayes.
library(mlbench)
library(caret)
library(fastICA)
data(PimaIndiansDiabetes)
#summarize pedigree and age
summary(PimaIndiansDiabetes[, 1:8])
## pregnant glucose pressure triceps
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## insulin mass pedigree age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
#Calculate the preprocess parameters from the dataset
preproecessParams <- preProcess( PimaIndiansDiabetes[, 1:8],
method = c("center", "scale", "ica"), n.comp = 5 )
# Summarise transformed parameters
print(preproecessParams)
## Created from 768 samples and 8 variables
##
## Pre-processing:
## - centered (8)
## - independent component signal extraction (8)
## - ignored (0)
## - scaled (8)
##
## ICA used 5 components
#transform the data using the parameters
transformed <- predict(preproecessParams, PimaIndiansDiabetes[, 1:8])
# summarize the transformed dataset (note pedigree and age)
summary(transformed)
## ICA1 ICA2 ICA3 ICA4
## Min. :-2.38147 Min. :-3.0651 Min. :-4.89714 Min. :-1.5756
## 1st Qu.:-0.73690 1st Qu.:-0.7689 1st Qu.:-0.48296 1st Qu.:-0.6827
## Median : 0.07421 Median : 0.2780 Median : 0.02382 Median :-0.2602
## Mean : 0.00000 Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.72058 3rd Qu.: 0.8444 3rd Qu.: 0.59374 3rd Qu.: 0.4283
## Max. : 2.92924 Max. : 1.4105 Max. : 4.17316 Max. : 6.0210
## ICA5
## Min. :-5.5350
## 1st Qu.:-0.4714
## Median : 0.1392
## Mean : 0.0000
## 3rd Qu.: 0.6490
## Max. : 3.2277
## What is standardization?
## Standardization is the use of both centering and scaling. Centering subtracts the mean from ## each value. Scaling divides the result by the standard deviation. So the net effect is the ## calculation of the z-score; converting to a mean of 0 and standard deviation of 1.
## What is normalization?
## Normalization takes the data values and scales them into the range of [0, 1].
## What are the pre-processing methods/techniques?
## Pre-processing techniques include data cleansing, data integration, data transformation, scaling, discretization or feature extraction. The focus of this lesson was data
## transformation.
## List transform methods. Which is the best?
## There are three categories of transforms, including basic, power, & linear transforms.
## Basic includes scaling, centering, standardization and normalization.
## Power includes Box-Cox and Yeo-Johnson.
## Linear includes Principal Component Analysis (PCA) and Independent Component Analysis (ICA).
## What is scaling data?
## To scale means to divide each value by the standard deviation. It is useful for scaling data with a Gaussian distribution.
## Explain the center transform.
## Centering subtracts the value of the mean from each of the x values
## Explain characterstics of Pima Indians Diabetes & Iris.
##
## How many columns in Pima Indians Diabetes?
## There are 9 columns in the Pima Indians Diabetes.
## How many rows in Iris?
## There are 150 rows in the Iris dataset.
## Explain PCA & ICA.
## PCA is a dimensionality reduction and machine learning method used to simplify a large data set into a smaller set while still maintaining significant patterns and trends. This exercise resulted in 2 principal components being selected.
## ICA transforms the data into independent components. It retains those components that are independent. The number of desired independent components must be specified with the n.comp argument.
## Give your understanding of the pre-processing process.
## Pre-processing in this lesson focused on data transformation. These transformations are necessary to prepare data to get the best results from machine learning algorithms. The aim is to get the best accuracy from machine learning algorithms on the datasets.