Scale Data

The following code apply scale transform

# load packages
library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

# load the dataset
data(iris)
# summarize data
summary(iris[,1:4])

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500

# calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(iris[,1:4], method=c("scale"))
# summarize transform parameters
print(preprocessParams)

## Created from 150 samples and 4 variables
## 
## Pre-processing:
##   - ignored (0)
##   - scaled (4)

# transform the dataset using the parameters
transformed <- predict(preprocessParams, iris[,1:4])
# summarize the transformed dataset
summary(transformed)

##   Sepal.Length    Sepal.Width      Petal.Length     Petal.Width    
##  Min.   :5.193   Min.   : 4.589   Min.   :0.5665   Min.   :0.1312  
##  1st Qu.:6.159   1st Qu.: 6.424   1st Qu.:0.9064   1st Qu.:0.3936  
##  Median :7.004   Median : 6.883   Median :2.4642   Median :1.7055  
##  Mean   :7.057   Mean   : 7.014   Mean   :2.1288   Mean   :1.5734  
##  3rd Qu.:7.729   3rd Qu.: 7.571   3rd Qu.:2.8890   3rd Qu.:2.3615  
##  Max.   :9.540   Max.   :10.095   Max.   :3.9087   Max.   :3.2798

Center Data

The following algorithm applies the center data transform.

# load packages
library(caret)
# load the dataset
data(iris)
# summarize data
summary(iris[,1:4])

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500

# calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(iris[,1:4], method=c("center"))
# summarize transform parameters
print(preprocessParams)

## Created from 150 samples and 4 variables
## 
## Pre-processing:
##   - centered (4)
##   - ignored (0)

# transform the dataset using the parameters
transformed <- predict(preprocessParams, iris[,1:4])
# summarize the transformed dataset
summary(transformed)

##   Sepal.Length       Sepal.Width        Petal.Length     Petal.Width     
##  Min.   :-1.54333   Min.   :-1.05733   Min.   :-2.758   Min.   :-1.0993  
##  1st Qu.:-0.74333   1st Qu.:-0.25733   1st Qu.:-2.158   1st Qu.:-0.8993  
##  Median :-0.04333   Median :-0.05733   Median : 0.592   Median : 0.1007  
##  Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.000   Mean   : 0.0000  
##  3rd Qu.: 0.55667   3rd Qu.: 0.24267   3rd Qu.: 1.342   3rd Qu.: 0.6007  
##  Max.   : 2.05667   Max.   : 1.34267   Max.   : 3.142   Max.   : 1.3007

Standardize Data

The following algorithm applies the standardize transform.

# load packages
library(caret)
# load the dataset
data(iris)
# summarize data
summary(iris[,1:4])

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500

# calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(iris[,1:4], method=c("center", "scale"))
# summarize transform parameters
print(preprocessParams)

## Created from 150 samples and 4 variables
## 
## Pre-processing:
##   - centered (4)
##   - ignored (0)
##   - scaled (4)

# transform the dataset using the parameters
transformed <- predict(preprocessParams, iris[,1:4])
# summarize the transformed dataset
summary(transformed)

##   Sepal.Length       Sepal.Width       Petal.Length      Petal.Width     
##  Min.   :-1.86378   Min.   :-2.4258   Min.   :-1.5623   Min.   :-1.4422  
##  1st Qu.:-0.89767   1st Qu.:-0.5904   1st Qu.:-1.2225   1st Qu.:-1.1799  
##  Median :-0.05233   Median :-0.1315   Median : 0.3354   Median : 0.1321  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.67225   3rd Qu.: 0.5567   3rd Qu.: 0.7602   3rd Qu.: 0.7880  
##  Max.   : 2.48370   Max.   : 3.0805   Max.   : 1.7799   Max.   : 1.7064

Normalize Data

The following algorithm applies the normalize transform

# load packages
library(caret)
# load the dataset
data(iris)
# summarize data
summary(iris[,1:4])

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500

# calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(iris[,1:4], method=c("range"))
# summarize transform parameters
print(preprocessParams)

## Created from 150 samples and 4 variables
## 
## Pre-processing:
##   - ignored (0)
##   - re-scaling to [0, 1] (4)

# transform the dataset using the parameters
transformed <- predict(preprocessParams, iris[,1:4])
# summarize the transformed dataset
summary(transformed)

##   Sepal.Length     Sepal.Width      Petal.Length     Petal.Width     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.2222   1st Qu.:0.3333   1st Qu.:0.1017   1st Qu.:0.08333  
##  Median :0.4167   Median :0.4167   Median :0.5678   Median :0.50000  
##  Mean   :0.4287   Mean   :0.4406   Mean   :0.4675   Mean   :0.45806  
##  3rd Qu.:0.5833   3rd Qu.:0.5417   3rd Qu.:0.6949   3rd Qu.:0.70833  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000

Box-Cox Transform

The following algorithm applies the Box-Cox transform to the data

# load packages
library(mlbench)
library(caret)
# load the dataset
data(PimaIndiansDiabetes)
# summarize pedigree and age
summary(PimaIndiansDiabetes[,7:8])

##     pedigree           age       
##  Min.   :0.0780   Min.   :21.00  
##  1st Qu.:0.2437   1st Qu.:24.00  
##  Median :0.3725   Median :29.00  
##  Mean   :0.4719   Mean   :33.24  
##  3rd Qu.:0.6262   3rd Qu.:41.00  
##  Max.   :2.4200   Max.   :81.00

# calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(PimaIndiansDiabetes[,7:8], method=c("BoxCox"))
# summarize transform parameters
print(preprocessParams)

## Created from 768 samples and 2 variables
## 
## Pre-processing:
##   - Box-Cox transformation (2)
##   - ignored (0)
## 
## Lambda estimates for Box-Cox transformation:
## -0.1, -1.1

# transform the dataset using the parameters
transformed <- predict(preprocessParams, PimaIndiansDiabetes[,7:8])
# summarize the transformed dataset (note pedigree and age)
summary(transformed)

##     pedigree            age        
##  Min.   :-2.5510   Min.   :0.8772  
##  1st Qu.:-1.4116   1st Qu.:0.8815  
##  Median :-0.9875   Median :0.8867  
##  Mean   :-0.9599   Mean   :0.8874  
##  3rd Qu.:-0.4680   3rd Qu.:0.8938  
##  Max.   : 0.8838   Max.   :0.9019

Yeo-Johnson Transform

# load packages
library(mlbench)
library(caret)
# load the dataset
data(PimaIndiansDiabetes)
# summarize pedigree and age
summary(PimaIndiansDiabetes[,7:8])

##     pedigree           age       
##  Min.   :0.0780   Min.   :21.00  
##  1st Qu.:0.2437   1st Qu.:24.00  
##  Median :0.3725   Median :29.00  
##  Mean   :0.4719   Mean   :33.24  
##  3rd Qu.:0.6262   3rd Qu.:41.00  
##  Max.   :2.4200   Max.   :81.00

# calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(PimaIndiansDiabetes[,7:8], method=c("YeoJohnson"))
# summarize transform parameters
print(preprocessParams)

## Created from 768 samples and 2 variables
## 
## Pre-processing:
##   - ignored (0)
##   - Yeo-Johnson transformation (2)
## 
## Lambda estimates for Yeo-Johnson transformation:
## -2.25, -1.15

# transform the dataset using the parameters
transformed <- predict(preprocessParams, PimaIndiansDiabetes[,7:8])
# summarize the transformed dataset (note pedigree and age)
summary(transformed)

##     pedigree           age        
##  Min.   :0.0691   Min.   :0.8450  
##  1st Qu.:0.1724   1st Qu.:0.8484  
##  Median :0.2265   Median :0.8524  
##  Mean   :0.2317   Mean   :0.8530  
##  3rd Qu.:0.2956   3rd Qu.:0.8580  
##  Max.   :0.4164   Max.   :0.8644

Principal component analysis

# load the packages
library(mlbench)
# load the dataset
data(iris)
# summarize dataset
summary(iris)

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
##

# calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(iris, method=c("center", "scale", "pca"))
# summarize transform parameters
print(preprocessParams)

## Created from 150 samples and 5 variables
## 
## Pre-processing:
##   - centered (4)
##   - ignored (1)
##   - principal component signal extraction (4)
##   - scaled (4)
## 
## PCA needed 2 components to capture 95 percent of the variance

# transform the dataset using the parameters
transformed <- predict(preprocessParams, iris)
# summarize the transformed dataset
summary(transformed)

##        Species        PC1               PC2          
##  setosa    :50   Min.   :-2.7651   Min.   :-2.67732  
##  versicolor:50   1st Qu.:-2.0957   1st Qu.:-0.59205  
##  virginica :50   Median : 0.4169   Median :-0.01744  
##                  Mean   : 0.0000   Mean   : 0.00000  
##                  3rd Qu.: 1.3385   3rd Qu.: 0.59649  
##                  Max.   : 3.2996   Max.   : 2.64521

Independent component analysis transform

# load packages
library(mlbench)
library(caret)
# load the dataset
data(PimaIndiansDiabetes)
# summarize dataset
summary(PimaIndiansDiabetes[,1:8])

##     pregnant         glucose         pressure         triceps     
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
##  Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##     insulin           mass          pedigree           age       
##  Min.   :  0.0   Min.   : 0.00   Min.   :0.0780   Min.   :21.00  
##  1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437   1st Qu.:24.00  
##  Median : 30.5   Median :32.00   Median :0.3725   Median :29.00  
##  Mean   : 79.8   Mean   :31.99   Mean   :0.4719   Mean   :33.24  
##  3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262   3rd Qu.:41.00  
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200   Max.   :81.00

# calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(PimaIndiansDiabetes[,1:8], method=c("center", "scale", "ica"), n.comp=5)
# summarize transform parameters
print(preprocessParams)

## Created from 768 samples and 8 variables
## 
## Pre-processing:
##   - centered (8)
##   - independent component signal extraction (8)
##   - ignored (0)
##   - scaled (8)
## 
## ICA used 5 components

# transform the dataset using the parameters
transformed <- predict(preprocessParams, PimaIndiansDiabetes[,1:8])
# summarize the transformed dataset
summary(transformed)

##       ICA1              ICA2              ICA3               ICA4         
##  Min.   :-3.0680   Min.   :-5.5393   Min.   :-2.93627   Min.   :-4.89541  
##  1st Qu.:-0.7687   1st Qu.:-0.4690   1st Qu.:-0.72169   1st Qu.:-0.48294  
##  Median : 0.2797   Median : 0.1379   Median :-0.07297   Median : 0.02362  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.00000  
##  3rd Qu.: 0.8422   3rd Qu.: 0.6496   3rd Qu.: 0.73750   3rd Qu.: 0.59407  
##  Max.   : 1.4134   Max.   : 3.2204   Max.   : 2.38132   Max.   : 4.17389  
##       ICA5        
##  Min.   :-6.0266  
##  1st Qu.:-0.4305  
##  Median : 0.2600  
##  Mean   : 0.0000  
##  3rd Qu.: 0.6842  
##  Max.   : 1.5788

Importin Pima indians diabetes

# load the dataset
data(PimaIndiansDiabetes)
summary(PimaIndiansDiabetes)

##     pregnant         glucose         pressure         triceps     
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
##  Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##     insulin           mass          pedigree           age        diabetes 
##  Min.   :  0.0   Min.   : 0.00   Min.   :0.0780   Min.   :21.00   neg:500  
##  1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437   1st Qu.:24.00   pos:268  
##  Median : 30.5   Median :32.00   Median :0.3725   Median :29.00            
##  Mean   : 79.8   Mean   :31.99   Mean   :0.4719   Mean   :33.24            
##  3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262   3rd Qu.:41.00            
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200   Max.   :81.00

colnames(PimaIndiansDiabetes)

## [1] "pregnant" "glucose"  "pressure" "triceps"  "insulin"  "mass"     "pedigree"
## [8] "age"      "diabetes"

Importing iris data set

# load the dataset
data(iris)
summary(iris)

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
##

nrow(iris)

## [1] 150

Obtain histogram of Pima indians diabetes dataset

data(PimaIndiansDiabetes)

# Plot histograms of each column
par(mfrow=c(3, 3))  # Sets up a 3x3 grid for plotting (adjust if needed)

# Loop through each column (1 to 8) and plot histogram
for (i in 1:8) {
  hist(PimaIndiansDiabetes[, i], main = colnames(PimaIndiansDiabetes)[i], xlab = colnames(PimaIndiansDiabetes)[i])
}

Obtain histogram of iris dataset

data(iris)

# Plot histograms of each column
par(mfrow=c(2, 2))  # Sets up a 2x2 grid for plotting

# Loop through each column (1 to 4) and plot histogram
for (i in 1:4) {
  hist(iris[, i], main = colnames(iris)[i], xlab = colnames(iris)[i])
}

HW2

Carlos Olivos

2024-06-29