Data Mining with R Assignment 2a

Scale Data

# install packages
# install.packages('caret')
# load packages
library(caret)
# load the dataset
data(iris)
# summarize data
summary(iris[,1:4])

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500

# calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(iris[,1:4], method= c('scale'))
# summarize transform parameters
print(preprocessParams)

## Created from 150 samples and 4 variables
## 
## Pre-processing:
##   - ignored (0)
##   - scaled (4)

# transform the dataset using the parameters
transformed <- predict(preprocessParams, iris[,1:4])
summary(transformed)

##   Sepal.Length    Sepal.Width      Petal.Length     Petal.Width    
##  Min.   :5.193   Min.   : 4.589   Min.   :0.5665   Min.   :0.1312  
##  1st Qu.:6.159   1st Qu.: 6.424   1st Qu.:0.9064   1st Qu.:0.3936  
##  Median :7.004   Median : 6.883   Median :2.4642   Median :1.7055  
##  Mean   :7.057   Mean   : 7.014   Mean   :2.1288   Mean   :1.5734  
##  3rd Qu.:7.729   3rd Qu.: 7.571   3rd Qu.:2.8890   3rd Qu.:2.3615  
##  Max.   :9.540   Max.   :10.095   Max.   :3.9087   Max.   :3.2798

Center Data

# load packages
library(caret)
# load the dataset
data(iris)
# summarize data
summary(iris[,1:4])

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500

# calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(iris[,1:4], method= c('center'))
# summarize transform parameters
print(preprocessParams)

## Created from 150 samples and 4 variables
## 
## Pre-processing:
##   - centered (4)
##   - ignored (0)

# transform the dataset using the parameters
transformed <- predict(preprocessParams, iris[,1:4])
summary(transformed)

##   Sepal.Length       Sepal.Width        Petal.Length     Petal.Width     
##  Min.   :-1.54333   Min.   :-1.05733   Min.   :-2.758   Min.   :-1.0993  
##  1st Qu.:-0.74333   1st Qu.:-0.25733   1st Qu.:-2.158   1st Qu.:-0.8993  
##  Median :-0.04333   Median :-0.05733   Median : 0.592   Median : 0.1007  
##  Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.000   Mean   : 0.0000  
##  3rd Qu.: 0.55667   3rd Qu.: 0.24267   3rd Qu.: 1.342   3rd Qu.: 0.6007  
##  Max.   : 2.05667   Max.   : 1.34267   Max.   : 3.142   Max.   : 1.3007

Standardize Data

# install packages
# install.packages('caret')
# load packages
library(caret)
# load the dataset
data(iris)
# summarize data
summary(iris[,1:4])

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500

# calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(iris[,1:4], method= c('center','scale'))
# summarize transform parameters
print(preprocessParams)

## Created from 150 samples and 4 variables
## 
## Pre-processing:
##   - centered (4)
##   - ignored (0)
##   - scaled (4)

# transform the dataset using the parameters
transformed <- predict(preprocessParams, iris[,1:4])
summary(transformed)

##   Sepal.Length       Sepal.Width       Petal.Length      Petal.Width     
##  Min.   :-1.86378   Min.   :-2.4258   Min.   :-1.5623   Min.   :-1.4422  
##  1st Qu.:-0.89767   1st Qu.:-0.5904   1st Qu.:-1.2225   1st Qu.:-1.1799  
##  Median :-0.05233   Median :-0.1315   Median : 0.3354   Median : 0.1321  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.67225   3rd Qu.: 0.5567   3rd Qu.: 0.7602   3rd Qu.: 0.7880  
##  Max.   : 2.48370   Max.   : 3.0805   Max.   : 1.7799   Max.   : 1.7064

Normalize Data

# install packages
# install.packages('caret')
# load packages
library(caret)
# load the dataset
data(iris)
# summarize data
summary(iris[,1:4])

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500

# calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(iris[,1:4], method= c('range'))
# summarize transofrm parameters
print(preprocessParams)

## Created from 150 samples and 4 variables
## 
## Pre-processing:
##   - ignored (0)
##   - re-scaling to [0, 1] (4)

# transform the dataset using the parameters
transformed <- predict(preprocessParams, iris[,1:4])
summary(transformed)

##   Sepal.Length     Sepal.Width      Petal.Length     Petal.Width     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.2222   1st Qu.:0.3333   1st Qu.:0.1017   1st Qu.:0.08333  
##  Median :0.4167   Median :0.4167   Median :0.5678   Median :0.50000  
##  Mean   :0.4287   Mean   :0.4406   Mean   :0.4675   Mean   :0.45806  
##  3rd Qu.:0.5833   3rd Qu.:0.5417   3rd Qu.:0.6949   3rd Qu.:0.70833  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000

Box-Cox Transform

# install packages
# install.packages('mlbench')
# load packages
library(mlbench)
library(caret)
# load the dataset
data(PimaIndiansDiabetes)
# summarize pedigree and age
summary(PimaIndiansDiabetes[,7:8])

##     pedigree           age       
##  Min.   :0.0780   Min.   :21.00  
##  1st Qu.:0.2437   1st Qu.:24.00  
##  Median :0.3725   Median :29.00  
##  Mean   :0.4719   Mean   :33.24  
##  3rd Qu.:0.6262   3rd Qu.:41.00  
##  Max.   :2.4200   Max.   :81.00

# calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(PimaIndiansDiabetes[,7:8], method= c('BoxCox'))
# summarize transform parameters
print(preprocessParams)

## Created from 768 samples and 2 variables
## 
## Pre-processing:
##   - Box-Cox transformation (2)
##   - ignored (0)
## 
## Lambda estimates for Box-Cox transformation:
## -0.1, -1.1

# transform the dataset using the parameters
transformed <- predict(preprocessParams, PimaIndiansDiabetes[,7:8])
# summarize the transformed dataset (note pedigree and age)
summary(transformed)

##     pedigree            age        
##  Min.   :-2.5510   Min.   :0.8772  
##  1st Qu.:-1.4116   1st Qu.:0.8815  
##  Median :-0.9875   Median :0.8867  
##  Mean   :-0.9599   Mean   :0.8874  
##  3rd Qu.:-0.4680   3rd Qu.:0.8938  
##  Max.   : 0.8838   Max.   :0.9019

Yeo-Johnson Transform

# install packages
# install.packages('mlbench')
# load packages
library(mlbench)
library(caret)
# load the dataset
data(PimaIndiansDiabetes)
# summarize pedigree and age
summary(PimaIndiansDiabetes[,7:8])

##     pedigree           age       
##  Min.   :0.0780   Min.   :21.00  
##  1st Qu.:0.2437   1st Qu.:24.00  
##  Median :0.3725   Median :29.00  
##  Mean   :0.4719   Mean   :33.24  
##  3rd Qu.:0.6262   3rd Qu.:41.00  
##  Max.   :2.4200   Max.   :81.00

# calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(PimaIndiansDiabetes[,7:8], method= c('YeoJohnson'))
# summarize transform parameters
print(preprocessParams)

## Created from 768 samples and 2 variables
## 
## Pre-processing:
##   - ignored (0)
##   - Yeo-Johnson transformation (2)
## 
## Lambda estimates for Yeo-Johnson transformation:
## -2.25, -1.15

# transform the dataset using the parameters
transformed <- predict(preprocessParams, PimaIndiansDiabetes[,7:8])
# summarize the transformed dataset (note pedigree and age)
summary(transformed)

##     pedigree           age        
##  Min.   :0.0691   Min.   :0.8450  
##  1st Qu.:0.1724   1st Qu.:0.8484  
##  Median :0.2265   Median :0.8524  
##  Mean   :0.2317   Mean   :0.8530  
##  3rd Qu.:0.2956   3rd Qu.:0.8580  
##  Max.   :0.4164   Max.   :0.8644

Prinicipal Component Analysis Transform

# install packages
# install.packages('mlbench')
# load packages
library(mlbench)
# load the dataset
data(iris)
# summarize pedigree and age
summary(iris)

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
##

# calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(iris, method= c('center','scale','pca'))
# summarize transform parameters
print(preprocessParams)

## Created from 150 samples and 5 variables
## 
## Pre-processing:
##   - centered (4)
##   - ignored (1)
##   - principal component signal extraction (4)
##   - scaled (4)
## 
## PCA needed 2 components to capture 95 percent of the variance

# transform the dataset using the parameters
transformed <- predict(preprocessParams, iris)
# summarize the transformed dataset (note pedigree and age)
summary(transformed)

##        Species        PC1               PC2          
##  setosa    :50   Min.   :-2.7651   Min.   :-2.67732  
##  versicolor:50   1st Qu.:-2.0957   1st Qu.:-0.59205  
##  virginica :50   Median : 0.4169   Median :-0.01744  
##                  Mean   : 0.0000   Mean   : 0.00000  
##                  3rd Qu.: 1.3385   3rd Qu.: 0.59649  
##                  Max.   : 3.2996   Max.   : 2.64521

Independent Component Analysis Transform

# install packages
# install.packages('fastICA')
# load packages
library(mlbench)
library(caret)
# load the dataset
data(PimaIndiansDiabetes)
# summarize pedigree and age
summary(PimaIndiansDiabetes[,1:8])

##     pregnant         glucose         pressure         triceps     
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
##  Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##     insulin           mass          pedigree           age       
##  Min.   :  0.0   Min.   : 0.00   Min.   :0.0780   Min.   :21.00  
##  1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437   1st Qu.:24.00  
##  Median : 30.5   Median :32.00   Median :0.3725   Median :29.00  
##  Mean   : 79.8   Mean   :31.99   Mean   :0.4719   Mean   :33.24  
##  3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262   3rd Qu.:41.00  
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200   Max.   :81.00

# calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(PimaIndiansDiabetes[,1:8], method= c('center','scale','ica'),n.comp= 5)
# summarize transform parameters
print(preprocessParams)

## Created from 768 samples and 8 variables
## 
## Pre-processing:
##   - centered (8)
##   - independent component signal extraction (8)
##   - ignored (0)
##   - scaled (8)
## 
## ICA used 5 components

# transform the dataset using the parameters
transformed <- predict(preprocessParams, PimaIndiansDiabetes[,1:8])
# summarize the transformed dataset
summary(transformed)

##       ICA1               ICA2               ICA3              ICA4        
##  Min.   :-2.37900   Min.   :-4.89677   Min.   :-3.0694   Min.   :-5.5363  
##  1st Qu.:-0.73931   1st Qu.:-0.48270   1st Qu.:-0.7709   1st Qu.:-0.4705  
##  Median : 0.07492   Median : 0.02384   Median : 0.2781   Median : 0.1384  
##  Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.72043   3rd Qu.: 0.59357   3rd Qu.: 0.8410   3rd Qu.: 0.6490  
##  Max.   : 2.93056   Max.   : 4.17266   Max.   : 1.4152   Max.   : 3.2250  
##       ICA5        
##  Min.   :-1.5773  
##  1st Qu.:-0.6822  
##  Median :-0.2595  
##  Mean   : 0.0000  
##  3rd Qu.: 0.4273  
##  Max.   : 6.0196

More about IRIS dataset

data(iris)
print('Below is the size and structure of the data and type of each variable:\n')

## [1] "Below is the size and structure of the data and type of each variable:\n"

str(iris)

## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

cat("Number of missing values: ", sum(is.na(iris)),'\n')

## Number of missing values:  0

print('Below is the summary of key statistics of the data:\n')

## [1] "Below is the summary of key statistics of the data:\n"

summary(iris)

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
##

# Sketch histograms for 4 columns
par(mfrow = c(2, 2))
for (i in 1:4){
hist(iris[,i], main = 'Histogram', xlab = colnames(iris)[i], ylab = "Frequency")
}

More about Pima Indians Diabetes dataset

data(PimaIndiansDiabetes)
print('Below is the size and structure of the data and type of each variable:\n')

## [1] "Below is the size and structure of the data and type of each variable:\n"

str(PimaIndiansDiabetes)

## 'data.frame':    768 obs. of  9 variables:
##  $ pregnant: num  6 1 8 1 0 5 3 10 2 8 ...
##  $ glucose : num  148 85 183 89 137 116 78 115 197 125 ...
##  $ pressure: num  72 66 64 66 40 74 50 0 70 96 ...
##  $ triceps : num  35 29 0 23 35 0 32 0 45 0 ...
##  $ insulin : num  0 0 0 94 168 0 88 0 543 0 ...
##  $ mass    : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ pedigree: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ age     : num  50 31 32 21 33 30 26 29 53 54 ...
##  $ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...

cat("Number of missing values: ", sum(is.na(PimaIndiansDiabetes)),'\n')

## Number of missing values:  0

print('Below is the summary of key statistics of the data:\n')

## [1] "Below is the summary of key statistics of the data:\n"

summary(PimaIndiansDiabetes)

##     pregnant         glucose         pressure         triceps     
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
##  Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##     insulin           mass          pedigree           age        diabetes 
##  Min.   :  0.0   Min.   : 0.00   Min.   :0.0780   Min.   :21.00   neg:500  
##  1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437   1st Qu.:24.00   pos:268  
##  Median : 30.5   Median :32.00   Median :0.3725   Median :29.00            
##  Mean   : 79.8   Mean   :31.99   Mean   :0.4719   Mean   :33.24            
##  3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262   3rd Qu.:41.00            
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200   Max.   :81.00

# Sketch histograms for 9 columns
par(mfrow = c(3, 3))
PimaIndiansDiabetescopy <- PimaIndiansDiabetes
PimaIndiansDiabetescopy[,9] <- as.numeric(PimaIndiansDiabetescopy[,9], levels= c('neg','pos'))-1

for (i in 1:9){
hist(PimaIndiansDiabetescopy[,i], main = 'Histogram', xlab = colnames(PimaIndiansDiabetes)[i], ylab = "Frequency")
}

print('1: diabetic, 0: non diabetic')

## [1] "1: diabetic, 0: non diabetic"

Data Mining with R Assignment 2a

Soudeh Khoubrouy

2024-06-29

Scale Data

Center Data

Standardize Data

Normalize Data

Box-Cox Transform

Yeo-Johnson Transform

Prinicipal Component Analysis Transform

Independent Component Analysis Transform

More about IRIS dataset

More about Pima Indians Diabetes dataset