Iris Dataset

Load Iris file

library(stringr)
filename <- "C:/Users/Gabi/Documents/MVC/AI4OPT/R/iris.csv"
#columnnames <-"C:/Users/Gabi/Documents/MVC/AI4OPT/irisnames.csv"
dataset <- read.csv(filename,header=FALSE)
names(dataset)[1] <- "Sepal.Length" # changes the name of column 1
names(dataset)[2] <- "Sepal.Width" # changes the name of column 2
names(dataset)[3] <- "Petal.Length" # changes the name of column 3
names(dataset)[4] <- "Petal.Width" # changes the name of column 4
names(dataset)[5] <- "Class" # changes the name of column 5

head(dataset)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width       Class
## 1          5.1         3.5          1.4         0.2 Iris-setosa
## 2          4.9         3.0          1.4         0.2 Iris-setosa
## 3          4.7         3.2          1.3         0.2 Iris-setosa
## 4          4.6         3.1          1.5         0.2 Iris-setosa
## 5          5.0         3.6          1.4         0.2 Iris-setosa
## 6          5.4         3.9          1.7         0.4 Iris-setosa

Dimensions of Iris Dataset

dim(dataset)
## [1] 150   5

Summary: The Iris dataset has 150 rows and 5 columns.

Data Types in Iris Dataset

sapply(dataset, class)
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width        Class 
##    "numeric"    "numeric"    "numeric"    "numeric"  "character"

Summary: Iris dataset has 5 columns. Columns 1-4 have numeric data types and column 5 has character data types.

Summary of columns

summary(dataset)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.054   Mean   :3.759   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##     Class          
##  Length:150        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 

Histogram Visualization

par(mfrow=c(1,4))
for(i in 1:4){
  hist(dataset[,i], main=names(dataset)[i])
}

Density Plots

library(lattice)
#create a layout of simpler density plots by attribute
par(mfrow=c(1,4))
for (i in 1:4) {
  plot(density(dataset[,i]), main=names(dataset)[i])
}

Box and Whisker Plots

par(mfrow=c(1,4))
for(i in 1:4) {
  boxplot(dataset[,i], main=names(dataset)[i])
}

Bar Plots

par(mfrow=c(1,4))
for(i in 1:4){
  counts <- table(dataset[,i])
  name <-names(dataset)[i]
  barplot(counts,main=name)
}

Multivariate Vizualization

Correlation Plot

library(corrplot)
## corrplot 0.92 loaded
correlations <-cor(dataset[,1:4])
corrplot(correlations, method="circle")

Summary: - Blue represents positive correlation and red negative correlation. - The larger the dot the larger the correlation

Density Plots By Class

  • The code below doesn’t display the graphs, I have tried various things and it only outputs null.
library(caret)
## Loading required package: ggplot2
x <- dataset[,1:4]
y <- dataset[,5]
scales <- list(x=list(relation="free"), y=list(relation="free"))
featurePlot(x=x, y=y, plot="density", scales=scales)
## NULL

Box and Whisker Plots By Class

  • The code below doesn’t display the graphs, I have tried various things and it only outputs null.
x <- dataset[,1:4]
y <- dataset[,5]
featurePlot(x=x,y=y, plot="box")
## NULL

Scale Data

  • Takes each value and divides it by the standard deviation of the specific column
    • example: find the standard deviation of the sepal length then take each value of sepal length column and divide it by the standard deviation
#summarize data
summary(dataset[,1:4])
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.054   Mean   :3.759   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500
#calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(dataset[,1:4], method=c("scale"))

#Standard deviation of sepal Length - Just testing to see if scale was working properly.
cat("The standard deviation of iris petal length is: ", "\n")
## The standard deviation of iris petal length is:
sd(dataset$Sepal.Length)
## [1] 0.8280661
#summarize transform parameters
print(preprocessParams)
## Created from 150 samples and 4 variables
## 
## Pre-processing:
##   - ignored (0)
##   - scaled (4)
#transform the dataset using the parameters
transformed <- predict(preprocessParams, dataset[,1:4])

#summarize the transformed dataset 
summary(transformed)
##   Sepal.Length    Sepal.Width      Petal.Length     Petal.Width    
##  Min.   :5.193   Min.   : 4.613   Min.   :0.5668   Min.   :0.1310  
##  1st Qu.:6.159   1st Qu.: 6.458   1st Qu.:0.9068   1st Qu.:0.3931  
##  Median :7.004   Median : 6.919   Median :2.4654   Median :1.7034  
##  Mean   :7.057   Mean   : 7.043   Mean   :2.1303   Mean   :1.5707  
##  3rd Qu.:7.729   3rd Qu.: 7.611   3rd Qu.:2.8905   3rd Qu.:2.3586  
##  Max.   :9.540   Max.   :10.148   Max.   :3.9106   Max.   :3.2759

Center Data

  • Takes every value and subtracts it by the mean of the specific column
    • For example: take a value from sepal length and subtract it by the mean of the values in the sepal length column
#Calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(dataset[,1:4], method = c("center"))


#Standard deviation of sepal Length - Just testing to see if scale was working properly.
cat("The mean of iris petal length is: ", "\n")
## The mean of iris petal length is:
mean(dataset$Sepal.Length)
## [1] 5.843333
#Summarize transform parameters
print(preprocessParams)
## Created from 150 samples and 4 variables
## 
## Pre-processing:
##   - centered (4)
##   - ignored (0)
#Transform the dataset using the parameters
transformed <- predict(preprocessParams, dataset[,1:4])

#summarize the transformed dataset
summary(transformed)
##   Sepal.Length       Sepal.Width      Petal.Length      Petal.Width     
##  Min.   :-1.54333   Min.   :-1.054   Min.   :-2.7587   Min.   :-1.0987  
##  1st Qu.:-0.74333   1st Qu.:-0.254   1st Qu.:-2.1587   1st Qu.:-0.8987  
##  Median :-0.04333   Median :-0.054   Median : 0.5913   Median : 0.1013  
##  Mean   : 0.00000   Mean   : 0.000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.55667   3rd Qu.: 0.246   3rd Qu.: 1.3413   3rd Qu.: 0.6013  
##  Max.   : 2.05667   Max.   : 1.346   Max.   : 3.1413   Max.   : 1.3013

Now we combine Scale and Center data to Standardize Data

  • Calculate the pre-process parameters from the dataset

    • First subtract the mean from each value then divide each value by standard deviation.
    preprocessParams <- preProcess(dataset[,1:4], method = c("center","scale"))
    print(preprocessParams)
    ## Created from 150 samples and 4 variables
    ## 
    ## Pre-processing:
    ##   - centered (4)
    ##   - ignored (0)
    ##   - scaled (4)
    #transform the dataset using the parameters 
    transformed <- predict(preprocessParams, dataset[,1:4])
    
    #summarize the transformed dataset
    summary(transformed)
    ##   Sepal.Length       Sepal.Width       Petal.Length      Petal.Width     
    ##  Min.   :-1.86378   Min.   :-2.4308   Min.   :-1.5635   Min.   :-1.4396  
    ##  1st Qu.:-0.89767   1st Qu.:-0.5858   1st Qu.:-1.2234   1st Qu.:-1.1776  
    ##  Median :-0.05233   Median :-0.1245   Median : 0.3351   Median : 0.1328  
    ##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
    ##  3rd Qu.: 0.67225   3rd Qu.: 0.5674   3rd Qu.: 0.7602   3rd Qu.: 0.7880  
    ##  Max.   : 2.48370   Max.   : 3.1043   Max.   : 1.7804   Max.   : 1.7052

Normalize Data

  • Normalize is to Scale values into the range of [0,1]
# Calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(dataset[,1:4], method = c("range"))

#Summarize transform parameters
print(preprocessParams)
## Created from 150 samples and 4 variables
## 
## Pre-processing:
##   - ignored (0)
##   - re-scaling to [0, 1] (4)
#transform the dataset using the parameters
transformed <- predict(preprocessParams, dataset[,1:4])

#Summarize the transformed dataset
summary(transformed)
##   Sepal.Length     Sepal.Width      Petal.Length     Petal.Width     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.2222   1st Qu.:0.3333   1st Qu.:0.1017   1st Qu.:0.08333  
##  Median :0.4167   Median :0.4167   Median :0.5678   Median :0.50000  
##  Mean   :0.4287   Mean   :0.4392   Mean   :0.4676   Mean   :0.45778  
##  3rd Qu.:0.5833   3rd Qu.:0.5417   3rd Qu.:0.6949   3rd Qu.:0.70833  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000

Box-Con Tranform

  • Takes distribution that is skewed and makes it more Gaussian by reducing the skewness
  • Assume all values are positive
summary(dataset[,1:4])
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.054   Mean   :3.759   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500
#Calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(dataset[,1:4], method = c("BoxCox"))

#Summarize transform parameters
print(preprocessParams)
## Created from 150 samples and 4 variables
## 
## Pre-processing:
##   - Box-Cox transformation (4)
##   - ignored (0)
## 
## Lambda estimates for Box-Cox transformation:
## -0.1, 0.3, 0.9, 0.6
#Transform the dataset using the parameters
transformed <- predict(preprocessParams, dataset[,1:4])

#Summarize the transformed dataset
summary(transformed)
##   Sepal.Length    Sepal.Width      Petal.Length    Petal.Width      
##  Min.   :1.459   Min.   :0.7705   Min.   :1.000   Min.   :-1.24802  
##  1st Qu.:1.629   1st Qu.:1.2064   1st Qu.:1.600   1st Qu.:-0.85734  
##  Median :1.758   Median :1.3013   Median :4.350   Median : 0.28414  
##  Mean   :1.755   Mean   :1.3164   Mean   :3.759   Mean   : 0.06972  
##  3rd Qu.:1.856   3rd Qu.:1.4357   3rd Qu.:5.100   3rd Qu.: 0.70477  
##  Max.   :2.067   Max.   :1.8656   Max.   :6.900   Max.   : 1.22144

Yeo-Johnson Transform

  • Similar to power-transform like Box-Cox
  • Supports raw values that are equal to zero and negative
#Calculate the pre-process parameters from the dataset 
preprocessParams <- preProcess(dataset[,1:4], method = c("YeoJohnson"))

#Summarize transform parameters 
print(preprocessParams)
## Created from 150 samples and 4 variables
## 
## Pre-processing:
##   - ignored (0)
##   - Yeo-Johnson transformation (4)
## 
## Lambda estimates for Yeo-Johnson transformation:
## -0.32, 0.01, 1.09, 0.84
#Transform the dataset using the parameters
transformed <- predict(preprocessParams, dataset[,1:4])

#Summarize the transformed dataset
summary(transformed)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width     
##  Min.   :1.291   Min.   :1.106   Min.   :1.037   Min.   :0.09925  
##  1st Qu.:1.372   1st Qu.:1.345   1st Qu.:1.685   1st Qu.:0.29369  
##  Median :1.431   Median :1.397   Median :4.806   Median :1.20833  
##  Mean   :1.430   Mean   :1.405   Mean   :4.156   Mean   :1.09980  
##  3rd Qu.:1.476   3rd Qu.:1.471   3rd Qu.:5.688   3rd Qu.:1.64059  
##  Max.   :1.571   Max.   :1.703   Max.   :7.844   Max.   :2.22620

Principal Component Analysis Transform (PCA Transform)

  • Technique for multivariate statistics and linear algebra
  • Transform keeps components above the variance threshold (default=0.95) or number of components specified
  • Result gives attributes that are uncorrelated, useful for algorithms like linear and generalized linear regression
#Calculate the pre-process parameters from the dataset 
preprocessParams <- preProcess(dataset, method = c("center", "scale", "pca"))

#Summarize transform parameters
print(preprocessParams)
## Created from 150 samples and 5 variables
## 
## Pre-processing:
##   - centered (4)
##   - ignored (1)
##   - principal component signal extraction (4)
##   - scaled (4)
## 
## PCA needed 2 components to capture 95 percent of the variance
#Transform the dataset using the parameters
transformed <- predict(preprocessParams, dataset[, 1:4])

#summarize the transformed dataset
summary(transformed)
##       PC1               PC2           
##  Min.   :-2.7649   Min.   :-2.713281  
##  1st Qu.:-2.1146   1st Qu.:-0.585859  
##  Median : 0.4132   Median :-0.008406  
##  Mean   : 0.0000   Mean   : 0.000000  
##  3rd Qu.: 1.3381   3rd Qu.: 0.592676  
##  Max.   : 3.2981   Max.   : 2.649188

Independent Component Analysis Transform (ICA Transform)

  • Transforms data into independent components
  • Must specify the desired independent components
    • argument: n.comp
  • Useful in Naive Bayes algorithms
#Calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(dataset[,1:4], method = c("center", "scale", "ica"), n.comp=4)

#Summarize transform parameters
print(preprocessParams)
## Created from 150 samples and 4 variables
## 
## Pre-processing:
##   - centered (4)
##   - independent component signal extraction (4)
##   - ignored (0)
##   - scaled (4)
## 
## ICA used 4 components
#Transform the datset using the parameters
transformed <- predict(preprocessParams, dataset[,1:4])
#Summarize the transformed dataset
summary(transformed)
##       ICA1                ICA2               ICA3               ICA4        
##  Min.   :-2.960146   Min.   :-2.96809   Min.   :-3.05643   Min.   :-1.6101  
##  1st Qu.:-0.553140   1st Qu.:-0.83489   1st Qu.:-0.60140   1st Qu.:-0.8671  
##  Median :-0.001057   Median :-0.04526   Median :-0.07553   Median :-0.2994  
##  Mean   : 0.000000   Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.530382   3rd Qu.: 0.68669   3rd Qu.: 0.57927   3rd Qu.: 1.2553  
##  Max.   : 2.953922   Max.   : 2.75278   Max.   : 3.02152   Max.   : 1.6658