library(stringr)
filename <- "C:/Users/Gabi/Documents/MVC/AI4OPT/R/iris.csv"
#columnnames <-"C:/Users/Gabi/Documents/MVC/AI4OPT/irisnames.csv"
dataset <- read.csv(filename,header=FALSE)
names(dataset)[1] <- "Sepal.Length" # changes the name of column 1
names(dataset)[2] <- "Sepal.Width" # changes the name of column 2
names(dataset)[3] <- "Petal.Length" # changes the name of column 3
names(dataset)[4] <- "Petal.Width" # changes the name of column 4
names(dataset)[5] <- "Class" # changes the name of column 5
head(dataset)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Class
## 1 5.1 3.5 1.4 0.2 Iris-setosa
## 2 4.9 3.0 1.4 0.2 Iris-setosa
## 3 4.7 3.2 1.3 0.2 Iris-setosa
## 4 4.6 3.1 1.5 0.2 Iris-setosa
## 5 5.0 3.6 1.4 0.2 Iris-setosa
## 6 5.4 3.9 1.7 0.4 Iris-setosa
dim(dataset)
## [1] 150 5
Summary: The Iris dataset has 150 rows and 5 columns.
sapply(dataset, class)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Class
## "numeric" "numeric" "numeric" "numeric" "character"
Summary: Iris dataset has 5 columns. Columns 1-4 have numeric data types and column 5 has character data types.
summary(dataset)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.054 Mean :3.759 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Class
## Length:150
## Class :character
## Mode :character
##
##
##
par(mfrow=c(1,4))
for(i in 1:4){
hist(dataset[,i], main=names(dataset)[i])
}
library(lattice)
#create a layout of simpler density plots by attribute
par(mfrow=c(1,4))
for (i in 1:4) {
plot(density(dataset[,i]), main=names(dataset)[i])
}
par(mfrow=c(1,4))
for(i in 1:4) {
boxplot(dataset[,i], main=names(dataset)[i])
}
par(mfrow=c(1,4))
for(i in 1:4){
counts <- table(dataset[,i])
name <-names(dataset)[i]
barplot(counts,main=name)
}
library(corrplot)
## corrplot 0.92 loaded
correlations <-cor(dataset[,1:4])
corrplot(correlations, method="circle")
Summary: - Blue represents positive correlation and red negative correlation. - The larger the dot the larger the correlation
library(caret)
## Loading required package: ggplot2
x <- dataset[,1:4]
y <- dataset[,5]
scales <- list(x=list(relation="free"), y=list(relation="free"))
featurePlot(x=x, y=y, plot="density", scales=scales)
## NULL
x <- dataset[,1:4]
y <- dataset[,5]
featurePlot(x=x,y=y, plot="box")
## NULL
#summarize data
summary(dataset[,1:4])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.054 Mean :3.759 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
#calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(dataset[,1:4], method=c("scale"))
#Standard deviation of sepal Length - Just testing to see if scale was working properly.
cat("The standard deviation of iris petal length is: ", "\n")
## The standard deviation of iris petal length is:
sd(dataset$Sepal.Length)
## [1] 0.8280661
#summarize transform parameters
print(preprocessParams)
## Created from 150 samples and 4 variables
##
## Pre-processing:
## - ignored (0)
## - scaled (4)
#transform the dataset using the parameters
transformed <- predict(preprocessParams, dataset[,1:4])
#summarize the transformed dataset
summary(transformed)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :5.193 Min. : 4.613 Min. :0.5668 Min. :0.1310
## 1st Qu.:6.159 1st Qu.: 6.458 1st Qu.:0.9068 1st Qu.:0.3931
## Median :7.004 Median : 6.919 Median :2.4654 Median :1.7034
## Mean :7.057 Mean : 7.043 Mean :2.1303 Mean :1.5707
## 3rd Qu.:7.729 3rd Qu.: 7.611 3rd Qu.:2.8905 3rd Qu.:2.3586
## Max. :9.540 Max. :10.148 Max. :3.9106 Max. :3.2759
#Calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(dataset[,1:4], method = c("center"))
#Standard deviation of sepal Length - Just testing to see if scale was working properly.
cat("The mean of iris petal length is: ", "\n")
## The mean of iris petal length is:
mean(dataset$Sepal.Length)
## [1] 5.843333
#Summarize transform parameters
print(preprocessParams)
## Created from 150 samples and 4 variables
##
## Pre-processing:
## - centered (4)
## - ignored (0)
#Transform the dataset using the parameters
transformed <- predict(preprocessParams, dataset[,1:4])
#summarize the transformed dataset
summary(transformed)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :-1.54333 Min. :-1.054 Min. :-2.7587 Min. :-1.0987
## 1st Qu.:-0.74333 1st Qu.:-0.254 1st Qu.:-2.1587 1st Qu.:-0.8987
## Median :-0.04333 Median :-0.054 Median : 0.5913 Median : 0.1013
## Mean : 0.00000 Mean : 0.000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.55667 3rd Qu.: 0.246 3rd Qu.: 1.3413 3rd Qu.: 0.6013
## Max. : 2.05667 Max. : 1.346 Max. : 3.1413 Max. : 1.3013
Calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(dataset[,1:4], method = c("center","scale"))
print(preprocessParams)
## Created from 150 samples and 4 variables
##
## Pre-processing:
## - centered (4)
## - ignored (0)
## - scaled (4)
#transform the dataset using the parameters
transformed <- predict(preprocessParams, dataset[,1:4])
#summarize the transformed dataset
summary(transformed)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :-1.86378 Min. :-2.4308 Min. :-1.5635 Min. :-1.4396
## 1st Qu.:-0.89767 1st Qu.:-0.5858 1st Qu.:-1.2234 1st Qu.:-1.1776
## Median :-0.05233 Median :-0.1245 Median : 0.3351 Median : 0.1328
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.67225 3rd Qu.: 0.5674 3rd Qu.: 0.7602 3rd Qu.: 0.7880
## Max. : 2.48370 Max. : 3.1043 Max. : 1.7804 Max. : 1.7052# Calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(dataset[,1:4], method = c("range"))
#Summarize transform parameters
print(preprocessParams)
## Created from 150 samples and 4 variables
##
## Pre-processing:
## - ignored (0)
## - re-scaling to [0, 1] (4)
#transform the dataset using the parameters
transformed <- predict(preprocessParams, dataset[,1:4])
#Summarize the transformed dataset
summary(transformed)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.2222 1st Qu.:0.3333 1st Qu.:0.1017 1st Qu.:0.08333
## Median :0.4167 Median :0.4167 Median :0.5678 Median :0.50000
## Mean :0.4287 Mean :0.4392 Mean :0.4676 Mean :0.45778
## 3rd Qu.:0.5833 3rd Qu.:0.5417 3rd Qu.:0.6949 3rd Qu.:0.70833
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.00000
summary(dataset[,1:4])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.054 Mean :3.759 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
#Calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(dataset[,1:4], method = c("BoxCox"))
#Summarize transform parameters
print(preprocessParams)
## Created from 150 samples and 4 variables
##
## Pre-processing:
## - Box-Cox transformation (4)
## - ignored (0)
##
## Lambda estimates for Box-Cox transformation:
## -0.1, 0.3, 0.9, 0.6
#Transform the dataset using the parameters
transformed <- predict(preprocessParams, dataset[,1:4])
#Summarize the transformed dataset
summary(transformed)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :1.459 Min. :0.7705 Min. :1.000 Min. :-1.24802
## 1st Qu.:1.629 1st Qu.:1.2064 1st Qu.:1.600 1st Qu.:-0.85734
## Median :1.758 Median :1.3013 Median :4.350 Median : 0.28414
## Mean :1.755 Mean :1.3164 Mean :3.759 Mean : 0.06972
## 3rd Qu.:1.856 3rd Qu.:1.4357 3rd Qu.:5.100 3rd Qu.: 0.70477
## Max. :2.067 Max. :1.8656 Max. :6.900 Max. : 1.22144
#Calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(dataset[,1:4], method = c("YeoJohnson"))
#Summarize transform parameters
print(preprocessParams)
## Created from 150 samples and 4 variables
##
## Pre-processing:
## - ignored (0)
## - Yeo-Johnson transformation (4)
##
## Lambda estimates for Yeo-Johnson transformation:
## -0.32, 0.01, 1.09, 0.84
#Transform the dataset using the parameters
transformed <- predict(preprocessParams, dataset[,1:4])
#Summarize the transformed dataset
summary(transformed)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :1.291 Min. :1.106 Min. :1.037 Min. :0.09925
## 1st Qu.:1.372 1st Qu.:1.345 1st Qu.:1.685 1st Qu.:0.29369
## Median :1.431 Median :1.397 Median :4.806 Median :1.20833
## Mean :1.430 Mean :1.405 Mean :4.156 Mean :1.09980
## 3rd Qu.:1.476 3rd Qu.:1.471 3rd Qu.:5.688 3rd Qu.:1.64059
## Max. :1.571 Max. :1.703 Max. :7.844 Max. :2.22620
#Calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(dataset, method = c("center", "scale", "pca"))
#Summarize transform parameters
print(preprocessParams)
## Created from 150 samples and 5 variables
##
## Pre-processing:
## - centered (4)
## - ignored (1)
## - principal component signal extraction (4)
## - scaled (4)
##
## PCA needed 2 components to capture 95 percent of the variance
#Transform the dataset using the parameters
transformed <- predict(preprocessParams, dataset[, 1:4])
#summarize the transformed dataset
summary(transformed)
## PC1 PC2
## Min. :-2.7649 Min. :-2.713281
## 1st Qu.:-2.1146 1st Qu.:-0.585859
## Median : 0.4132 Median :-0.008406
## Mean : 0.0000 Mean : 0.000000
## 3rd Qu.: 1.3381 3rd Qu.: 0.592676
## Max. : 3.2981 Max. : 2.649188
#Calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(dataset[,1:4], method = c("center", "scale", "ica"), n.comp=4)
#Summarize transform parameters
print(preprocessParams)
## Created from 150 samples and 4 variables
##
## Pre-processing:
## - centered (4)
## - independent component signal extraction (4)
## - ignored (0)
## - scaled (4)
##
## ICA used 4 components
#Transform the datset using the parameters
transformed <- predict(preprocessParams, dataset[,1:4])
#Summarize the transformed dataset
summary(transformed)
## ICA1 ICA2 ICA3 ICA4
## Min. :-2.960146 Min. :-2.96809 Min. :-3.05643 Min. :-1.6101
## 1st Qu.:-0.553140 1st Qu.:-0.83489 1st Qu.:-0.60140 1st Qu.:-0.8671
## Median :-0.001057 Median :-0.04526 Median :-0.07553 Median :-0.2994
## Mean : 0.000000 Mean : 0.00000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.530382 3rd Qu.: 0.68669 3rd Qu.: 0.57927 3rd Qu.: 1.2553
## Max. : 2.953922 Max. : 2.75278 Max. : 3.02152 Max. : 1.6658