Iris Flowers Dataset

Read iris dataset

iris <- read.csv("D:/AI4OPT/iris.csv")
head(iris)

##   sepallength sepalwidth petallength petalwidth       class
## 1         5.1        3.5         1.4        0.2 Iris-setosa
## 2         4.9        3.0         1.4        0.2 Iris-setosa
## 3         4.7        3.2         1.3        0.2 Iris-setosa
## 4         4.6        3.1         1.5        0.2 Iris-setosa
## 5         5.0        3.6         1.4        0.2 Iris-setosa
## 6         5.4        3.9         1.7        0.4 Iris-setosa

tail(iris)

##     sepallength sepalwidth petallength petalwidth          class
## 145         6.7        3.3         5.7        2.5 Iris-virginica
## 146         6.7        3.0         5.2        2.3 Iris-virginica
## 147         6.3        2.5         5.0        1.9 Iris-virginica
## 148         6.5        3.0         5.2        2.0 Iris-virginica
## 149         6.2        3.4         5.4        2.3 Iris-virginica
## 150         5.9        3.0         5.1        1.8 Iris-virginica

summary(iris)

##   sepallength      sepalwidth     petallength      petalwidth   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.054   Mean   :3.759   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##     class          
##  Length:150        
##  Class :character  
##  Mode  :character  
##                    
##                    
##

dim(iris)

## [1] 150   5

A random sampling of the data might be helpful, since there are three different types of species. Also, checking the types of attributes in iris dataset for further analysis

iris[sample(nrow(iris),10),]

##     sepallength sepalwidth petallength petalwidth          class
## 49          5.3        3.7         1.5        0.2    Iris-setosa
## 145         6.7        3.3         5.7        2.5 Iris-virginica
## 116         6.4        3.2         5.3        2.3 Iris-virginica
## 39          4.4        3.0         1.3        0.2    Iris-setosa
## 35          4.9        3.1         1.5        0.1    Iris-setosa
## 113         6.8        3.0         5.5        2.1 Iris-virginica
## 8           5.0        3.4         1.5        0.2    Iris-setosa
## 130         7.2        3.0         5.8        1.6 Iris-virginica
## 126         7.2        3.2         6.0        1.8 Iris-virginica
## 128         6.1        3.0         4.9        1.8 Iris-virginica

data("iris")
sapply(iris,class)

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##    "numeric"    "numeric"    "numeric"    "numeric"     "factor"

Instances that belong to each class label

library(mlbench)
data("iris")
y <- iris$class
cbind(freq=table(y), percentage=prop.table(table(y))*100)

##      freq percentage

summary(iris)

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
##

summary(y)

## Length  Class   Mode 
##      0   NULL   NULL

Study the individual attributes for more information using histograms

data("iris")

par(mfrow=c(1,4))

for(i in 1:4){
  hist(iris[,i], main=names(iris)[i])
  }

#### Try density plot for another look at each variable distribution

data(iris)
par (mfrow=c(1,4))

for(i in 1:4){
  plot(density(iris[,i]), main=names(iris)[i])
}

#### Since the dataset characteristics are multivariate, a correlation plot of the attributes may give better relation

data(iris)
correlations <- cor(iris[,1:4])
corrplot(correlations,method="circle")

#### Show a scatter plot matrix

data(iris)
pairs(iris)

#### Check scatter plot matrix by Class

data(iris)
pairs(Species~., data=iris, col=iris$Species)

library(ggplot2)
ggplot(iris) +
  aes(x = Petal.Length, y = Petal.Width) +
  geom_point(aes(color = Species, shape = Species))

#### See if more information by using box-plot graph

ggplot(iris) + aes(x = Species, y = Sepal.Length, color = Species) +
  geom_boxplot() +
  geom_jitter(position = position_jitter(0.2))

#### Check Density Plot Length by Species

ggplot(iris) + aes(x = Petal.Length, fill = Species) +
geom_density(alpha = 0.3)

#### Check for similar density plots in subgroups using facets

ggplot(iris) + aes(x = Petal.Length, fill = Species) +
geom_density(alpha = 0.3) +
facet_wrap(~Species, nrow = 3)

#### Convert to a matrix for Clustering

mat1 <- as.matrix(iris[, 1:4])
disMa <- dist(mat1)
plot(hclust(disMa))

#### More detailed clustering The rows and columns are reorganized based on hierarchical clustering, and the values in the matrix are coded by colors. You can directly visualize millions of numbers in one plot. The hierarchical trees also show the similarity among rows and columns.

heatmap(mat1,
  scale = "column",
  RowSideColors = rainbow(3)[iris$Species]
)

#### Using pheatmap to improve plot and provide scaling and standardization

library(pheatmap)

## Warning: package 'pheatmap' was built under R version 4.2.1

#### Convert to a matrix
mat1 <- as.matrix(iris[, 1:4])

#### assign row names in the matrix
row.names(mat1) <- row.names(iris) 
pheatmap(mat1,
scale = "column",

#### average linkage
clustering_method = "average", 

# the 5th column as color bar
annotation_row = iris[, 5, drop = FALSE], 
show_rownames = FALSE
)

#### Principal Component Analysis Transform

pcatrans <- prcomp(iris[, 1:4], scale = TRUE)
print(pcatrans)

## Standard deviations (1, .., p=4):
## [1] 1.7083611 0.9560494 0.3830886 0.1439265
## 
## Rotation (n x k) = (4 x 4):
##                     PC1         PC2        PC3        PC4
## Sepal.Length  0.5210659 -0.37741762  0.7195664  0.2612863
## Sepal.Width  -0.2693474 -0.92329566 -0.2443818 -0.1235096
## Petal.Length  0.5804131 -0.02449161 -0.1421264 -0.8014492
## Petal.Width   0.5648565 -0.06694199 -0.6342727  0.5235971

plot the variance of each principal component

plot(pcatrans)

#### showing the structure of the objects
str(pcatrans)

## List of 5
##  $ sdev    : num [1:4] 1.708 0.956 0.383 0.144
##  $ rotation: num [1:4, 1:4] 0.521 -0.269 0.58 0.565 -0.377 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
##   .. ..$ : chr [1:4] "PC1" "PC2" "PC3" "PC4"
##  $ center  : Named num [1:4] 5.84 3.06 3.76 1.2
##   ..- attr(*, "names")= chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
##  $ scale   : Named num [1:4] 0.828 0.436 1.765 0.762
##   ..- attr(*, "names")= chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
##  $ x       : num [1:150, 1:4] -2.26 -2.07 -2.36 -2.29 -2.38 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : chr [1:4] "PC1" "PC2" "PC3" "PC4"
##  - attr(*, "class")= chr "prcomp"

New coordinate values for each of the 150 samples

head(pcatrans$x)

##            PC1        PC2         PC3          PC4
## [1,] -2.257141 -0.4784238  0.12727962  0.024087508
## [2,] -2.074013  0.6718827  0.23382552  0.102662845
## [3,] -2.356335  0.3407664 -0.04405390  0.028282305
## [4,] -2.291707  0.5953999 -0.09098530 -0.065735340
## [5,] -2.381863 -0.6446757 -0.01568565 -0.035802870
## [6,] -2.068701 -1.4842053 -0.02687825  0.006586116

Change column names to ready the prediction

#### # First two columns extracted and convert to dataframe
pcaData <- as.data.frame(pcatrans$x[, 1:2])

#### bind the columns 
pcaData <- cbind(pcaData, iris$Species) 
colnames(pcaData) <- c("PC1", "PC2", "Species")

library(ggplot2)

#### define plot area and add data points

ggplot(pcaData) +
aes(PC1, PC2, color = Species, shape = Species) + 
geom_point(size = 2)

#### add plot details #### compute % variables

percentVar <- round(100 * summary(pcatrans)$importance[2, 1:2], 0) 

#### add data points, labels, title, and width and height ratio
ggplot(pcaData, aes(PC1, PC2, color = Species, shape = Species)) + 
geom_point(size = 2) + 
xlab(paste0("PC1: ", percentVar[1], "% variance")) + ylab(paste0("PC2: ", percentVar[2], "% variance")) + 
ggtitle("Principal component analysis (PCA)") + 
theme(aspect.ratio = 1)