iris <- read.csv("D:/AI4OPT/iris.csv")
head(iris)
## sepallength sepalwidth petallength petalwidth class
## 1 5.1 3.5 1.4 0.2 Iris-setosa
## 2 4.9 3.0 1.4 0.2 Iris-setosa
## 3 4.7 3.2 1.3 0.2 Iris-setosa
## 4 4.6 3.1 1.5 0.2 Iris-setosa
## 5 5.0 3.6 1.4 0.2 Iris-setosa
## 6 5.4 3.9 1.7 0.4 Iris-setosa
tail(iris)
## sepallength sepalwidth petallength petalwidth class
## 145 6.7 3.3 5.7 2.5 Iris-virginica
## 146 6.7 3.0 5.2 2.3 Iris-virginica
## 147 6.3 2.5 5.0 1.9 Iris-virginica
## 148 6.5 3.0 5.2 2.0 Iris-virginica
## 149 6.2 3.4 5.4 2.3 Iris-virginica
## 150 5.9 3.0 5.1 1.8 Iris-virginica
summary(iris)
## sepallength sepalwidth petallength petalwidth
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.054 Mean :3.759 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## class
## Length:150
## Class :character
## Mode :character
##
##
##
dim(iris)
## [1] 150 5
iris[sample(nrow(iris),10),]
## sepallength sepalwidth petallength petalwidth class
## 49 5.3 3.7 1.5 0.2 Iris-setosa
## 145 6.7 3.3 5.7 2.5 Iris-virginica
## 116 6.4 3.2 5.3 2.3 Iris-virginica
## 39 4.4 3.0 1.3 0.2 Iris-setosa
## 35 4.9 3.1 1.5 0.1 Iris-setosa
## 113 6.8 3.0 5.5 2.1 Iris-virginica
## 8 5.0 3.4 1.5 0.2 Iris-setosa
## 130 7.2 3.0 5.8 1.6 Iris-virginica
## 126 7.2 3.2 6.0 1.8 Iris-virginica
## 128 6.1 3.0 4.9 1.8 Iris-virginica
data("iris")
sapply(iris,class)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## "numeric" "numeric" "numeric" "numeric" "factor"
library(mlbench)
data("iris")
y <- iris$class
cbind(freq=table(y), percentage=prop.table(table(y))*100)
## freq percentage
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
summary(y)
## Length Class Mode
## 0 NULL NULL
data("iris")
par(mfrow=c(1,4))
for(i in 1:4){
hist(iris[,i], main=names(iris)[i])
}
#### Try density plot for another look at each variable distribution
data(iris)
par (mfrow=c(1,4))
for(i in 1:4){
plot(density(iris[,i]), main=names(iris)[i])
}
#### Since the dataset characteristics are multivariate, a correlation
plot of the attributes may give better relation
data(iris)
correlations <- cor(iris[,1:4])
corrplot(correlations,method="circle")
#### Show a scatter plot matrix
data(iris)
pairs(iris)
#### Check scatter plot matrix by Class
data(iris)
pairs(Species~., data=iris, col=iris$Species)
library(ggplot2)
ggplot(iris) +
aes(x = Petal.Length, y = Petal.Width) +
geom_point(aes(color = Species, shape = Species))
#### See if more information by using box-plot graph
ggplot(iris) + aes(x = Species, y = Sepal.Length, color = Species) +
geom_boxplot() +
geom_jitter(position = position_jitter(0.2))
#### Check Density Plot Length by Species
ggplot(iris) + aes(x = Petal.Length, fill = Species) +
geom_density(alpha = 0.3)
#### Check for similar density plots in subgroups using facets
ggplot(iris) + aes(x = Petal.Length, fill = Species) +
geom_density(alpha = 0.3) +
facet_wrap(~Species, nrow = 3)
#### Convert to a matrix for Clustering
mat1 <- as.matrix(iris[, 1:4])
disMa <- dist(mat1)
plot(hclust(disMa))
#### More detailed clustering The rows and columns are reorganized based
on hierarchical clustering, and the values in the matrix are coded by
colors. You can directly visualize millions of numbers in one plot. The
hierarchical trees also show the similarity among rows and columns.
heatmap(mat1,
scale = "column",
RowSideColors = rainbow(3)[iris$Species]
)
#### Using pheatmap to improve plot and provide scaling and
standardization
library(pheatmap)
## Warning: package 'pheatmap' was built under R version 4.2.1
#### Convert to a matrix
mat1 <- as.matrix(iris[, 1:4])
#### assign row names in the matrix
row.names(mat1) <- row.names(iris)
pheatmap(mat1,
scale = "column",
#### average linkage
clustering_method = "average",
# the 5th column as color bar
annotation_row = iris[, 5, drop = FALSE],
show_rownames = FALSE
)
#### Principal Component Analysis Transform
pcatrans <- prcomp(iris[, 1:4], scale = TRUE)
print(pcatrans)
## Standard deviations (1, .., p=4):
## [1] 1.7083611 0.9560494 0.3830886 0.1439265
##
## Rotation (n x k) = (4 x 4):
## PC1 PC2 PC3 PC4
## Sepal.Length 0.5210659 -0.37741762 0.7195664 0.2612863
## Sepal.Width -0.2693474 -0.92329566 -0.2443818 -0.1235096
## Petal.Length 0.5804131 -0.02449161 -0.1421264 -0.8014492
## Petal.Width 0.5648565 -0.06694199 -0.6342727 0.5235971
plot(pcatrans)
#### showing the structure of the objects
str(pcatrans)
## List of 5
## $ sdev : num [1:4] 1.708 0.956 0.383 0.144
## $ rotation: num [1:4, 1:4] 0.521 -0.269 0.58 0.565 -0.377 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## .. ..$ : chr [1:4] "PC1" "PC2" "PC3" "PC4"
## $ center : Named num [1:4] 5.84 3.06 3.76 1.2
## ..- attr(*, "names")= chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## $ scale : Named num [1:4] 0.828 0.436 1.765 0.762
## ..- attr(*, "names")= chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## $ x : num [1:150, 1:4] -2.26 -2.07 -2.36 -2.29 -2.38 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : NULL
## .. ..$ : chr [1:4] "PC1" "PC2" "PC3" "PC4"
## - attr(*, "class")= chr "prcomp"
head(pcatrans$x)
## PC1 PC2 PC3 PC4
## [1,] -2.257141 -0.4784238 0.12727962 0.024087508
## [2,] -2.074013 0.6718827 0.23382552 0.102662845
## [3,] -2.356335 0.3407664 -0.04405390 0.028282305
## [4,] -2.291707 0.5953999 -0.09098530 -0.065735340
## [5,] -2.381863 -0.6446757 -0.01568565 -0.035802870
## [6,] -2.068701 -1.4842053 -0.02687825 0.006586116
#### # First two columns extracted and convert to dataframe
pcaData <- as.data.frame(pcatrans$x[, 1:2])
#### bind the columns
pcaData <- cbind(pcaData, iris$Species)
colnames(pcaData) <- c("PC1", "PC2", "Species")
library(ggplot2)
#### define plot area and add data points
ggplot(pcaData) +
aes(PC1, PC2, color = Species, shape = Species) +
geom_point(size = 2)
#### add plot details #### compute % variables
percentVar <- round(100 * summary(pcatrans)$importance[2, 1:2], 0)
#### add data points, labels, title, and width and height ratio
ggplot(pcaData, aes(PC1, PC2, color = Species, shape = Species)) +
geom_point(size = 2) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) + ylab(paste0("PC2: ", percentVar[2], "% variance")) +
ggtitle("Principal component analysis (PCA)") +
theme(aspect.ratio = 1)