The Iris Dataset contains four features (length and width of sepals and petals) of 50 samples of three species of Iris (Setosa, Virginica, and Versicolor). This dataset was created by British biologist Ronald Fisher in 1936.
In this project, I will first explore, transform, and visualize the IRIS dataset. Secondly, I will execute a Hypothesis Testing with t-test (H0: Setosa and Versicolor have the same average petal length) and an ANOVA analysis (H0: Setosa, Virginica, and Versicolor have the same average petal length). Thirdly, I will adopt K-mean Clustering model to categorize IRIS species.
#Install package
install.packages('dplyr')
trying URL 'https://cran.rstudio.com/bin/macosx/big-sur-arm64/contrib/4.2/dplyr_1.1.2.tgz'
Content type 'application/x-gzip' length 1590090 bytes (1.5 MB)
==================================================
downloaded 1.5 MB
The downloaded binary packages are in
/var/folders/yl/fk25qksj205djhgmm8ksmfv40000gn/T//RtmpbwJpCa/downloaded_packages
install.packages('ggplot2')
trying URL 'https://cran.rstudio.com/bin/macosx/big-sur-arm64/contrib/4.2/ggplot2_3.4.2.tgz'
Content type 'application/x-gzip' length 4299292 bytes (4.1 MB)
==================================================
downloaded 4.1 MB
The downloaded binary packages are in
/var/folders/yl/fk25qksj205djhgmm8ksmfv40000gn/T//RtmpbwJpCa/downloaded_packages
install.packages('GGally')
trying URL 'https://cran.rstudio.com/bin/macosx/big-sur-arm64/contrib/4.2/GGally_2.1.2.tgz'
Content type 'application/x-gzip' length 1636372 bytes (1.6 MB)
==================================================
downloaded 1.6 MB
The downloaded binary packages are in
/var/folders/yl/fk25qksj205djhgmm8ksmfv40000gn/T//RtmpbwJpCa/downloaded_packages
install.packages('cluster')
trying URL 'https://cran.rstudio.com/bin/macosx/big-sur-arm64/contrib/4.2/cluster_2.1.4.tgz'
Content type 'application/x-gzip' length 610729 bytes (596 KB)
==================================================
downloaded 596 KB
The downloaded binary packages are in
/var/folders/yl/fk25qksj205djhgmm8ksmfv40000gn/T//RtmpbwJpCa/downloaded_packages
install.packages('fpc')
trying URL 'https://cran.rstudio.com/bin/macosx/big-sur-arm64/contrib/4.2/fpc_2.2-10.tgz'
Content type 'application/x-gzip' length 838841 bytes (819 KB)
==================================================
downloaded 819 KB
The downloaded binary packages are in
/var/folders/yl/fk25qksj205djhgmm8ksmfv40000gn/T//RtmpbwJpCa/downloaded_packages
library(dplyr)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
library(reshape2)
library(ggplot2)
library(GGally)
Registered S3 method overwritten by 'GGally':
method from
+.gg ggplot2
library(cluster)
library(fpc)
#Explore Iris Dataset
iris
data(iris)
#Use head to show top rows from data
head(iris)
head(iris,10)
NA
#STEP 1: EXPLORE DATA ----------------------------
#1.1 Function summary to review data
summary(iris)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100 setosa :50
1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300 versicolor:50
Median :5.800 Median :3.000 Median :4.350 Median :1.300 virginica :50
Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
#1.2 Function names to show columns name
names(iris)
[1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species"
names(iris) <- tolower(names(iris))
#1.3 Function dim to show row and column counts
dim(iris)
[1] 150 5
#1.4 Function class to show data structure
class(iris)
[1] "data.frame"
#1.5 Function typeof and str to show data type
typeof(iris$sepal.length)
[1] "double"
str(iris$sepal.length)
num [1:150] 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
typeof(iris$species)
[1] "integer"
class(iris$species)
[1] "factor"
str(iris$species)
Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
#STEP 2: TRANSFORM DATA --------------------------
#2.1 Split data into subset
virginica <- iris[iris$species == 'virginica',]
virginica2 <- iris[iris$species == 'virginica' & iris$sepal.length > 6,]
head(virginica)
#2.2 Function select to select columns
selected <- select(iris, sepal.length, sepal.width)
head(selected)
#2.3 Function mutate to add column
newcol <- mutate(iris, longer = sepal.length / sepal.width )
newcol <- mutate(newcol, longer.2x = sepal.length > 2*sepal.width )
tail(newcol)
#2.4 Function arrange to sort data
newcol <- arrange(newcol, sepal.width)
newcol <- arrange(newcol, desc(sepal.width))
#2.5 Function melt to unpivot table (wide -> long)
iris.melt <- melt(iris, id = 'species', variable.name = 'size')
head(iris.melt)
#STEP 3: VISUALIZE DATA -----------------------
Warning message:
In do_once((if (is_R_CMD_check()) stop else warning)("The function xfun::isFALSE() will be deprecated in the future. Please ", :
The function xfun::isFALSE() will be deprecated in the future. Please consider using base::isFALSE(x) or identical(x, FALSE) instead.
#3.1 Function hist to show histogram
hist(iris$sepal.length)
hist(iris$sepal.length,
col='light blue',
main='Histogram',
xlab='Sepal.Length',
ylab='Frequency')
hist(iris$sepal.length, col='red', breaks=20, main='Histogram', xlab='Size')
hist(iris$petal.length, col='green',breaks=30, add=TRUE)
legend('topright',
c('Sepal Length', 'Petal Length'),
fill=c('red', 'green'))
#3.2 Use ggplot to create charts
ggplot(iris.melt, aes(x=value, fill=size)) +
geom_histogram(color ='#e9ecef', alpha = 0.6, position = 'identity')
ggplot(iris.melt, aes(x=value, fill=size)) +
geom_histogram(color ='#e9ecef', alpha = 0.6, position = 'identity') +
facet_wrap(~size)
#3.3 Function boxplot to create boxplot
boxplot(sepal.length ~ species,
data = iris,
main = 'Sepal Length by Species',
xlab = 'Species',
ylab = 'Sepal Length',
col = 'light blue',
border = 'black')
boxplot(value ~ size,
data = iris.melt,
main = 'Compare different size',
xlab = 'Size',
ylab = 'Value',
col = 'light blue',
border = 'black')
ggplot(iris.melt, aes(x=size, y=value, fill=size)) +
geom_boxplot()+
geom_jitter(color = 'black', size = 0.4, alpha = 0.9)
#3.4 Function plot to create scatter plot
plot(iris)
plot(iris[,1:4])
plot(iris$sepal.width, iris$sepal.length,
col = 'steelblue',
main = 'Scatterplot',
xlab = 'Sepal Width',
ylab = 'Sepal Length',
pch = 19)
pairs(iris[,1:4],col=iris[,5],oma=c(4,4,6,12))
par(xpd=TRUE)
ggplot(iris, aes(x=sepal.length, y=sepal.width, color=species)) +
geom_point(size=5)
ggpairs(iris,
columns = 1:4,
aes(color = species, alpha = 0.5))
H0: Setosa and Versicolor have the same average petal length
H1: Setosa and Versicolor do not have the same average petal length
#STEP 4: HYPOTHESIS TESTING WITH T-test ---------------
setosa <- iris[iris$species == 'setosa',]
versicolor <- iris[iris$species == 'versicolor',]
t.test(x= setosa$petal.length, y = versicolor$petal.length)
Welch Two Sample t-test
data: setosa$petal.length and versicolor$petal.length
t = -39.493, df = 62.14, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-2.939618 -2.656382
sample estimates:
mean of x mean of y
1.462 4.260
H0: Setosa, Virginica, and Versicolor have the same average petal length
H1: Setosa, Virginica, and Versicolor do not have the same average petal length
#STEP 5: ANALYSIS WITH ANOVA -----------------------------
petal.length.aov <- aov(formula = petal.length ~ species, data = iris)
summary(petal.length.aov)
Df Sum Sq Mean Sq F value Pr(>F)
species 2 437.1 218.55 1180 <2e-16 ***
Residuals 147 27.2 0.19
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
TukeyHSD(petal.length.aov)
Tukey multiple comparisons of means
95% family-wise confidence level
Fit: aov(formula = petal.length ~ species, data = iris)
$species
diff lwr upr p adj
versicolor-setosa 2.798 2.59422 3.00178 0
virginica-setosa 4.090 3.88622 4.29378 0
virginica-versicolor 1.292 1.08822 1.49578 0
#STEP 6: CATEGORIZE SPECIES WITH K-MEAN CLUSTERING -------------
iris.test <- iris
iris.test$species <- NULL
head(iris.test)
kmeans.result <- kmeans(iris.test, 3)
table(iris$species, kmeans.result$cluster)
1 2 3
setosa 0 0 50
versicolor 2 48 0
virginica 36 14 0
plot(iris.test[c('sepal.length', 'sepal.width')], col = kmeans.result$cluster)
plotcluster(iris.test, kmeans.result$cluster)
clusplot(iris.test, kmeans.result$cluster, color = TRUE, shade = TRUE)