# THE SPARK FOUNDATION:predict the optimum number of clusters
# and represent it visually using unsupervised ML in R
# TASK-2
#NAME: VIGNESH.A
#importing required libraries
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.4
## -- Attaching packages ------------------------------------ tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.5
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## Warning: package 'ggplot2' was built under R version 4.0.4
## Warning: package 'tidyr' was built under R version 4.0.4
## Warning: package 'dplyr' was built under R version 4.0.4
## Warning: package 'stringr' was built under R version 4.0.4
## -- Conflicts --------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(cluster)
## Warning: package 'cluster' was built under R version 4.0.4
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.0.4
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(ggplot2)
#read the dataset
data<-read.csv("Iris.csv")
View(data)
#exploring the dataset
head(data)
## Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
## 1 1 5.1 3.5 1.4 0.2 Iris-setosa
## 2 2 4.9 3.0 1.4 0.2 Iris-setosa
## 3 3 4.7 3.2 1.3 0.2 Iris-setosa
## 4 4 4.6 3.1 1.5 0.2 Iris-setosa
## 5 5 5.0 3.6 1.4 0.2 Iris-setosa
## 6 6 5.4 3.9 1.7 0.4 Iris-setosa
glimpse(data)
## Rows: 150
## Columns: 6
## $ Id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16...
## $ SepalLengthCm <dbl> 5.1, 4.9, 4.7, 4.6, 5.0, 5.4, 4.6, 5.0, 4.4, 4.9, 5.4...
## $ SepalWidthCm <dbl> 3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3.7...
## $ PetalLengthCm <dbl> 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5, 1.5...
## $ PetalWidthCm <dbl> 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1, 0.2...
## $ Species <chr> "Iris-setosa", "Iris-setosa", "Iris-setosa", "Iris-se...
#Scatter plot
#1)Sepal length and sepal width
scatter <- ggplot(data=iris, aes(x = Sepal.Length, y = Sepal.Width))
scatter + geom_point(aes(color=Species, shape=Species)) +
xlab("Sepal Length") + ylab("Sepal Width") +
ggtitle("Sepal Length-Width")

#2)petal length and petal width
scatter1 <- ggplot(data=iris, aes(x = Petal.Length, y = Petal.Width))
scatter1 + geom_point(aes(color=Species, shape=Species)) +
xlab("Petal Length") + ylab("Petal Width") +
ggtitle("Petal Length-Width")

#Box plot
#1)Sepal length with species
box <- ggplot(data=iris, aes(x=Species, y=Sepal.Length))
box + geom_boxplot(aes(fill=Species)) +
ylab("Sepal Length") + ggtitle("Iris Boxplot") +
stat_summary(fun=mean, geom="point")

#2)sepal width with species
box1 <- ggplot(data=iris, aes(x=Species, y=Sepal.Width))
box1 + geom_boxplot(aes(fill=Species)) +
ylab("Sepal Width") + ggtitle("Iris Boxplot") +
stat_summary(fun=mean, geom="point")

#3)petal length with species
box2<- ggplot(data=iris, aes(x=Species, y=Petal.Length))
box2 + geom_boxplot(aes(fill=Species)) +
ylab("Petal Length") + ggtitle("Iris Boxplot") +
stat_summary(fun=mean, geom="point")

#4)petal width with species
box3<- ggplot(data=iris, aes(x=Species, y=Petal.Width))
box3 + geom_boxplot(aes(fill=Species)) +
ylab("Petal Width") + ggtitle("Iris Boxplot") +
stat_summary(fun=mean, geom="point")

#K-means clustering
set.seed(20)
irisCluster <- kmeans(iris[, 3:4], 3, nstart = 20)
irisCluster
## K-means clustering with 3 clusters of sizes 52, 48, 50
##
## Cluster means:
## Petal.Length Petal.Width
## 1 4.269231 1.342308
## 2 5.595833 2.037500
## 3 1.462000 0.246000
##
## Clustering vector:
## [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [38] 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [75] 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 1 2 2 2 2
## [112] 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2
## [149] 2 2
##
## Within cluster sum of squares by cluster:
## [1] 13.05769 16.29167 2.02200
## (between_SS / total_SS = 94.3 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
#plotting the clusters
irisCluster$cluster <- as.factor(irisCluster$cluster)
ggplot(iris, aes(Petal.Length, Petal.Width, color = irisCluster$cluster)) + geom_point()
