#     THE SPARK FOUNDATION:predict the optimum number of clusters
#                  and represent it visually using unsupervised ML in R
# TASK-2
#NAME: VIGNESH.A



#importing required libraries
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.4
## -- Attaching packages ------------------------------------ tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.5
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## Warning: package 'ggplot2' was built under R version 4.0.4
## Warning: package 'tidyr' was built under R version 4.0.4
## Warning: package 'dplyr' was built under R version 4.0.4
## Warning: package 'stringr' was built under R version 4.0.4
## -- Conflicts --------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(cluster)
## Warning: package 'cluster' was built under R version 4.0.4
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.0.4
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(ggplot2)

#read the dataset
data<-read.csv("Iris.csv")
View(data)

#exploring the dataset
head(data)
##   Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm     Species
## 1  1           5.1          3.5           1.4          0.2 Iris-setosa
## 2  2           4.9          3.0           1.4          0.2 Iris-setosa
## 3  3           4.7          3.2           1.3          0.2 Iris-setosa
## 4  4           4.6          3.1           1.5          0.2 Iris-setosa
## 5  5           5.0          3.6           1.4          0.2 Iris-setosa
## 6  6           5.4          3.9           1.7          0.4 Iris-setosa
glimpse(data)
## Rows: 150
## Columns: 6
## $ Id            <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16...
## $ SepalLengthCm <dbl> 5.1, 4.9, 4.7, 4.6, 5.0, 5.4, 4.6, 5.0, 4.4, 4.9, 5.4...
## $ SepalWidthCm  <dbl> 3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3.7...
## $ PetalLengthCm <dbl> 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5, 1.5...
## $ PetalWidthCm  <dbl> 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1, 0.2...
## $ Species       <chr> "Iris-setosa", "Iris-setosa", "Iris-setosa", "Iris-se...
#Scatter plot
#1)Sepal length and sepal width
scatter <- ggplot(data=iris, aes(x = Sepal.Length, y = Sepal.Width)) 
scatter + geom_point(aes(color=Species, shape=Species)) +
  xlab("Sepal Length") +  ylab("Sepal Width") +
  ggtitle("Sepal Length-Width")

#2)petal length and petal width
scatter1 <- ggplot(data=iris, aes(x = Petal.Length, y = Petal.Width)) 
scatter1 + geom_point(aes(color=Species, shape=Species)) +
  xlab("Petal Length") +  ylab("Petal Width") +
  ggtitle("Petal Length-Width")

#Box plot
#1)Sepal length with species
box <- ggplot(data=iris, aes(x=Species, y=Sepal.Length))
box + geom_boxplot(aes(fill=Species)) + 
  ylab("Sepal Length") + ggtitle("Iris Boxplot") +
  stat_summary(fun=mean, geom="point") 

#2)sepal width with species
box1 <- ggplot(data=iris, aes(x=Species, y=Sepal.Width))
box1 + geom_boxplot(aes(fill=Species)) + 
  ylab("Sepal Width") + ggtitle("Iris Boxplot") +
  stat_summary(fun=mean, geom="point") 

#3)petal length with species
box2<- ggplot(data=iris, aes(x=Species, y=Petal.Length))
box2 + geom_boxplot(aes(fill=Species)) + 
  ylab("Petal Length") + ggtitle("Iris Boxplot") +
  stat_summary(fun=mean, geom="point") 

#4)petal width with species
box3<- ggplot(data=iris, aes(x=Species, y=Petal.Width))
box3 + geom_boxplot(aes(fill=Species)) + 
  ylab("Petal Width") + ggtitle("Iris Boxplot") +
  stat_summary(fun=mean, geom="point") 

#K-means clustering
set.seed(20)
irisCluster <- kmeans(iris[, 3:4], 3, nstart = 20)
irisCluster
## K-means clustering with 3 clusters of sizes 52, 48, 50
## 
## Cluster means:
##   Petal.Length Petal.Width
## 1     4.269231    1.342308
## 2     5.595833    2.037500
## 3     1.462000    0.246000
## 
## Clustering vector:
##   [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [38] 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [75] 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 1 2 2 2 2
## [112] 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2
## [149] 2 2
## 
## Within cluster sum of squares by cluster:
## [1] 13.05769 16.29167  2.02200
##  (between_SS / total_SS =  94.3 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
#plotting the clusters
irisCluster$cluster <- as.factor(irisCluster$cluster)
ggplot(iris, aes(Petal.Length, Petal.Width, color = irisCluster$cluster)) + geom_point()