data(iris)
head(iris)
sep.l <- iris$Sepal.Length
sep.w <- iris$Sepal.Width
pet.l <- iris$Petal.Length
plot_ly(x=sep.l, y=sep.w, z=pet.l, type="scatter3d", mode="markers", color=iris$Species)
ggpairs(iris, ggplot2::aes(color = Species, alpha = 0.04))
The histogram Petal.length and Petal.width clearly seperate out Setosa species with very high confidence.
However, Versicolor and Virginica Species are overlapping. If we look at the scatterplot of Sepal.Length vs Petal.Length and Petal.Width vs Petal.Length, we can distintly see a seperator that can be draw between the groups of Species.
Looks like we can just use Petal.Width and Petal.Length as parameters and come with a good model. SVM seems to be a very good model for this type of data.
First we randomly split into a training and testing set.
# Reproducing same sample (has nothing to do with the model )
set.seed = 10
# Indexing the data set
index <- c(1:nrow(iris))
# Randomly drawing samples from the data set as test data
test.index <- sample(index, size = (length(index)/4))
# Generating Test and Training Data
train <- iris[-test.index ,]
test <- iris[test.index ,]
svm.model.lin <- svm(Species ~ ., data = train,
kernel = 'linear')
Prediction Training Data Set
table(Prediction = predict(svm.model.lin, train),Truth = train$Species)
## Truth
## Prediction setosa versicolor virginica
## setosa 35 0 0
## versicolor 0 41 0
## virginica 0 0 37
plot(svm.model.lin, data = iris,
Petal.Width ~ Petal.Length, # dim 1, 2
slice = list(Sepal.Width = 3, Sepal.Length = 4)) # dimension 3, 4 hold constant
svm.model.rad <- svm(Species ~ ., data = train,
kernel = 'rad',
cost = 15,
gamma = .15)
plot(svm.model.rad, data = iris,
Petal.Width ~ Petal.Length, # dim 1, 2
slice = list(Sepal.Width = 3, Sepal.Length = 4)) # dimension 3, 4 hold constant
table(Prediction = predict(svm.model.rad, train),Truth = train$Species)
## Truth
## Prediction setosa versicolor virginica
## setosa 35 0 0
## versicolor 0 40 0
## virginica 0 1 37
table(Prediction = predict(svm.model.lin, test),Truth = test$Species)
## Truth
## Prediction setosa versicolor virginica
## setosa 15 0 0
## versicolor 0 7 1
## virginica 0 2 12
table(Prediction = predict(svm.model.rad, test),Truth = test$Species)
## Truth
## Prediction setosa versicolor virginica
## setosa 15 0 0
## versicolor 0 7 3
## virginica 0 2 10
ggplot(iris, aes(Petal.Length, Petal.Width, color = Species)) + geom_point()
set.seed(20)
irisCluster <- kmeans(iris[, 3:4], 3, nstart = 20)
irisCluster
## K-means clustering with 3 clusters of sizes 52, 48, 50
##
## Cluster means:
## Petal.Length Petal.Width
## 1 4.269231 1.342308
## 2 5.595833 2.037500
## 3 1.462000 0.246000
##
## Clustering vector:
## [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [38] 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [75] 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 1 2 2 2 2
## [112] 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2
## [149] 2 2
##
## Within cluster sum of squares by cluster:
## [1] 13.05769 16.29167 2.02200
## (between_SS / total_SS = 94.3 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
irisCluster$cluster <- as.factor(irisCluster$cluster)
ggplot(iris, aes(Petal.Length, Petal.Width, color = irisCluster$cluster)) + geom_point()
table(irisCluster$cluster, iris$Species)
##
## setosa versicolor virginica
## 1 0 48 4
## 2 0 2 46
## 3 50 0 0
cluster.name = as.character(irisCluster$cluster)
cluster.name[cluster.name == 3] = "setosa"
cluster.name[cluster.name == 1] = "versicolor"
cluster.name[cluster.name == 2] = "virginica"
cluster.name = as.factor(cluster.name)
ggplot() +
geom_point(data = iris, aes(Petal.Length, Petal.Width, color = Species))+
geom_point(data = iris[iris$Species != cluster.name, ], aes(Petal.Length, Petal.Width),
color = "red",
alpha = 0.2,
size = 4)
iris[iris$Species != cluster.name, ]