# Getting the data from the famous iris data set for this project. It's a small data set with flower features that can be used to attempt to predict the species of an iris flower.
#Use the ISLR libary to get the iris data set. Then, I will check the head of the iris Data Frame.
library(ISLR)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
head(str(data.frame(iris)))
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
## NULL
#Standardize Data
# Lets go ahead and standardize this data set even though its not necessary for this data!
# I will Use scale() to standardize the feature columns of the iris dataset and then set this standardized version of the data as a new variable(standardized.iris).
standardized.feature<-scale(iris[1:4])
#Check that the scaling worked by checking the variance of one of the new columns
var(standardized.feature[,1])
## [1] 1
var(standardized.feature[,2])
## [1] 1
There seem to be consistency in variance of the first and second column(1).
# Let's go ahead and join the standardized data with the response/target/label column (the column with the species names.
final.data<-cbind(standardized.feature,iris[5])
head(final.data)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 -0.8976739 1.01560199 -1.335752 -1.311052 setosa
## 2 -1.1392005 -0.13153881 -1.335752 -1.311052 setosa
## 3 -1.3807271 0.32731751 -1.392399 -1.311052 setosa
## 4 -1.5014904 0.09788935 -1.279104 -1.311052 setosa
## 5 -1.0184372 1.24503015 -1.335752 -1.311052 setosa
## 6 -0.5353840 1.93331463 -1.165809 -1.048667 setosa
** I am going to train and Test Splits Use the caTools library to split the standardized data into train and test sets; Using a 70/30 split.**
library(caTools)
set.seed(101)
sample<-sample.split(final.data$Species, SplitRatio = 0.70)
train<-subset(final.data,sample==TRUE)
test<-subset(final.data, sample==FALSE)
# I will go ahead and build a KNN model
#And then use the knn function to predict Species of the test set. Use k=1
library(class)
predicted.species<-knn(train[1:4],test[1:4],train$Species, k=1)
predicted.species
## [1] setosa setosa setosa setosa setosa setosa
## [7] setosa setosa setosa setosa setosa setosa
## [13] setosa setosa setosa versicolor versicolor versicolor
## [19] versicolor versicolor virginica versicolor versicolor versicolor
## [25] versicolor versicolor virginica versicolor versicolor versicolor
## [31] virginica virginica virginica virginica virginica virginica
## [37] virginica virginica virginica virginica virginica virginica
## [43] virginica virginica virginica
## Levels: setosa versicolor virginica
#checking to see the misclassification rate?
mean(test$Species!=predicted.species)
## [1] 0.04444444
# Looks like the misclassification error is bigger than I expected. Let me see if I can select a K value that will be able to bring the error down.
#Choosing a K Value
#Although the data is quite small for us to really get a feel for choosing a good K value, I will go ahead and Create a plot of the error (misclassification) rate for k values ranging from 1 to 10.
predicted.species<-NULL
error.rate<-NULL
for(i in 1:10){
set.seed(101)
predicted.species<-knn(train[1:4],test[1:4],train$Species, k=i)
error.rate[i]<-mean(test$Species!=predicted.species)
}
library(ggplot2)
k.values<-1:10
error.df<-data.frame(error.rate,k.values)
pl<-ggplot(error.df,aes(x=k.values,y=error.rate)) + geom_point()
pl + geom_line(lty="dotted",color='blue')
** In other to minimize the error rate, I have to select K values between 2 and 6**
#Kmeans Clustering
# I am going ahead to see the unsurpervised learning of the Iris data set
pl<-ggplot(iris, aes(Petal.Length, Petal.Width, color=Species))
pl+geom_point(size=4)
# Well, it looks like the Setosa species is already seperated and so K-means clustering is going to have easier time clustering the Setosa species than the rest of the species.
set.seed(101)
iris.cluster<-kmeans(iris[,1:4],3,nstart = 20)
iris.cluster
## K-means clustering with 3 clusters of sizes 62, 50, 38
##
## Cluster means:
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.901613 2.748387 4.393548 1.433871
## 2 5.006000 3.428000 1.462000 0.246000
## 3 6.850000 3.073684 5.742105 2.071053
##
## Clustering vector:
## [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [36] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [71] 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 3 3 3
## [106] 3 1 3 3 3 3 3 3 1 1 3 3 3 3 1 3 1 3 1 3 3 1 1 3 3 3 3 3 1 3 3 3 3 1 3
## [141] 3 3 1 3 3 3 1 3 3 1
##
## Within cluster sum of squares by cluster:
## [1] 39.82097 15.15100 23.87947
## (between_SS / total_SS = 88.4 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
library(cluster)
clusplot(iris,iris.cluster$cluster,color=T, Labels=0, lines=0)
## Warning in plot.window(...): "Labels" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "Labels" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "Labels" is
## not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "Labels" is
## not a graphical parameter
## Warning in box(...): "Labels" is not a graphical parameter
## Warning in title(...): "Labels" is not a graphical parameter
## Warning in polygon(z[[k]], density = if (shade) density[k] else 0, col =
## col.clus[jInd[i]], : "Labels" is not a graphical parameter
## Warning in polygon(z[[k]], density = if (shade) density[k] else 0, col =
## col.clus[jInd[i]], : "Labels" is not a graphical parameter
## Warning in polygon(z[[k]], density = if (shade) density[k] else 0, col =
## col.clus[jInd[i]], : "Labels" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "Labels" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "Labels" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "Labels" is not a
## graphical parameter