Project 1 k-NN_Prashant

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)

##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

We will implement KNN to identify flower species based on sepal & petal lenght & width

getwd()

## [1] "C:/Users/prasnaya/Desktop/Personal/python/Data_Science/DS With R-Saharan/Projects"

data<-read.csv("iris.csv")
str(data)

## 'data.frame':    150 obs. of  6 variables:
##  $ Id           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ SepalLengthCm: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ SepalWidthCm : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ PetalLengthCm: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ PetalWidthCm : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species      : Factor w/ 3 levels "Iris-setosa",..: 1 1 1 1 1 1 1 1 1 1 ...

Data is very homogeneous all species are together, we will use the random function to reshuffle the iris data

set.seed(1000)
rand_run<- runif(nrow(data))
data<-data[order(rand_run),]


head(data)

We get to know the min & max for sepal,petal for all 3 species with above command

Data visualization with help of ggplot - scatterplot

#install.packages("ggplot2")
library(ggplot2)

scatter <- ggplot(data=data, aes(x = SepalLengthCm, y = SepalWidthCm)) 
scatter + geom_point(aes(color=Species, shape=Species)) +
  xlab("Sepal Length") +  ylab("Sepal Width") +
  ggtitle("Sepal Length-Width")

scatter <- ggplot(data=data, aes(x = PetalLengthCm, y = PetalWidthCm)) 
scatter + geom_point(aes(color=Species, shape=Species)) +
  xlab("Petal Length") +  ylab("Petal Width") +
  ggtitle("Petal Length-Width")

library(reshape2)
data1 <- melt(data, id=c("Id","Species"))

data1

bar1 <- ggplot(data=data1, aes(x=Species, y=value, fill=variable))
bar1 + geom_bar(stat="identity", position="dodge") + 
  scale_fill_manual(values=c("orange", "blue", "darkgreen", "purple"),
                    name="Measurements",
                    breaks=c("SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"),
                    labels=c("Sepal Length", "Sepal Width", "Petal Length", "Petal Width"))

Based on summary from data set and visual graphs we can easily identify that scale of Sepal,Petal Lenght & Sepal , Petal Width are not in same ##scale

We can use a function to normalize the values

Create a normlize function

normalize_fun <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}

Normalizing the iris data

normalized_data<-normalize_fun(data[,2:5])



summary(normalized_data)

##  SepalLengthCm     SepalWidthCm    PetalLengthCm     PetalWidthCm    
##  Min.   :0.5385   Min.   :0.2436   Min.   :0.1154   Min.   :0.00000  
##  1st Qu.:0.6410   1st Qu.:0.3462   1st Qu.:0.1923   1st Qu.:0.02564  
##  Median :0.7308   Median :0.3718   Median :0.5449   Median :0.15385  
##  Mean   :0.7363   Mean   :0.3787   Mean   :0.4691   Mean   :0.14085  
##  3rd Qu.:0.8077   3rd Qu.:0.4103   3rd Qu.:0.6410   3rd Qu.:0.21795  
##  Max.   :1.0000   Max.   :0.5513   Max.   :0.8718   Max.   :0.30769

normalized_data

library(class)
require(class)

Creating training & testing dataset

training row - 1:130

testing rows - 131:150

data_training <- normalized_data[1:130,]
data_testing <- normalized_data[131:150,]
data_training_target <- data[1:130,6]
data_testing_target <- data[131:150,6]



data_training1 <- normalized_data[1:130,1:2]
data_testing1 <- normalized_data[131:150,1:2]
data_training_target <- data[1:130,6]
data_testing_target <- data[131:150,6]

K value - sqrt of total records

sqrt(nrow(data))

## [1] 12.24745

Running the KNN Model

m1_2_feature<-knn(train=data_training1,test=data_testing1,cl=data_training_target,k=13)

table(data_testing_target,m1_2_feature)

##                    m1_2_feature
## data_testing_target Iris-setosa Iris-versicolor Iris-virginica
##     Iris-setosa               3               0              0
##     Iris-versicolor           0               3              4
##     Iris-virginica            0               2              8

m1_4_feature<-knn(train=data_training,test=data_testing,cl=data_training_target,k=13)
table(data_testing_target,m1_4_feature)

##                    m1_4_feature
## data_testing_target Iris-setosa Iris-versicolor Iris-virginica
##     Iris-setosa               3               0              0
##     Iris-versicolor           0               7              0
##     Iris-virginica            0               1              9

Project 1 k-NN_Prashant_Nayak

Prashant Nayak

September 29, 2019