library(knitr)
library(class)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.6     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.1.3
## Warning: package 'tidyr' was built under R version 4.1.3
## Warning: package 'dplyr' was built under R version 4.1.3
## Warning: package 'stringr' was built under R version 4.1.3
## Warning: package 'forcats' was built under R version 4.1.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(GGally)
## Warning: package 'GGally' was built under R version 4.1.3
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(ggplot2)

head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
# Some minor changes in the data set
iris2 <- iris %>%
  rename(`Sepal length`=Sepal.Length,
         `Sepal width`=Sepal.Width,
         `Petal length`=Petal.Length,
         `Petal width`=Petal.Width) %>%
  mutate(Species=fct_recode(Species, "Setosa"="setosa",
                            "Versicolor"="versicolor",
                            "Virginica"="virginica"))



# Density plot for each species
iris2 %>%
  gather(Attributes, value, 1:4) %>%
  ggplot(aes(x=value, fill=Species)) +
  geom_density(colour="black", alpha=0.5) +
  facet_wrap(~Attributes, scales="free_x") +
  labs(x="Values", y="Density",
       title="Iris data set",
       subtitle="Density plot for each attribute") +
  theme_bw() +
  theme(legend.position="bottom",
        legend.title=element_blank())

# Violin plot for each attribute
iris2 %>%
  gather(Attributes, value, 1:4) %>%
  ggplot(aes(x=reorder(Attributes, value, FUN=median), y=value, fill=Attributes)) +
  geom_violin(show.legend=FALSE) +
  geom_boxplot(width=0.05, fill="white") +
  labs(title="Iris data set",
       subtitle="Violin plot for each attribute") +
  theme_bw() +
  theme(axis.title.y=element_blank(),
        axis.title.x=element_blank())

# Boxplot for each attribute
iris2 %>%
  gather(Attributes, value, 1:4) %>%
  ggplot(aes(x=reorder(Attributes, value, FUN=median), y=value, fill=Attributes)) +
  geom_boxplot(show.legend=FALSE) +
  labs(title="Iris data set",
       subtitle="Boxplot for each attribute") +
  theme_bw() +
  theme(axis.title.y=element_blank(),
        axis.title.x=element_blank())

# Scatter plot and correlations
ggpairs(cbind(iris2, Cluster=as.factor(iris2$Species)),
        columns=1:4, aes(colour=Cluster, alpha=0.5),
        lower=list(continuous="points"),
        axisLabels="none", switch="both") +
  theme_bw() 

# Normalization of all columns except Species
dataNorm <- iris
dataNorm[, -5] <- scale(iris[, -5])

# Reproducible results
set.seed(1234)

# 70% train and 30% test
ind <- sample(2, nrow(dataNorm), replace=TRUE, prob=c(0.7, 0.3))
trainData <- dataNorm[ind==1,]
testData <- dataNorm[ind==2,]

# Execution of k-NN with k=1
KnnTestPrediction_k1 <- knn(trainData[,-5], testData[,-5],
                            trainData$Species, k=1, prob=TRUE)

# Execution of k-NN with k=2
KnnTestPrediction_k2 <- knn(trainData[,-5], testData[,-5],
                            trainData$Species, k=2, prob=TRUE)

# Execution of k-NN with k=3
KnnTestPrediction_k3 <- knn(trainData[,-5], testData[,-5],
                            trainData$Species, k=3, prob=TRUE)

# Execution of k-NN with k=4
KnnTestPrediction_k4 <- knn(trainData[,-5], testData[,-5],
                            trainData$Species, k=4, prob=TRUE)

# Confusion matrix of KnnTestPrediction_k1
table(testData$Species, KnnTestPrediction_k1)
##             KnnTestPrediction_k1
##              setosa versicolor virginica
##   setosa         10          0         0
##   versicolor      0         12         0
##   virginica       0          2        14
# Classification accuracy of KnnTestPrediction_k1
sum(KnnTestPrediction_k1==testData$Species)/length(testData$Species)*100
## [1] 94.73684
# Confusion matrix of KnnTestPrediction_k2
table(testData$Species, KnnTestPrediction_k2)
##             KnnTestPrediction_k2
##              setosa versicolor virginica
##   setosa         10          0         0
##   versicolor      0         12         0
##   virginica       0          3        13
# Classification accuracy of KnnTestPrediction_k2
sum(KnnTestPrediction_k2==testData$Species)/length(testData$Species)*100
## [1] 92.10526
# Confusion matrix of KnnTestPrediction_k3
table(testData$Species, KnnTestPrediction_k3)
##             KnnTestPrediction_k3
##              setosa versicolor virginica
##   setosa         10          0         0
##   versicolor      0         12         0
##   virginica       0          2        14
# Classification accuracy of KnnTestPrediction_k3
sum(KnnTestPrediction_k3==testData$Species)/length(testData$Species)*100
## [1] 94.73684
# Confusion matrix of KnnTestPrediction_k4
table(testData$Species, KnnTestPrediction_k4)
##             KnnTestPrediction_k4
##              setosa versicolor virginica
##   setosa         10          0         0
##   versicolor      0         12         0
##   virginica       0          2        14
# Classification accuracy of KnnTestPrediction_k4
sum(KnnTestPrediction_k4==testData$Species)/length(testData$Species)*100
## [1] 94.73684
# Empty variables
KnnTestPrediction <- list()
accuracy <- numeric()

# From k=1 to k=100...
for(k in 1:100){
  
  # KnnTestPrediction for each k
  KnnTestPrediction[[k]] <- knn(trainData[,-5], testData[,-5], trainData$Species, k, prob=TRUE)
  
  # Accuracy for each k   
  accuracy[k] <- sum(KnnTestPrediction[[k]]==testData$Species)/length(testData$Species)*100
  
}

# Accuracy vs Choice of k
plot(accuracy, type="b", col="dodgerblue", cex=1, pch=20,
     xlab="k, number of neighbors", ylab="Classification accuracy", 
     main="Accuracy vs Neighbors")

# Add lines indicating k with best accuracy
abline(v=which(accuracy==max(accuracy)), col="darkorange", lwd=1.5)

# Add line for max accuracy seen
abline(h=max(accuracy), col="grey", lty=2)

# Add line for min accuracy seen 
abline(h=min(accuracy), col="grey", lty=2)