Major Project

# Most popular package for using KNN models and m-means is the standard library class:
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(GGally)

## Warning: package 'GGally' was built under R version 4.3.2

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

library(ggplot2)
library(plyr)

## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## 
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## 
## The following object is masked from 'package:purrr':
## 
##     compact

require(gridExtra)

## Loading required package: gridExtra
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine

library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(class)

# Importing the dataset
data = iris

# will check the summary of the iris data set
summary(data)

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
##

# Histogram for each species
data %>%
  gather(Attributes, Value, 1:4) %>%
  ggplot(aes(x=Value, fill=Attributes)) +
  geom_histogram(colour="black") +
  facet_wrap(~Species) +
  theme_bw() +
  labs(x="Values", y="Frequency",
       title="Iris data set",
       subtitle="Histogram for each species") +
  theme(legend.title=element_blank(),
        legend.position="bottom")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Density plot for each species
data %>%
  gather(Attributes, value, 1:4) %>%
  ggplot(aes(x=value, fill=Species)) +
  geom_density(colour="black", alpha=0.5) +
  facet_wrap(~Attributes, scales="free_x") +
  labs(x="Values", y="Density",
       title="Iris data set",
       subtitle="Density plot for each attribute") +
  theme_bw() +
  theme(legend.position="bottom",
        legend.title=element_blank())

# check the species data with pie chart
table(data$Species)

## 
##     setosa versicolor  virginica 
##         50         50         50

pie(table(data$Species), main = "Pie Chart of the Iris data set Species", 
    col = c("orange1", "chocolate", "coral"), radius = 1)

# Scatter plot and correlations
ggpairs(cbind(data, Cluster=as.factor(data$Species)),
        columns=1:4, aes(colour=Cluster, alpha=0.5),
        lower=list(continuous="points"),
        axisLabels="none", switch="both") +
        theme_bw()

# Will check the stats of "Sepal.Length" with respect to Species
aggregate(Sepal.Length  ~ Species, data, summary)

##      Species Sepal.Length.Min. Sepal.Length.1st Qu. Sepal.Length.Median
## 1     setosa             4.300                4.800               5.000
## 2 versicolor             4.900                5.600               5.900
## 3  virginica             4.900                6.225               6.500
##   Sepal.Length.Mean Sepal.Length.3rd Qu. Sepal.Length.Max.
## 1             5.006                5.200             5.800
## 2             5.936                6.300             7.000
## 3             6.588                6.900             7.900

# will see the data distribution using box plot
par(mfrow=c(2,2))
boxplot(Sepal.Length  ~ Species, data, main = "Sepal Length wrt Species", col = "lightpink3")
boxplot(Sepal.Width   ~ Species, data, main = "Sepal Width wrt Species", col = "antiquewhite1")
boxplot(Petal.Length  ~ Species, data, main = "Petal Length wrt Species", col = "lightskyblue4")
boxplot(Petal.Width  ~ Species, data, main = "Petal Width wrt Species", col = "orange1")

row_labels = data[,5]

# Encoding the target feature as factor
data$Species <- as.numeric(data$Species)


# Scale the data since we will be using distance formulas on the data
# and we want to reduce complexity and computation when computing 
# especially when our datasets are huge!
data[,c("Sepal.Length","Sepal.Width","Petal.Length","Petal.Width")] <- scale(
  data[,c("Sepal.Length","Sepal.Width","Petal.Length","Petal.Width")])

# Split into test and train 80/20
set.seed(123)

size <- floor(0.8 *  nrow(data))


train_ind <- sample(seq_len(nrow(data)), size = size)

train_labels <- data[train_ind, 5]

data_train <- data[train_ind,1:4]
data_test <- data[-train_ind,1:4]

data_test_labels <- row_labels[-train_ind]
# Fit KNN Model
predictions <- knn(train = data_train,
                   test = data_test,
                   cl = train_labels,
                   k= 11)

# Notice that I am only getting 2 dimensions 
plot_predictions <- data.frame(
  data_test$Sepal.Length,
  data_test$Sepal.Width,
  data_test$Petal.Length,
  data_test$Petal.Width,
  predicted = predictions)

colnames(plot_predictions) <- c("Sepal.Length",
                                "Sepal.Width",
                                "Petal.Length",
                                "Petal.Width",
                                'predicted')

# Visualize the KNN algorithm results.
p1 <- ggplot(plot_predictions, aes(Petal.Length, Petal.Width, color = predicted, fill = predicted)) + 
  geom_point(size = 5) + 
  geom_text(aes(label=data_test_labels),hjust=1, vjust=2) +
  ggtitle("Predicted relationship between Petal Length and Width") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(legend.position = "none")


p2 <- ggplot(plot_predictions, aes(Sepal.Length, Sepal.Width, color = predicted, fill = predicted)) + 
  geom_point(size = 5) + 
  geom_text(aes(label=data_test_labels),hjust=1, vjust=2) +
  ggtitle("Predicted relationship between Sepal Length and Sepal") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(legend.position = "none")

grid.arrange(p1, p2, ncol=2)

# iris.labels
iris.labels = iris$Species
table(iris.labels)

## iris.labels
##     setosa versicolor  virginica 
##         50         50         50

iris_data <- iris[1:4]

# Scale data
iris_data_scale <- scale(iris_data)
# Distance
iris_data <- dist(iris_data_scale)

# Calculate how many clusters you need
# Within Sum Squares
fviz_nbclust(iris_data_scale, kmeans, method = "wss")+
  labs(subtitle="Elbow Method")

# Kmeans
km.out <- kmeans(iris_data_scale, centers=3,nstart=100)
print(km.out)

## K-means clustering with 3 clusters of sizes 53, 47, 50
## 
## Cluster means:
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1  -0.05005221 -0.88042696    0.3465767   0.2805873
## 2   1.13217737  0.08812645    0.9928284   1.0141287
## 3  -1.01119138  0.85041372   -1.3006301  -1.2507035
## 
## Clustering vector:
##   [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [38] 3 3 3 3 3 3 3 3 3 3 3 3 3 2 2 2 1 1 1 2 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1
##  [75] 1 2 2 2 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2 2 1 2 2 2 2
## [112] 2 2 1 1 2 2 2 2 1 2 1 2 1 2 2 1 2 2 2 2 2 2 1 1 2 2 2 1 2 2 2 1 2 2 2 1 2
## [149] 2 1
## 
## Within cluster sum of squares by cluster:
## [1] 44.08754 47.45019 47.35062
##  (between_SS / total_SS =  76.7 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

# Visualize the clustering algorithm results.
km.clusters<-km.out$cluster
rownames(iris_data_scale)<-paste(iris$Species, 1:dim(iris)[1], sep = "_")
fviz_cluster(list(data=iris_data_scale, cluster = km.clusters))

table(km.clusters, iris$Species)

##            
## km.clusters setosa versicolor virginica
##           1      0         39        14
##           2      0         11        36
##           3     50          0         0

Major Project

Kinley phuntsho

2023-11-07