Data Set

The dataset contain images of Animals, Fruits and Vegetables.The task is to classify and cluster these images based on visual features.The dataset contain main 3 feature.

  1. Animals - This folder contain images of domestic and wild animals.
  2. Fruits - This folder contain images of various fruits.
  3. vegetables - This folder contain images of various vegetables.

importing libraries

suppressPackageStartupMessages({
  suppressWarnings({
    library(magick)
    library(ggplot2)
    library(imager)
    library(dplyr)
    library(caret)
    
    library(hopkins)
    library(clustertend)
    library(cluster)
    library(factoextra)
    library(dbscan)
    
    library(neuralnet)
    library(tidyverse)
    library(GGally)
  })
})

Getting Dataset

Each image is resized into 28*28 pixels and convert into gray scale for uniformality. And having 10 % favorite image (my favorite animal is Dog).Favorite animal is labeled as 1 and others as 0.Contain a total of 325 image and contain 786 (784 is pixel value , favorite animal, category (animal, fruits, vegetables)).

animal_path <- "D:/alliance/3rd sem/machine learning_2/DATASET/img_DS_1/img_DS/Animal"
fruit_path <- "D:/alliance/3rd sem/machine learning_2/DATASET/img_DS_1/img_DS/Fruits"
vegitable_path <- "D:/alliance/3rd sem/machine learning_2/DATASET/img_DS_1/img_DS/vegetable"



img_path <- list(animal_path, fruit_path, vegitable_path)
img_path
## [[1]]
## [1] "D:/alliance/3rd sem/machine learning_2/DATASET/img_DS_1/img_DS/Animal"
## 
## [[2]]
## [1] "D:/alliance/3rd sem/machine learning_2/DATASET/img_DS_1/img_DS/Fruits"
## 
## [[3]]
## [1] "D:/alliance/3rd sem/machine learning_2/DATASET/img_DS_1/img_DS/vegetable"
df <- data.frame(matrix = numeric(), favorite = integer(), category = character(), stringsAsFactors = FALSE)

preprocessing

Each image is processed as follows,

  1. Image Rescaling - the image is resized into 28x28 pixel using magick library.
  2. Gray scaling - image is converted into grayscale image.
  3. data reshaping - each grayscale image is reshaped into 784 1d vector
  4. scaling : all pixel is standardized using scale().
for (paths in img_path){
  image_file <- list.files(paths, recursive = TRUE, full.names = TRUE)
  
  for (image in image_file){
    img <- image_read(image)
    img <- image_resize(img, "28x28!")
    
    img_gray <- image_convert(img, colorspace="gray")
    
    img_data <- image_data(img_gray)
    
    img_matrix <- as.numeric(as.matrix(img_data))
    
    if(grepl("dog \\([0-9]+\\)",image)){
      label <- 1
    }else{
      label <- 0
    }
    
    category <- basename(paths)
    
    df <- rbind(df, c(img_matrix, favorite = label, category = category))
    
  }
}


print(dim(df))
## [1] 325 786
str(df)
## 'data.frame':    325 obs. of  786 variables:
##  $ X.123.   : chr  "123" "103" "109" "145" ...
##  $ X.131.   : chr  "131" "133" "107" "160" ...
##  $ X.135.   : chr  "135" "136" "105" "185" ...
##  $ X.137.   : chr  "137" "126" "107" "207" ...
##  $ X.139.   : chr  "139" "127" "121" "201" ...
##  $ X.146.   : chr  "146" "154" "119" "202" ...
##  $ X.143.   : chr  "143" "152" "106" "197" ...
##  $ X.139..1 : chr  "139" "142" "114" "197" ...
##  $ X.142.   : chr  "142" "155" "131" "163" ...
##  $ X.136.   : chr  "136" "169" "131" "147" ...
##  $ X.139..2 : chr  "139" "169" "129" "171" ...
##  $ X.147.   : chr  "147" "166" "116" "180" ...
##  $ X.145.   : chr  "145" "181" "111" "168" ...
##  $ X.167.   : chr  "167" "193" "116" "168" ...
##  $ X.170.   : chr  "170" "175" "116" "166" ...
##  $ X.167..1 : chr  "167" "164" "114" "149" ...
##  $ X.173.   : chr  "173" "179" "112" "147" ...
##  $ X.169.   : chr  "169" "181" "109" "157" ...
##  $ X.174.   : chr  "174" "185" "120" "167" ...
##  $ X.167..2 : chr  "167" "177" "133" "161" ...
##  $ X.165.   : chr  "165" "169" "117" "159" ...
##  $ X.170..1 : chr  "170" "161" "113" "162" ...
##  $ X.187.   : chr  "187" "151" "114" "163" ...
##  $ X.184.   : chr  "184" "158" "121" "177" ...
##  $ X.169..1 : chr  "169" "140" "130" "180" ...
##  $ X.158.   : chr  "158" "120" "117" "177" ...
##  $ X.156.   : chr  "156" "139" "100" "171" ...
##  $ X.155.   : chr  "155" "137" "97" "173" ...
##  $ X.107.   : chr  "107" "117" "115" "164" ...
##  $ X.121.   : chr  "121" "128" "109" "177" ...
##  $ X.137..1 : chr  "137" "125" "110" "202" ...
##  $ X.144.   : chr  "144" "122" "118" "218" ...
##  $ X.159.   : chr  "159" "148" "136" "207" ...
##  $ X.151.   : chr  "151" "177" "121" "212" ...
##  $ X.139..3 : chr  "139" "152" "108" "229" ...
##  $ X.138.   : chr  "138" "139" "124" "237" ...
##  $ X.134.   : chr  "134" "146" "128" "165" ...
##  $ X.128.   : chr  "128" "158" "139" "157" ...
##  $ X.135..1 : chr  "135" "148" "133" "176" ...
##  $ X.152.   : chr  "152" "137" "121" "176" ...
##  $ X.150.   : chr  "150" "164" "108" "175" ...
##  $ X.169..2 : chr  "169" "180" "103" "177" ...
##  $ X.175.   : chr  "175" "188" "112" "179" ...
##  $ X.179.   : chr  "179" "167" "117" "160" ...
##  $ X.184..1 : chr  "184" "171" "119" "135" ...
##  $ X.177.   : chr  "177" "184" "113" "122" ...
##  $ X.184..2 : chr  "184" "189" "121" "135" ...
##  $ X.187..1 : chr  "187" "174" "136" "167" ...
##  $ X.183.   : chr  "183" "157" "116" "182" ...
##  $ X.187..2 : chr  "187" "163" "113" "197" ...
##  $ X.192.   : chr  "192" "155" "119" "209" ...
##  $ X.188.   : chr  "188" "156" "130" "229" ...
##  $ X.174..1 : chr  "174" "149" "145" "228" ...
##  $ X.169..3 : chr  "169" "129" "139" "226" ...
##  $ X.168.   : chr  "168" "139" "120" "230" ...
##  $ X.167..3 : chr  "167" "137" "103" "230" ...
##  $ X.113.   : chr  "113" "127" "109" "161" ...
##  $ X.123..1 : chr  "123" "121" "107" "171" ...
##  $ X.133.   : chr  "133" "136" "105" "186" ...
##  $ X.145..1 : chr  "145" "131" "122" "178" ...
##  $ X.168..1 : chr  "168" "150" "134" "169" ...
##  $ X.153.   : chr  "153" "165" "114" "142" ...
##  $ X.134..1 : chr  "134" "165" "118" "170" ...
##  $ X.133..1 : chr  "133" "157" "121" "205" ...
##  $ X.129.   : chr  "129" "157" "106" "169" ...
##  $ X.123..2 : chr  "123" "151" "117" "176" ...
##  $ X.141.   : chr  "141" "137" "114" "179" ...
##  $ X.153..1 : chr  "153" "134" "104" "187" ...
##  $ X.153..2 : chr  "153" "156" "117" "188" ...
##  $ X.168..2 : chr  "168" "163" "123" "192" ...
##  $ X.181.   : chr  "181" "184" "117" "191" ...
##  $ X.184..3 : chr  "184" "179" "116" "162" ...
##  $ X.184..4 : chr  "184" "173" "106" "121" ...
##  $ X.179..1 : chr  "179" "185" "113" "92" ...
##  $ X.187..3 : chr  "187" "173" "112" "128" ...
##  $ X.185.   : chr  "185" "180" "133" "182" ...
##  $ X.183..1 : chr  "183" "151" "143" "166" ...
##  $ X.199.   : chr  "199" "127" "120" "219" ...
##  $ X.199..1 : chr  "199" "124" "118" "221" ...
##  $ X.194.   : chr  "194" "114" "131" "229" ...
##  $ X.189.   : chr  "189" "130" "114" "228" ...
##  $ X.181..1 : chr  "181" "133" "117" "223" ...
##  $ X.178.   : chr  "178" "127" "120" "221" ...
##  $ X.172.   : chr  "172" "127" "108" "219" ...
##  $ X.136..1 : chr  "136" "138" "101" "170" ...
##  $ X.129..1 : chr  "129" "146" "107" "167" ...
##  $ X.136..2 : chr  "136" "138" "108" "156" ...
##  $ X.153..3 : chr  "153" "148" "119" "145" ...
##  $ X.165..1 : chr  "165" "141" "120" "164" ...
##  $ X.162.   : chr  "162" "149" "109" "148" ...
##  $ X.149.   : chr  "149" "162" "119" "98" ...
##  $ X.156..1 : chr  "156" "148" "128" "135" ...
##  $ X.150..1 : chr  "150" "161" "126" "183" ...
##  $ X.143..1 : chr  "143" "142" "125" "174" ...
##  $ X.165..2 : chr  "165" "143" "117" "152" ...
##  $ X.167..4 : chr  "167" "151" "130" "183" ...
##  $ X.166.   : chr  "166" "145" "141" "190" ...
##  $ X.175..1 : chr  "175" "163" "134" "187" ...
##  $ X.177..1 : chr  "177" "185" "125" "176" ...
##   [list output truncated]
colnames(df) <- c(paste0("pixel", 1:784), "favorite", "category")

df$favorite <- as.factor(df$favorite)
df$category <- as.factor(df$category)
df$category <- as.numeric(df$category)

Partitioning the data

set.seed(45)
train_index <- createDataPartition(df$favorite, p = .7, list=FALSE)
train_data <- df[train_index, ]
test_data <- df[-train_index, ]

creating a sample Logistic Regression Model

#model <- glm(favorite~., data = train_data, family = binomial)

#summary(model)

because of higher dimention it is hard to train so we use pca to reduce the dimentionality.

PCA

train_data[, !names(train_data) %in% c("favorite", "category")] <- lapply(train_data[, !names(train_data) %in% c("favorite", "category")], as.numeric)
test_data[, !names(train_data) %in% c("favorite", "category")] <- lapply(test_data[, !names(train_data) %in% c("favorite", "category")], as.numeric)


scale_train <- scale(train_data[, !names(train_data) %in% c("favorite", "category")])
scale_test <- scale(test_data[, !names(train_data) %in% c("favorite", "category")])

pca_tr <- prcomp(scale_train, center = FALSE, scale = FALSE)

#sample pca result
pca_tr$x[1:5,1:15]
##            PC1        PC2        PC3        PC4        PC5        PC6       PC7
## 2  -5.49499882  4.2137970 -9.0999637  1.4246030 -0.6610506  0.1498807 -1.090098
## 3   0.06259991  9.9669329  1.4200319  0.8183340 -1.1492677 -0.2524310  2.250656
## 4   2.38621311  0.4521827 -0.8269685 -0.1265222  0.6714670  0.2489085  1.266212
## 5  -5.00227507  4.5019779 -8.7360584  1.4291610 -0.7803301  0.1734669 -1.034691
## 6 -18.35184733 -0.6523587  0.5689522  7.3296221  1.4681680  0.2182316  2.805686
##          PC8        PC9       PC10        PC11      PC12       PC13       PC14
## 2  0.2384838 -0.8628841  1.4436313 -0.10345712 0.1013756 -1.1539528  0.3266641
## 3 -1.6541231 -1.2715212  0.3707211 -3.44942689 1.1149501 -1.1122315  0.3916325
## 4 -0.1715415 -2.8568604  0.6348533 -0.29784121 0.5338896  0.4580468  0.3767122
## 5  0.2914949 -0.8001614  1.3778328 -0.09016435 0.1078102 -1.0785787  0.3151480
## 6  3.8281287  1.0478799 -0.6700217 -0.57399371 1.3592313 -1.5420958 -2.4105858
##         PC15
## 2 -0.5782168
## 3  1.7371740
## 4 -1.6365730
## 5 -0.5438888
## 6 -0.8557334
train_df <- data.frame(pca_tr$x[,1:50], favorite = train_data$favorite)
train_df$category <- train_data$category

test_df <- as.data.frame(predict(pca_tr, newdata = scale_test)[, 1:50])
test_df$favorite <- test_data$favorite
test_df$category <- test_data$category

dim(train_df)
## [1] 228  52
dim(test_df)
## [1] 97 52

part A

Logistic Regrission Model

model <- glm(favorite~., data = train_df, family = binomial)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model)
## 
## Call:
## glm(formula = favorite ~ ., family = binomial, data = train_df)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept)    29.9717 78213.7762   0.000    1.000
## PC1             1.5955  1465.0678   0.001    0.999
## PC2             0.9147  5057.6389   0.000    1.000
## PC3            -0.8162  5660.6583   0.000    1.000
## PC4             2.5604  5282.4072   0.000    1.000
## PC5             0.3466  4432.0143   0.000    1.000
## PC6            -3.0644  6635.5827   0.000    1.000
## PC7            -0.1666  6753.3479   0.000    1.000
## PC8             0.0980 10161.6468   0.000    1.000
## PC9            -0.8527  6419.8291   0.000    1.000
## PC10           -3.6420 14422.5592   0.000    1.000
## PC11            2.8790  9159.9165   0.000    1.000
## PC12           -3.0246 15921.2475   0.000    1.000
## PC13            7.9035 10347.2079   0.001    0.999
## PC14            2.6336 17480.0666   0.000    1.000
## PC15            3.8821 11173.5586   0.000    1.000
## PC16            1.4842 17104.9828   0.000    1.000
## PC17           -2.3405 14062.5917   0.000    1.000
## PC18            1.6280 10938.2085   0.000    1.000
## PC19            9.0613 18515.1289   0.000    1.000
## PC20          -11.4049 10355.9018  -0.001    0.999
## PC21           -4.0665 12791.2083   0.000    1.000
## PC22           -0.1198 21282.9384   0.000    1.000
## PC23           -7.2489 12685.9469  -0.001    1.000
## PC24            5.4674 16291.3743   0.000    1.000
## PC25            0.6129 19780.4054   0.000    1.000
## PC26           -1.2440 12825.5521   0.000    1.000
## PC27            5.1228 12043.7652   0.000    1.000
## PC28           -2.7165 16298.6711   0.000    1.000
## PC29           -8.5042 11553.3708  -0.001    0.999
## PC30           -7.5231 19323.7645   0.000    1.000
## PC31            1.7186 14905.3893   0.000    1.000
## PC32           12.1632 14930.3652   0.001    0.999
## PC33            4.7950 35677.5566   0.000    1.000
## PC34           12.6892 16694.5793   0.001    0.999
## PC35            5.8766 23895.1123   0.000    1.000
## PC36           -6.7885 17594.4142   0.000    1.000
## PC37            7.7477 20334.8434   0.000    1.000
## PC38            5.9656 14029.4113   0.000    1.000
## PC39           -1.2887 35022.3576   0.000    1.000
## PC40           -3.6231 21427.8016   0.000    1.000
## PC41           -4.2559 32695.1357   0.000    1.000
## PC42            5.8827 25508.3381   0.000    1.000
## PC43           -0.2859 18743.3619   0.000    1.000
## PC44           11.7459 24603.6469   0.000    1.000
## PC45            3.0038 22086.9724   0.000    1.000
## PC46           -4.5530 27117.9812   0.000    1.000
## PC47            1.9184 20499.6500   0.000    1.000
## PC48           14.1599 16513.4860   0.001    0.999
## PC49           -5.6806 25612.5983   0.000    1.000
## PC50            6.0938 15349.6183   0.000    1.000
## category      -67.0967 45259.3985  -0.001    0.999
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1.5344e+02  on 227  degrees of freedom
## Residual deviance: 1.8989e-08  on 176  degrees of freedom
## AIC: 104
## 
## Number of Fisher Scoring iterations: 25

Accuracy

y_predict <- predict(model, newdata = test_df, type = "response")
predict_label <- ifelse(y_predict > 0.5, 1 , 0)

table(predict = predict_label, actual = test_df$favorite)
##        actual
## predict  0  1
##       0 87 10
accuracy <- mean(predict_label == test_data$favorite)
accuracy
## [1] 0.8969072
data.frame(predict_label, test_df$favorite)
##     predict_label test_df.favorite
## 1               0                0
## 7               0                0
## 8               0                0
## 15              0                0
## 16              0                0
## 17              0                0
## 20              0                0
## 21              0                0
## 23              0                0
## 24              0                0
## 25              0                0
## 26              0                0
## 29              0                0
## 32              0                0
## 33              0                1
## 34              0                1
## 37              0                1
## 40              0                1
## 42              0                1
## 48              0                1
## 53              0                1
## 59              0                1
## 60              0                1
## 66              0                1
## 67              0                0
## 70              0                0
## 72              0                0
## 80              0                0
## 82              0                0
## 86              0                0
## 87              0                0
## 103             0                0
## 105             0                0
## 108             0                0
## 110             0                0
## 111             0                0
## 116             0                0
## 120             0                0
## 125             0                0
## 126             0                0
## 132             0                0
## 134             0                0
## 137             0                0
## 138             0                0
## 139             0                0
## 141             0                0
## 142             0                0
## 143             0                0
## 144             0                0
## 145             0                0
## 149             0                0
## 150             0                0
## 151             0                0
## 152             0                0
## 157             0                0
## 158             0                0
## 165             0                0
## 167             0                0
## 175             0                0
## 177             0                0
## 178             0                0
## 184             0                0
## 186             0                0
## 190             0                0
## 192             0                0
## 193             0                0
## 196             0                0
## 198             0                0
## 199             0                0
## 200             0                0
## 201             0                0
## 208             0                0
## 209             0                0
## 217             0                0
## 219             0                0
## 220             0                0
## 221             0                0
## 222             0                0
## 223             0                0
## 228             0                0
## 233             0                0
## 236             0                0
## 237             0                0
## 243             0                0
## 246             0                0
## 249             0                0
## 254             0                0
## 256             0                0
## 263             0                0
## 269             0                0
## 270             0                0
## 280             0                0
## 281             0                0
## 290             0                0
## 291             0                0
## 295             0                0
## 325             0                0

Since the Favorite animal constitues only 10% of the dataset, this introduce significant class inbalance. This imbalance means that the model may endup biased towards predicting the non favorite class.

Part B

Clustering

df[, !names(df) %in% c("favorite", "category")] <- lapply(df[, !names(df) %in% c("favorite", "category")], as.numeric)
scaled_df <-scale(df[, !names(df) %in% c("favorite", "category")])

dim(scaled_df)
## [1] 325 784
#clusterable ???
set.seed(123)
hopkinsts <- hopkins(scaled_df, n=nrow(scaled_df)-1)
## Warning: Package `clustertend` is deprecated.  Use package `hopkins` instead.
print(hopkinsts)
## $H
## [1] 0.3868863

The Hopkins statistic is used to assess the cluster tendency of a dataset, i.e., whether the data contains meaningful clusters or if the data points are randomly distributed. The value of the Hopkins statistic ranges from 0 to 1:

A value close to 1 indicates that the dataset is highly clustered and not random. A value close to 0.5 suggests that the data is uniformly distributed or random and does not exhibit cluster structure. A value close to 0 would indicate anti-clustering or regular spacing between points.

Hopkins value is less that .5 which indicate this dataset is not good for clustering.

Anyway we will try doing clustering.

k-means Cluster

km <- kmeans(scaled_df, centers = 3, nstart = 25)

fviz_cluster(km,data = scaled_df, geom = "point")

km$cluster
##   [1] 2 2 2 2 2 1 1 2 2 1 1 2 2 1 2 2 1 1 1 2 1 1 1 1 1 2 1 2 2 1 1 2 2 1 2 2 1
##  [38] 1 3 2 2 2 2 2 3 2 3 2 2 2 1 3 2 2 2 2 3 2 1 1 2 2 2 2 2 2 2 1 2 2 2 1 1 1
##  [75] 2 2 1 2 2 1 1 1 1 1 2 2 1 1 2 1 2 2 2 2 1 2 1 1 2 1 2 1 1 1 2 1 1 1 2 2 1
## [112] 2 2 2 1 2 2 1 1 1 2 2 1 1 2 1 2 2 2 1 2 1 1 2 1 1 2 1 1 1 2 2 3 2 2 2 3 1
## [149] 3 1 1 1 1 3 1 1 1 1 1 2 2 1 2 1 2 3 2 3 2 2 1 2 2 3 3 2 3 3 3 3 3 1 2 3 1
## [186] 1 3 1 2 1 2 2 3 1 2 3 3 1 3 1 2 2 1 2 1 3 3 3 2 3 3 3 2 3 2 2 3 3 1 1 1 3
## [223] 3 3 3 2 3 3 3 1 1 3 3 1 2 1 1 1 1 2 1 2 2 2 2 3 3 3 2 1 1 3 2 3 3 1 2 3 2
## [260] 3 2 3 1 3 2 2 1 3 3 3 3 2 3 3 3 3 3 2 2 3 3 3 2 3 3 2 2 1 3 1 3 3 1 1 1 2
## [297] 3 3 3 1 3 3 2 1 3 2 2 3 3 2 2 1 3 2 3 3 1 1 3 1 1 3 3 3 2
df$category <- as.factor(df$category)
df$category <- as.numeric(df$category)
df$category
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [186] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [223] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [260] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [297] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
table(km$cluster, df$category)
##    
##      1  2  3
##   1 73 19 20
##   2 83 20 26
##   3  9 33 42

using pca

pca_tr <- prcomp(scaled_df, center = FALSE, scale = FALSE)

scaled_df_ <- data.frame(pca_tr$x[,1:50])

hopkinsts <- hopkins(scaled_df_, n=nrow(scaled_df_)-1)
## Warning: Package `clustertend` is deprecated.  Use package `hopkins` instead.
print(hopkinsts)
## $H
## [1] 0.3293306
km <- kmeans(scaled_df_, centers = 3, nstart = 25)

fviz_cluster(km,data = scaled_df_, geom = "point")

table(km$cluster, df$category)
##    
##      1  2  3
##   1  9 33 42
##   2 73 19 20
##   3 83 20 26

we tried to Cluster using category (animals, fruits and vegetable), but the result suggest that clusters are not seperated according to categories, and shows some degree of overlaping across the categories. cluster 1 has a majority of 3rd item. but still contain 9 animals, 33 fruits and 42 vegetables together. cluster 2 has a majority of animals.

Hierarchical Clustering

hc1 <- hclust(dist(scaled_df), method = "ward.D2")
plot(hc1)

clusters <- cutree(hc1, k = 3)

table(clusters, df$category)
##         
## clusters   1   2   3
##        1  38  24  28
##        2 124  29  32
##        3   3  19  28

cluster 1 almost equally contain all category cluster 2 has majority of category animals. cluster 3 has majority of 3rd category vegetables

DBSCAN

kNNdistplot(x = scaled_df, k = 5)
abline(h = 28, col="black")

knndistplot shows that the eps(radius = 30) if we reduce it we will get a lot of outliers. no proper cluster.

dbs <- dbscan(scaled_df, eps = 30, minPts = 3)
table(dbs$cluster, df$category)
##    
##       1   2   3
##   0   1   0   3
##   1 164  72  85
fviz_cluster(dbs, data=scaled_df,geom = "point")

we used dbscan to cluster, cluster 0 represents the noise points. Mainly 1 cluster is there which contain almost all the image.

Part c

ANN

ANN with 1 hiddenlayer and 5 Neurons

netlayer <- c(5)
net.network <-neuralnet(favorite ~ . , train_df, hidden = netlayer, threshold =  0.01)
plot(net.network)

nn.predict <- predict(net.network, test_df[, !names(test_df) %in% 'favorite'])
predicted_class <- ifelse(nn.predict[,1] >.5, 1, 0)
confusion_matrix <- table(predicted = predicted_class, actual = test_df$favorite)
accuracy <-sum(diag(confusion_matrix)) / sum(confusion_matrix)
accuracy
## [1] 0.8969072

ANN with 1 hiddenlayer and 10 Neurons

netlayer <- c(10)
net.network <-neuralnet(favorite ~ . , train_df, hidden = netlayer, threshold =  0.01)
plot(net.network)

nn.predict <- predict(net.network, test_df[, !names(test_df) %in% 'favorite'])
predicted_class <- ifelse(nn.predict[,1] >.5, 1, 0)
confusion_matrix <- table(predicted = predicted_class, actual = test_df$favorite)
accuracy <-sum(diag(confusion_matrix)) / sum(confusion_matrix)
accuracy
## [1] 0.8969072

ANN with 1 hiddenlayer and 20 Neurons

netlayer <- c(20)
net.network <-neuralnet(favorite ~ . , train_df, hidden = netlayer, threshold =  0.01)

nn.predict <- predict(net.network, test_df[, !names(test_df) %in% 'favorite'])
predicted_class <- ifelse(nn.predict[,1] >.5, 1, 0)
confusion_matrix <- table(predicted = predicted_class, actual = test_df$favorite)
accuracy <-sum(diag(confusion_matrix)) / sum(confusion_matrix)
accuracy
## [1] 0.8969072

comparison

Both the logistic Regression and ANN model with different configuration achived accuracy of 89.69%.

netlayer <- c(10,20,5)
net.network <-neuralnet(favorite ~ . , train_df, hidden = netlayer, threshold =  0.01)

nn.predict <- predict(net.network, test_df[, !names(test_df) %in% 'favorite'])
predicted_class <- ifelse(nn.predict[,1] >.5, 1, 0)
confusion_matrix <- table(predicted = predicted_class, actual = test_df$favorite)
accuracy <-sum(diag(confusion_matrix)) / sum(confusion_matrix)
accuracy
## [1] 0.8969072