Preprocessing and EDA
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'lubridate' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
image_data <- read.csv('C:\\Alliance University\\SEM 3\\ML2\\Puneetha_Submission\\grayscale_image_data.csv')
# Check for any missing values
sum(is.na(image_data))
## [1] 0
# Convert 'label' column to a factor
image_data$label <- as.factor(image_data$label)
# Scale the columns V1 to V10000
preprocess_params <- preProcess(image_data[, 1:10000], method = c("center", "scale"))
image_data_scaled <- predict(preprocess_params, image_data[, 1:10000])
image_data <- cbind(image_data_scaled, label = image_data$label)
#Summary statistics
selected_columns <- image_data[, c(1:5, ncol(image_data))]
summary(selected_columns)
## V1 V2 V3 V4
## Min. :-2.2703 Min. :-2.3135 Min. :-2.3054 Min. :-2.3364
## 1st Qu.:-0.9093 1st Qu.:-0.8668 1st Qu.:-0.8252 1st Qu.:-0.8797
## Median : 0.4768 Median : 0.4276 Median : 0.5032 Median : 0.4237
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.8888 3rd Qu.: 0.8971 3rd Qu.: 0.8953 3rd Qu.: 0.8965
## Max. : 0.9138 Max. : 0.9225 Max. : 0.9207 Max. : 0.9220
## V5 label
## Min. :-2.3110 avocado :116
## 1st Qu.:-0.8400 banana :142
## Median : 0.4155 bitter_gourd:139
## Mean : 0.0000 grape :144
## 3rd Qu.: 0.8974
## Max. : 0.9228
# Plot label distribution
ggplot(image_data, aes(x = label)) +
geom_bar() +
theme_minimal() +
labs(title = "Distribution of Labels", x = "Label", y = "Count")
The bar plot suggests that labels like “grape” and “bitter gourd” have
more instances than others.
#Visualize a grayscale image
visualize_image <- function(image_row) {
image_matrix <- matrix(as.numeric(image_row), nrow = 100, ncol = 100, byrow = TRUE)
image(1:100, 1:100, t(image_matrix)[, nrow(image_matrix):1], col = gray.colors(256), axes = FALSE)
}
par(mfrow = c(2, 2)) # 2x2 grid of images
for (i in 1:4) {
visualize_image(image_data[i, 1:10000])
title(main = image_data$label[i])
}
# Correlation Matrix
subset_data <- image_data[, 1:50]
corrplot::corrplot(cor(subset_data), method = "color", type = "upper", tl.cex = 0.6)
PCA
library(tidyverse)
library(caret)
image_data$label <- as.factor(image_data$label)
preprocess_params <- preProcess(image_data[, 1:10000], method = c("center", "scale"))
image_data_scaled <- predict(preprocess_params, image_data[, 1:10000])
pca_result <- prcomp(image_data_scaled, center = TRUE, scale. = TRUE)
plot(pca_result, type = "l", main = "Scree Plot - Variance Explained by PCs")
explained_variance <- cumsum(pca_result$sdev^2 / sum(pca_result$sdev^2))
n_components <- which(explained_variance > 0.90)[1]
cat("Number of components for 90% variance explained:", n_components, "\n")
## Number of components for 90% variance explained: 82
pca_data <- as.data.frame(pca_result$x[, 1:n_components])
pca_data$label <- image_data$label
PCA was applied to reduce the dimensionality of the data, keeping 82
components to explain 90% of the variance. This step reduces the
computational complexity of the data, making it more manageable for
further analysis.
A scree plot was used to visualize the variance explained by each
principal component, helping to decide how many components to
retain.
Logistic Regression
library(tidyverse)
library(caret)
library(nnet)
library(MLmetrics)
## Warning: package 'MLmetrics' was built under R version 4.3.3
##
## Attaching package: 'MLmetrics'
## The following objects are masked from 'package:caret':
##
## MAE, RMSE
## The following object is masked from 'package:base':
##
## Recall
# The 'label' column is the target, and the rest are the features
# Split training and testing sets
set.seed(123) # For reproducibility
trainIndex <- createDataPartition(pca_data$label, p = 0.7, list = FALSE)
train_data <- pca_data[trainIndex, ]
test_data <- pca_data[-trainIndex, ]
# Logistic regression model
logistic_model <- multinom(label ~ ., data = train_data)
## # weights: 336 (249 variable)
## initial value 528.178152
## iter 10 value 330.530315
## iter 20 value 288.811501
## iter 30 value 284.083106
## iter 40 value 280.197457
## iter 50 value 269.437346
## iter 60 value 251.048568
## iter 70 value 244.313666
## iter 80 value 237.552415
## iter 90 value 224.899849
## iter 100 value 220.146428
## final value 220.146428
## stopped after 100 iterations
predictions <- predict(logistic_model, newdata = test_data)
confusionMatrix(predictions, test_data$label)
## Confusion Matrix and Statistics
##
## Reference
## Prediction avocado banana bitter_gourd grape
## avocado 19 9 10 4
## banana 6 27 3 4
## bitter_gourd 4 2 20 8
## grape 5 4 8 27
##
## Overall Statistics
##
## Accuracy : 0.5812
## 95% CI : (0.5008, 0.6587)
## No Information Rate : 0.2688
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.4418
##
## Mcnemar's Test P-Value : 0.7463
##
## Statistics by Class:
##
## Class: avocado Class: banana Class: bitter_gourd
## Sensitivity 0.5588 0.6429 0.4878
## Specificity 0.8175 0.8898 0.8824
## Pos Pred Value 0.4524 0.6750 0.5882
## Neg Pred Value 0.8729 0.8750 0.8333
## Prevalence 0.2125 0.2625 0.2562
## Detection Rate 0.1187 0.1688 0.1250
## Detection Prevalence 0.2625 0.2500 0.2125
## Balanced Accuracy 0.6881 0.7663 0.6851
## Class: grape
## Sensitivity 0.6279
## Specificity 0.8547
## Pos Pred Value 0.6136
## Neg Pred Value 0.8621
## Prevalence 0.2687
## Detection Rate 0.1688
## Detection Prevalence 0.2750
## Balanced Accuracy 0.7413
accuracy <- mean(predictions == test_data$label)
cat("Accuracy:", accuracy, "\n")
## Accuracy: 0.58125
precision <- Precision(test_data$label, predictions)
recall <- Recall(test_data$label, predictions)
f1 <- F1_Score(test_data$label, predictions)
cat("Precision:", precision, "\n")
## Precision: 0.452381
cat("Recall:", recall, "\n")
## Recall: 0.5588235
cat("F1-Score:", f1, "\n")
## F1-Score: 0.5
A logistic regression model was applied to the PCA-transformed data for multi-class classification. The model achieved an accuracy of around 54%, with moderate precision and recall values for different classes. The confusion matrix shows where the model performed well and where it struggled, such as classifying “banana” with high accuracy but “bitter_gourd” with lower performance.
Clustering
Clusterability
if (!require(hopkins)) {
install.packages("hopkins")
}
## Loading required package: hopkins
## Warning: package 'hopkins' was built under R version 4.3.3
library(hopkins)
hopkins_stat_value <- hopkins(pca_data[, -ncol(pca_data)])
cat("Hopkins Statistic:", hopkins_stat_value, "\n")
## Hopkins Statistic: 1
K-Means
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.3.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)
## Warning: package 'cluster' was built under R version 4.3.3
data <- read.csv('C:\\Alliance University\\SEM 3\\ML2\\Puneetha_Submission\\grayscale_image_data.csv')
data_features <- data[, -ncol(data)] # All columns except the label
# Elbow method
set.seed(123)
fviz_nbclust(data_features, kmeans, method = "wss") +
labs(title = "Elbow Plot", x = "Number of Clusters (k)", y = "Total Within-Cluster Sum of Squares")
# Perform K-Means clustering with optimal k (based on the elbow plot)
set.seed(123)
optimal_k <- 2
kmeans_model <- kmeans(data_features, centers = optimal_k, nstart = 25)
data$cluster <- as.factor(kmeans_model$cluster)
# Silhouette Score
silhouette_score <- silhouette(kmeans_model$cluster, dist(data_features))
fviz_silhouette(silhouette_score) +
labs(title = "Silhouette Plot")
## cluster size ave.sil.width
## 1 1 309 0.24
## 2 2 232 0.31
cat("Average Silhouette Width:", mean(silhouette_score[, 3]), "\n")
## Average Silhouette Width: 0.2708388
Hopkins statistic was calculated, which returned a value of 1, indicating that the data is not clusterable. Despite this, K-Means clustering was performed, and the resulting silhouette score was low (~0.2), suggesting poor clustering quality. K-Means clustering with k=4, but given the low silhouette score, these clusters are not well-separated.
Neural Networks
library(caret)
library(neuralnet)
## Warning: package 'neuralnet' was built under R version 4.3.3
##
## Attaching package: 'neuralnet'
## The following object is masked from 'package:dplyr':
##
## compute
library(ggplot2)
fit_and_evaluate_nn <- function(neurons) {
pca_data <- as.data.frame(pca_data)
pca_data[] <- lapply(pca_data, function(x) {
if (is.factor(x)) {
return(as.numeric(as.character(x)))
} else {
return(x)
}
})
# Split data into training and testing sets
set.seed(123)
train_index <- createDataPartition(pca_data[, ncol(pca_data)], p = 0.8, list = FALSE)
train_data <- pca_data[train_index, ]
test_data <- pca_data[-train_index, ]
# Scale the predictor variables (excluding the target variable)
train_data_scaled <- as.data.frame(scale(train_data[, -ncol(train_data)])) # Scale features only
train_data_scaled$label <- train_data[, ncol(train_data)] # Add label back
set.seed(1)
nn_model <- neuralnet(label ~ ., data = train_data_scaled, hidden = neurons, linear.output = FALSE)
print(class(nn_model))
plot(nn_model)
test_data_scaled <- as.data.frame(scale(test_data[, -ncol(test_data)])) # Scale features only
test_data_scaled$label <- test_data[, ncol(test_data)] # Add label back
nn_predictions <- neuralnet::compute(nn_model, test_data_scaled[, -ncol(test_data_scaled)])$net.result
print("NN Predictions (raw):")
print(nn_predictions)
predicted_labels <- max.col(nn_predictions) # Get the index of the max probability for each prediction
predicted_labels <- factor(predicted_labels, levels = 1:length(levels(test_data$label))) # Adjust the factor levels
predicted_labels <- factor(predicted_labels, levels = levels(test_data_scaled$label))
cat("Predicted Levels:", levels(predicted_labels), "\n")
cat("Actual Levels:", levels(test_data_scaled$label), "\n")
confusion_matrix <- confusionMatrix(predicted_labels, test_data_scaled$label)
print(confusion_matrix)
performance_df <- data.frame(
Actual = test_data_scaled$label,
Predicted = predicted_labels
)
ggplot(performance_df, aes(x = Actual, fill = as.factor(Predicted))) +
geom_bar(position = "dodge") +
labs(title = paste("Actual vs Predicted with", neurons, "Neurons"),
x = "Actual Label",
y = "Count",
fill = "Predicted Label") +
theme_minimal()
}
Constructed and visualized neural networks with 5, 10, and 20 neurons
in a single hidden layer. These models, however, returned low accuracy
(~25.93%), suggesting that the neural networks struggled with the given
data.
The confusion matrices show that the models tend to predict only one
class, indicating a significant performance issue.
Logistic Regression typically provides a straightforward
interpretation of results and is less prone to overfitting, especially
with smaller datasets. Neural Networks, while flexible and powerful, may
struggle to generalize well on limited data, especially when they have a
larger number of parameters relative to the number of training
samples.
Logistic Regression is a simpler model that fits well on datasets with
linear relationships between the features and the target. In cases where
the underlying relationship is linear or nearly linear, Logistic
Regression can be very effective.
In conclusion, while Neural Networks are powerful models capable of
capturing complex relationships in data, they require larger datasets
and careful tuning to avoid overfitting and ensure generalizability.
Logistic Regression, with its simplicity and interpretability, can
outperform more complex models in scenarios with smaller datasets or
when the relationships in the data are primarily linear.