#RakshithVijayR_2023MDTS07ALA019_Submitted to Dr. K A Venkatesh_MachineLearning2_3rd SEM_AllianceUniversity_17 Sep 2024

1) Small Writeup on the Dataset

The Human Motion Primitives (HMP) Dataset contains accelerometer data that captures various human motion activities. Each data file records three-dimensional accelerometer readings (X, Y, and Z axes) corresponding to specific human movements like walking, running, jumping, etc. The dataset is often used for human activity recognition tasks and is valuable in motion analysis, health monitoring, and wearable device research.

Load required libraries

library(dplyr)       # Data manipulation
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)     # Data visualization
library(gridExtra)   # For arranging plots
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(reshape2)    # For reshaping data for correlation matrix
library(cluster)     # Clustering and distance calculations
library(factoextra)  # For enhanced silhouette plots
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

Suppress warnings globally

options(warn = -1)

Function to handle different column numbers and irregular lines

load_data <- function(file) {
  data <- tryCatch({
    read.table(file, header = FALSE, sep = " ", fill = TRUE, strip.white = TRUE)
  }, error = function(e) {
    warning(paste("Error in reading file:", basename(file), "- skipping"))
    return(NULL)
  })
  
  if (!is.null(data) && ncol(data) >= 3) {
    data <- data[, 1:3]  # Only take the first 3 columns
    colnames(data) <- c("X", "Y", "Z")  # Name the columns
    data$label <- basename(file)  # Add a label column with the file name
    return(data)
  } else {
    warning(paste("Skipping file:", basename(file), "- unexpected number of columns:", ncol(data)))
    return(NULL)
  }
}

Load data from all folders

folders <- c("Liedown_bed", "Use_telephone", "Eat_soup", "Eat_meat", "Descend_stairs",
             "Comb_hair", "Brush_teeth", "Walk_MODEL", "Walk", "Standup_chair_MODEL",
             "Standup_chair", "Sitdown_chair_MODEL", "Sitdown_chair", "Pour_water_MODEL",
             "Pour_water", "Getup_bed_MODEL", "Getup_bed", "Drink_glass_MODEL", 
             "Drink_glass", "Climb_stairs_MODEL", "Climb_stairs")

data_list <- list()

Iterate through each folder and load files

for (folder in folders) {
  folder_path <- paste0("C:/Users/raksh/Downloads/HMP_Dataset/", folder)
  file_list <- list.files(path = folder_path, pattern = "*.txt", full.names = TRUE)
  
  folder_data <- bind_rows(lapply(file_list, load_data))
  data_list[[folder]] <- folder_data
}

Combine all data into one data frame

hmp_data <- bind_rows(data_list)

Check for and handle NA values

if (any(is.na(hmp_data$X) | is.na(hmp_data$Y) | is.na(hmp_data$Z))) {
  print("Data contains NA values. Inspecting rows with NA values...")
  problematic_rows <- hmp_data %>% filter(is.na(X) | is.na(Y) | is.na(Z))
  print(head(problematic_rows))
}

Convert X, Y, Z columns to numeric (handling NAs)

hmp_data <- hmp_data %>%
  mutate(across(c(X, Y, Z), ~ as.numeric(.))) %>%
  mutate(across(c(X, Y, Z), ~ ifelse(is.na(.), mean(., na.rm = TRUE), .)))

Normalizing the data (min-max scaling)

normalize <- function(x) {
  range_x <- max(x, na.rm = TRUE) - min(x, na.rm = TRUE)
  if (range_x == 0) return(rep(NA, length(x)))  # Avoid division by zero
  (x - min(x, na.rm = TRUE)) / range_x
}

hmp_data_norm <- hmp_data %>%
  mutate(across(c(X, Y, Z), normalize))

2) Exploratory Data Analysis (EDA)

2.1. Summary Statistics

summary(hmp_data)
##        X               Y               Z           label          
##  Min.   : 0.00   Min.   : 0.00   Min.   : 0.0   Length:479289     
##  1st Qu.:13.00   1st Qu.:35.00   1st Qu.:35.0   Class :character  
##  Median :25.00   Median :38.00   Median :42.0   Mode  :character  
##  Mean   :24.58   Mean   :38.18   Mean   :41.9                     
##  3rd Qu.:34.00   3rd Qu.:42.00   3rd Qu.:50.0                     
##  Max.   :63.00   Max.   :63.00   Max.   :63.0

2.2. Distributions of X, Y, Z values

p1 <- ggplot(hmp_data, aes(x = X)) + 
  geom_histogram(bins = 50, fill = "blue", alpha = 0.7) + 
  labs(title = "Distribution of X values", x = "X", y = "Count")

p2 <- ggplot(hmp_data, aes(x = Y)) + 
  geom_histogram(bins = 50, fill = "green", alpha = 0.7) + 
  labs(title = "Distribution of Y values", x = "Y", y = "Count")

p3 <- ggplot(hmp_data, aes(x = Z)) + 
  geom_histogram(bins = 50, fill = "red", alpha = 0.7) + 
  labs(title = "Distribution of Z values", x = "Z", y = "Count")

grid.arrange(p1, p2, p3, ncol = 3)

# 2.3. Boxplots to check for outliers in X, Y, Z values

boxplot_X <- ggplot(hmp_data, aes(y = X)) +
  geom_boxplot(fill = "blue") + 
  labs(title = "Boxplot of X values", y = "X")

boxplot_Y <- ggplot(hmp_data, aes(y = Y)) +
  geom_boxplot(fill = "green") + 
  labs(title = "Boxplot of Y values", y = "Y")

boxplot_Z <- ggplot(hmp_data, aes(y = Z)) +
  geom_boxplot(fill = "red") + 
  labs(title = "Boxplot of Z values", y = "Z")

grid.arrange(boxplot_X, boxplot_Y, boxplot_Z, ncol = 3)

# 2.4. Correlation Analysis between X, Y, Z axes

cor_matrix <- cor(hmp_data[, c("X", "Y", "Z")], use = "complete.obs")
print(cor_matrix)
##            X          Y          Z
## X  1.0000000 -0.0278245  0.6131834
## Y -0.0278245  1.0000000 -0.1580859
## Z  0.6131834 -0.1580859  1.0000000

2.5. Heatmap of Correlation Matrix

ggplot(melt(cor_matrix), aes(Var1, Var2, fill = value)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0) +
  labs(title = "Correlation Heatmap", x = "Axis", y = "Axis", fill = "Correlation") +
  theme_minimal()

## 3) K-means Clustering # WSS (Within Sum of Squares) calculation to find the optimal number of clusters

wss <- sapply(1:15, function(k) {
  kmeans_result <- tryCatch({
    kmeans(hmp_data_norm[, c("X", "Y", "Z")], centers = k, nstart = 20)
  }, error = function(e) {
    warning(paste("K-means failed for k =", k))
    return(NULL)
  })
  
  if (!is.null(kmeans_result)) {
    kmeans_result$tot.withinss
  } else {
    NA
  }
})

Remove NA values in WSS for plotting

valid_indices <- !is.na(wss)
wss <- wss[valid_indices]
k_values <- 1:15
k_values <- k_values[valid_indices]

Plot the elbow curve to determine the optimal number of clusters

if(length(wss) > 0 && length(k_values) > 0) {
  plot(k_values, wss, type = "b", pch = 19, frame = FALSE,
       xlab = "Number of clusters (K)",
       ylab = "Total within-cluster sum of squares")
} else {
  warning("No valid WSS values for plotting. Check the data and k-means results.")
}

# Perform K-means clustering with the optimal number of clusters (assuming k = 4 here)

optimal_k <- 4
kmeans_result <- kmeans(hmp_data_norm[, c("X", "Y", "Z")], centers = optimal_k, nstart = 20)

Add cluster assignments to the data frame

hmp_data_norm$cluster <- as.factor(kmeans_result$cluster)

Plotting clusters in 2D space

ggplot(hmp_data_norm, aes(x = X, y = Y, color = cluster)) +
  geom_point() +
  ggtitle("Clusters in 2D Space") +
  scale_color_discrete(name = "Cluster") +
  labs(x = "X Axis", y = "Y Axis")

# Compute the distance matrix # Randomly sample 10% of the data for clustering

set.seed(123)  # For reproducibility
sample_size <- floor(0.1 * nrow(hmp_data_norm))
sample_indices <- sample(seq_len(nrow(hmp_data_norm)), size = sample_size)
sampled_data <- hmp_data_norm[sample_indices, ]

Compute the distance matrix on the sampled data

dist_matrix <- dist(sampled_data[, c("X", "Y", "Z")])

Perform hierarchical clustering on the sampled data

hclust_result <- hclust(dist_matrix, method = "complete")
plot(hclust_result, main = "Dendrogram of Hierarchical Clustering (Sampled Data)")

# Cut the dendrogram to create clusters

hclust_clusters <- cutree(hclust_result, k = optimal_k)
sampled_data$hclust_cluster <- as.factor(hclust_clusters)

Plotting hierarchical clusters in 2D space

ggplot(sampled_data, aes(x = X, y = Y, color = hclust_cluster)) +
  geom_point() +
  ggtitle("Hierarchical Clustering in 2D Space (Sampled Data)") +
  scale_color_discrete(name = "Cluster") +
  labs(x = "X Axis", y = "Y Axis")

# Restore default warning behavior

options(warn = 0)