#RakshithVijayR_2023MDTS07ALA019_Submitted to Dr. K A Venkatesh_MachineLearning2_3rd SEM_AllianceUniversity_17 Sep 2024
The Human Motion Primitives (HMP) Dataset contains accelerometer data that captures various human motion activities. Each data file records three-dimensional accelerometer readings (X, Y, and Z axes) corresponding to specific human movements like walking, running, jumping, etc. The dataset is often used for human activity recognition tasks and is valuable in motion analysis, health monitoring, and wearable device research.
library(dplyr) # Data manipulation
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2) # Data visualization
library(gridExtra) # For arranging plots
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(reshape2) # For reshaping data for correlation matrix
library(cluster) # Clustering and distance calculations
library(factoextra) # For enhanced silhouette plots
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
options(warn = -1)
load_data <- function(file) {
data <- tryCatch({
read.table(file, header = FALSE, sep = " ", fill = TRUE, strip.white = TRUE)
}, error = function(e) {
warning(paste("Error in reading file:", basename(file), "- skipping"))
return(NULL)
})
if (!is.null(data) && ncol(data) >= 3) {
data <- data[, 1:3] # Only take the first 3 columns
colnames(data) <- c("X", "Y", "Z") # Name the columns
data$label <- basename(file) # Add a label column with the file name
return(data)
} else {
warning(paste("Skipping file:", basename(file), "- unexpected number of columns:", ncol(data)))
return(NULL)
}
}
folders <- c("Liedown_bed", "Use_telephone", "Eat_soup", "Eat_meat", "Descend_stairs",
"Comb_hair", "Brush_teeth", "Walk_MODEL", "Walk", "Standup_chair_MODEL",
"Standup_chair", "Sitdown_chair_MODEL", "Sitdown_chair", "Pour_water_MODEL",
"Pour_water", "Getup_bed_MODEL", "Getup_bed", "Drink_glass_MODEL",
"Drink_glass", "Climb_stairs_MODEL", "Climb_stairs")
data_list <- list()
for (folder in folders) {
folder_path <- paste0("C:/Users/raksh/Downloads/HMP_Dataset/", folder)
file_list <- list.files(path = folder_path, pattern = "*.txt", full.names = TRUE)
folder_data <- bind_rows(lapply(file_list, load_data))
data_list[[folder]] <- folder_data
}
hmp_data <- bind_rows(data_list)
if (any(is.na(hmp_data$X) | is.na(hmp_data$Y) | is.na(hmp_data$Z))) {
print("Data contains NA values. Inspecting rows with NA values...")
problematic_rows <- hmp_data %>% filter(is.na(X) | is.na(Y) | is.na(Z))
print(head(problematic_rows))
}
hmp_data <- hmp_data %>%
mutate(across(c(X, Y, Z), ~ as.numeric(.))) %>%
mutate(across(c(X, Y, Z), ~ ifelse(is.na(.), mean(., na.rm = TRUE), .)))
normalize <- function(x) {
range_x <- max(x, na.rm = TRUE) - min(x, na.rm = TRUE)
if (range_x == 0) return(rep(NA, length(x))) # Avoid division by zero
(x - min(x, na.rm = TRUE)) / range_x
}
hmp_data_norm <- hmp_data %>%
mutate(across(c(X, Y, Z), normalize))
summary(hmp_data)
## X Y Z label
## Min. : 0.00 Min. : 0.00 Min. : 0.0 Length:479289
## 1st Qu.:13.00 1st Qu.:35.00 1st Qu.:35.0 Class :character
## Median :25.00 Median :38.00 Median :42.0 Mode :character
## Mean :24.58 Mean :38.18 Mean :41.9
## 3rd Qu.:34.00 3rd Qu.:42.00 3rd Qu.:50.0
## Max. :63.00 Max. :63.00 Max. :63.0
p1 <- ggplot(hmp_data, aes(x = X)) +
geom_histogram(bins = 50, fill = "blue", alpha = 0.7) +
labs(title = "Distribution of X values", x = "X", y = "Count")
p2 <- ggplot(hmp_data, aes(x = Y)) +
geom_histogram(bins = 50, fill = "green", alpha = 0.7) +
labs(title = "Distribution of Y values", x = "Y", y = "Count")
p3 <- ggplot(hmp_data, aes(x = Z)) +
geom_histogram(bins = 50, fill = "red", alpha = 0.7) +
labs(title = "Distribution of Z values", x = "Z", y = "Count")
grid.arrange(p1, p2, p3, ncol = 3)
# 2.3. Boxplots to check for outliers in X, Y, Z values
boxplot_X <- ggplot(hmp_data, aes(y = X)) +
geom_boxplot(fill = "blue") +
labs(title = "Boxplot of X values", y = "X")
boxplot_Y <- ggplot(hmp_data, aes(y = Y)) +
geom_boxplot(fill = "green") +
labs(title = "Boxplot of Y values", y = "Y")
boxplot_Z <- ggplot(hmp_data, aes(y = Z)) +
geom_boxplot(fill = "red") +
labs(title = "Boxplot of Z values", y = "Z")
grid.arrange(boxplot_X, boxplot_Y, boxplot_Z, ncol = 3)
# 2.4. Correlation Analysis between X, Y, Z axes
cor_matrix <- cor(hmp_data[, c("X", "Y", "Z")], use = "complete.obs")
print(cor_matrix)
## X Y Z
## X 1.0000000 -0.0278245 0.6131834
## Y -0.0278245 1.0000000 -0.1580859
## Z 0.6131834 -0.1580859 1.0000000
ggplot(melt(cor_matrix), aes(Var1, Var2, fill = value)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0) +
labs(title = "Correlation Heatmap", x = "Axis", y = "Axis", fill = "Correlation") +
theme_minimal()
## 3) K-means Clustering # WSS (Within Sum of Squares) calculation to
find the optimal number of clusters
wss <- sapply(1:15, function(k) {
kmeans_result <- tryCatch({
kmeans(hmp_data_norm[, c("X", "Y", "Z")], centers = k, nstart = 20)
}, error = function(e) {
warning(paste("K-means failed for k =", k))
return(NULL)
})
if (!is.null(kmeans_result)) {
kmeans_result$tot.withinss
} else {
NA
}
})
valid_indices <- !is.na(wss)
wss <- wss[valid_indices]
k_values <- 1:15
k_values <- k_values[valid_indices]
if(length(wss) > 0 && length(k_values) > 0) {
plot(k_values, wss, type = "b", pch = 19, frame = FALSE,
xlab = "Number of clusters (K)",
ylab = "Total within-cluster sum of squares")
} else {
warning("No valid WSS values for plotting. Check the data and k-means results.")
}
# Perform K-means clustering with the optimal number of clusters
(assuming k = 4 here)
optimal_k <- 4
kmeans_result <- kmeans(hmp_data_norm[, c("X", "Y", "Z")], centers = optimal_k, nstart = 20)
hmp_data_norm$cluster <- as.factor(kmeans_result$cluster)
ggplot(hmp_data_norm, aes(x = X, y = Y, color = cluster)) +
geom_point() +
ggtitle("Clusters in 2D Space") +
scale_color_discrete(name = "Cluster") +
labs(x = "X Axis", y = "Y Axis")
# Compute the distance matrix # Randomly sample 10% of the data for
clustering
set.seed(123) # For reproducibility
sample_size <- floor(0.1 * nrow(hmp_data_norm))
sample_indices <- sample(seq_len(nrow(hmp_data_norm)), size = sample_size)
sampled_data <- hmp_data_norm[sample_indices, ]
dist_matrix <- dist(sampled_data[, c("X", "Y", "Z")])
hclust_result <- hclust(dist_matrix, method = "complete")
plot(hclust_result, main = "Dendrogram of Hierarchical Clustering (Sampled Data)")
# Cut the dendrogram to create clusters
hclust_clusters <- cutree(hclust_result, k = optimal_k)
sampled_data$hclust_cluster <- as.factor(hclust_clusters)
ggplot(sampled_data, aes(x = X, y = Y, color = hclust_cluster)) +
geom_point() +
ggtitle("Hierarchical Clustering in 2D Space (Sampled Data)") +
scale_color_discrete(name = "Cluster") +
labs(x = "X Axis", y = "Y Axis")
# Restore default warning behavior
options(warn = 0)