Dimensionality Reduction Analysis for RHMCD-20 Dataset

Introduction

This part of the project applies dimensionality reduction techniques to the RHMCD-20 dataset. The aim is to simplify the dataset by reducing its dimensions while retaining key information, enabling clearer visualization and interpretation of the underlying patterns. Techniques such as Principal Component Analysis (PCA) and t-distributed Stochastic Neighbor Embedding (t-SNE) are used to identify important features and uncover hidden relationships among variables like stress, coping mechanisms, and work-related factors. This analysis lays the foundation for further insights into mental health and enhances our understanding of its complex dynamics.

Load Libraries and Dataset

# Set a CRAN mirror
options(repos = c(CRAN = "https://cloud.r-project.org"))

# Force-install missing packages
required_packages <- c("tidyverse", "Rtsne", "FactoMineR", "factoextra", "dplyr", "ggplot2")
new_packages <- required_packages[!(required_packages %in% installed.packages()[, "Package"])]
if (length(new_packages)) install.packages(new_packages, dependencies = TRUE)

# Load necessary libraries
lapply(required_packages, library, character.only = TRUE)

## Warning: package 'tidyverse' was built under R version 4.4.2

## Warning: package 'ggplot2' was built under R version 4.4.2

## Warning: package 'readr' was built under R version 4.4.2

## Warning: package 'dplyr' was built under R version 4.4.2

## Warning: package 'forcats' was built under R version 4.4.2

## Warning: package 'lubridate' was built under R version 4.4.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

## Warning: package 'Rtsne' was built under R version 4.4.2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "Rtsne"     "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "FactoMineR" "Rtsne"      "lubridate"  "forcats"    "stringr"   
##  [6] "dplyr"      "purrr"      "readr"      "tidyr"      "tibble"    
## [11] "ggplot2"    "tidyverse"  "stats"      "graphics"   "grDevices" 
## [16] "utils"      "datasets"   "methods"    "base"      
## 
## [[4]]
##  [1] "factoextra" "FactoMineR" "Rtsne"      "lubridate"  "forcats"   
##  [6] "stringr"    "dplyr"      "purrr"      "readr"      "tidyr"     
## [11] "tibble"     "ggplot2"    "tidyverse"  "stats"      "graphics"  
## [16] "grDevices"  "utils"      "datasets"   "methods"    "base"      
## 
## [[5]]
##  [1] "factoextra" "FactoMineR" "Rtsne"      "lubridate"  "forcats"   
##  [6] "stringr"    "dplyr"      "purrr"      "readr"      "tidyr"     
## [11] "tibble"     "ggplot2"    "tidyverse"  "stats"      "graphics"  
## [16] "grDevices"  "utils"      "datasets"   "methods"    "base"      
## 
## [[6]]
##  [1] "factoextra" "FactoMineR" "Rtsne"      "lubridate"  "forcats"   
##  [6] "stringr"    "dplyr"      "purrr"      "readr"      "tidyr"     
## [11] "tibble"     "ggplot2"    "tidyverse"  "stats"      "graphics"  
## [16] "grDevices"  "utils"      "datasets"   "methods"    "base"

# Load the dataset
data <- read.csv("C:/Users/MAGWALI/Downloads/mental_health_finaldata_1 (1).csv")

# Display the first few rows
head(data)

##        Age Gender Occupation       Days_Indoors Growing_Stress
## 1    20-25 Female  Corporate          1-14 days            Yes
## 2 30-Above   Male     Others         31-60 days            Yes
## 3 30-Above Female    Student   Go out Every day             No
## 4    25-30   Male     Others          1-14 days            Yes
## 5    16-20 Female    Student More than 2 months            Yes
## 6    25-30   Male  Housewife More than 2 months             No
##   Quarantine_Frustrations Changes_Habits Mental_Health_History Weight_Change
## 1                     Yes             No                   Yes           Yes
## 2                     Yes          Maybe                    No            No
## 3                      No            Yes                    No            No
## 4                      No          Maybe                    No         Maybe
## 5                     Yes            Yes                    No           Yes
## 6                     Yes            Yes                   Yes           Yes
##   Mood_Swings Coping_Struggles Work_Interest Social_Weakness
## 1      Medium               No            No             Yes
## 2        High               No            No             Yes
## 3      Medium              Yes         Maybe              No
## 4      Medium               No         Maybe             Yes
## 5      Medium              Yes         Maybe              No
## 6      Medium               No         Maybe           Maybe

# Check the structure and dimensions of the dataset
str(data)

## 'data.frame':    824 obs. of  13 variables:
##  $ Age                    : chr  "20-25" "30-Above" "30-Above" "25-30" ...
##  $ Gender                 : chr  "Female" "Male" "Female" "Male" ...
##  $ Occupation             : chr  "Corporate" "Others" "Student" "Others" ...
##  $ Days_Indoors           : chr  "1-14 days" "31-60 days" "Go out Every day" "1-14 days" ...
##  $ Growing_Stress         : chr  "Yes" "Yes" "No" "Yes" ...
##  $ Quarantine_Frustrations: chr  "Yes" "Yes" "No" "No" ...
##  $ Changes_Habits         : chr  "No" "Maybe" "Yes" "Maybe" ...
##  $ Mental_Health_History  : chr  "Yes" "No" "No" "No" ...
##  $ Weight_Change          : chr  "Yes" "No" "No" "Maybe" ...
##  $ Mood_Swings            : chr  "Medium" "High" "Medium" "Medium" ...
##  $ Coping_Struggles       : chr  "No" "No" "Yes" "No" ...
##  $ Work_Interest          : chr  "No" "No" "Maybe" "Maybe" ...
##  $ Social_Weakness        : chr  "Yes" "Yes" "No" "Yes" ...

dim(data)

## [1] 824  13

Data Preprocessing

Handle Missing Values

# Check for missing values
cat("Missing values per column:\n")

## Missing values per column:

print(colSums(is.na(data)))

##                     Age                  Gender              Occupation 
##                       0                       0                       0 
##            Days_Indoors          Growing_Stress Quarantine_Frustrations 
##                       0                       0                       0 
##          Changes_Habits   Mental_Health_History           Weight_Change 
##                       0                       0                       0 
##             Mood_Swings        Coping_Struggles           Work_Interest 
##                       0                       0                       0 
##         Social_Weakness 
##                       0

# Impute missing values (if any) with median for numeric and mode for categorical
impute_mode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}

for (col in names(data)) {
  if (is.numeric(data[[col]])) {
    data[[col]][is.na(data[[col]])] <- median(data[[col]], na.rm = TRUE)
  } else {
    data[[col]][is.na(data[[col]])] <- impute_mode(data[[col]])
  }
}

# Confirm no missing values remain
cat("Missing values after imputation:\n")

## Missing values after imputation:

print(colSums(is.na(data)))

##                     Age                  Gender              Occupation 
##                       0                       0                       0 
##            Days_Indoors          Growing_Stress Quarantine_Frustrations 
##                       0                       0                       0 
##          Changes_Habits   Mental_Health_History           Weight_Change 
##                       0                       0                       0 
##             Mood_Swings        Coping_Struggles           Work_Interest 
##                       0                       0                       0 
##         Social_Weakness 
##                       0

Encode Categorical Variables

# Convert categorical variables to factors
data <- data %>% mutate(across(where(is.character), as.factor))

# Encode factors to numeric
cat("Encoding categorical variables as numeric.\n")

## Encoding categorical variables as numeric.

data_encoded <- data %>% mutate(across(where(is.factor), as.numeric))

# Check structure and ensure no rows are dropped
str(data_encoded)

## 'data.frame':    824 obs. of  13 variables:
##  $ Age                    : num  2 4 4 3 1 3 1 3 4 2 ...
##  $ Gender                 : num  1 2 1 2 1 2 1 1 2 2 ...
##  $ Occupation             : num  2 4 5 4 5 3 1 5 4 2 ...
##  $ Days_Indoors           : num  1 3 4 1 5 5 4 1 4 4 ...
##  $ Growing_Stress         : num  3 3 2 3 3 2 3 3 3 1 ...
##  $ Quarantine_Frustrations: num  3 3 2 2 3 3 3 2 3 1 ...
##  $ Changes_Habits         : num  2 1 3 1 3 3 1 1 3 3 ...
##  $ Mental_Health_History  : num  3 2 2 2 2 3 2 1 2 3 ...
##  $ Weight_Change          : num  3 2 2 1 3 3 3 1 3 3 ...
##  $ Mood_Swings            : num  3 1 3 3 3 3 2 1 3 2 ...
##  $ Coping_Struggles       : num  1 1 2 1 2 1 1 1 2 1 ...
##  $ Work_Interest          : num  2 2 1 1 1 1 1 2 1 1 ...
##  $ Social_Weakness        : num  3 3 2 3 2 1 1 3 1 2 ...

dim(data_encoded)

## [1] 824  13

Normalize Numeric Features

# Normalize numeric columns
normalize <- function(x) {
  return((x - min(x)) / (max(x) - min(x)))
}
data_normalized <- data_encoded %>% mutate(across(where(is.numeric), normalize))

# Check structure and dimensions after normalization
cat("Dataset dimensions after normalization:\n")

## Dataset dimensions after normalization:

dim(data_normalized)

## [1] 824  13

head(data_normalized)

##         Age Gender Occupation Days_Indoors Growing_Stress
## 1 0.3333333      0       0.25         0.00            1.0
## 2 1.0000000      1       0.75         0.50            1.0
## 3 1.0000000      0       1.00         0.75            0.5
## 4 0.6666667      1       0.75         0.00            1.0
## 5 0.0000000      0       1.00         1.00            1.0
## 6 0.6666667      1       0.50         1.00            0.5
##   Quarantine_Frustrations Changes_Habits Mental_Health_History Weight_Change
## 1                     1.0            0.5                   1.0           1.0
## 2                     1.0            0.0                   0.5           0.5
## 3                     0.5            1.0                   0.5           0.5
## 4                     0.5            0.0                   0.5           0.0
## 5                     1.0            1.0                   0.5           1.0
## 6                     1.0            1.0                   1.0           1.0
##   Mood_Swings Coping_Struggles Work_Interest Social_Weakness
## 1           1                0           0.5             1.0
## 2           0                0           0.5             1.0
## 3           1                1           0.0             0.5
## 4           1                0           0.0             1.0
## 5           1                1           0.0             0.5
## 6           1                0           0.0             0.0