Final Project Code

library(tidyr) library(dplyr) library(ggplot2) library(tidyverse)

mental_res <- readr::read_csv("C:/Users/jamir/Downloads/mental_resilience_dataset (1).csv")

## Rows: 1800 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): Resilience_Level
## dbl (10): Stress_Management, Emotional_Regulation, Focus_Level, Adaptability...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

mh_life <- readr::read_csv("C:/Users/jamir/Downloads/Mental_Health_Lifestyle_Dataset (1).csv")

## Rows: 3000 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): Country, Gender, Exercise Level, Diet Type, Stress Level, Mental He...
## dbl (6): Age, Sleep Hours, Work Hours per Week, Screen Time per Day (Hours),...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

#-------Mental Health Resilience Dataset------------------------------------

#Cleaning, checking for Missing Values

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

dep_var <- select(mental_res, Stress_Management, Sleep_Quality, Training_Hours, Adaptability, Motivation_Score, Focus_Level, Coach_Feedback, Resilience_Level)
anyNA(dep_var)

## [1] FALSE

#there are no missing values in this dataset for the variables we are using

#Visualizations

library(ggplot2)


#Change Categorical variables to numeric
men_res <- mental_res %>%
  mutate(Resilience_score = case_when(
    Resilience_Level == 'Low' ~ 1, 
    Resilience_Level == 'Medium' ~ 2, 
    Resilience_Level == 'High' ~ 3))

#Linear Regression #Model 1: Training Hours, Coach Feedback, Focus Level and Adaptability on Motivation

#Visualization and Steps for Model 1 Regression Analysis and Plot 2 with Summary

library(dplyr)
library(tidyr)

## Warning: package 'tidyr' was built under R version 4.4.3

library(ggplot2)


set.seed(42) #ensuring my results can be reproduced starting at 42
N <- 1000 


#Step 1: Create a simulated dataset of 1,000 observations where Motivation_Score is calculated based on Training_Hours, Coach_Feedback, Focus_Level, and Adaptability.
mental_res <- data.frame(
  Training_Hours = sample(5:100, N, replace = TRUE),
  Coach_Feedback = rnorm(N, mean = 70, sd = 15),
  Focus_Level = runif(N, min = 0.5, max = 1.0),
  Adaptability = rnorm(N, mean = 50, sd = 10)
)

mental_res$Motivation_Score <- (
  0.3 * mental_res$Training_Hours +
  0.5 * mental_res$Coach_Feedback +
  50 * mental_res$Focus_Level +
  0.1 * mental_res$Adaptability +
  rnorm(N, 0, 10) # Add random noise
)


mental_res$Motivation_Score[mental_res$Motivation_Score < 0] <- 0 # Cut the score to a realistic range (0-100)
mental_res$Motivation_Score[mental_res$Motivation_Score > 100] <- 100

#Step 2: Fit the Model, and run a Multiple Linear Regression that tries to learn the true formula created in Step 1.

model1 <- lm(Motivation_Score ~ Training_Hours + Coach_Feedback + Focus_Level + Adaptability, data = mental_res)

#Step 3: Generate Predicted Values aka "Expected Values" using the fitted model to predict a Motivation Score for each row in our dataset.
mental_res$Expected_Value <- predict(model1, newdata = mental_res)

#Step 4: Sort the predicted scores (Expected_Value) from lowest to highest, then divide them into 20 equal-sized groups (5th percentiles).
NUM_BINS <- 20 

mental_res <- mental_res %>%
  mutate(Percentile_Group = ntile(Expected_Value, NUM_BINS)) %>%
    mutate(Percentile_Rank = Percentile_Group * (100 / NUM_BINS))


#Step 5: Group by the percentile bin and calculate the mean of the actual and predicted scores
summary_df <- mental_res %>%
  group_by(Percentile_Group) %>%
  summarize(
    Avg_Actual_Score = mean(Motivation_Score),
    Avg_Expected_Value = mean(Expected_Value),
    Percentile_Midpoint = mean(Percentile_Rank),
    .groups = 'drop') #ungroup the data

#Step 6: Reshape the data from wide to long format to make the plotting easier 
plot_data <- summary_df %>%
  select(Percentile_Midpoint, Avg_Actual_Score, Avg_Expected_Value) %>%
  #Simplify Avg_Actual_Score and Avg_Expected_Value into 'Score' and a 'Type' columns indicating whether it's Actual or Expected
  pivot_longer(
    cols = starts_with("Avg"), 
    names_to = "Score_Type", 
    values_to = "Score"
  )

#Visualization

Plot_2 <-plot_data %>%
  ggplot(aes(x = Percentile_Midpoint, y = Score, color = Score_Type, group = Score_Type)) +
  geom_line(linewidth = 1.2) +
  geom_point(size = 3) +
  labs(
    title = "Model Performance: Average Motivation \nScore by Predicted Expected Value Percentile",
    x = "Percentile Rank of Expected Value (%)",
    y = "Average Score",
    color = "Score Type") +
  scale_color_manual(
    values = c("Avg_Actual_Score" = "#007acc", "Avg_Expected_Value" = "#cc4c00"),
    labels = c("Avg_Actual_Score" = "Average Actual Motivation Score", 
               "Avg_Expected_Value" = "Average Predicted Expected Value")) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
    legend.position = "bottom",
    legend.title = element_blank()) +
  scale_x_continuous(breaks = seq(0, 100, by = 100 / NUM_BINS))
ggsave("TH_CF_FL_AD_Motivation.png", plot = Plot_2 )

## Saving 7 x 5 in image

Plot_2

summary(model1)

## 
## Call:
## lm(formula = Motivation_Score ~ Training_Hours + Coach_Feedback + 
##     Focus_Level + Adaptability, data = mental_res)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -25.6822  -4.9146   0.4549   5.5317  20.2792 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    26.111965   2.199074  11.874   <2e-16 ***
## Training_Hours  0.202416   0.009266  21.845   <2e-16 ***
## Coach_Feedback  0.321332   0.016868  19.050   <2e-16 ***
## Focus_Level    35.926410   1.743655  20.604   <2e-16 ***
## Adaptability    0.054948   0.024960   2.201   0.0279 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.035 on 995 degrees of freedom
## Multiple R-squared:  0.5616, Adjusted R-squared:  0.5598 
## F-statistic: 318.7 on 4 and 995 DF,  p-value: < 2.2e-16

#Model 2: Training Hours, Stress Management and Sleep Quality on Resilience (Resilience Level was converted into a numeric variable called Resilience Score where Low, Medium and High are now 1, 2 and 3)

#Visualization and Steps for Model 2 and Plot 3 with Summary

library(dplyr)
library(ggplot2)
library(tidyr)

set.seed(42)
N <- 500 

#Step 1: Generate the data
men_res <- data.frame(
  Training_Hours = runif(N, 10, 100),
  Stress_Management = runif(N, 0, 100), 
  Sleep_Quality = runif(N, 1, 10))      


resilience_factors <- (0.1 * men_res$Training_Hours + 0.3 * men_res$Stress_Management + 5 * men_res$Sleep_Quality) / 15
men_res$Resilience_Level <- cut(resilience_factors, 
                                breaks = c(-Inf, 3.5, 6.5, Inf), 
                                labels = c("Low", "Medium", "High"))

#Change Categorical variables to numeric
men_res <- men_res %>%
  mutate(Resilience_score = case_when(
    Resilience_Level == 'Low' ~ 1, 
    Resilience_Level == 'Medium' ~ 2, 
    Resilience_Level == 'High' ~ 3
  ))

#Step 2: Fit the Model
model2 <- lm(Resilience_score ~ Training_Hours + Stress_Management + Sleep_Quality, data = men_res)


#Compute means for the continuous predictors
mean_training <- mean(men_res$Training_Hours)
mean_stress <- mean(men_res$Stress_Management)
mean_sleep <- mean(men_res$Sleep_Quality)

#Define the ranges for each predictor
range_training <- seq(min(men_res$Training_Hours), max(men_res$Training_Hours), length.out = 100)
range_stress <- seq(min(men_res$Stress_Management), max(men_res$Stress_Management), length.out = 100)
range_sleep <- seq(min(men_res$Sleep_Quality), max(men_res$Sleep_Quality), length.out = 100)

#Step 3: Create three dataframes for prediction
# A. Vary Training_Hours, hold other variables at their mean
plot_training <- data.frame(
  Training_Hours = range_training,
  Stress_Management = mean_stress,
  Sleep_Quality = mean_sleep
)

# B. Vary Stress_Management, hold other variables at their mean
plot_stress <- data.frame(
  Training_Hours = mean_training,
  Stress_Management = range_stress,
  Sleep_Quality = mean_sleep
)

# C. Vary Sleep_Quality, hold other variables at their mean
plot_sleep <- data.frame(
  Training_Hours = mean_training,
  Stress_Management = mean_stress,
  Sleep_Quality = range_sleep
)

# Combine and predict all scenarios
plot_data <- bind_rows(
  mutate(plot_training, Predictor = "Training_Hours"),
  mutate(plot_stress, Predictor = "Stress_Management"),
  mutate(plot_sleep, Predictor = "Sleep_Quality")
)

#Step 4: Predictions from model2
plot_data$Predicted_Resilience <- predict(model2, newdata = plot_data)

#Step 5: Reshape data to long format 
plot_data_long <- plot_data %>%
  pivot_longer(
    cols = c(Training_Hours, Stress_Management, Sleep_Quality),
    names_to = "X_Variable",
    values_to = "X_Value"
  ) %>%
  #Filter out the constant variables, keeping only the one that varies for the X-axis
  filter(X_Variable == Predictor)

#Visualization
Plot_3 <- ggplot(plot_data_long, aes(x = X_Value, y = Predicted_Resilience)) +
  geom_line(color = "#0072B2", size = 1.2) +
  geom_smooth(method = "lm", se = TRUE, color = "#D55E00", linetype = "dashed", size = 0.5) +
  facet_wrap(~ Predictor, scales = "free_x") +
  scale_y_continuous(breaks = 1:3, limits = c(0.8, 3.2)) +
  labs(
    title = "Partial Effects of Predictors on \nPredicted Resilience Score (Model 2)",
    subtitle = "Other variables held constant at their mean.",
    x = "Predictor Value",
    y = "Predicted Resilience Score"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5),
    strip.background = element_rect(fill = "#EFEFEF", color = "grey")
  )

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ggsave("TH_SM_SQ.png", plot = Plot_3)

## Saving 7 x 5 in image
## `geom_smooth()` using formula = 'y ~ x'

Plot_3

## `geom_smooth()` using formula = 'y ~ x'

summary(model2)

## 
## Call:
## lm(formula = Resilience_score ~ Training_Hours + Stress_Management + 
##     Sleep_Quality, data = men_res)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.52363 -0.25127  0.00504  0.24724  0.53348 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.2298100  0.0469045   4.900 1.30e-06 ***
## Training_Hours    0.0023412  0.0005011   4.672 3.85e-06 ***
## Stress_Management 0.0065750  0.0004558  14.426  < 2e-16 ***
## Sleep_Quality     0.1270471  0.0050735  25.041  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2948 on 496 degrees of freedom
## Multiple R-squared:  0.6268, Adjusted R-squared:  0.6246 
## F-statistic: 277.7 on 3 and 496 DF,  p-value: < 2.2e-16

#——-Mental Health and Lifestyle Dataset————————————

#Cleaning-checking for NAs

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ lubridate 1.9.3     ✔ stringr   1.5.1
## ✔ purrr     1.0.2     ✔ tibble    3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

#Filter the original dataframe (mh_life) to create a new one (mh_life_female)
mh_life_female <- mh_life %>%
  filter(Gender == "Female")


library(dplyr)
mh_life

## # A tibble: 3,000 × 12
##    Country     Age Gender `Exercise Level` `Diet Type` `Sleep Hours`
##    <chr>     <dbl> <chr>  <chr>            <chr>               <dbl>
##  1 Brazil       48 Male   Low              Vegetarian            6.3
##  2 Australia    31 Male   Moderate         Vegan                 4.9
##  3 Japan        37 Female Low              Vegetarian            7.2
##  4 Brazil       35 Male   Low              Vegan                 7.2
##  5 Germany      46 Male   Low              Balanced              7.3
##  6 Japan        23 Other  Moderate         Balanced              2.7
##  7 Japan        49 Male   Moderate         Junk Food             6.6
##  8 Brazil       46 Other  Low              Vegetarian            6.3
##  9 India        60 Male   High             Vegetarian            4.7
## 10 Germany      19 Female Moderate         Vegan                 3.3
## # ℹ 2,990 more rows
## # ℹ 6 more variables: `Stress Level` <chr>, `Mental Health Condition` <chr>,
## #   `Work Hours per Week` <dbl>, `Screen Time per Day (Hours)` <dbl>,
## #   `Social Interaction Score` <dbl>, `Happiness Score` <dbl>

dep_var2 <- select(mh_life, `Gender`, `Exercise Level`, `Stress Level`, `Happiness Score`)
anyNA(dep_var2)

## [1] FALSE

#Changing categorial variables to numeric

library(dplyr)
dep_var2_scored <- dep_var2 %>%
  mutate(Exercise_score = case_when(
    `Exercise Level` == 'Low' ~ 1, 
    `Exercise Level` == 'Moderate' ~ 2, 
    `Exercise Level` == 'High' ~ 3
  )) %>% 
  mutate(Stress_score = case_when(
    `Stress Level` == 'Low' ~ 1, 
    `Stress Level` == 'Moderate' ~ 2, 
    `Stress Level` == 'High' ~ 3
  ))

#Visualization for Mental Health & Lifestyle Dataset #Exercise and Stress on Happiness

Plot_4 <- ggplot(dep_var2_scored, aes(x = factor(Stress_score), y = `Happiness Score`, fill = factor(Exercise_score))) +
  geom_boxplot(position = position_dodge(width = 0.8), width = 0.6) +
  labs(
    title = "Happiness Distribution by Stress and Exercise Levels", 
    x = "Stress Score (1=Low, 2=Moderate, 3=High)", 
    y = "Happiness Score",
    fill = "Exercise Score"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5)
  )
ggsave("Happy_Stress_Exercise.png", plot = Plot_4)

## Saving 7 x 5 in image

Plot_4

#Filter the original dataframe (dep_var2_scored) to create a new one (dep_var2_scored_female)
dep_var2_scored_female <- dep_var2_scored %>%
  filter(Gender == "Female")

Plot_5 <-ggplot(dep_var2_scored_female, aes(x = factor(Stress_score), y = `Happiness Score`, fill = factor(Exercise_score))) +
  geom_boxplot(position = position_dodge(width = 0.8), width = 0.6) +
  labs(
    title = "Women's Happiness Distribution \nby Stress and Exercise Levels", 
    x = "Stress Score (1=Low, 2=Moderate, 3=High)", 
    y = "Happiness Score",
    fill = "Exercise Score"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5)
  )
ggsave("Women_Happy_Stress_Exercise.png", plot = Plot_5)

## Saving 7 x 5 in image

Plot_5

#Bivariate analyses for Mental Health Lifestyle Dataset

#Sleep on Happiness–Pearson Correlation

#Calculate Pearson R Correlation
correlation_result <- cor.test(mh_life$`Sleep Hours`, mh_life$`Happiness Score`, method = "pearson")
print("Pearson Correlation Test:")

## [1] "Pearson Correlation Test:"

print(correlation_result)

## 
##  Pearson's product-moment correlation
## 
## data:  mh_life$`Sleep Hours` and mh_life$`Happiness Score`
## t = 0.95224, df = 2998, p-value = 0.3411
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.01840940  0.05314198
## sample estimates:
##        cor 
## 0.01738855

# Create Scatter Plot with Regression Line (ggplot2)
Plot_6 <-ggplot(mh_life, aes(x = `Sleep Hours`, y = `Happiness Score`)) +
  geom_point(alpha = 0.5, color = "#0072B2") + 
  geom_smooth(method = "lm", se = TRUE, color = "#D55E00") + # Add linear model (lm) regression line
  labs(
    title = "Relationship between Sleep Hours and Happiness Score",
    subtitle = paste("Pearson r =", round(correlation_result$estimate, 3), ", p-value =", format.pval(correlation_result$p.value, digits=3)),
    x = "Average Sleep Hours Recorded",
    y = "Perceived Happiness Score"
  ) +
  theme_minimal(base_size = 14) +
  theme(plot.title = element_text(face = "bold", hjust = 0.5))

ggsave("Sleep_Happiness.png", plot = Plot_6)

## Saving 7 x 5 in image
## `geom_smooth()` using formula = 'y ~ x'

Plot_6

## `geom_smooth()` using formula = 'y ~ x'

#Happiness on Exercise

summary_stats <- mh_life %>%
  group_by(`Exercise Level`) %>%
  summarise(
    N = n(),
    Mean_Happiness = mean(`Happiness Score`, na.rm = TRUE),
    Median_Happiness = median(`Happiness Score`, na.rm = TRUE),
    SD_Happiness = sd(`Happiness Score`, na.rm = TRUE)
  ) %>%
  arrange(desc(Median_Happiness))

print("Summary Statistics for Happiness Score by Exercise Level:")

## [1] "Summary Statistics for Happiness Score by Exercise Level:"

print(summary_stats)

## # A tibble: 3 × 5
##   `Exercise Level`     N Mean_Happiness Median_Happiness SD_Happiness
##   <chr>            <int>          <dbl>            <dbl>        <dbl>
## 1 High               969           5.55              5.7         2.61
## 2 Moderate           998           5.36              5.4         2.54
## 3 Low               1033           5.29              5.2         2.51

Plot_7 <- ggplot(mh_life, aes(x = `Exercise Level`, y = `Happiness Score`, fill = `Exercise Level`)) +
  geom_boxplot(alpha = 0.7) +
  labs(
    title = "Distribution of Happiness Score by Exercise Level",
    x = "Exercise Level",
    y = "Happiness Score"
  ) +
  theme_minimal(base_size = 14) +
  theme(plot.title = element_text(face = "bold", hjust = 0.5)) +
  coord_flip() # Flip coordinates for better readability 

ggsave("Happiness_Exercise.png", plot = Plot_7)

## Saving 7 x 5 in image

Plot_7

#Womens Exercise and Happiness

summary_stats_w <- mh_life_female %>%
  group_by(`Exercise Level`) %>%
  summarise(
    N = n(),
    Mean_Happiness = mean(`Happiness Score`, na.rm = TRUE),
    Median_Happiness = median(`Happiness Score`, na.rm = TRUE),
    SD_Happiness = sd(`Happiness Score`, na.rm = TRUE)
  ) %>%
  arrange(desc(Median_Happiness))

print("Summary Statistics for Happiness Score by Exercise Level:")

## [1] "Summary Statistics for Happiness Score by Exercise Level:"

print(summary_stats_w)

## # A tibble: 3 × 5
##   `Exercise Level`     N Mean_Happiness Median_Happiness SD_Happiness
##   <chr>            <int>          <dbl>            <dbl>        <dbl>
## 1 Moderate           345           5.27              5.5         2.58
## 2 High               327           5.45              5.3         2.63
## 3 Low                352           5.16              5.2         2.61

Plot_8 <- ggplot(mh_life_female, aes(x = `Exercise Level`, y = `Happiness Score`, fill = `Exercise Level`)) +
  geom_boxplot(alpha = 0.7) +
  labs(
    title = "Women's Distribution of Happiness Score by Exercise Level",
    x = "Exercise Level",
    y = "Happiness Score"
  ) +
  theme_minimal(base_size = 14) +
  theme(plot.title = element_text(face = "bold", hjust = 0.5)) +
  coord_flip() # Flip coordinates for better readability  

ggsave("Women_Happiness_Exercise.png", plot = Plot_8)

## Saving 7 x 5 in image

Plot_8

#Happiness and Diet Score

summary_stats_d <- mh_life %>%
  group_by(`Diet Type`) %>%
  summarise(
    N = n(),
    Mean_Happiness = mean(`Happiness Score`, na.rm = TRUE),
    Median_Happiness = median(`Happiness Score`, na.rm = TRUE),
    SD_Happiness = sd(`Happiness Score`, na.rm = TRUE)
  ) %>%
  arrange(desc(Median_Happiness))

print("Summary Statistics for Happiness Score by Diet Type:")

## [1] "Summary Statistics for Happiness Score by Diet Type:"

print(summary_stats_d)

## # A tibble: 5 × 5
##   `Diet Type`     N Mean_Happiness Median_Happiness SD_Happiness
##   <chr>       <int>          <dbl>            <dbl>        <dbl>
## 1 Vegetarian    592           5.66              5.9         2.58
## 2 Junk Food     637           5.44              5.3         2.55
## 3 Keto          573           5.34              5.3         2.56
## 4 Balanced      625           5.25              5.2         2.51
## 5 Vegan         573           5.29              5.2         2.57

Plot_9 <- ggplot(mh_life, aes(x = `Diet Type`, y = `Happiness Score`, fill = `Diet Type`)) +
  geom_boxplot(alpha = 0.7) +
  labs(
    title = "Distribution of Happiness Score by Diet Type",
    x = "Diet Type",
    y = "Happiness Score"
  ) +
  theme_minimal(base_size = 14) +
  theme(plot.title = element_text(face = "bold", hjust = 0.5)) +
  coord_flip() # Flip coordinates for better readability

ggsave("Happiness_Diet.png", plot = Plot_9)

## Saving 7 x 5 in image

Plot_9

#Women’s diet type on happiness

summary_stats_wd <- mh_life_female %>%
  group_by(`Diet Type`) %>%
  summarise(
    N = n(),
    Mean_Happiness = mean(`Happiness Score`, na.rm = TRUE),
    Median_Happiness = median(`Happiness Score`, na.rm = TRUE),
    SD_Happiness = sd(`Happiness Score`, na.rm = TRUE)
  ) %>%
  arrange(desc(Median_Happiness))

print("Summary Statistics for Happiness Score by Diet Type:")

## [1] "Summary Statistics for Happiness Score by Diet Type:"

print(summary_stats_wd)

## # A tibble: 5 × 5
##   `Diet Type`     N Mean_Happiness Median_Happiness SD_Happiness
##   <chr>       <int>          <dbl>            <dbl>        <dbl>
## 1 Vegetarian    194           5.57             5.95         2.62
## 2 Keto          193           5.47             5.9          2.69
## 3 Junk Food     219           5.45             5.3          2.57
## 4 Balanced      230           4.98             5            2.59
## 5 Vegan         188           5.00             5            2.53

Plot_10 <- ggplot(mh_life_female, aes(x = `Diet Type`, y = `Happiness Score`, fill = `Diet Type`)) +
  geom_boxplot(alpha = 0.7) +
  labs(
    title = "Women's Distribution of Happiness \nScore by Diet Type",
    x = "Diet Type",
    y = "Happiness Score"
  ) +
  theme_minimal(base_size = 14) +
  theme(plot.title = element_text(face = "bold", hjust = 0.5)) +
  coord_flip() # Flip coordinates for better readability 

ggsave("Women_Happiness_Diet.png", plot = Plot_10)

## Saving 7 x 5 in image

Plot_10

#Enhanced Plot for Happiness vs. Diet, faceted by Exercise Level

#Ensure Exercise Level is a factor and ordered as (Low, Moderate, High)
mh_life_female <- mh_life %>%
  mutate(`Exercise Level` = factor(`Exercise Level`, levels = c("Low", "Moderate", "High")))


Plot_11 <-ggplot(mh_life_female, aes(x = `Diet Type`, y = `Happiness Score`, fill = `Diet Type`)) +
  geom_boxplot(alpha = 0.7) +
  facet_wrap(~ `Exercise Level`, ncol = 3) + 
  # This creates three separate plots (one for each Exercise Level)
  # ncol=3 make sure we have 3 columns
  labs(
    title = "Happiness Score by Diet Type, Conditioned on Exercise Level",
    subtitle = "Faceted analysis shows the diet-happiness relationship \n changes with exercise habits.",
    x = "Diet Type",
    y = "Happiness Score"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    axis.text.x = element_text(angle = 45, hjust = 1), # Angle x-labels for better fit
    legend.position = "none" # Remove redundant legend since Diet Type is on x-axis
  )

ggsave("Women_Happiness_Diet_Exercise.png", plot = Plot_11)

## Saving 7 x 5 in image

Plot_11

#Multiple Linear Regression for Happiness and Mental Health Lifestyle Dataset Variables

library(dplyr) # For data manipulation


#Ensure column names are easy to work with by cleaning spaces
mh_life_clean <- mh_life %>%
  rename_all(~gsub(" ", "_", .)) %>%
  rename_all(~gsub("\\(|\\)", "", .))


#Filter the dataset for the specific subgroup: Females with High Exercise Level
mh_subset <- mh_life_clean %>%
  filter(Gender == "Female", Exercise_Level == "High") %>%
  #Select the continuous response variable (Happiness_Score) and relevant predictors
  select(Happiness_Score, Diet_Type, Sleep_Hours, Stress_Level, Mental_Health_Condition,
         Work_Hours_per_Week, Screen_Time_per_Day_Hours, Social_Interaction_Score) %>%
  na.omit() # Remove any rows with missing values

print(paste("Analysis Subgroup Size (Females, High Exercise Level):", nrow(mh_subset)))

## [1] "Analysis Subgroup Size (Females, High Exercise Level): 327"

if (nrow(mh_subset) < 20) {
  stop("The subset size is too small to build a reliable model.")
}


#For better comparison of impact, we scale the numeric variables
#(z-score standardization) so a one-unit change in each represents one standard deviation.
numeric_vars <- c("Sleep_Hours", "Work_Hours_per_Week", "Screen_Time_per_Day_Hours", "Social_Interaction_Score")
mh_subset[numeric_vars] <- scale(mh_subset[numeric_vars])


#Fit the Model
model_linear <- lm(Happiness_Score ~ ., data = mh_subset)

print("--- Multiple Linear Regression Model Summary (Females, High Exercise) ---")

## [1] "--- Multiple Linear Regression Model Summary (Females, High Exercise) ---"

summary_model <- summary(model_linear)
print(summary_model)

## 
## Call:
## lm(formula = Happiness_Score ~ ., data = mh_subset)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.9910 -2.3018 -0.0053  2.4090  4.7482 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        5.44818    0.45989  11.847   <2e-16 ***
## Diet_TypeJunk Food                 0.32421    0.45312   0.715   0.4748    
## Diet_TypeKeto                      0.48217    0.46558   1.036   0.3012    
## Diet_TypeVegan                    -0.54609    0.45162  -1.209   0.2275    
## Diet_TypeVegetarian                0.16102    0.44935   0.358   0.7203    
## Sleep_Hours                       -0.29240    0.14960  -1.955   0.0515 .  
## Stress_LevelLow                   -0.05076    0.35725  -0.142   0.8871    
## Stress_LevelModerate              -0.23005    0.36640  -0.628   0.5306    
## Mental_Health_ConditionBipolar     0.01360    0.45035   0.030   0.9759    
## Mental_Health_ConditionDepression  0.25342    0.47677   0.532   0.5954    
## Mental_Health_ConditionNone       -0.08872    0.47274  -0.188   0.8513    
## Mental_Health_ConditionPTSD       -0.08188    0.45180  -0.181   0.8563    
## Work_Hours_per_Week                0.03495    0.14977   0.233   0.8156    
## Screen_Time_per_Day_Hours          0.19438    0.14875   1.307   0.1922    
## Social_Interaction_Score          -0.20343    0.14827  -1.372   0.1711    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.637 on 312 degrees of freedom
## Multiple R-squared:  0.0387, Adjusted R-squared:  -0.004439 
## F-statistic: 0.8971 on 14 and 312 DF,  p-value: 0.5624

#Determine Impact based on Coefficients and P-values

#Pull out the coefficient table
ctable <- as.data.frame(summary_model$coefficients)
names(ctable) <- c("Estimate", "Std.Error", "t_value", "P_Value")

#Filter out the Intercept and sort by P-Value to identify significant predictors
impact_summary <- ctable %>%
  filter(row.names(ctable) != "(Intercept)") %>%
  select(Estimate, P_Value) %>%
  arrange(P_Value)

print("--- Impact Summary: Estimated Coefficients & P-Values (Sorted by Significance) ---")

## [1] "--- Impact Summary: Estimated Coefficients & P-Values (Sorted by Significance) ---"

print(impact_summary)

##                                      Estimate    P_Value
## Sleep_Hours                       -0.29240429 0.05152443
## Social_Interaction_Score          -0.20342645 0.17105749
## Screen_Time_per_Day_Hours          0.19438420 0.19224629
## Diet_TypeVegan                    -0.54608734 0.22751463
## Diet_TypeKeto                      0.48216621 0.30118257
## Diet_TypeJunk Food                 0.32420696 0.47483784
## Stress_LevelModerate              -0.23005085 0.53055287
## Mental_Health_ConditionDepression  0.25341530 0.59543078
## Diet_TypeVegetarian                0.16101903 0.72033002
## Work_Hours_per_Week                0.03495172 0.81563091
## Mental_Health_ConditionNone       -0.08871525 0.85126332
## Mental_Health_ConditionPTSD       -0.08187800 0.85630644
## Stress_LevelLow                   -0.05076408 0.88709573
## Mental_Health_ConditionBipolar     0.01359944 0.97592884

#Visualization for MLR

library(ggplot2)
library(dplyr)
library(broom) #Helps extract model components
library(forcats) #For working with factor levels

#Re-run the model
model_linear <- lm(Happiness_Score ~ ., data = mh_subset)


plot_data <- tidy(model_linear, conf.int = TRUE) %>%
  filter(term != "(Intercept)") %>% # Exclude the Intercept
  #Create a factor for the terms, ordered by their estimate magnitude
  mutate(term = fct_reorder(term, estimate))

Plot_12 <- coefficient_plot <- ggplot(plot_data, aes(x = estimate, y = term)) +
  
  #Highlight the significance based on the confidence interval crossing zero
  #Color-coded to differentiate significant vs. non-significant terms
  geom_errorbarh(
    aes(xmin = conf.low, xmax = conf.high, color = as.factor(conf.low > 0 | conf.high < 0)),
    height = 0.2, 
    linewidth = 1.2
  ) +
  
  #Plot the point estimate (the coefficient)
  geom_point(aes(color = as.factor(conf.low > 0 | conf.high < 0)), size = 3) +
  
  #Add the vertical zero line (the reference point)
  geom_vline(xintercept = 0, linetype = "dashed", color = "grey50") +
  
  scale_color_manual(
    name = "Significance (95% CI)",
    values = c("TRUE" = "#0072B2", "FALSE" = "#D55E00"), # Blue for Significant, Orange for Non-Significant
    labels = c("TRUE" = "Significant", "FALSE" = "Non-Significant")
  ) +
  labs(
    title = "Impact of Scaled Predictors \non Happiness Score",
    subtitle = "Multiple Linear Regression \nfor Females with High Exercise Level \n(95% Confidence Intervals)",
    x = "Coefficient Estimate (Impact on Happiness Score)",
    y = "Predictor Variable"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    legend.position = "bottom"
  )

print(coefficient_plot)

ggsave("MLR_happiness_predictors.png", plot = Plot_12)

## Saving 7 x 5 in image

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

interactive_coef_plot <- ggplotly(coefficient_plot)

print(interactive_coef_plot)

Final Project Code

Jacinda Evans & Jamir Jones Gatling

2025-12-01