library(tidyr) library(dplyr) library(ggplot2) library(tidyverse)
mental_res <- readr::read_csv("C:/Users/jamir/Downloads/mental_resilience_dataset (1).csv")
## Rows: 1800 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Resilience_Level
## dbl (10): Stress_Management, Emotional_Regulation, Focus_Level, Adaptability...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
mh_life <- readr::read_csv("C:/Users/jamir/Downloads/Mental_Health_Lifestyle_Dataset (1).csv")
## Rows: 3000 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): Country, Gender, Exercise Level, Diet Type, Stress Level, Mental He...
## dbl (6): Age, Sleep Hours, Work Hours per Week, Screen Time per Day (Hours),...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#-------Mental Health Resilience Dataset------------------------------------
#Cleaning, checking for Missing Values
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
dep_var <- select(mental_res, Stress_Management, Sleep_Quality, Training_Hours, Adaptability, Motivation_Score, Focus_Level, Coach_Feedback, Resilience_Level)
anyNA(dep_var)
## [1] FALSE
#there are no missing values in this dataset for the variables we are using
#Visualizations
library(ggplot2)
#Change Categorical variables to numeric
men_res <- mental_res %>%
mutate(Resilience_score = case_when(
Resilience_Level == 'Low' ~ 1,
Resilience_Level == 'Medium' ~ 2,
Resilience_Level == 'High' ~ 3))
#Linear Regression #Model 1: Training Hours, Coach Feedback, Focus Level and Adaptability on Motivation
#Visualization and Steps for Model 1 Regression Analysis and Plot 2 with Summary
library(dplyr)
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.4.3
library(ggplot2)
set.seed(42) #ensuring my results can be reproduced starting at 42
N <- 1000
#Step 1: Create a simulated dataset of 1,000 observations where Motivation_Score is calculated based on Training_Hours, Coach_Feedback, Focus_Level, and Adaptability.
mental_res <- data.frame(
Training_Hours = sample(5:100, N, replace = TRUE),
Coach_Feedback = rnorm(N, mean = 70, sd = 15),
Focus_Level = runif(N, min = 0.5, max = 1.0),
Adaptability = rnorm(N, mean = 50, sd = 10)
)
mental_res$Motivation_Score <- (
0.3 * mental_res$Training_Hours +
0.5 * mental_res$Coach_Feedback +
50 * mental_res$Focus_Level +
0.1 * mental_res$Adaptability +
rnorm(N, 0, 10) # Add random noise
)
mental_res$Motivation_Score[mental_res$Motivation_Score < 0] <- 0 # Cut the score to a realistic range (0-100)
mental_res$Motivation_Score[mental_res$Motivation_Score > 100] <- 100
#Step 2: Fit the Model, and run a Multiple Linear Regression that tries to learn the true formula created in Step 1.
model1 <- lm(Motivation_Score ~ Training_Hours + Coach_Feedback + Focus_Level + Adaptability, data = mental_res)
#Step 3: Generate Predicted Values aka "Expected Values" using the fitted model to predict a Motivation Score for each row in our dataset.
mental_res$Expected_Value <- predict(model1, newdata = mental_res)
#Step 4: Sort the predicted scores (Expected_Value) from lowest to highest, then divide them into 20 equal-sized groups (5th percentiles).
NUM_BINS <- 20
mental_res <- mental_res %>%
mutate(Percentile_Group = ntile(Expected_Value, NUM_BINS)) %>%
mutate(Percentile_Rank = Percentile_Group * (100 / NUM_BINS))
#Step 5: Group by the percentile bin and calculate the mean of the actual and predicted scores
summary_df <- mental_res %>%
group_by(Percentile_Group) %>%
summarize(
Avg_Actual_Score = mean(Motivation_Score),
Avg_Expected_Value = mean(Expected_Value),
Percentile_Midpoint = mean(Percentile_Rank),
.groups = 'drop') #ungroup the data
#Step 6: Reshape the data from wide to long format to make the plotting easier
plot_data <- summary_df %>%
select(Percentile_Midpoint, Avg_Actual_Score, Avg_Expected_Value) %>%
#Simplify Avg_Actual_Score and Avg_Expected_Value into 'Score' and a 'Type' columns indicating whether it's Actual or Expected
pivot_longer(
cols = starts_with("Avg"),
names_to = "Score_Type",
values_to = "Score"
)
#Visualization
Plot_2 <-plot_data %>%
ggplot(aes(x = Percentile_Midpoint, y = Score, color = Score_Type, group = Score_Type)) +
geom_line(linewidth = 1.2) +
geom_point(size = 3) +
labs(
title = "Model Performance: Average Motivation \nScore by Predicted Expected Value Percentile",
x = "Percentile Rank of Expected Value (%)",
y = "Average Score",
color = "Score Type") +
scale_color_manual(
values = c("Avg_Actual_Score" = "#007acc", "Avg_Expected_Value" = "#cc4c00"),
labels = c("Avg_Actual_Score" = "Average Actual Motivation Score",
"Avg_Expected_Value" = "Average Predicted Expected Value")) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
legend.position = "bottom",
legend.title = element_blank()) +
scale_x_continuous(breaks = seq(0, 100, by = 100 / NUM_BINS))
ggsave("TH_CF_FL_AD_Motivation.png", plot = Plot_2 )
## Saving 7 x 5 in image
Plot_2
summary(model1)
##
## Call:
## lm(formula = Motivation_Score ~ Training_Hours + Coach_Feedback +
## Focus_Level + Adaptability, data = mental_res)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.6822 -4.9146 0.4549 5.5317 20.2792
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 26.111965 2.199074 11.874 <2e-16 ***
## Training_Hours 0.202416 0.009266 21.845 <2e-16 ***
## Coach_Feedback 0.321332 0.016868 19.050 <2e-16 ***
## Focus_Level 35.926410 1.743655 20.604 <2e-16 ***
## Adaptability 0.054948 0.024960 2.201 0.0279 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.035 on 995 degrees of freedom
## Multiple R-squared: 0.5616, Adjusted R-squared: 0.5598
## F-statistic: 318.7 on 4 and 995 DF, p-value: < 2.2e-16
#Model 2: Training Hours, Stress Management and Sleep Quality on Resilience (Resilience Level was converted into a numeric variable called Resilience Score where Low, Medium and High are now 1, 2 and 3)
#Visualization and Steps for Model 2 and Plot 3 with Summary
library(dplyr)
library(ggplot2)
library(tidyr)
set.seed(42)
N <- 500
#Step 1: Generate the data
men_res <- data.frame(
Training_Hours = runif(N, 10, 100),
Stress_Management = runif(N, 0, 100),
Sleep_Quality = runif(N, 1, 10))
resilience_factors <- (0.1 * men_res$Training_Hours + 0.3 * men_res$Stress_Management + 5 * men_res$Sleep_Quality) / 15
men_res$Resilience_Level <- cut(resilience_factors,
breaks = c(-Inf, 3.5, 6.5, Inf),
labels = c("Low", "Medium", "High"))
#Change Categorical variables to numeric
men_res <- men_res %>%
mutate(Resilience_score = case_when(
Resilience_Level == 'Low' ~ 1,
Resilience_Level == 'Medium' ~ 2,
Resilience_Level == 'High' ~ 3
))
#Step 2: Fit the Model
model2 <- lm(Resilience_score ~ Training_Hours + Stress_Management + Sleep_Quality, data = men_res)
#Compute means for the continuous predictors
mean_training <- mean(men_res$Training_Hours)
mean_stress <- mean(men_res$Stress_Management)
mean_sleep <- mean(men_res$Sleep_Quality)
#Define the ranges for each predictor
range_training <- seq(min(men_res$Training_Hours), max(men_res$Training_Hours), length.out = 100)
range_stress <- seq(min(men_res$Stress_Management), max(men_res$Stress_Management), length.out = 100)
range_sleep <- seq(min(men_res$Sleep_Quality), max(men_res$Sleep_Quality), length.out = 100)
#Step 3: Create three dataframes for prediction
# A. Vary Training_Hours, hold other variables at their mean
plot_training <- data.frame(
Training_Hours = range_training,
Stress_Management = mean_stress,
Sleep_Quality = mean_sleep
)
# B. Vary Stress_Management, hold other variables at their mean
plot_stress <- data.frame(
Training_Hours = mean_training,
Stress_Management = range_stress,
Sleep_Quality = mean_sleep
)
# C. Vary Sleep_Quality, hold other variables at their mean
plot_sleep <- data.frame(
Training_Hours = mean_training,
Stress_Management = mean_stress,
Sleep_Quality = range_sleep
)
# Combine and predict all scenarios
plot_data <- bind_rows(
mutate(plot_training, Predictor = "Training_Hours"),
mutate(plot_stress, Predictor = "Stress_Management"),
mutate(plot_sleep, Predictor = "Sleep_Quality")
)
#Step 4: Predictions from model2
plot_data$Predicted_Resilience <- predict(model2, newdata = plot_data)
#Step 5: Reshape data to long format
plot_data_long <- plot_data %>%
pivot_longer(
cols = c(Training_Hours, Stress_Management, Sleep_Quality),
names_to = "X_Variable",
values_to = "X_Value"
) %>%
#Filter out the constant variables, keeping only the one that varies for the X-axis
filter(X_Variable == Predictor)
#Visualization
Plot_3 <- ggplot(plot_data_long, aes(x = X_Value, y = Predicted_Resilience)) +
geom_line(color = "#0072B2", size = 1.2) +
geom_smooth(method = "lm", se = TRUE, color = "#D55E00", linetype = "dashed", size = 0.5) +
facet_wrap(~ Predictor, scales = "free_x") +
scale_y_continuous(breaks = 1:3, limits = c(0.8, 3.2)) +
labs(
title = "Partial Effects of Predictors on \nPredicted Resilience Score (Model 2)",
subtitle = "Other variables held constant at their mean.",
x = "Predictor Value",
y = "Predicted Resilience Score"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
strip.background = element_rect(fill = "#EFEFEF", color = "grey")
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggsave("TH_SM_SQ.png", plot = Plot_3)
## Saving 7 x 5 in image
## `geom_smooth()` using formula = 'y ~ x'
Plot_3
## `geom_smooth()` using formula = 'y ~ x'
summary(model2)
##
## Call:
## lm(formula = Resilience_score ~ Training_Hours + Stress_Management +
## Sleep_Quality, data = men_res)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.52363 -0.25127 0.00504 0.24724 0.53348
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.2298100 0.0469045 4.900 1.30e-06 ***
## Training_Hours 0.0023412 0.0005011 4.672 3.85e-06 ***
## Stress_Management 0.0065750 0.0004558 14.426 < 2e-16 ***
## Sleep_Quality 0.1270471 0.0050735 25.041 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2948 on 496 degrees of freedom
## Multiple R-squared: 0.6268, Adjusted R-squared: 0.6246
## F-statistic: 277.7 on 3 and 496 DF, p-value: < 2.2e-16
#——-Mental Health and Lifestyle Dataset————————————
#Cleaning-checking for NAs
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ lubridate 1.9.3 ✔ stringr 1.5.1
## ✔ purrr 1.0.2 ✔ tibble 3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#Filter the original dataframe (mh_life) to create a new one (mh_life_female)
mh_life_female <- mh_life %>%
filter(Gender == "Female")
library(dplyr)
mh_life
## # A tibble: 3,000 × 12
## Country Age Gender `Exercise Level` `Diet Type` `Sleep Hours`
## <chr> <dbl> <chr> <chr> <chr> <dbl>
## 1 Brazil 48 Male Low Vegetarian 6.3
## 2 Australia 31 Male Moderate Vegan 4.9
## 3 Japan 37 Female Low Vegetarian 7.2
## 4 Brazil 35 Male Low Vegan 7.2
## 5 Germany 46 Male Low Balanced 7.3
## 6 Japan 23 Other Moderate Balanced 2.7
## 7 Japan 49 Male Moderate Junk Food 6.6
## 8 Brazil 46 Other Low Vegetarian 6.3
## 9 India 60 Male High Vegetarian 4.7
## 10 Germany 19 Female Moderate Vegan 3.3
## # ℹ 2,990 more rows
## # ℹ 6 more variables: `Stress Level` <chr>, `Mental Health Condition` <chr>,
## # `Work Hours per Week` <dbl>, `Screen Time per Day (Hours)` <dbl>,
## # `Social Interaction Score` <dbl>, `Happiness Score` <dbl>
dep_var2 <- select(mh_life, `Gender`, `Exercise Level`, `Stress Level`, `Happiness Score`)
anyNA(dep_var2)
## [1] FALSE
#Changing categorial variables to numeric
library(dplyr)
dep_var2_scored <- dep_var2 %>%
mutate(Exercise_score = case_when(
`Exercise Level` == 'Low' ~ 1,
`Exercise Level` == 'Moderate' ~ 2,
`Exercise Level` == 'High' ~ 3
)) %>%
mutate(Stress_score = case_when(
`Stress Level` == 'Low' ~ 1,
`Stress Level` == 'Moderate' ~ 2,
`Stress Level` == 'High' ~ 3
))
#Visualization for Mental Health & Lifestyle Dataset #Exercise and Stress on Happiness
Plot_4 <- ggplot(dep_var2_scored, aes(x = factor(Stress_score), y = `Happiness Score`, fill = factor(Exercise_score))) +
geom_boxplot(position = position_dodge(width = 0.8), width = 0.6) +
labs(
title = "Happiness Distribution by Stress and Exercise Levels",
x = "Stress Score (1=Low, 2=Moderate, 3=High)",
y = "Happiness Score",
fill = "Exercise Score"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5)
)
ggsave("Happy_Stress_Exercise.png", plot = Plot_4)
## Saving 7 x 5 in image
Plot_4
#Filter the original dataframe (dep_var2_scored) to create a new one (dep_var2_scored_female)
dep_var2_scored_female <- dep_var2_scored %>%
filter(Gender == "Female")
Plot_5 <-ggplot(dep_var2_scored_female, aes(x = factor(Stress_score), y = `Happiness Score`, fill = factor(Exercise_score))) +
geom_boxplot(position = position_dodge(width = 0.8), width = 0.6) +
labs(
title = "Women's Happiness Distribution \nby Stress and Exercise Levels",
x = "Stress Score (1=Low, 2=Moderate, 3=High)",
y = "Happiness Score",
fill = "Exercise Score"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5)
)
ggsave("Women_Happy_Stress_Exercise.png", plot = Plot_5)
## Saving 7 x 5 in image
Plot_5
#Bivariate analyses for Mental Health Lifestyle Dataset
#Sleep on Happiness–Pearson Correlation
#Calculate Pearson R Correlation
correlation_result <- cor.test(mh_life$`Sleep Hours`, mh_life$`Happiness Score`, method = "pearson")
print("Pearson Correlation Test:")
## [1] "Pearson Correlation Test:"
print(correlation_result)
##
## Pearson's product-moment correlation
##
## data: mh_life$`Sleep Hours` and mh_life$`Happiness Score`
## t = 0.95224, df = 2998, p-value = 0.3411
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.01840940 0.05314198
## sample estimates:
## cor
## 0.01738855
# Create Scatter Plot with Regression Line (ggplot2)
Plot_6 <-ggplot(mh_life, aes(x = `Sleep Hours`, y = `Happiness Score`)) +
geom_point(alpha = 0.5, color = "#0072B2") +
geom_smooth(method = "lm", se = TRUE, color = "#D55E00") + # Add linear model (lm) regression line
labs(
title = "Relationship between Sleep Hours and Happiness Score",
subtitle = paste("Pearson r =", round(correlation_result$estimate, 3), ", p-value =", format.pval(correlation_result$p.value, digits=3)),
x = "Average Sleep Hours Recorded",
y = "Perceived Happiness Score"
) +
theme_minimal(base_size = 14) +
theme(plot.title = element_text(face = "bold", hjust = 0.5))
ggsave("Sleep_Happiness.png", plot = Plot_6)
## Saving 7 x 5 in image
## `geom_smooth()` using formula = 'y ~ x'
Plot_6
## `geom_smooth()` using formula = 'y ~ x'
#Happiness on Exercise
summary_stats <- mh_life %>%
group_by(`Exercise Level`) %>%
summarise(
N = n(),
Mean_Happiness = mean(`Happiness Score`, na.rm = TRUE),
Median_Happiness = median(`Happiness Score`, na.rm = TRUE),
SD_Happiness = sd(`Happiness Score`, na.rm = TRUE)
) %>%
arrange(desc(Median_Happiness))
print("Summary Statistics for Happiness Score by Exercise Level:")
## [1] "Summary Statistics for Happiness Score by Exercise Level:"
print(summary_stats)
## # A tibble: 3 × 5
## `Exercise Level` N Mean_Happiness Median_Happiness SD_Happiness
## <chr> <int> <dbl> <dbl> <dbl>
## 1 High 969 5.55 5.7 2.61
## 2 Moderate 998 5.36 5.4 2.54
## 3 Low 1033 5.29 5.2 2.51
Plot_7 <- ggplot(mh_life, aes(x = `Exercise Level`, y = `Happiness Score`, fill = `Exercise Level`)) +
geom_boxplot(alpha = 0.7) +
labs(
title = "Distribution of Happiness Score by Exercise Level",
x = "Exercise Level",
y = "Happiness Score"
) +
theme_minimal(base_size = 14) +
theme(plot.title = element_text(face = "bold", hjust = 0.5)) +
coord_flip() # Flip coordinates for better readability
ggsave("Happiness_Exercise.png", plot = Plot_7)
## Saving 7 x 5 in image
Plot_7
#Womens Exercise and Happiness
summary_stats_w <- mh_life_female %>%
group_by(`Exercise Level`) %>%
summarise(
N = n(),
Mean_Happiness = mean(`Happiness Score`, na.rm = TRUE),
Median_Happiness = median(`Happiness Score`, na.rm = TRUE),
SD_Happiness = sd(`Happiness Score`, na.rm = TRUE)
) %>%
arrange(desc(Median_Happiness))
print("Summary Statistics for Happiness Score by Exercise Level:")
## [1] "Summary Statistics for Happiness Score by Exercise Level:"
print(summary_stats_w)
## # A tibble: 3 × 5
## `Exercise Level` N Mean_Happiness Median_Happiness SD_Happiness
## <chr> <int> <dbl> <dbl> <dbl>
## 1 Moderate 345 5.27 5.5 2.58
## 2 High 327 5.45 5.3 2.63
## 3 Low 352 5.16 5.2 2.61
Plot_8 <- ggplot(mh_life_female, aes(x = `Exercise Level`, y = `Happiness Score`, fill = `Exercise Level`)) +
geom_boxplot(alpha = 0.7) +
labs(
title = "Women's Distribution of Happiness Score by Exercise Level",
x = "Exercise Level",
y = "Happiness Score"
) +
theme_minimal(base_size = 14) +
theme(plot.title = element_text(face = "bold", hjust = 0.5)) +
coord_flip() # Flip coordinates for better readability
ggsave("Women_Happiness_Exercise.png", plot = Plot_8)
## Saving 7 x 5 in image
Plot_8
#Happiness and Diet Score
summary_stats_d <- mh_life %>%
group_by(`Diet Type`) %>%
summarise(
N = n(),
Mean_Happiness = mean(`Happiness Score`, na.rm = TRUE),
Median_Happiness = median(`Happiness Score`, na.rm = TRUE),
SD_Happiness = sd(`Happiness Score`, na.rm = TRUE)
) %>%
arrange(desc(Median_Happiness))
print("Summary Statistics for Happiness Score by Diet Type:")
## [1] "Summary Statistics for Happiness Score by Diet Type:"
print(summary_stats_d)
## # A tibble: 5 × 5
## `Diet Type` N Mean_Happiness Median_Happiness SD_Happiness
## <chr> <int> <dbl> <dbl> <dbl>
## 1 Vegetarian 592 5.66 5.9 2.58
## 2 Junk Food 637 5.44 5.3 2.55
## 3 Keto 573 5.34 5.3 2.56
## 4 Balanced 625 5.25 5.2 2.51
## 5 Vegan 573 5.29 5.2 2.57
Plot_9 <- ggplot(mh_life, aes(x = `Diet Type`, y = `Happiness Score`, fill = `Diet Type`)) +
geom_boxplot(alpha = 0.7) +
labs(
title = "Distribution of Happiness Score by Diet Type",
x = "Diet Type",
y = "Happiness Score"
) +
theme_minimal(base_size = 14) +
theme(plot.title = element_text(face = "bold", hjust = 0.5)) +
coord_flip() # Flip coordinates for better readability
ggsave("Happiness_Diet.png", plot = Plot_9)
## Saving 7 x 5 in image
Plot_9
#Women’s diet type on happiness
summary_stats_wd <- mh_life_female %>%
group_by(`Diet Type`) %>%
summarise(
N = n(),
Mean_Happiness = mean(`Happiness Score`, na.rm = TRUE),
Median_Happiness = median(`Happiness Score`, na.rm = TRUE),
SD_Happiness = sd(`Happiness Score`, na.rm = TRUE)
) %>%
arrange(desc(Median_Happiness))
print("Summary Statistics for Happiness Score by Diet Type:")
## [1] "Summary Statistics for Happiness Score by Diet Type:"
print(summary_stats_wd)
## # A tibble: 5 × 5
## `Diet Type` N Mean_Happiness Median_Happiness SD_Happiness
## <chr> <int> <dbl> <dbl> <dbl>
## 1 Vegetarian 194 5.57 5.95 2.62
## 2 Keto 193 5.47 5.9 2.69
## 3 Junk Food 219 5.45 5.3 2.57
## 4 Balanced 230 4.98 5 2.59
## 5 Vegan 188 5.00 5 2.53
Plot_10 <- ggplot(mh_life_female, aes(x = `Diet Type`, y = `Happiness Score`, fill = `Diet Type`)) +
geom_boxplot(alpha = 0.7) +
labs(
title = "Women's Distribution of Happiness \nScore by Diet Type",
x = "Diet Type",
y = "Happiness Score"
) +
theme_minimal(base_size = 14) +
theme(plot.title = element_text(face = "bold", hjust = 0.5)) +
coord_flip() # Flip coordinates for better readability
ggsave("Women_Happiness_Diet.png", plot = Plot_10)
## Saving 7 x 5 in image
Plot_10
#Enhanced Plot for Happiness vs. Diet, faceted by Exercise Level
#Ensure Exercise Level is a factor and ordered as (Low, Moderate, High)
mh_life_female <- mh_life %>%
mutate(`Exercise Level` = factor(`Exercise Level`, levels = c("Low", "Moderate", "High")))
Plot_11 <-ggplot(mh_life_female, aes(x = `Diet Type`, y = `Happiness Score`, fill = `Diet Type`)) +
geom_boxplot(alpha = 0.7) +
facet_wrap(~ `Exercise Level`, ncol = 3) +
# This creates three separate plots (one for each Exercise Level)
# ncol=3 make sure we have 3 columns
labs(
title = "Happiness Score by Diet Type, Conditioned on Exercise Level",
subtitle = "Faceted analysis shows the diet-happiness relationship \n changes with exercise habits.",
x = "Diet Type",
y = "Happiness Score"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1), # Angle x-labels for better fit
legend.position = "none" # Remove redundant legend since Diet Type is on x-axis
)
ggsave("Women_Happiness_Diet_Exercise.png", plot = Plot_11)
## Saving 7 x 5 in image
Plot_11
#Multiple Linear Regression for Happiness and Mental Health Lifestyle Dataset Variables
library(dplyr) # For data manipulation
#Ensure column names are easy to work with by cleaning spaces
mh_life_clean <- mh_life %>%
rename_all(~gsub(" ", "_", .)) %>%
rename_all(~gsub("\\(|\\)", "", .))
#Filter the dataset for the specific subgroup: Females with High Exercise Level
mh_subset <- mh_life_clean %>%
filter(Gender == "Female", Exercise_Level == "High") %>%
#Select the continuous response variable (Happiness_Score) and relevant predictors
select(Happiness_Score, Diet_Type, Sleep_Hours, Stress_Level, Mental_Health_Condition,
Work_Hours_per_Week, Screen_Time_per_Day_Hours, Social_Interaction_Score) %>%
na.omit() # Remove any rows with missing values
print(paste("Analysis Subgroup Size (Females, High Exercise Level):", nrow(mh_subset)))
## [1] "Analysis Subgroup Size (Females, High Exercise Level): 327"
if (nrow(mh_subset) < 20) {
stop("The subset size is too small to build a reliable model.")
}
#For better comparison of impact, we scale the numeric variables
#(z-score standardization) so a one-unit change in each represents one standard deviation.
numeric_vars <- c("Sleep_Hours", "Work_Hours_per_Week", "Screen_Time_per_Day_Hours", "Social_Interaction_Score")
mh_subset[numeric_vars] <- scale(mh_subset[numeric_vars])
#Fit the Model
model_linear <- lm(Happiness_Score ~ ., data = mh_subset)
print("--- Multiple Linear Regression Model Summary (Females, High Exercise) ---")
## [1] "--- Multiple Linear Regression Model Summary (Females, High Exercise) ---"
summary_model <- summary(model_linear)
print(summary_model)
##
## Call:
## lm(formula = Happiness_Score ~ ., data = mh_subset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.9910 -2.3018 -0.0053 2.4090 4.7482
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.44818 0.45989 11.847 <2e-16 ***
## Diet_TypeJunk Food 0.32421 0.45312 0.715 0.4748
## Diet_TypeKeto 0.48217 0.46558 1.036 0.3012
## Diet_TypeVegan -0.54609 0.45162 -1.209 0.2275
## Diet_TypeVegetarian 0.16102 0.44935 0.358 0.7203
## Sleep_Hours -0.29240 0.14960 -1.955 0.0515 .
## Stress_LevelLow -0.05076 0.35725 -0.142 0.8871
## Stress_LevelModerate -0.23005 0.36640 -0.628 0.5306
## Mental_Health_ConditionBipolar 0.01360 0.45035 0.030 0.9759
## Mental_Health_ConditionDepression 0.25342 0.47677 0.532 0.5954
## Mental_Health_ConditionNone -0.08872 0.47274 -0.188 0.8513
## Mental_Health_ConditionPTSD -0.08188 0.45180 -0.181 0.8563
## Work_Hours_per_Week 0.03495 0.14977 0.233 0.8156
## Screen_Time_per_Day_Hours 0.19438 0.14875 1.307 0.1922
## Social_Interaction_Score -0.20343 0.14827 -1.372 0.1711
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.637 on 312 degrees of freedom
## Multiple R-squared: 0.0387, Adjusted R-squared: -0.004439
## F-statistic: 0.8971 on 14 and 312 DF, p-value: 0.5624
#Determine Impact based on Coefficients and P-values
#Pull out the coefficient table
ctable <- as.data.frame(summary_model$coefficients)
names(ctable) <- c("Estimate", "Std.Error", "t_value", "P_Value")
#Filter out the Intercept and sort by P-Value to identify significant predictors
impact_summary <- ctable %>%
filter(row.names(ctable) != "(Intercept)") %>%
select(Estimate, P_Value) %>%
arrange(P_Value)
print("--- Impact Summary: Estimated Coefficients & P-Values (Sorted by Significance) ---")
## [1] "--- Impact Summary: Estimated Coefficients & P-Values (Sorted by Significance) ---"
print(impact_summary)
## Estimate P_Value
## Sleep_Hours -0.29240429 0.05152443
## Social_Interaction_Score -0.20342645 0.17105749
## Screen_Time_per_Day_Hours 0.19438420 0.19224629
## Diet_TypeVegan -0.54608734 0.22751463
## Diet_TypeKeto 0.48216621 0.30118257
## Diet_TypeJunk Food 0.32420696 0.47483784
## Stress_LevelModerate -0.23005085 0.53055287
## Mental_Health_ConditionDepression 0.25341530 0.59543078
## Diet_TypeVegetarian 0.16101903 0.72033002
## Work_Hours_per_Week 0.03495172 0.81563091
## Mental_Health_ConditionNone -0.08871525 0.85126332
## Mental_Health_ConditionPTSD -0.08187800 0.85630644
## Stress_LevelLow -0.05076408 0.88709573
## Mental_Health_ConditionBipolar 0.01359944 0.97592884
#Visualization for MLR
library(ggplot2)
library(dplyr)
library(broom) #Helps extract model components
library(forcats) #For working with factor levels
#Re-run the model
model_linear <- lm(Happiness_Score ~ ., data = mh_subset)
plot_data <- tidy(model_linear, conf.int = TRUE) %>%
filter(term != "(Intercept)") %>% # Exclude the Intercept
#Create a factor for the terms, ordered by their estimate magnitude
mutate(term = fct_reorder(term, estimate))
Plot_12 <- coefficient_plot <- ggplot(plot_data, aes(x = estimate, y = term)) +
#Highlight the significance based on the confidence interval crossing zero
#Color-coded to differentiate significant vs. non-significant terms
geom_errorbarh(
aes(xmin = conf.low, xmax = conf.high, color = as.factor(conf.low > 0 | conf.high < 0)),
height = 0.2,
linewidth = 1.2
) +
#Plot the point estimate (the coefficient)
geom_point(aes(color = as.factor(conf.low > 0 | conf.high < 0)), size = 3) +
#Add the vertical zero line (the reference point)
geom_vline(xintercept = 0, linetype = "dashed", color = "grey50") +
scale_color_manual(
name = "Significance (95% CI)",
values = c("TRUE" = "#0072B2", "FALSE" = "#D55E00"), # Blue for Significant, Orange for Non-Significant
labels = c("TRUE" = "Significant", "FALSE" = "Non-Significant")
) +
labs(
title = "Impact of Scaled Predictors \non Happiness Score",
subtitle = "Multiple Linear Regression \nfor Females with High Exercise Level \n(95% Confidence Intervals)",
x = "Coefficient Estimate (Impact on Happiness Score)",
y = "Predictor Variable"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
legend.position = "bottom"
)
print(coefficient_plot)
ggsave("MLR_happiness_predictors.png", plot = Plot_12)
## Saving 7 x 5 in image
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
interactive_coef_plot <- ggplotly(coefficient_plot)
print(interactive_coef_plot)