Step 1: Install and Load Required Libraries
library(ggpubr)
## Loading required package: ggplot2
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(dplyr)
library(ggplot2)
theme_set(theme_pubr()) # gives a clear report ready plot
Step 2: Import & Clean the Dataset
social_media_df <-read_excel(file.choose())
# Removes unecessary collumns
response_df <- social_media_df[ -c(1, 5:8, 10:18)]
Rename columns
colnames(response_df)
## [1] "1. What is your age?"
## [2] "2. Gender"
## [3] "3. Relationship Status"
## [4] "8. What is the average time you spend on social media every day?"
## [5] "18. How often do you feel depressed or down?"
## [6] "19. On a scale of 1 to 5, how frequently does your interest in daily activities fluctuate?"
## [7] "20. On a scale of 1 to 5, how often do you face issues regarding sleep?"
# Rename column to simplify analysis
names(response_df)[names(response_df) == "1. What is your age?"] <-
"Age"
names(response_df)[names(response_df) == "2. Gender"] <-
"Gender"
names(response_df)[names(response_df) == "3. Relationship Status"] <-
"Relationship_Status"
names(response_df)[names(response_df) == "8. What is the average time you spend on social media every day?"] <-
"Avg_Time"
# Re-name Questions to "Depression"
names(response_df)[names(response_df) == "18. How often do you feel depressed or down?"] <-
"Depression_Q1"
names(response_df)[names(response_df) == "19. On a scale of 1 to 5, how frequently does your interest in daily activities fluctuate?"] <-
"Depression_Q2"
names(response_df)[names(response_df) == "20. On a scale of 1 to 5, how often do you face issues regarding sleep?"] <-
"Depression_Q3"
colnames(response_df)
## [1] "Age" "Gender" "Relationship_Status"
## [4] "Avg_Time" "Depression_Q1" "Depression_Q2"
## [7] "Depression_Q3"
Focus on only male and female respondents
# To simplify analysis delete data from alternative options
response_df <- response_df[!grepl('There are others???', response_df$Gender),]
response_df <- response_df[!grepl('NB', response_df$Gender),]
response_df <- response_df[!grepl('Non binary', response_df$Gender),]
response_df <- response_df[!grepl('unsure', response_df$Gender),]
response_df <- response_df[!grepl('Trans', response_df$Gender),]
response_df <- response_df[!grepl('Nonbinary', response_df$Gender),]
response_df <- response_df[!grepl('Non-binary', response_df$Gender),]
response_df
## # A tibble: 474 × 7
## Age Gender Relationship_Status Avg_Time Depression_Q1 Depression_Q2
## <dbl> <chr> <chr> <chr> <dbl> <dbl>
## 1 21 Male In a relationship Between 2 and 3… 5 4
## 2 21 Female Single More than 5 hou… 5 4
## 3 21 Female Single Between 3 and 4… 4 2
## 4 21 Female Single More than 5 hou… 4 3
## 5 21 Female Single Between 2 and 3… 4 4
## 6 22 Female Single Between 2 and 3… 3 2
## 7 21 Female Married Between 3 and 4… 5 5
## 8 21 Female In a relationship More than 5 hou… 5 5
## 9 21 Female In a relationship More than 5 hou… 5 5
## 10 20 Male Single Less than an Ho… 1 1
## # ℹ 464 more rows
## # ℹ 1 more variable: Depression_Q3 <dbl>
Step 3: Create Data Frame for Row-mean Scores in “Depression” and
Variables of Interest
# Depression row mean scores
Depression_df <- data.frame (response_df$Depression_Q1,
response_df$Depression_Q2,
response_df$Depression_Q3)
Depression_df <- data.frame(rowMeans(Depression_df))
colnames(Depression_df) <- ("Depression")
Make new data frame
# Update data in new frame
mhealth_df <- data.frame(response_df$Age, Depression_df, response_df$Gender, response_df$Relationship_Status)
## Rename columns
names(mhealth_df)[names(mhealth_df) == "response_df.Age"] <-
"Age"
names(mhealth_df)[names(mhealth_df) == "response_df.Gender"] <-
"Gender"
names(mhealth_df)[names(mhealth_df) == "response_df.Relationship_Status"] <-
"Relationship_Status"
colnames(mhealth_df)
## [1] "Age" "Depression" "Gender"
## [4] "Relationship_Status"
Filter ages <25 & >= 25
mhealth1_df <- subset(mhealth_df, Age < 25)
mhealth2_df <- subset(mhealth_df, Age > 24)
Step 4: Summarize Data and Data Visualization
Defining the variables
Variable | Definition
-------------------------|------------
1. Age | Respondents age (age under 25 and Age Over 25)
2. Gender | Respondents who are Male or Female
3. Relationship_Status | Respondents who are Single, In a relationship, Married and Divorced
4. Depression | The mean score for questions pertaining to Depression
Questions aggregated defined as the variable “Depression”
18. How often do you feel depressed or down?
19. On a scale of 1 to 5, how frequently does your interest in daily activities fluctuate?
20. On a scale of 1 to 5, how often do you face issues regarding sleep?
Descriptive statistics
# 5 number summary for Age and Depression
summary(mhealth_df$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 13.00 21.00 22.00 26.12 26.00 91.00
summary(mhealth1_df$Depression)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.667 3.333 3.400 4.000 5.000
Interpretation: For the summary of age
(1) The youngest respondent is 13 years old, while the oldest is 91 years old.
(2) The median age is 22 years.
(3) The mean age is 26.12 years.
(4) For the summary of depression scores among respondents aged less than 25.
(5) The minimum depression score is 1 -> the lowest level of depression reported.
(6) The median depression score is approximately 3.33.
(7) The mean depression score is approximately 3.4.
Create cross-tabulation and proportion Table
cross_tab_gender_relationship <- table(mhealth_df$Gender, mhealth_df$Relationship_Status)
prop_table_gender_relationship <- prop.table(cross_tab_gender_relationship, margin = 2)
Print tables
print(cross_tab_gender_relationship)
##
## Divorced In a relationship Married Single
## Female 1 57 53 152
## Male 5 30 48 128
print(prop_table_gender_relationship)
##
## Divorced In a relationship Married Single
## Female 0.1666667 0.6551724 0.5247525 0.5428571
## Male 0.8333333 0.3448276 0.4752475 0.4571429
Create tables for gender and relationship status
gender_table <- as.data.frame(table(mhealth_df$Gender))
colnames(gender_table) <- c("Gender", "Frequency")
gender_table$Proportion <- gender_table$Frequency / sum(gender_table$Frequency)
relationship_table <- as.data.frame(table(mhealth_df$Relationship_Status))
colnames(relationship_table) <- c("Relationship_Status", "Frequency")
relationship_table$Proportion <- relationship_table$Frequency / sum(relationship_table$Frequency)
Display the tables
print(gender_table)
## Gender Frequency Proportion
## 1 Female 263 0.5548523
## 2 Male 211 0.4451477
print(relationship_table)
## Relationship_Status Frequency Proportion
## 1 Divorced 6 0.01265823
## 2 In a relationship 87 0.18354430
## 3 Married 101 0.21308017
## 4 Single 280 0.59071730
Interpretation: Gender Table
(1) Table shows the frequency and proportion of respondents by gender.
(2) 263 female respondents and 211 male respondents.
(3) The proportion indicates the proportion of each gender category relative to the total number of respondents. For example, approximately 55.49% of the respondents are female, while approximately 44.51% are male
(4) Relationship Status Table:The table presents the frequency and proportion of respondents by relationship status.
(5)Among the respondents:
(a) 6 respondents are divorced, making up approximately 1.27% of the total.
(b) 87 respondents are in a relationship, which is about 18.35% of the total.
(c) 101 respondents are married, accounting for approximately 21.31% of the total.
(d) 280 respondents are single, representing about 59.07% of the total.
Histogram of depression scores
# Show distribution of Depression Scores
ggplot(mhealth_df, aes(x = Depression)) +
geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
labs(title = "Histogram of Depression", x = "Depression Score", y = "Frequency") +
theme_pubr()

Interpretation: Histogram of Depression Scores
The histogram shows the distribution of depression scores
The x-axis represents the depression score, while the y-axis represents the frequency (number of respondents)
The bars show how many respondents fall into each depression score range
The histogram's shape and spread can provide insights into the distribution of depression scores among the respondents, such as whether the scores are normally distributed, skewed, or exhibit other patterns
Boxplot of depression Scores by gender
# Create variables showing male and female depression score
MD <- mhealth_df[mhealth_df$Gender == "Male", "Depression"]
FD <- mhealth_df[mhealth_df$Gender == "Female", "Depression"]
boxplot(MD, FD, names = c("Male", "Female"), ylab = "Depression Levels",
main = "Depression Scores by Gender", col = c("#00AFBB", "pink"))

Interpretation: Boxplot of Depression Scores by Gender
A low score of 1 generally indicates low intensity, and a high score of 5 typically indicates high intensity.
Feelings of Depression [Depression] - Question 18 / How often do you feel depressed or down?
Fluctuation of interest [Depression] - Question 19 / On a scale of 1 to 5, how frequently does your interest in daily activities fluctuate?
Sleep Issues [Depression] - Question 20 / On a scale of 1 to 5, how often do you face issues regarding sleep?
Females report higher depression scores than Males.
Boxplot of depression scores by relationship status
# Create variables showing depressions score in each relationship status
single <- mhealth_df[mhealth_df$Relationship_Status == "Single", "Depression"]
relationship <- mhealth_df[mhealth_df$Relationship_Status == "In a relationship", "Depression"]
married <- mhealth_df[mhealth_df$Relationship_Status == "Married", "Depression"]
divorced <- mhealth_df[mhealth_df$Relationship_Status == "Divorced", "Depression"]
boxplot(single, relationship, married, divorced,
main = "Depression Scores by Relationship Status",
xlab = "Relationship_Status",
ylab = "Depression Scores",
col = c("lightblue", "lightgreen", "lightcoral", "gray"),
names = c("Single", "In a Relationship", "Married", "Divorced"))

Interpretation: Boxplot of Depression Scores by Relationship Status
We see that Single and in a relationship have the highest depression scores while divorced individuals show the lowest depression score.
The distribution of depression scores varies across different relationship statuses.
Single and divorced participants have a wider spread of depression scores, suggesting more variability in their mental health.
Those in a relationship or married tend to have less variability in their depression scores.
Step 6: Testing Assumptions
Test of independence: “Depression” and “Gender”
# Test independence using Pearson's Chi-squared test
chisq_g <- chisq.test(mhealth_df$Depression,mhealth_df$Gender)
print(chisq_g)
##
## Pearson's Chi-squared test
##
## data: mhealth_df$Depression and mhealth_df$Gender
## X-squared = 13.006, df = 12, p-value = 0.3686
Interpretation: Since p-value (0.37) > 0.5 We conclude that gender and depression are dependent of each other.
Testing to find out if “Depression” is normally distributed
# Test normality of data using Shapiro-Wilk test
shapiro.test(Depression_df$Depression)
##
## Shapiro-Wilk normality test
##
## data: Depression_df$Depression
## W = 0.97135, p-value = 5.246e-08
Interpretation: Since the p-value (5.246e-08) is less than the significance level (0.05), we reject the null hypothesis. This suggests that the 'Depression' scores are not normally distributed in the population.
Testing to find out if “Age” is normally distributed
# Test normality of data using Shapiro-Wilk test
shapiro.test(response_df$Age)
##
## Shapiro-Wilk normality test
##
## data: response_df$Age
## W = 0.7261, p-value < 2.2e-16
Interpretation: Since the p-value (2.2e-16) is less than the significance level (0.05), we reject the null hypothesis. This indicates that the 'Age' variable is not normally distributed in the population.
Explaning method choice:
The Kruskal-Wallis test is used instead of parametric tests due to the non-normal distribution of depression scores, assessing whether there are significant differences among groups based on age, gender, and marital status.
Since the Age and Depression is not normally distributed, we will use the non-parametric Mann-Whitney U test to determine if there is a significant difference in the median depression scores among the different age groups.
Step 7: Mann-Whitney U test for depression scores between
individuals under and over the age of 25
Parameters
M1 = the population median depression score for Age < 25
M2 = the population median depression score for Age > 25
Hypothesis
H0: M1 = M2
Ha: M1 ≠ M2
Two groups based on their age: those under 25 and those 25 or
older.
group_under_25 <- subset(mhealth_df, Age < 25)$Depression
group_over_25 <- subset(mhealth_df, Age > 24)$Depression
wilcox.test(group_under_25, group_over_25,
paired = FALSE)
##
## Wilcoxon rank sum test with continuity correction
##
## data: group_under_25 and group_over_25
## W = 32082, p-value = 3.987e-08
## alternative hypothesis: true location shift is not equal to 0
Interpretation: Since p-value (0.00)< level of significance (0.05), we reject H0.
Conclusion: There is sufficient evidence to conclude that there is a significant difference in depression scores between individuals under 25 and those 25 and older who use social media.
Step 8: Kruskal-Wallis Test for Depression Scores by Gender
Parameters
M3 = the population median depression score for Females
M4 = the population median depression score for Males
Hypothesis
H0: M3 = M4
Ha: M3 ≠ M4
Clean and extract data for specific needs.
# Extracting Male and Depression values and inserting them into new dataset called MD
MD <- mhealth_df[mhealth_df$Gender =="Male", "Depression"]
# Extracting Female and Depression values and inserting them into new dataset called FD
FD <- mhealth_df[mhealth_df$Gender =="Female", "Depression"]
# recalling both MD and FD to see results and double check work
MD
## [1] 4.666667 1.000000 4.666667 5.000000 3.000000 2.666667 2.000000 2.333333
## [9] 2.000000 2.333333 2.333333 4.333333 3.666667 4.333333 3.000000 2.666667
## [17] 2.333333 4.666667 2.666667 3.000000 2.000000 1.666667 3.000000 5.000000
## [25] 1.333333 2.000000 3.333333 3.000000 3.666667 3.333333 3.000000 4.000000
## [33] 3.666667 1.666667 5.000000 1.666667 3.000000 3.666667 4.333333 4.333333
## [41] 3.333333 3.666667 2.666667 2.333333 3.666667 3.000000 3.000000 3.000000
## [49] 2.000000 4.333333 3.333333 4.333333 2.666667 2.666667 3.000000 4.333333
## [57] 4.000000 2.666667 2.333333 2.666667 1.666667 4.333333 3.000000 3.333333
## [65] 2.000000 2.333333 2.666667 2.000000 2.333333 3.666667 3.333333 3.000000
## [73] 2.666667 3.333333 3.000000 2.333333 2.333333 1.666667 1.666667 3.333333
## [81] 5.000000 1.333333 3.333333 2.000000 2.000000 2.000000 4.666667 2.000000
## [89] 1.333333 1.333333 3.333333 3.666667 5.000000 2.333333 4.666667 2.666667
## [97] 1.000000 5.000000 1.000000 2.000000 1.000000 2.333333 4.333333 3.000000
## [105] 2.666667 4.333333 4.000000 3.666667 1.333333 3.000000 3.333333 3.666667
## [113] 2.666667 3.333333 3.333333 3.666667 3.333333 5.000000 5.000000 4.000000
## [121] 4.000000 3.333333 4.666667 2.333333 2.666667 2.666667 2.666667 3.333333
## [129] 5.000000 3.666667 3.000000 4.000000 5.000000 2.333333 3.666667 3.000000
## [137] 3.666667 4.666667 4.000000 2.666667 2.666667 2.666667 1.666667 2.333333
## [145] 3.333333 3.000000 3.333333 3.666667 1.000000 2.000000 4.000000 2.000000
## [153] 3.333333 2.666667 4.000000 2.666667 2.000000 5.000000 2.666667 3.666667
## [161] 3.000000 3.666667 3.666667 3.333333 2.666667 3.333333 3.333333 3.333333
## [169] 3.666667 5.000000 3.666667 3.000000 4.000000 1.000000 1.000000 5.000000
## [177] 3.000000 4.666667 4.333333 3.333333 2.000000 3.666667 2.666667 4.666667
## [185] 4.333333 2.666667 5.000000 4.333333 5.000000 3.000000 3.666667 3.000000
## [193] 2.333333 3.333333 3.000000 4.333333 1.000000 2.333333 1.333333 2.666667
## [201] 1.000000 3.333333 2.000000 2.333333 2.666667 2.333333 2.666667 1.000000
## [209] 3.666667 4.333333 2.333333
FD
## [1] 4.666667 3.666667 3.000000 3.000000 3.000000 4.333333 3.666667 3.666667
## [9] 3.000000 3.000000 3.000000 4.000000 3.000000 2.666667 3.000000 3.333333
## [17] 4.000000 2.666667 4.333333 3.000000 2.333333 2.666667 2.000000 3.666667
## [25] 2.666667 3.000000 1.000000 3.666667 4.333333 3.666667 3.000000 3.666667
## [33] 4.333333 4.666667 3.333333 4.333333 4.666667 3.333333 5.000000 4.666667
## [41] 3.333333 1.333333 4.666667 3.666667 2.333333 4.000000 4.333333 2.666667
## [49] 2.333333 3.666667 4.000000 1.000000 3.000000 3.000000 3.000000 3.000000
## [57] 3.666667 3.666667 3.333333 4.666667 1.000000 3.000000 4.000000 2.666667
## [65] 2.000000 4.000000 3.666667 4.000000 2.000000 1.333333 2.000000 1.000000
## [73] 1.333333 4.000000 2.666667 5.000000 3.000000 3.666667 2.333333 2.666667
## [81] 2.333333 4.000000 2.000000 3.333333 4.666667 4.000000 2.000000 3.666667
## [89] 2.333333 4.333333 1.000000 4.000000 4.333333 2.333333 4.333333 3.333333
## [97] 3.666667 2.333333 3.666667 4.000000 2.333333 2.666667 4.000000 4.000000
## [105] 1.666667 2.333333 2.333333 3.333333 2.000000 1.000000 3.666667 3.333333
## [113] 3.666667 2.333333 3.333333 3.000000 3.666667 3.000000 3.666667 3.000000
## [121] 2.000000 1.666667 1.000000 3.333333 4.333333 4.666667 3.333333 4.333333
## [129] 2.666667 3.333333 2.333333 2.333333 1.333333 4.333333 1.000000 4.333333
## [137] 2.666667 4.000000 5.000000 2.000000 3.000000 2.666667 2.666667 4.666667
## [145] 2.333333 2.333333 2.666667 2.666667 4.000000 2.666667 2.666667 5.000000
## [153] 3.333333 5.000000 3.000000 1.666667 4.000000 3.666667 4.333333 3.000000
## [161] 3.666667 3.000000 4.333333 5.000000 1.000000 1.333333 4.000000 3.666667
## [169] 4.333333 2.666667 4.000000 2.000000 3.000000 2.000000 4.666667 2.666667
## [177] 4.333333 3.333333 4.666667 5.000000 5.000000 4.666667 3.333333 4.333333
## [185] 3.000000 3.666667 3.000000 3.333333 4.666667 5.000000 5.000000 3.666667
## [193] 2.333333 4.333333 4.000000 3.333333 3.666667 3.333333 2.666667 2.666667
## [201] 4.000000 1.666667 4.000000 4.000000 4.000000 4.666667 3.333333 3.666667
## [209] 1.666667 4.666667 4.333333 2.000000 5.000000 4.666667 3.333333 4.333333
## [217] 3.333333 2.000000 1.666667 4.000000 5.000000 3.000000 5.000000 4.666667
## [225] 3.333333 3.333333 3.666667 2.333333 4.666667 3.666667 2.000000 4.333333
## [233] 3.000000 3.000000 3.666667 2.666667 4.333333 5.000000 4.000000 3.666667
## [241] 3.333333 1.666667 2.000000 3.666667 4.666667 4.333333 3.666667 2.000000
## [249] 3.333333 3.333333 1.333333 2.666667 2.333333 3.333333 4.000000 4.666667
## [257] 3.666667 3.333333 3.666667 2.333333 4.666667 3.000000 2.000000
Build Model
# Wilcox test and saving the results under “results”. Wilcox is calling MD and FD
results = wilcox.test(MD, FD)
Interpretation:
Since, p-value 0.02 < 0.05, we reject the null hypothesis.
It is clear that there is a significant difference between the scores of females and Males. To see whether females or males reported higher depression, we can go back to the data visualization. We see that Females reported higher depression scores compared to Men and that the samples are in fact, different from each other.
Step 9: Kruskal-Wallis Test for Depression by Relationship
Status
Parameters
M5 = the population median depression score for Single
M6 = the population median depression score for In a Relationship
M7 = the population median depression score for Married
M8 = the population median depression score for Divorced
Hypothesis
H0: M5 = M6 = M7 = M8
Ha: M5 ≠ M6 ≠ M7 ≠ M8
Kruskal-Wallis test (nonparametric altenative to ANOVA)
kruskal.test(Depression~Relationship_Status, data = mhealth_df)
##
## Kruskal-Wallis rank sum test
##
## data: Depression by Relationship_Status
## Kruskal-Wallis chi-squared = 47.779, df = 3, p-value = 2.373e-10
pairwise.wilcox.test(mhealth_df$Depression, mhealth_df$Relationship_Status,
p.adjust.method = "BH")
##
## Pairwise comparisons using Wilcoxon rank sum test with continuity correction
##
## data: mhealth_df$Depression and mhealth_df$Relationship_Status
##
## Divorced In a relationship Married
## In a relationship 0.16 - -
## Married 0.71 1.4e-06 -
## Single 0.16 0.71 2.3e-10
##
## P value adjustment method: BH
Interpretation: a p-value of 0.00 indicates we can reject our null hypothesis and conclude that
not all population median are equal
Step 10: Pairwise comparison
Pairwise comparison is necessary because there is significant difference in the population mean depression Scores in individuals who are single, in a relationship, married, or divorced.
pairwise.wilcox.test(mhealth_df$Depression, mhealth_df$Relationship_Status,
p.adjust.method = "BH")
##
## Pairwise comparisons using Wilcoxon rank sum test with continuity correction
##
## data: mhealth_df$Depression and mhealth_df$Relationship_Status
##
## Divorced In a relationship Married
## In a relationship 0.16 - -
## Married 0.71 1.4e-06 -
## Single 0.16 0.71 2.3e-10
##
## P value adjustment method: BH
Interpretation: with a p-value of 0.00, we can conclude there is a significant difference in the population medians in the groups "In a relatinship" and "Mariied" and "Married" and "Single"