knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)
library(tidyr)
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(ggrepel)
library(maps)
##
## Attaching package: 'maps'
##
## The following object is masked from 'package:purrr':
##
## map
library(sf)
## Linking to GEOS 3.13.0, GDAL 3.8.5, PROJ 9.5.1; sf_use_s2() is TRUE
library(leaflet)
#load dataset, view data.
std_dep_full <- read.csv("student_depression_dataset.csv", stringsAsFactors = FALSE)
head(std_dep_full)
#too many variables, take a subset that I think may be important and are interesting to me.
std_dep <- std_dep_full[, c("Gender", "Age", "City", "Sleep.Duration", "Financial.Stress", "Depression")]
head(std_dep)
#change to Yes/No
std_dep$Depression <- factor(std_dep$Depression, levels = c(0, 1), labels = c("No", "Yes"))
#Throw out values >35. Now only analyzing effect on those younger than 35. More easy for visualization purposes, maybe not a good idea if I was conducting statistical analysis.
std_dep <- std_dep[std_dep$Age <= 35, ]
#throw out "other" category
std_dep <- std_dep[std_dep$Sleep.Duration != "Others", ]
#set in correct order
std_dep <- std_dep %>%
mutate(Sleep.Duration = factor(Sleep.Duration,
levels = c("'Less than 5 hours'", "'5-6 hours'", "'7-8 hours'", "'More than 8 hours'")))
## Remove rows where Financial.Stress is "?"
std_dep <- std_dep[std_dep$Financial.Stress != "?", ]
#change labels into something that is easily understandable by viewer.
std_dep <- std_dep %>%
mutate(Financial.Stress = as.numeric(Financial.Stress), #
Financial.Stress = factor(Financial.Stress,
levels = c(1, 2, 3, 4, 5),
labels = c("Very Low", "Low", "Moderate", "High", "Very High")))
# recalculate percentages by subgroups.
sleep_dep_pct <- std_dep %>%
group_by(Financial.Stress, Sleep.Duration, Depression) %>%
summarise(Count = n(), .groups = "drop") %>%
group_by(Financial.Stress, Sleep.Duration) %>%
mutate(Percent = Count / sum(Count) * 100)
#just make new dataframe where just depression yes is there.
heatmap_data <- sleep_dep_pct %>%
filter(Depression == "Yes")
# Create heatmap plot. Colorblind safe colors.
plot2 <- ggplot(heatmap_data, aes(x = Sleep.Duration, y = Financial.Stress, fill = Percent)) +
geom_tile() +
scale_fill_gradient(low = "yellow", high = "blue") +
labs(title = "Impact of financial worry and sleep on depression incidence rate",
x = "Sleep Duration",
y = "Financial Stress") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
plot2
#qualitative, colorblind friendly palette for sleep duration.
palette_SD <- c("'Less than 5 hours'" = "#E69F00", "'5-6 hours'" = "#F0E442", "'7-8 hours'" = "#56B4E9", "'More than 8 hours'" = "#009E73")
##group into equal age buckets.
std_dep <- std_dep %>%
mutate(Age.Group = cut(Age,
breaks = c(18, 23, 29, 35),
right = TRUE,
include.lowest = TRUE,
labels = c("18-23", "24-29", "30-35")))
##calculate percent depression per group.
sleep_dep_pct <- std_dep %>%
group_by(Financial.Stress, Age.Group, Sleep.Duration, Depression) %>%
summarise(Count = n(), .groups = "drop") %>%
group_by(Financial.Stress, Age.Group, Sleep.Duration) %>%
mutate(Percent = Count / sum(Count) * 100) %>%
ungroup()
##filter just yes then plot.
plot3 <- sleep_dep_pct %>%
filter(Depression == "Yes") %>%
ggplot(aes(x = Financial.Stress, y = Percent, fill = Sleep.Duration)) +
geom_col(position = position_dodge()) +
scale_y_continuous(
labels = scales::percent_format(scale = 1),
breaks = seq(0, 100, by = 10),
limits = c(0, 100)) +
scale_fill_manual(values = palette_SD) +
labs(title = "Predictors of depression incidence rate separated by age class",
x = "Financial Stress",
y = "Depression Incidence",
fill = "Sleep Duration") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
facet_wrap(~ Age.Group)
plot3
cities <- c("Visakhapatnam", "Bangalore", "Srinagar", "Varanasi", "Jaipur", "Pune", "Thane",
"Chennai", "Nagpur", "Nashik", "Vadodara", "Kalyan", "Rajkot", "Ahmedabad",
"Kolkata", "Mumbai", "Lucknow", "Indore", "Surat", "Ludhiana", "Bhopal", "Meerut",
"Agra", "Ghaziabad", "Hyderabad", "Vasai-Virar", "Kanpur", "Patna", "Faridabad", "Delhi")
std_dep_cities <- std_dep %>%
filter(City %in% cities)
city_coords <- tribble(~City, ~lat, ~lon,
"Visakhapatnam", 17.6868, 83.2185,
"Bangalore", 12.9716, 77.5946,
"Srinagar", 34.0837, 74.7973,
"Varanasi", 25.3176, 82.9739,
"Jaipur", 26.9124, 75.7873,
"Pune", 18.5204, 73.8567,
"Thane", 19.2183, 72.9781,
"Chennai", 13.0827, 80.2707,
"Nagpur", 21.1458, 79.0882,
"Nashik", 19.9975, 73.7898,
"Vadodara", 22.3072, 73.1812,
"Kalyan", 19.2437, 73.1355,
"Rajkot", 22.3039, 70.8022,
"Ahmedabad", 23.0225, 72.5714,
"Kolkata", 22.5726, 88.3639,
"Mumbai", 19.0760, 72.8777,
"Lucknow", 26.8467, 80.9462,
"Indore", 22.7196, 75.8577,
"Surat", 21.1702, 72.8311,
"Ludhiana", 30.9000, 75.8573,
"Bhopal", 23.2599, 77.4126,
"Meerut", 28.9845, 77.7064,
"Agra", 27.1767, 78.0081,
"Ghaziabad", 28.6692, 77.4538,
"Hyderabad", 17.3850, 78.4867,
"Vasai-Virar", 19.3919, 72.8397,
"Kanpur", 26.4499, 80.3319,
"Patna", 25.5941, 85.1376,
"Faridabad", 28.4089, 77.3178,
"Delhi", 28.6139, 77.2090)
#get proportions of variables of interest
std_dep_prop <- std_dep_cities %>%
group_by(City) %>%
summarise(n = n(),
depression_rate = mean(Depression == "Yes")) %>%
left_join(std_dep_cities %>%
group_by(City, Sleep.Duration) %>%
summarise(sleep_count = n(), .groups = "drop") %>%
group_by(City) %>%
mutate(sleep_pct = sleep_count / sum(sleep_count)) %>%
select(City, Sleep.Duration, sleep_pct) %>%
pivot_wider(names_from = Sleep.Duration, values_from = sleep_pct, names_prefix = "sleep_pct_"),
by = "City") %>%
left_join(std_dep_cities %>%
group_by(City, Financial.Stress) %>%
summarise(stress_count = n(), .groups = "drop") %>%
group_by(City) %>%
mutate(stress_pct = stress_count / sum(stress_count)) %>%
select(City, Financial.Stress, stress_pct) %>%
pivot_wider(names_from = Financial.Stress, values_from = stress_pct, names_prefix = "stress_pct_"),
by = "City")
#join with coordinates
std_dep_x_city <- std_dep_prop %>%
left_join(city_coords, by = "City")
# put together hover information for plotly plot
std_dep_x_city <- std_dep_x_city %>%
rowwise() %>%
mutate(hover_text = paste0("<b>", City, "</b><br>",
"N: ", n, "<br>",
"Depression Rate: ", round(depression_rate * 100, 1), "%<br>",
"<br><b>% of each sleep duration category</b><br>",
paste(names(across(starts_with("sleep_pct_"))),": ",
round(c_across(starts_with("sleep_pct_")) * 100, 1), "%",
collapse = "<br>"),
"<br><b>level of financial stress</b><br>",
paste(names(across(starts_with("stress_pct_"))),": ",
round(c_across(starts_with("stress_pct_")) * 100, 1), "%",
collapse = "<br>"))) %>%
ungroup()
plot_ly(data = std_dep_x_city,
type = 'scattergeo',
lat = ~lat,
lon = ~lon,
mode = 'markers',
width = 900,
height = 700,
marker = list(size = 30,
color = ~depression_rate,
colorscale = 'Viridis',
cmin = 0.5,
cmax = 0.7,
colorbar = list(title = "Student Depression Prevalence",
tickvals = seq(0.5, 0.7, by = 0.05),
ticktext = paste0(seq(50, 70, by = 5), "%"),
len = 0.4,
thickness = 10),
sizemode = 'diameter'),
text = ~hover_text,
hoverinfo = "text") %>%
layout(title = "Student Depression Prevalence of 30 Cities in India",
geo = list(scope = 'asia',
showland = TRUE,
landcolor = 'rgb(250, 240, 220)',
countrycolor = 'steelblue',
center = list(lat = 22, lon = 80),
lonaxis = list(range = c(68, 93)),
lataxis = list(range = c(7, 38))))