Load Packages

knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)
library(tidyr)
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(ggrepel)
library(maps)
## 
## Attaching package: 'maps'
## 
## The following object is masked from 'package:purrr':
## 
##     map
library(sf)
## Linking to GEOS 3.13.0, GDAL 3.8.5, PROJ 9.5.1; sf_use_s2() is TRUE
library(leaflet)

Load Dataset

#load dataset, view data. 
std_dep_full <- read.csv("student_depression_dataset.csv", stringsAsFactors = FALSE)
head(std_dep_full)
#too many variables, take a subset that I think may be important and are interesting to me. 
std_dep <- std_dep_full[, c("Gender", "Age", "City", "Sleep.Duration", "Financial.Stress", "Depression")]
head(std_dep)

process/tidy data as in assignment #3

#change to Yes/No
std_dep$Depression <- factor(std_dep$Depression, levels = c(0, 1), labels = c("No", "Yes"))
#Throw out values >35. Now only analyzing effect on those younger than 35. More easy for visualization purposes, maybe not a good idea if I was conducting statistical analysis. 
std_dep <- std_dep[std_dep$Age <= 35, ]
#throw out "other" category
std_dep <- std_dep[std_dep$Sleep.Duration != "Others", ]
#set in correct order
std_dep <- std_dep %>%
  mutate(Sleep.Duration = factor(Sleep.Duration,
    levels = c("'Less than 5 hours'", "'5-6 hours'", "'7-8 hours'", "'More than 8 hours'")))
## Remove rows where Financial.Stress is "?"
std_dep <- std_dep[std_dep$Financial.Stress != "?", ]
#change labels into something that is easily understandable by viewer. 
std_dep <- std_dep %>%
  mutate(Financial.Stress = as.numeric(Financial.Stress),  #
    Financial.Stress = factor(Financial.Stress,
                              levels = c(1, 2, 3, 4, 5),
                              labels = c("Very Low", "Low", "Moderate", "High", "Very High")))

Visual #2 from assignment #3. Heatmap of Financial Stress and Sleep Duration.

# recalculate percentages by subgroups.
sleep_dep_pct <- std_dep %>%
  group_by(Financial.Stress, Sleep.Duration, Depression) %>%
  summarise(Count = n(), .groups = "drop") %>%
  group_by(Financial.Stress, Sleep.Duration) %>%
  mutate(Percent = Count / sum(Count) * 100)

#just make new dataframe where just depression yes is there. 
heatmap_data <- sleep_dep_pct %>%
  filter(Depression == "Yes")

# Create heatmap plot. Colorblind safe colors. 
plot2 <- ggplot(heatmap_data, aes(x = Sleep.Duration, y = Financial.Stress, fill = Percent)) +
  geom_tile() +
  scale_fill_gradient(low = "yellow", high = "blue") +
  labs(title = "Impact of financial worry and sleep on depression incidence rate",
    x = "Sleep Duration",
    y = "Financial Stress") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
plot2

Visual #3 from assignment #3: Financial Stress, Sleep Duration, and Age Histogram.

#qualitative, colorblind friendly palette for sleep duration. 
palette_SD <- c("'Less than 5 hours'" = "#E69F00", "'5-6 hours'" = "#F0E442", "'7-8 hours'" = "#56B4E9", "'More than 8 hours'" = "#009E73")
##group into equal age buckets. 
std_dep <- std_dep %>%
  mutate(Age.Group = cut(Age,
      breaks = c(18, 23, 29, 35),
      right = TRUE,
      include.lowest = TRUE,
      labels = c("18-23", "24-29", "30-35")))

##calculate percent depression per group. 
sleep_dep_pct <- std_dep %>%
  group_by(Financial.Stress, Age.Group, Sleep.Duration, Depression) %>%
  summarise(Count = n(), .groups = "drop") %>%
  group_by(Financial.Stress, Age.Group, Sleep.Duration) %>%
  mutate(Percent = Count / sum(Count) * 100) %>%
  ungroup()

##filter just yes then plot. 
plot3 <- sleep_dep_pct %>%
  filter(Depression == "Yes") %>%
  ggplot(aes(x = Financial.Stress, y = Percent, fill = Sleep.Duration)) +
  geom_col(position = position_dodge()) +
  scale_y_continuous(
    labels = scales::percent_format(scale = 1),
    breaks = seq(0, 100, by = 10),
    limits = c(0, 100)) +
    scale_fill_manual(values = palette_SD) +
  labs(title = "Predictors of depression incidence rate separated by age class",
    x = "Financial Stress",
    y = "Depression Incidence",
    fill = "Sleep Duration") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +  
  facet_wrap(~ Age.Group)
plot3

BELOW, NEW INTERACTIVE PLOT WITH PLOTLY.

cities subset based on having enough data, easily finding latitudes and longitudes.

cities <- c("Visakhapatnam", "Bangalore", "Srinagar", "Varanasi", "Jaipur", "Pune", "Thane", 
                  "Chennai", "Nagpur", "Nashik", "Vadodara", "Kalyan", "Rajkot", "Ahmedabad", 
                  "Kolkata", "Mumbai", "Lucknow", "Indore", "Surat", "Ludhiana", "Bhopal", "Meerut", 
                  "Agra", "Ghaziabad", "Hyderabad", "Vasai-Virar", "Kanpur", "Patna", "Faridabad", "Delhi")

std_dep_cities <- std_dep %>%
  filter(City %in% cities)

connect cities with coordinate data

city_coords <- tribble(~City, ~lat, ~lon,
  "Visakhapatnam", 17.6868, 83.2185,
  "Bangalore", 12.9716, 77.5946,
  "Srinagar", 34.0837, 74.7973,
  "Varanasi", 25.3176, 82.9739,
  "Jaipur", 26.9124, 75.7873,
  "Pune", 18.5204, 73.8567,
  "Thane", 19.2183, 72.9781,
  "Chennai", 13.0827, 80.2707,
  "Nagpur", 21.1458, 79.0882,
  "Nashik", 19.9975, 73.7898,
  "Vadodara", 22.3072, 73.1812,
  "Kalyan", 19.2437, 73.1355,
  "Rajkot", 22.3039, 70.8022,
  "Ahmedabad", 23.0225, 72.5714,
  "Kolkata", 22.5726, 88.3639,
  "Mumbai", 19.0760, 72.8777,
  "Lucknow", 26.8467, 80.9462,
  "Indore", 22.7196, 75.8577,
  "Surat", 21.1702, 72.8311,
  "Ludhiana", 30.9000, 75.8573,
  "Bhopal", 23.2599, 77.4126,
  "Meerut", 28.9845, 77.7064,
  "Agra", 27.1767, 78.0081,
  "Ghaziabad", 28.6692, 77.4538,
  "Hyderabad", 17.3850, 78.4867,
  "Vasai-Virar", 19.3919, 72.8397,
  "Kanpur", 26.4499, 80.3319,
  "Patna", 25.5941, 85.1376,
  "Faridabad", 28.4089, 77.3178,
  "Delhi", 28.6139, 77.2090)

summarize variables of interest, proportions of each categorical variable per city.

#get proportions of variables of interest
std_dep_prop <- std_dep_cities %>%
  group_by(City) %>%
  summarise(n = n(),
    depression_rate = mean(Depression == "Yes")) %>%
  left_join(std_dep_cities %>%
      group_by(City, Sleep.Duration) %>%
      summarise(sleep_count = n(), .groups = "drop") %>%
      group_by(City) %>%
      mutate(sleep_pct = sleep_count / sum(sleep_count)) %>%
      select(City, Sleep.Duration, sleep_pct) %>%
      pivot_wider(names_from = Sleep.Duration, values_from = sleep_pct, names_prefix = "sleep_pct_"),
    by = "City") %>%
  left_join(std_dep_cities %>%
      group_by(City, Financial.Stress) %>%
      summarise(stress_count = n(), .groups = "drop") %>%
      group_by(City) %>%
      mutate(stress_pct = stress_count / sum(stress_count)) %>%
      select(City, Financial.Stress, stress_pct) %>%
      pivot_wider(names_from = Financial.Stress, values_from = stress_pct, names_prefix = "stress_pct_"),
    by = "City")

#join with coordinates

std_dep_x_city <- std_dep_prop %>%
  left_join(city_coords, by = "City")

# put together hover information for plotly plot
std_dep_x_city <- std_dep_x_city %>%
  rowwise() %>%
  mutate(hover_text = paste0("<b>", City, "</b><br>",
      "N: ", n, "<br>",
      "Depression Rate: ", round(depression_rate * 100, 1), "%<br>",
      "<br><b>% of each sleep duration category</b><br>",
      paste(names(across(starts_with("sleep_pct_"))),": ",
        round(c_across(starts_with("sleep_pct_")) * 100, 1), "%",
        collapse = "<br>"),
      "<br><b>level of financial stress</b><br>",
      paste(names(across(starts_with("stress_pct_"))),": ",
        round(c_across(starts_with("stress_pct_")) * 100, 1), "%",
        collapse = "<br>"))) %>%
  ungroup()

Make Interactive Plot of Student Depression Rate Across Cities in India

plot_ly(data = std_dep_x_city,
  type = 'scattergeo',
  lat = ~lat,
  lon = ~lon,
  mode = 'markers',
  width = 900,     
  height = 700,  
  marker = list(size = 30,
    color = ~depression_rate,
    colorscale = 'Viridis',
    cmin = 0.5,
    cmax = 0.7,
    colorbar = list(title = "Student Depression Prevalence",
      tickvals = seq(0.5, 0.7, by = 0.05),
      ticktext = paste0(seq(50, 70, by = 5), "%"),
      len = 0.4,         
      thickness = 10),     
    sizemode = 'diameter'),
  text = ~hover_text,
  hoverinfo = "text") %>%
layout(title = "Student Depression Prevalence of 30 Cities in India",
  geo = list(scope = 'asia',
    showland = TRUE,
    landcolor = 'rgb(250, 240, 220)',
    countrycolor = 'steelblue',
    center = list(lat = 22, lon = 80),
    lonaxis = list(range = c(68, 93)),
    lataxis = list(range = c(7, 38))))