Load Packages

knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggplot2)
library(dplyr)
library(tidyr)
library(plotly)

## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

library(ggrepel)
library(maps)

## 
## Attaching package: 'maps'
## 
## The following object is masked from 'package:purrr':
## 
##     map

library(sf)

## Linking to GEOS 3.13.0, GDAL 3.8.5, PROJ 9.5.1; sf_use_s2() is TRUE

library(leaflet)

Load Dataset

#load dataset, view data. 
std_dep_full <- read.csv("student_depression_dataset.csv", stringsAsFactors = FALSE)
head(std_dep_full)

#too many variables, take a subset that I think may be important and are interesting to me. 
std_dep <- std_dep_full[, c("Gender", "Age", "City", "Sleep.Duration", "Financial.Stress", "Depression")]
head(std_dep)

process/tidy data as in assignment #3

#change to Yes/No
std_dep$Depression <- factor(std_dep$Depression, levels = c(0, 1), labels = c("No", "Yes"))
#Throw out values >35. Now only analyzing effect on those younger than 35. More easy for visualization purposes, maybe not a good idea if I was conducting statistical analysis. 
std_dep <- std_dep[std_dep$Age <= 35, ]
#throw out "other" category
std_dep <- std_dep[std_dep$Sleep.Duration != "Others", ]
#set in correct order
std_dep <- std_dep %>%
  mutate(Sleep.Duration = factor(Sleep.Duration,
    levels = c("'Less than 5 hours'", "'5-6 hours'", "'7-8 hours'", "'More than 8 hours'")))
## Remove rows where Financial.Stress is "?"
std_dep <- std_dep[std_dep$Financial.Stress != "?", ]
#change labels into something that is easily understandable by viewer. 
std_dep <- std_dep %>%
  mutate(Financial.Stress = as.numeric(Financial.Stress),  #
    Financial.Stress = factor(Financial.Stress,
                              levels = c(1, 2, 3, 4, 5),
                              labels = c("Very Low", "Low", "Moderate", "High", "Very High")))

Visual #2 from assignment #3. Heatmap of Financial Stress and Sleep Duration.

# recalculate percentages by subgroups.
sleep_dep_pct <- std_dep %>%
  group_by(Financial.Stress, Sleep.Duration, Depression) %>%
  summarise(Count = n(), .groups = "drop") %>%
  group_by(Financial.Stress, Sleep.Duration) %>%
  mutate(Percent = Count / sum(Count) * 100)

#just make new dataframe where just depression yes is there. 
heatmap_data <- sleep_dep_pct %>%
  filter(Depression == "Yes")

# Create heatmap plot. Colorblind safe colors. 
plot2 <- ggplot(heatmap_data, aes(x = Sleep.Duration, y = Financial.Stress, fill = Percent)) +
  geom_tile() +
  scale_fill_gradient(low = "yellow", high = "blue") +
  labs(title = "Impact of financial worry and sleep on depression incidence rate",
    x = "Sleep Duration",
    y = "Financial Stress") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
plot2

Visual #3 from assignment #3: Financial Stress, Sleep Duration, and Age Histogram.

#qualitative, colorblind friendly palette for sleep duration. 
palette_SD <- c("'Less than 5 hours'" = "#E69F00", "'5-6 hours'" = "#F0E442", "'7-8 hours'" = "#56B4E9", "'More than 8 hours'" = "#009E73")
##group into equal age buckets. 
std_dep <- std_dep %>%
  mutate(Age.Group = cut(Age,
      breaks = c(18, 23, 29, 35),
      right = TRUE,
      include.lowest = TRUE,
      labels = c("18-23", "24-29", "30-35")))

##calculate percent depression per group. 
sleep_dep_pct <- std_dep %>%
  group_by(Financial.Stress, Age.Group, Sleep.Duration, Depression) %>%
  summarise(Count = n(), .groups = "drop") %>%
  group_by(Financial.Stress, Age.Group, Sleep.Duration) %>%
  mutate(Percent = Count / sum(Count) * 100) %>%
  ungroup()

##filter just yes then plot. 
plot3 <- sleep_dep_pct %>%
  filter(Depression == "Yes") %>%
  ggplot(aes(x = Financial.Stress, y = Percent, fill = Sleep.Duration)) +
  geom_col(position = position_dodge()) +
  scale_y_continuous(
    labels = scales::percent_format(scale = 1),
    breaks = seq(0, 100, by = 10),
    limits = c(0, 100)) +
    scale_fill_manual(values = palette_SD) +
  labs(title = "Predictors of depression incidence rate separated by age class",
    x = "Financial Stress",
    y = "Depression Incidence",
    fill = "Sleep Duration") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +  
  facet_wrap(~ Age.Group)
plot3

BELOW, NEW INTERACTIVE PLOT WITH PLOTLY.

cities subset based on having enough data, easily finding latitudes and longitudes.

cities <- c("Visakhapatnam", "Bangalore", "Srinagar", "Varanasi", "Jaipur", "Pune", "Thane", 
                  "Chennai", "Nagpur", "Nashik", "Vadodara", "Kalyan", "Rajkot", "Ahmedabad", 
                  "Kolkata", "Mumbai", "Lucknow", "Indore", "Surat", "Ludhiana", "Bhopal", "Meerut", 
                  "Agra", "Ghaziabad", "Hyderabad", "Vasai-Virar", "Kanpur", "Patna", "Faridabad", "Delhi")

std_dep_cities <- std_dep %>%
  filter(City %in% cities)

connect cities with coordinate data

city_coords <- tribble(~City, ~lat, ~lon,
  "Visakhapatnam", 17.6868, 83.2185,
  "Bangalore", 12.9716, 77.5946,
  "Srinagar", 34.0837, 74.7973,
  "Varanasi", 25.3176, 82.9739,
  "Jaipur", 26.9124, 75.7873,
  "Pune", 18.5204, 73.8567,
  "Thane", 19.2183, 72.9781,
  "Chennai", 13.0827, 80.2707,
  "Nagpur", 21.1458, 79.0882,
  "Nashik", 19.9975, 73.7898,
  "Vadodara", 22.3072, 73.1812,
  "Kalyan", 19.2437, 73.1355,
  "Rajkot", 22.3039, 70.8022,
  "Ahmedabad", 23.0225, 72.5714,
  "Kolkata", 22.5726, 88.3639,
  "Mumbai", 19.0760, 72.8777,
  "Lucknow", 26.8467, 80.9462,
  "Indore", 22.7196, 75.8577,
  "Surat", 21.1702, 72.8311,
  "Ludhiana", 30.9000, 75.8573,
  "Bhopal", 23.2599, 77.4126,
  "Meerut", 28.9845, 77.7064,
  "Agra", 27.1767, 78.0081,
  "Ghaziabad", 28.6692, 77.4538,
  "Hyderabad", 17.3850, 78.4867,
  "Vasai-Virar", 19.3919, 72.8397,
  "Kanpur", 26.4499, 80.3319,
  "Patna", 25.5941, 85.1376,
  "Faridabad", 28.4089, 77.3178,
  "Delhi", 28.6139, 77.2090)

summarize variables of interest, proportions of each categorical variable per city.

#get proportions of variables of interest
std_dep_prop <- std_dep_cities %>%
  group_by(City) %>%
  summarise(n = n(),
    depression_rate = mean(Depression == "Yes")) %>%
  left_join(std_dep_cities %>%
      group_by(City, Sleep.Duration) %>%
      summarise(sleep_count = n(), .groups = "drop") %>%
      group_by(City) %>%
      mutate(sleep_pct = sleep_count / sum(sleep_count)) %>%
      select(City, Sleep.Duration, sleep_pct) %>%
      pivot_wider(names_from = Sleep.Duration, values_from = sleep_pct, names_prefix = "sleep_pct_"),
    by = "City") %>%
  left_join(std_dep_cities %>%
      group_by(City, Financial.Stress) %>%
      summarise(stress_count = n(), .groups = "drop") %>%
      group_by(City) %>%
      mutate(stress_pct = stress_count / sum(stress_count)) %>%
      select(City, Financial.Stress, stress_pct) %>%
      pivot_wider(names_from = Financial.Stress, values_from = stress_pct, names_prefix = "stress_pct_"),
    by = "City")

#join with coordinates

std_dep_x_city <- std_dep_prop %>%
  left_join(city_coords, by = "City")

# put together hover information for plotly plot
std_dep_x_city <- std_dep_x_city %>%
  rowwise() %>%
  mutate(hover_text = paste0("<b>", City, "</b><br>",
      "N: ", n, "<br>",
      "Depression Rate: ", round(depression_rate * 100, 1), "%<br>",
      "<br><b>% of each sleep duration category</b><br>",
      paste(names(across(starts_with("sleep_pct_"))),": ",
        round(c_across(starts_with("sleep_pct_")) * 100, 1), "%",
        collapse = "<br>"),
      "<br><b>level of financial stress</b><br>",
      paste(names(across(starts_with("stress_pct_"))),": ",
        round(c_across(starts_with("stress_pct_")) * 100, 1), "%",
        collapse = "<br>"))) %>%
  ungroup()

Make Interactive Plot of Student Depression Rate Across Cities in India

plot_ly(data = std_dep_x_city,
  type = 'scattergeo',
  lat = ~lat,
  lon = ~lon,
  mode = 'markers',
  width = 900,     
  height = 700,  
  marker = list(size = 30,
    color = ~depression_rate,
    colorscale = 'Viridis',
    cmin = 0.5,
    cmax = 0.7,
    colorbar = list(title = "Student Depression Prevalence",
      tickvals = seq(0.5, 0.7, by = 0.05),
      ticktext = paste0(seq(50, 70, by = 5), "%"),
      len = 0.4,         
      thickness = 10),     
    sizemode = 'diameter'),
  text = ~hover_text,
  hoverinfo = "text") %>%
layout(title = "Student Depression Prevalence of 30 Cities in India",
  geo = list(scope = 'asia',
    showland = TRUE,
    landcolor = 'rgb(250, 240, 220)',
    countrycolor = 'steelblue',
    center = list(lat = 22, lon = 80),
    lonaxis = list(range = c(68, 93)),
    lataxis = list(range = c(7, 38))))

ST 537: Assignment 4 Draft - Putting it Together

Daniel Farthing

20250602

Load Packages

Load Dataset

process/tidy data as in assignment #3

Visual #2 from assignment #3. Heatmap of Financial Stress and Sleep Duration.

Visual #3 from assignment #3: Financial Stress, Sleep Duration, and Age Histogram.

BELOW, NEW INTERACTIVE PLOT WITH PLOTLY.

cities subset based on having enough data, easily finding latitudes and longitudes.

connect cities with coordinate data

summarize variables of interest, proportions of each categorical variable per city.

Make Interactive Plot of Student Depression Rate Across Cities in India