title: “Mini Data Analysis Project” author: “Sayana” date: “2025-12-07” output: html_document: theme: cosmo toc: true toc_float: true params: title: default


Introduction

There is a common stereotype that computer science students are introverted, socially closed, and prone to depression. To investigate this, I collected survey responses from 100 computer science students. The dataset contains (among others) the following variables:

  • Age
  • Gender
  • AcademicPerformance (e.g., GPA or self-rated)
  • TakingNoteInClass (yes/no or Likert)
  • DepressionStatus (survey score or categorical label)
  • FaceChallengesToCompleteAcademicTask (yes/no or Likert)
  • LikePresentation (measure correlated with extroversion)
  • SleepPerDayHours
  • NumberOfFriend
  • LikeNewThings (receptiveness to new experiences)

Research questions

  1. Can the number of friends (social connectedness) accurately predict the presence/level of depression among computer science students?
  2. How are sleep patterns and depression level correlated?
  3. Do preferences for presentations (as a proxy of extroversion) show patterns that relate to certain depression levels?

Hypothesis: Number of friends, liking presentations and sleep quality will emerge as definite predictors of depression level among computer science students.

Load data and packages

pkgs <- c("tidyverse", "broom", "knitr", "DT", "ggpubr", "nnet")
for(p in pkgs) if(!requireNamespace(p, quietly = TRUE)) install.packages(p)
library(tidyverse)
library(broom)
library(knitr)
library(DT)
library(ggpubr)
library(nnet)

data <- read.csv("lightning.csv", stringsAsFactors = FALSE)
glimpse(data)
## Rows: 99
## Columns: 10
## $ Age                                  <int> 23, 23, 24, 20, 24, 23, 21, 21, 2…
## $ Gender                               <chr> "Male", "Male", "Male", "Female",…
## $ AcademicPerformance                  <chr> "Average", "Excellent", "Average"…
## $ TakingNoteInClass                    <chr> "No", "Sometimes", "No", "Yes", "…
## $ DepressionStatus                     <chr> "Sometimes", "Yes", "Sometimes", …
## $ FaceChallangesToCompleteAcademicTask <chr> "Yes", "No", "Sometimes", "Yes", …
## $ LikePresentation                     <chr> "Yes", "Yes", "No", "No", "Yes", …
## $ SleepPerDayHours                     <int> 12, 8, 8, 5, 5, 8, 8, 8, 8, 8, 4,…
## $ NumberOfFriend                       <int> NA, 80, 10, 15, 2, 12, 7, 6, 3, 4…
## $ LikeNewThings                        <chr> "Yes", "Yes", "Yes", "Yes", "Yes"…

Summary statistics (neat table)

Summary Statistics of Numeric Columns
Statistic Age SleepPerDayHours NumberOfFriend
Min 20.00 4.00 0.00
1st Qu. 21.00 5.00 3.00
Median 23.00 7.00 6.00
Mean 22.52 6.72 16.19
3rd Qu. 24.00 8.00 15.00
Max 25.00 12.00 100.00
NA’s 0.00 0.00 4.00

Visual exploration

Histograms and distributions

Scatter plots: Age vs NumberOfFriend and Age vs SleepPerDayHours

Linear models requested earlier

## ### Linear model: NumberOfFriend ~ Age
## # A tibble: 2 × 5
##   term        estimate std.error statistic p.value
##   <chr>          <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)   -22.7      37.5     -0.605   0.546
## 2 Age             1.73      1.66     1.04    0.301
## # A tibble: 1 × 12
##   r.squared adj.r.squared sigma statistic p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>   <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1    0.0115      0.000855  25.4      1.08   0.301     1  -441.  888.  896.
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

## ### Linear model: SleepPerDayHours ~ Age
## # A tibble: 2 × 5
##   term        estimate std.error statistic p.value
##   <chr>          <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)   8.09       2.55      3.18  0.00200
## 2 Age          -0.0611     0.113    -0.541 0.590  
## # A tibble: 1 × 12
##   r.squared adj.r.squared sigma statistic p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>   <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1   0.00301      -0.00727  1.74     0.292   0.590     1  -195.  395.  403.
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

Analysis for research questions

1) Number of friends predicting depression

## # weights:  9 (4 variable)
## initial  value 104.368167 
## final  value 99.357760 
## converged
## Multinomial regression (DepressionFactor ~ NumberOfFriend): coefficients shown below
## Call:
## multinom(formula = DepressionFactor ~ NumberOfFriend, data = data)
## 
## Coefficients:
##           (Intercept) NumberOfFriend
## Sometimes   0.9886886    -0.01927580
## Yes         0.7471444    -0.01341274
## 
## Std. Errors:
##           (Intercept) NumberOfFriend
## Sometimes   0.3344128    0.010476633
## Yes         0.3437784    0.009868549
## 
## Residual Deviance: 198.7155 
## AIC: 206.7155

2) Sleep patterns and depression correlation

3) Age x Number of friends

library(ggplot2)
library(broom)
library(dplyr)
library(patchwork)

if(all(c("Age", "NumberOfFriend") %in% names(data))){

  clean_df <- data %>% 
    select(Age, NumberOfFriend) %>%
    na.omit()

  model <- lm(Age ~ NumberOfFriend, data = clean_df)

  p1 <- ggplot(model, aes(.fitted, .resid)) +
    geom_point() +
    geom_smooth(method = "loess", se = FALSE, color = "blue") +
    labs(title = "Residuals vs Fitted", x = "Fitted values", y = "Residuals") +
    theme_minimal(base_size = 12)

  p2 <- ggplot(model, aes(sample = .stdresid)) +
    stat_qq() +
    stat_qq_line() +
    labs(title = "Normal Q-Q", x = "Theoretical Quantiles", y = "Standardized residuals") +
    theme_minimal(base_size = 12)

  p3 <- ggplot(model, aes(.fitted, sqrt(abs(.stdresid)))) +
    geom_point() +
    geom_smooth(method = "loess", se = FALSE, color = "blue") +
    labs(title = "Scale-Location", 
         x = "Fitted values", y = "√|Standardized residuals|") +
    theme_minimal(base_size = 12)

  infl <- influence.measures(model)$infmat
  leverage <- infl[, "hat"]

  p4 <- ggplot(data.frame(leverage = leverage, stdresid = rstandard(model)), 
               aes(leverage, stdresid)) +
    geom_point() +
    geom_smooth(method = "loess", se = FALSE, color = "blue") +
    labs(title = "Residuals vs Leverage", 
         x = "Leverage", y = "Standardized Residuals") +
    theme_minimal(base_size = 12)

  (p1 | p2) /
  (p3 | p4)

}

library(ggfortify)

if(all(c("Age", "NumberOfFriend") %in% names(data))){

  clean_df_poly <- data %>%
    select(NumberOfFriend, Age) %>%
    na.omit()

  poly_model <- lm(NumberOfFriend ~ poly(Age, 2), data = clean_df_poly)

  print(summary(poly_model))

  autoplot(poly_model, label.size = 3) +
    theme_minimal(base_size = 13)

}
## 
## Call:
## lm(formula = NumberOfFriend ~ poly(Age, 2), data = clean_df_poly)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -23.577 -14.710  -7.878   5.766  79.903 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     16.189      2.545   6.361 7.66e-09 ***
## poly(Age, 2)1   26.388     24.807   1.064   0.2902    
## poly(Age, 2)2  -57.652     24.807  -2.324   0.0223 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24.81 on 92 degrees of freedom
## Multiple R-squared:  0.0663, Adjusted R-squared:  0.046 
## F-statistic: 3.266 on 2 and 92 DF,  p-value: 0.04261

library(ggplot2)
library(dplyr)

if(all(c("DepressionStatus", "NumberOfFriend") %in% names(data))){

  data$DepressionStatus <- factor(
    data$DepressionStatus,
    levels = c("No", "Sometimes", "Yes")
  )

  clean_df <- data %>% 
    select(DepressionStatus, NumberOfFriend) %>% 
    na.omit()

  ggplot(clean_df, aes(x = DepressionStatus, y = NumberOfFriend, fill = DepressionStatus)) +
    geom_violin(trim = FALSE, alpha = 0.5) +
    geom_boxplot(width = 0.2, outlier.color = "black", alpha = 0.8) +
    geom_jitter(width = 0.15, alpha = 0.5) +
    scale_fill_brewer(palette = "Set2") +
    labs(
      title = "Number of Friends Across Depression Status Levels",
      x = "Depression Status",
      y = "Number of Friends"
    ) +
    theme_minimal(base_size = 14)

}

mean_df <- data %>%
  group_by(DepressionStatus) %>%
  summarize(mean_friends = mean(NumberOfFriend, na.rm = TRUE))

ggplot(mean_df, aes(x = DepressionStatus, y = mean_friends)) +
  geom_segment(aes(xend = DepressionStatus, y = 0, yend = mean_friends)) +
  geom_point(size = 5, color = "steelblue") +
  labs(title = "Average Number of Friends by Depression Status") +
  theme_minimal()

library(GGally)

num_small <- data %>% select(Age, NumberOfFriend, SleepPerDayHours)
GGally::ggpairs(num_small)

##Conclusion

The former hypothesis for research questions was that Computer science students’ number of friends, preferences for presentations and sleep hours can predict the depression status. So, as the depression status is defined as “Yes/No/Sometimes” by students, correlations were drawn between Age and three potential predictors, also Depression status is dependent variable while others variables are independent. Obviously, sleep patterns have drown certain linear patterns with depression status, but, contrary, number of friends and liking presentations show nonlinear patterns in the plot.