title: “Mini Data Analysis Project” author: “Sayana” date: “2025-12-07” output: html_document: theme: cosmo toc: true toc_float: true params: title: default
There is a common stereotype that computer science students are introverted, socially closed, and prone to depression. To investigate this, I collected survey responses from 100 computer science students. The dataset contains (among others) the following variables:
AgeGenderAcademicPerformance (e.g., GPA or self-rated)TakingNoteInClass (yes/no or Likert)DepressionStatus (survey score or categorical
label)FaceChallengesToCompleteAcademicTask (yes/no or
Likert)LikePresentation (measure correlated with
extroversion)SleepPerDayHoursNumberOfFriendLikeNewThings (receptiveness to new experiences)Hypothesis: Number of friends, liking presentations and sleep quality will emerge as definite predictors of depression level among computer science students.
pkgs <- c("tidyverse", "broom", "knitr", "DT", "ggpubr", "nnet")
for(p in pkgs) if(!requireNamespace(p, quietly = TRUE)) install.packages(p)
library(tidyverse)
library(broom)
library(knitr)
library(DT)
library(ggpubr)
library(nnet)
data <- read.csv("lightning.csv", stringsAsFactors = FALSE)
glimpse(data)
## Rows: 99
## Columns: 10
## $ Age <int> 23, 23, 24, 20, 24, 23, 21, 21, 2…
## $ Gender <chr> "Male", "Male", "Male", "Female",…
## $ AcademicPerformance <chr> "Average", "Excellent", "Average"…
## $ TakingNoteInClass <chr> "No", "Sometimes", "No", "Yes", "…
## $ DepressionStatus <chr> "Sometimes", "Yes", "Sometimes", …
## $ FaceChallangesToCompleteAcademicTask <chr> "Yes", "No", "Sometimes", "Yes", …
## $ LikePresentation <chr> "Yes", "Yes", "No", "No", "Yes", …
## $ SleepPerDayHours <int> 12, 8, 8, 5, 5, 8, 8, 8, 8, 8, 4,…
## $ NumberOfFriend <int> NA, 80, 10, 15, 2, 12, 7, 6, 3, 4…
## $ LikeNewThings <chr> "Yes", "Yes", "Yes", "Yes", "Yes"…
| Statistic | Age | SleepPerDayHours | NumberOfFriend |
|---|---|---|---|
| Min | 20.00 | 4.00 | 0.00 |
| 1st Qu. | 21.00 | 5.00 | 3.00 |
| Median | 23.00 | 7.00 | 6.00 |
| Mean | 22.52 | 6.72 | 16.19 |
| 3rd Qu. | 24.00 | 8.00 | 15.00 |
| Max | 25.00 | 12.00 | 100.00 |
| NA’s | 0.00 | 0.00 | 4.00 |
## ### Linear model: NumberOfFriend ~ Age
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -22.7 37.5 -0.605 0.546
## 2 Age 1.73 1.66 1.04 0.301
## # A tibble: 1 × 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0115 0.000855 25.4 1.08 0.301 1 -441. 888. 896.
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
## ### Linear model: SleepPerDayHours ~ Age
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 8.09 2.55 3.18 0.00200
## 2 Age -0.0611 0.113 -0.541 0.590
## # A tibble: 1 × 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.00301 -0.00727 1.74 0.292 0.590 1 -195. 395. 403.
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
## # weights: 9 (4 variable)
## initial value 104.368167
## final value 99.357760
## converged
## Multinomial regression (DepressionFactor ~ NumberOfFriend): coefficients shown below
## Call:
## multinom(formula = DepressionFactor ~ NumberOfFriend, data = data)
##
## Coefficients:
## (Intercept) NumberOfFriend
## Sometimes 0.9886886 -0.01927580
## Yes 0.7471444 -0.01341274
##
## Std. Errors:
## (Intercept) NumberOfFriend
## Sometimes 0.3344128 0.010476633
## Yes 0.3437784 0.009868549
##
## Residual Deviance: 198.7155
## AIC: 206.7155
library(ggplot2)
library(broom)
library(dplyr)
library(patchwork)
if(all(c("Age", "NumberOfFriend") %in% names(data))){
clean_df <- data %>%
select(Age, NumberOfFriend) %>%
na.omit()
model <- lm(Age ~ NumberOfFriend, data = clean_df)
p1 <- ggplot(model, aes(.fitted, .resid)) +
geom_point() +
geom_smooth(method = "loess", se = FALSE, color = "blue") +
labs(title = "Residuals vs Fitted", x = "Fitted values", y = "Residuals") +
theme_minimal(base_size = 12)
p2 <- ggplot(model, aes(sample = .stdresid)) +
stat_qq() +
stat_qq_line() +
labs(title = "Normal Q-Q", x = "Theoretical Quantiles", y = "Standardized residuals") +
theme_minimal(base_size = 12)
p3 <- ggplot(model, aes(.fitted, sqrt(abs(.stdresid)))) +
geom_point() +
geom_smooth(method = "loess", se = FALSE, color = "blue") +
labs(title = "Scale-Location",
x = "Fitted values", y = "√|Standardized residuals|") +
theme_minimal(base_size = 12)
infl <- influence.measures(model)$infmat
leverage <- infl[, "hat"]
p4 <- ggplot(data.frame(leverage = leverage, stdresid = rstandard(model)),
aes(leverage, stdresid)) +
geom_point() +
geom_smooth(method = "loess", se = FALSE, color = "blue") +
labs(title = "Residuals vs Leverage",
x = "Leverage", y = "Standardized Residuals") +
theme_minimal(base_size = 12)
(p1 | p2) /
(p3 | p4)
}
library(ggfortify)
if(all(c("Age", "NumberOfFriend") %in% names(data))){
clean_df_poly <- data %>%
select(NumberOfFriend, Age) %>%
na.omit()
poly_model <- lm(NumberOfFriend ~ poly(Age, 2), data = clean_df_poly)
print(summary(poly_model))
autoplot(poly_model, label.size = 3) +
theme_minimal(base_size = 13)
}
##
## Call:
## lm(formula = NumberOfFriend ~ poly(Age, 2), data = clean_df_poly)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.577 -14.710 -7.878 5.766 79.903
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.189 2.545 6.361 7.66e-09 ***
## poly(Age, 2)1 26.388 24.807 1.064 0.2902
## poly(Age, 2)2 -57.652 24.807 -2.324 0.0223 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24.81 on 92 degrees of freedom
## Multiple R-squared: 0.0663, Adjusted R-squared: 0.046
## F-statistic: 3.266 on 2 and 92 DF, p-value: 0.04261
library(ggplot2)
library(dplyr)
if(all(c("DepressionStatus", "NumberOfFriend") %in% names(data))){
data$DepressionStatus <- factor(
data$DepressionStatus,
levels = c("No", "Sometimes", "Yes")
)
clean_df <- data %>%
select(DepressionStatus, NumberOfFriend) %>%
na.omit()
ggplot(clean_df, aes(x = DepressionStatus, y = NumberOfFriend, fill = DepressionStatus)) +
geom_violin(trim = FALSE, alpha = 0.5) +
geom_boxplot(width = 0.2, outlier.color = "black", alpha = 0.8) +
geom_jitter(width = 0.15, alpha = 0.5) +
scale_fill_brewer(palette = "Set2") +
labs(
title = "Number of Friends Across Depression Status Levels",
x = "Depression Status",
y = "Number of Friends"
) +
theme_minimal(base_size = 14)
}
mean_df <- data %>%
group_by(DepressionStatus) %>%
summarize(mean_friends = mean(NumberOfFriend, na.rm = TRUE))
ggplot(mean_df, aes(x = DepressionStatus, y = mean_friends)) +
geom_segment(aes(xend = DepressionStatus, y = 0, yend = mean_friends)) +
geom_point(size = 5, color = "steelblue") +
labs(title = "Average Number of Friends by Depression Status") +
theme_minimal()
library(GGally)
num_small <- data %>% select(Age, NumberOfFriend, SleepPerDayHours)
GGally::ggpairs(num_small)
##Conclusion
The former hypothesis for research questions was that Computer science students’ number of friends, preferences for presentations and sleep hours can predict the depression status. So, as the depression status is defined as “Yes/No/Sometimes” by students, correlations were drawn between Age and three potential predictors, also Depression status is dependent variable while others variables are independent. Obviously, sleep patterns have drown certain linear patterns with depression status, but, contrary, number of friends and liking presentations show nonlinear patterns in the plot.