# List of packages
packages <- c("tidyverse", "dplyr", "modelsummary", "ggplot2", "ggcorrplot", "DescTools", "car")
# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
## backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
##
## Revert to `kableExtra` for one session:
##
## options(modelsummary_factory_default = 'kableExtra')
## options(modelsummary_factory_latex = 'kableExtra')
## options(modelsummary_factory_html = 'kableExtra')
##
## Silence this message forever:
##
## config_modelsummary(startup_message = FALSE)
##
##
## Attaching package: 'DescTools'
##
##
## The following objects are masked from 'package:modelsummary':
##
## Format, Mean, Median, N, SD, Var
##
##
## Loading required package: carData
##
##
## Attaching package: 'car'
##
##
## The following object is masked from 'package:DescTools':
##
## Recode
##
##
## The following object is masked from 'package:dplyr':
##
## recode
##
##
## The following object is masked from 'package:purrr':
##
## some
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "modelsummary" "lubridate" "forcats" "stringr" "dplyr"
## [6] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [11] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [16] "datasets" "methods" "base"
##
## [[4]]
## [1] "modelsummary" "lubridate" "forcats" "stringr" "dplyr"
## [6] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [11] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [16] "datasets" "methods" "base"
##
## [[5]]
## [1] "ggcorrplot" "modelsummary" "lubridate" "forcats" "stringr"
## [6] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [11] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [16] "utils" "datasets" "methods" "base"
##
## [[6]]
## [1] "DescTools" "ggcorrplot" "modelsummary" "lubridate" "forcats"
## [6] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [11] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [16] "grDevices" "utils" "datasets" "methods" "base"
##
## [[7]]
## [1] "car" "carData" "DescTools" "ggcorrplot" "modelsummary"
## [6] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [11] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [16] "stats" "graphics" "grDevices" "utils" "datasets"
## [21] "methods" "base"
library(dplyr) # For data manipulation
library(ggplot2) # For visualization
library(ggcorrplot) # For correlation matrix visualization
library(DescTools) # Descriptive stats tools used for binning
library(car) # for multicollinearity check
aggdata3 <- read.csv("aggdata3.csv")
longdata3 <- read.csv("longdata3.csv")
#Data Preparation
str(aggdata3)
## 'data.frame': 209 obs. of 9 variables:
## $ uni : int 1 2 3 4 5 6 7 8 9 10 ...
## $ sentiment : num 0.0278 0.0335 0.1259 0.0344 0.053 ...
## $ t_count : num 0.0154 0.0343 0.0337 0 0.0602 0.0208 0.0208 0.0467 0 0.0428 ...
## $ cost_of_living : int 111 112 108 84 169 92 166 85 166 166 ...
## $ STUFACR : int 16 19 20 15 19 15 10 19 11 11 ...
## $ UGDS : int 18467 64778 25234 13466 15554 15134 7395 14398 17668 17668 ...
## $ first_date : chr "2024-11-04 16:27:03+00:00" "2024-11-04 05:02:23+00:00" "2024-11-04 07:42:30+00:00" "2024-11-04 18:37:46+00:00" ...
## $ last_date : chr "2024-12-03 19:04:34+00:00" "2024-12-03 02:36:19+00:00" "2024-12-04 00:34:33+00:00" "2024-12-03 08:08:59+00:00" ...
## $ cost.of.living.manual.input: chr "" "" "" "" ...
str(longdata3)
## 'data.frame': 36191 obs. of 8 variables:
## $ uni : int 1 1 1 1 1 1 1 1 1 1 ...
## $ psample : int 1 2 3 4 5 6 7 8 9 10 ...
## $ date : chr "2024-12-03T19:04:34+00:00" "2024-12-03T00:27:12+00:00" "2024-12-02T20:34:41+00:00" "2024-12-02T04:38:15+00:00" ...
## $ sentiment : num 0 0 -0.71 0 0 ...
## $ t_count : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cost_of_living: int 111 111 111 111 111 111 111 111 111 111 ...
## $ STUFACR : int 16 16 16 16 16 16 16 16 16 16 ...
## $ UGDS : int 18467 18467 18467 18467 18467 18467 18467 18467 18467 18467 ...
cormatrix <- aggdata3 %>%
select(sentiment, UGDS, STUFACR, cost_of_living, t_count) %>%
cor()
print(cormatrix)
## sentiment UGDS STUFACR cost_of_living
## sentiment 1.0000000000 -0.074795032 -0.11022246 -0.0002232583
## UGDS -0.0747950323 1.000000000 0.52984434 -0.0012899660
## STUFACR -0.1102224615 0.529844342 1.00000000 -0.0468878649
## cost_of_living -0.0002232583 -0.001289966 -0.04688786 1.0000000000
## t_count -0.1904260718 0.152438833 0.14644232 -0.0637577942
## t_count
## sentiment -0.19042607
## UGDS 0.15243883
## STUFACR 0.14644232
## cost_of_living -0.06375779
## t_count 1.00000000
ggcorrplot(cormatrix, method = "square")
INTERPRETATION: Average Sentiment Correlation with UGDS: -0.0748 Weak negative correlation. Correlation with STUFACR: -0.1102 Weak negative correlation. Correlation with t_count: -0.1904 Weak negative correlation. Correlation with cost_of_living: NA
UGDS correlation with STUFACR: 0.5298 Moderate positive correlation. This makes sense; larger numbers of students tend to be associated with higher student-faculty ratios.
anova_longdata3 <- aov(sentiment ~ as.factor(uni), data = longdata3)
summary(anova_longdata3)
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(uni) 208 21.7 0.10420 1.985 1.61e-15 ***
## Residuals 35982 1889.3 0.05251
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova_eta3 <- summary(anova_longdata3)
ss_between3 <- anova_eta3[[1]]$`Sum Sq`[1] # Between-group sum of squares
ss_total3 <- sum(anova_eta3[[1]]$`Sum Sq`) # Total sum of squares
eta_squared3 <- ss_between3 / ss_total3
cat("Eta-squared:", eta_squared3, "\n")
## Eta-squared: 0.01134182
F value: 1.985*** There are statistically significant differences in sentiment between universities (p<0.001) The effect size (eta-squared value of 0.0113) is small. There are meaningful differences between university groups. This will be examined with institutional characteristics.
# Density plots
ggplot(aggdata3, aes(x = sentiment)) +
geom_density(color = "red", fill = "red", alpha = 0.5) +
labs(
title = "Density Plot of Average Sentiment",
x = "Average Sentiment",
y = "Density"
) +
theme_minimal()
ggplot(longdata3, aes(x = sentiment)) +
geom_density(color = "red", fill = "red", alpha = 0.5) +
labs(
title = "Density Plot of Sentiment",
x = "Sentiment",
y = "Density"
) +
theme_minimal()
longdata density plot: Most sentiment samples are scored by VADER as
neutral with many values hovering around 0.0. There seems to be slightly
more positively scored samples than negatively scored samples across all
sentiment observations. aggdata density plot: most average sentiment
samples are neutral/slightly positive.
#regression model to predict sentiment based on institutional characteristics
model_main3 <- lm(sentiment ~ UGDS + STUFACR + cost_of_living + t_count, data = aggdata3)
summary(model_main3)
##
## Call:
## lm(formula = sentiment ~ UGDS + STUFACR + cost_of_living + t_count,
## data = aggdata3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.254079 -0.020922 -0.003855 0.020100 0.117290
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.868e-02 1.364e-02 4.301 2.64e-05 ***
## UGDS -1.174e-08 2.583e-07 -0.045 0.964
## STUFACR -5.907e-04 5.784e-04 -1.021 0.308
## cost_of_living -1.803e-05 7.990e-05 -0.226 0.822
## t_count -2.829e-01 1.102e-01 -2.567 0.011 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0424 on 204 degrees of freedom
## Multiple R-squared: 0.04344, Adjusted R-squared: 0.02469
## F-statistic: 2.316 on 4 and 204 DF, p-value: 0.05854
# VIF
vif_values <- vif(model_main3)
print(vif_values)
## UGDS STUFACR cost_of_living t_count
## 1.403002 1.402360 1.006641 1.033876
# Diagnostic Plots
par(mfrow = c(1, 1))
plot(model_main3)
VIF has no multicollinearity. investigating outliers 48,77,99,108.
# Refit model without any extreme outliers
aggdataminus4 <- aggdata3[-c(48, 77, 99, 108), ]
modelminus4 <- lm(sentiment ~ UGDS + STUFACR + cost_of_living + t_count, data = aggdataminus4)
summary(modelminus4)
##
## Call:
## lm(formula = sentiment ~ UGDS + STUFACR + cost_of_living + t_count,
## data = aggdataminus4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.113826 -0.020712 -0.005104 0.018684 0.103043
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.091e-02 1.185e-02 4.295 2.72e-05 ***
## UGDS -1.861e-07 2.166e-07 -0.859 0.39123
## STUFACR -1.907e-04 4.963e-04 -0.384 0.70126
## cost_of_living 2.236e-05 7.230e-05 0.309 0.75748
## t_count -2.510e-01 9.328e-02 -2.691 0.00773 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03538 on 200 degrees of freedom
## Multiple R-squared: 0.04938, Adjusted R-squared: 0.03037
## F-statistic: 2.597 on 4 and 200 DF, p-value: 0.03752
now checking for cooks distance.
cooksD3 <- cooks.distance(model_main3)
influential_points3 <- which(cooksD3 > 4 / length(cooksD3)) # Rule of thumb
influential_points3
## 15 48 77 99 107 108 118 148 155 203
## 15 48 77 99 107 108 118 148 155 203
# Refit model without any outliers where cooksd>4/n
cleaned_data3 <- aggdata3[-c(15, 48, 77, 99, 107, 108, 118, 148, 155, 203), ]
model_cleaned3 <- lm(sentiment ~ UGDS + STUFACR + cost_of_living + t_count, data = cleaned_data3)
summary(model_cleaned3)
##
## Call:
## lm(formula = sentiment ~ UGDS + STUFACR + cost_of_living + t_count,
## data = cleaned_data3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.083104 -0.019638 -0.004246 0.018451 0.106909
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.123e-02 1.119e-02 5.470 1.37e-07 ***
## UGDS -1.893e-07 2.191e-07 -0.864 0.38870
## STUFACR -1.564e-04 4.607e-04 -0.339 0.73466
## cost_of_living -7.035e-05 6.919e-05 -1.017 0.31047
## t_count -2.789e-01 8.827e-02 -3.160 0.00183 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03241 on 194 degrees of freedom
## Multiple R-squared: 0.06451, Adjusted R-squared: 0.04522
## F-statistic: 3.344 on 4 and 194 DF, p-value: 0.01127
# VIF
vif_valuesabc <- vif(model_cleaned3)
print(vif_valuesabc)
## UGDS STUFACR cost_of_living t_count
## 1.268545 1.266870 1.002874 1.018437
# Diagnostic Plots
par(mfrow = c(1, 1))
plot(model_cleaned3)
I will use and interpret this model because it has the highest
predictive accuracy, improvement in model fit and continued robustness
of t_count and nonsignificance of the other variables.
INTERPRETATION: t_count is significant (p<0.01) where the greater the t_count in a given subreddit, the lower the average sentiment. UGDS, STUFACR, and cost of living are not significant.
Model fit: Multiple R-squared: 0.06451, Adjusted R-squared: 0.04522 Model is statistically significant (p<0.05), but only 4-6%% of the variance is explained by the model.
No multicollinearity, VIF values < 5. Diagnostic plots look good.
# Visualize Relationships
plot_STUFACR3 <- ggplot(aggdata3, aes(x = STUFACR, y = sentiment)) +
geom_point() +
geom_smooth(method = "lm", color = "purple") +
labs(title = "Sentiment vs Student-Faculty Ratio", x = "Student-Faculty Ratio", y = "Sentiment") +
theme_minimal()
print(plot_STUFACR3)
## `geom_smooth()` using formula = 'y ~ x'
plot_cost_of_living3 <- ggplot(aggdata3, aes(x = cost_of_living, y = sentiment)) +
geom_point() +
geom_smooth(method = "lm", color = "purple") +
labs(title = "Sentiment vs Cost of Living", x = "Cost of Living", y = "Sentiment") +
theme_minimal()
print(plot_cost_of_living3)
## `geom_smooth()` using formula = 'y ~ x'
plot_UGDS3 <- ggplot(aggdata3, aes(x = UGDS, y = sentiment)) +
geom_point() +
geom_smooth(method = "lm", color = "purple") +
labs(title = "Sentiment vs Institution Size", x = "Institution Size", y = "Sentiment") +
theme_minimal()
print(plot_UGDS3)
## `geom_smooth()` using formula = 'y ~ x'
plot_t_count3 <- ggplot(aggdata3, aes(x = t_count, y = sentiment)) +
geom_point() +
geom_smooth(method = "lm", color = "purple") +
labs(title = "Sentiment vs Academic Stress Word Count", x = "t_count", y = "sentiment") +
theme_minimal()
print(plot_t_count3)
## `geom_smooth()` using formula = 'y ~ x'