# List of packages
packages <- c("tidyverse", "dplyr", "modelsummary", "ggplot2", "ggcorrplot", "DescTools", "car")

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
##   backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
## 
## Revert to `kableExtra` for one session:
## 
##   options(modelsummary_factory_default = 'kableExtra')
##   options(modelsummary_factory_latex = 'kableExtra')
##   options(modelsummary_factory_html = 'kableExtra')
## 
## Silence this message forever:
## 
##   config_modelsummary(startup_message = FALSE)
## 
## 
## Attaching package: 'DescTools'
## 
## 
## The following objects are masked from 'package:modelsummary':
## 
##     Format, Mean, Median, N, SD, Var
## 
## 
## Loading required package: carData
## 
## 
## Attaching package: 'car'
## 
## 
## The following object is masked from 'package:DescTools':
## 
##     Recode
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## 
## The following object is masked from 'package:purrr':
## 
##     some
## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
##  [6] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [11] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [16] "datasets"     "methods"      "base"        
## 
## [[4]]
##  [1] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
##  [6] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [11] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [16] "datasets"     "methods"      "base"        
## 
## [[5]]
##  [1] "ggcorrplot"   "modelsummary" "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[6]]
##  [1] "DescTools"    "ggcorrplot"   "modelsummary" "lubridate"    "forcats"     
##  [6] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [11] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [16] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[7]]
##  [1] "car"          "carData"      "DescTools"    "ggcorrplot"   "modelsummary"
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"
library(dplyr)  # For data manipulation
library(ggplot2)  # For visualization
library(ggcorrplot) # For correlation matrix visualization 
library(DescTools) # Descriptive stats tools used for binning
library(car) # for multicollinearity check

aggdata3 <- read.csv("aggdata3.csv")
longdata3 <- read.csv("longdata3.csv")
 
#Data Preparation
str(aggdata3)
## 'data.frame':    209 obs. of  9 variables:
##  $ uni                        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ sentiment                  : num  0.0278 0.0335 0.1259 0.0344 0.053 ...
##  $ t_count                    : num  0.0154 0.0343 0.0337 0 0.0602 0.0208 0.0208 0.0467 0 0.0428 ...
##  $ cost_of_living             : int  111 112 108 84 169 92 166 85 166 166 ...
##  $ STUFACR                    : int  16 19 20 15 19 15 10 19 11 11 ...
##  $ UGDS                       : int  18467 64778 25234 13466 15554 15134 7395 14398 17668 17668 ...
##  $ first_date                 : chr  "2024-11-04 16:27:03+00:00" "2024-11-04 05:02:23+00:00" "2024-11-04 07:42:30+00:00" "2024-11-04 18:37:46+00:00" ...
##  $ last_date                  : chr  "2024-12-03 19:04:34+00:00" "2024-12-03 02:36:19+00:00" "2024-12-04 00:34:33+00:00" "2024-12-03 08:08:59+00:00" ...
##  $ cost.of.living.manual.input: chr  "" "" "" "" ...
str(longdata3)
## 'data.frame':    36191 obs. of  8 variables:
##  $ uni           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ psample       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ date          : chr  "2024-12-03T19:04:34+00:00" "2024-12-03T00:27:12+00:00" "2024-12-02T20:34:41+00:00" "2024-12-02T04:38:15+00:00" ...
##  $ sentiment     : num  0 0 -0.71 0 0 ...
##  $ t_count       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ cost_of_living: int  111 111 111 111 111 111 111 111 111 111 ...
##  $ STUFACR       : int  16 16 16 16 16 16 16 16 16 16 ...
##  $ UGDS          : int  18467 18467 18467 18467 18467 18467 18467 18467 18467 18467 ...
cormatrix <- aggdata3 %>%
  select(sentiment, UGDS, STUFACR, cost_of_living, t_count) %>%
  cor()
print(cormatrix)
##                    sentiment         UGDS     STUFACR cost_of_living
## sentiment       1.0000000000 -0.074795032 -0.11022246  -0.0002232583
## UGDS           -0.0747950323  1.000000000  0.52984434  -0.0012899660
## STUFACR        -0.1102224615  0.529844342  1.00000000  -0.0468878649
## cost_of_living -0.0002232583 -0.001289966 -0.04688786   1.0000000000
## t_count        -0.1904260718  0.152438833  0.14644232  -0.0637577942
##                    t_count
## sentiment      -0.19042607
## UGDS            0.15243883
## STUFACR         0.14644232
## cost_of_living -0.06375779
## t_count         1.00000000
ggcorrplot(cormatrix, method = "square")

INTERPRETATION: Average Sentiment Correlation with UGDS: -0.0748 Weak negative correlation. Correlation with STUFACR: -0.1102 Weak negative correlation. Correlation with t_count: -0.1904 Weak negative correlation. Correlation with cost_of_living: NA

UGDS correlation with STUFACR: 0.5298 Moderate positive correlation. This makes sense; larger numbers of students tend to be associated with higher student-faculty ratios.

anova_longdata3 <- aov(sentiment ~ as.factor(uni), data = longdata3)
summary(anova_longdata3)
##                   Df Sum Sq Mean Sq F value   Pr(>F)    
## as.factor(uni)   208   21.7 0.10420   1.985 1.61e-15 ***
## Residuals      35982 1889.3 0.05251                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova_eta3 <- summary(anova_longdata3)
ss_between3 <- anova_eta3[[1]]$`Sum Sq`[1] # Between-group sum of squares
ss_total3 <- sum(anova_eta3[[1]]$`Sum Sq`) # Total sum of squares
eta_squared3 <- ss_between3 / ss_total3

cat("Eta-squared:", eta_squared3, "\n")
## Eta-squared: 0.01134182

F value: 1.985*** There are statistically significant differences in sentiment between universities (p<0.001) The effect size (eta-squared value of 0.0113) is small. There are meaningful differences between university groups. This will be examined with institutional characteristics.

# Density plots
ggplot(aggdata3, aes(x = sentiment)) +
  geom_density(color = "red", fill = "red", alpha = 0.5) +
  labs(
    title = "Density Plot of Average Sentiment",
    x = "Average Sentiment",
    y = "Density"
  ) +
  theme_minimal() 

ggplot(longdata3, aes(x = sentiment)) +
  geom_density(color = "red", fill = "red", alpha = 0.5) +
  labs(
    title = "Density Plot of Sentiment",
    x = "Sentiment",
    y = "Density"
  ) +
  theme_minimal()

longdata density plot: Most sentiment samples are scored by VADER as neutral with many values hovering around 0.0. There seems to be slightly more positively scored samples than negatively scored samples across all sentiment observations. aggdata density plot: most average sentiment samples are neutral/slightly positive.

#regression model to predict sentiment based on institutional characteristics
model_main3 <- lm(sentiment ~ UGDS + STUFACR + cost_of_living + t_count, data = aggdata3)
summary(model_main3)
## 
## Call:
## lm(formula = sentiment ~ UGDS + STUFACR + cost_of_living + t_count, 
##     data = aggdata3)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.254079 -0.020922 -0.003855  0.020100  0.117290 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     5.868e-02  1.364e-02   4.301 2.64e-05 ***
## UGDS           -1.174e-08  2.583e-07  -0.045    0.964    
## STUFACR        -5.907e-04  5.784e-04  -1.021    0.308    
## cost_of_living -1.803e-05  7.990e-05  -0.226    0.822    
## t_count        -2.829e-01  1.102e-01  -2.567    0.011 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0424 on 204 degrees of freedom
## Multiple R-squared:  0.04344,    Adjusted R-squared:  0.02469 
## F-statistic: 2.316 on 4 and 204 DF,  p-value: 0.05854
# VIF
vif_values <- vif(model_main3)
print(vif_values)
##           UGDS        STUFACR cost_of_living        t_count 
##       1.403002       1.402360       1.006641       1.033876
# Diagnostic Plots 
par(mfrow = c(1, 1))
plot(model_main3)

VIF has no multicollinearity. investigating outliers 48,77,99,108.

# Refit model without any extreme outliers
aggdataminus4 <- aggdata3[-c(48, 77, 99, 108), ]
modelminus4 <- lm(sentiment ~ UGDS + STUFACR + cost_of_living + t_count, data = aggdataminus4)
summary(modelminus4)
## 
## Call:
## lm(formula = sentiment ~ UGDS + STUFACR + cost_of_living + t_count, 
##     data = aggdataminus4)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.113826 -0.020712 -0.005104  0.018684  0.103043 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     5.091e-02  1.185e-02   4.295 2.72e-05 ***
## UGDS           -1.861e-07  2.166e-07  -0.859  0.39123    
## STUFACR        -1.907e-04  4.963e-04  -0.384  0.70126    
## cost_of_living  2.236e-05  7.230e-05   0.309  0.75748    
## t_count        -2.510e-01  9.328e-02  -2.691  0.00773 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03538 on 200 degrees of freedom
## Multiple R-squared:  0.04938,    Adjusted R-squared:  0.03037 
## F-statistic: 2.597 on 4 and 200 DF,  p-value: 0.03752

now checking for cooks distance.

cooksD3 <- cooks.distance(model_main3)
influential_points3 <- which(cooksD3 > 4 / length(cooksD3)) # Rule of thumb
influential_points3
##  15  48  77  99 107 108 118 148 155 203 
##  15  48  77  99 107 108 118 148 155 203
# Refit model without any outliers where cooksd>4/n
cleaned_data3 <- aggdata3[-c(15, 48, 77, 99, 107, 108, 118, 148, 155, 203), ]
model_cleaned3 <- lm(sentiment ~ UGDS + STUFACR + cost_of_living + t_count, data = cleaned_data3)
summary(model_cleaned3)
## 
## Call:
## lm(formula = sentiment ~ UGDS + STUFACR + cost_of_living + t_count, 
##     data = cleaned_data3)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.083104 -0.019638 -0.004246  0.018451  0.106909 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     6.123e-02  1.119e-02   5.470 1.37e-07 ***
## UGDS           -1.893e-07  2.191e-07  -0.864  0.38870    
## STUFACR        -1.564e-04  4.607e-04  -0.339  0.73466    
## cost_of_living -7.035e-05  6.919e-05  -1.017  0.31047    
## t_count        -2.789e-01  8.827e-02  -3.160  0.00183 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03241 on 194 degrees of freedom
## Multiple R-squared:  0.06451,    Adjusted R-squared:  0.04522 
## F-statistic: 3.344 on 4 and 194 DF,  p-value: 0.01127
# VIF
vif_valuesabc <- vif(model_cleaned3)
print(vif_valuesabc)
##           UGDS        STUFACR cost_of_living        t_count 
##       1.268545       1.266870       1.002874       1.018437
# Diagnostic Plots 
par(mfrow = c(1, 1))
plot(model_cleaned3)

I will use and interpret this model because it has the highest predictive accuracy, improvement in model fit and continued robustness of t_count and nonsignificance of the other variables.

INTERPRETATION: t_count is significant (p<0.01) where the greater the t_count in a given subreddit, the lower the average sentiment. UGDS, STUFACR, and cost of living are not significant.

Model fit: Multiple R-squared: 0.06451, Adjusted R-squared: 0.04522 Model is statistically significant (p<0.05), but only 4-6%% of the variance is explained by the model.

No multicollinearity, VIF values < 5. Diagnostic plots look good.

# Visualize Relationships
plot_STUFACR3 <- ggplot(aggdata3, aes(x = STUFACR, y = sentiment)) +
  geom_point() +
  geom_smooth(method = "lm", color = "purple") +
  labs(title = "Sentiment vs Student-Faculty Ratio", x = "Student-Faculty Ratio", y = "Sentiment") +
  theme_minimal()
print(plot_STUFACR3)
## `geom_smooth()` using formula = 'y ~ x'

plot_cost_of_living3 <- ggplot(aggdata3, aes(x = cost_of_living, y = sentiment)) +
  geom_point() +
  geom_smooth(method = "lm", color = "purple") +
  labs(title = "Sentiment vs Cost of Living", x = "Cost of Living", y = "Sentiment") +
  theme_minimal()

print(plot_cost_of_living3)
## `geom_smooth()` using formula = 'y ~ x'

plot_UGDS3 <- ggplot(aggdata3, aes(x = UGDS, y = sentiment)) +
  geom_point() +
  geom_smooth(method = "lm", color = "purple") +
  labs(title = "Sentiment vs Institution Size", x = "Institution Size", y = "Sentiment") +
  theme_minimal()

print(plot_UGDS3)
## `geom_smooth()` using formula = 'y ~ x'

plot_t_count3 <- ggplot(aggdata3, aes(x = t_count, y = sentiment)) +
  geom_point() +
  geom_smooth(method = "lm", color = "purple") +
  labs(title = "Sentiment vs Academic Stress Word Count", x = "t_count", y = "sentiment") +
  theme_minimal()

print(plot_t_count3)
## `geom_smooth()` using formula = 'y ~ x'