# List of packages
packages <- c("tidyverse", "dplyr", "modelsummary", "readxl", "lubridate", "ggplot2", "lme4", "ggcorrplot", "DescTools", "car")

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
##   backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
## 
## Revert to `kableExtra` for one session:
## 
##   options(modelsummary_factory_default = 'kableExtra')
##   options(modelsummary_factory_latex = 'kableExtra')
##   options(modelsummary_factory_html = 'kableExtra')
## 
## Silence this message forever:
## 
##   config_modelsummary(startup_message = FALSE)
## 
## Loading required package: Matrix
## 
## 
## Attaching package: 'Matrix'
## 
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## 
## 
## Attaching package: 'DescTools'
## 
## 
## The following objects are masked from 'package:modelsummary':
## 
##     Format, Mean, Median, N, SD, Var
## 
## 
## Loading required package: carData
## 
## 
## Attaching package: 'car'
## 
## 
## The following object is masked from 'package:DescTools':
## 
##     Recode
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## 
## The following object is masked from 'package:purrr':
## 
##     some
## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
##  [6] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [11] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [16] "datasets"     "methods"      "base"        
## 
## [[4]]
##  [1] "readxl"       "modelsummary" "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[5]]
##  [1] "readxl"       "modelsummary" "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[6]]
##  [1] "readxl"       "modelsummary" "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[7]]
##  [1] "lme4"         "Matrix"       "readxl"       "modelsummary" "lubridate"   
##  [6] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [11] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [16] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [21] "base"        
## 
## [[8]]
##  [1] "ggcorrplot"   "lme4"         "Matrix"       "readxl"       "modelsummary"
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"        
## 
## [[9]]
##  [1] "DescTools"    "ggcorrplot"   "lme4"         "Matrix"       "readxl"      
##  [6] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
## [11] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [16] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [21] "datasets"     "methods"      "base"        
## 
## [[10]]
##  [1] "car"          "carData"      "DescTools"    "ggcorrplot"   "lme4"        
##  [6] "Matrix"       "readxl"       "modelsummary" "lubridate"    "forcats"     
## [11] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [16] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [21] "grDevices"    "utils"        "datasets"     "methods"      "base"
library(dplyr)  # For data manipulation
library(ggplot2)  # For visualization
library(ggcorrplot) # For correlation matrix visualization 
library(DescTools) # Descriptive stats tools used for binning
library(car) # for multicollinearity check

data <- read.csv("unistats1.csv")
longdata <- read.csv("dataforanalysis1.csv")

#Data Preparation
str(data)
## 'data.frame':    25 obs. of  7 variables:
##  $ uni          : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ avg_sentiment: num  -0.007797 0.005153 0.000978 0.018655 0.04324 ...
##  $ size         : int  56643 45445 26793 39049 24596 39920 61890 66901 87995 58009 ...
##  $ sfratio      : num  19.8 34.2 10.5 17.2 10 11.3 7.4 10.5 14.6 18.7 ...
##  $ livingcost   : int  112 89 113 159 201 89 169 94 121 105 ...
##  $ ranking      : int  200 538 16 600 3 49 36 135 105 89 ...
##  $ t_count      : num  10.6 1.03 16.27 2.99 3.96 ...
summary(data$avg_sentiment)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -0.007797  0.018365  0.027047  0.031049  0.043506  0.126037
range(data$avg_sentiment)
## [1] -0.007796728  0.126037500
correlation_matrix <- data %>%
  select(avg_sentiment, size, sfratio, livingcost, ranking, t_count) %>%
  cor()
print(correlation_matrix)
##               avg_sentiment        size     sfratio livingcost     ranking
## avg_sentiment    1.00000000  0.22428669 -0.05716477  0.2512813 -0.23243767
## size             0.22428669  1.00000000  0.30093259 -0.4739626  0.08710757
## sfratio         -0.05716477  0.30093259  1.00000000 -0.4782641  0.68548897
## livingcost       0.25128135 -0.47396257 -0.47826410  1.0000000 -0.28258098
## ranking         -0.23243767  0.08710757  0.68548897 -0.2825810  1.00000000
## t_count         -0.47608596 -0.15655121 -0.17677272 -0.2352761 -0.32118164
##                  t_count
## avg_sentiment -0.4760860
## size          -0.1565512
## sfratio       -0.1767727
## livingcost    -0.2352761
## ranking       -0.3211816
## t_count        1.0000000
ggcorrplot(correlation_matrix, method = "square")

INTERPRETATION: Average sentiment: Positively correlated with size (0.224) and livingcost (0.251): Larger institutions and higher living costs are slightly associated with higher average sentiment. Negatively correlated with ranking (-0.232) and t_count (-0.476): Higher-ranked institutions and those with higher academic stress words (t_count) tend to have lower average sentiment.

anova_longdata <- aov(sentiment ~ as.factor(uni), data = longdata)
summary(anova_longdata)
##                   Df Sum Sq Mean Sq F value   Pr(>F)    
## as.factor(uni)    24    5.9 0.24777   4.147 4.04e-11 ***
## Residuals      13756  821.9 0.05975                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova_eta <- summary(anova_longdata)
ss_between <- anova_eta[[1]]$`Sum Sq`[1] # Between-group sum of squares
ss_total <- sum(anova_eta[[1]]$`Sum Sq`) # Total sum of squares
eta_squared <- ss_between / ss_total

# Print eta-squared
cat("Eta-squared:", eta_squared, "\n")
## Eta-squared: 0.00718316
anova_data <- aov(sentiment ~ as.factor(uni), data = longdata)
blueberry <- summary(anova_data)
print(blueberry)
##                   Df Sum Sq Mean Sq F value   Pr(>F)    
## as.factor(uni)    24    5.9 0.24777   4.147 4.04e-11 ***
## Residuals      13756  821.9 0.05975                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova_eta2 <- summary(anova_data)
ss_between2 <- anova_eta2[[1]]$`Sum Sq`[1] # Between-group sum of squares
ss_total2 <- sum(anova_eta2[[1]]$`Sum Sq`) # Total sum of squares
eta_squared2 <- ss_between2 / ss_total2

# Print eta-squared
cat("Eta-squared:", eta_squared, "\n")
## Eta-squared: 0.00718316

F value 4.147*** Sentiment differs significantly across universities (p<0.001) eta-squared: 0.00718316 (very small effect) This is compelling evidence that there are meaningful differences between university groups. This will be examined with institutional characteristics.

# Density plots
ggplot(data, aes(x = avg_sentiment)) +
  geom_density(color = "blue", fill = "blue", alpha = 0.5) +
  labs(
    title = "Density Plot of avg_sentiment",
    x = "avg_sentiment",
    y = "Density"
  ) +
  theme_minimal() 

density_plot2 <- ggplot(longdata, aes(x = sentiment)) +
  geom_density(fill = "green", alpha = 0.5) +
  labs(title = "Density Plot of Dependent Variable", x = "Dependent Variable", y = "Density") +
  theme_minimal()

print(density_plot2)

longdata density plot: most sentiment samples are scored by VADER as neutral with many values hovering around 0.0. There seems to be slightly more positively scored samples than negatively scored samples across all sentiment observations. aggdata density plot: most average sentiment samples are neutral/slightly positive.

#regression model to predict avg_sentiment based on institutional characteristics
model_main <- lm(avg_sentiment ~ size + sfratio + livingcost + ranking + t_count, data = data)
summary(model_main)
## 
## Call:
## lm(formula = avg_sentiment ~ size + sfratio + livingcost + ranking + 
##     t_count, data = data)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.027096 -0.012114 -0.003291  0.004386  0.072253 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  1.490e-02  4.130e-02   0.361   0.7222  
## size         3.738e-07  3.827e-07   0.977   0.3410  
## sfratio      8.605e-04  9.092e-04   0.946   0.3558  
## livingcost   1.073e-04  1.340e-04   0.800   0.4334  
## ranking     -8.655e-05  4.071e-05  -2.126   0.0468 *
## t_count     -2.908e-03  1.138e-03  -2.554   0.0194 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02344 on 19 degrees of freedom
## Multiple R-squared:  0.4549, Adjusted R-squared:  0.3115 
## F-statistic: 3.172 on 5 and 19 DF,  p-value: 0.03007
vif_values <- vif(model_main)
print(vif_values)
##       size    sfratio livingcost    ranking    t_count 
##   1.533861   2.336337   1.979676   2.196959   1.485234

INTERPRETATION: Ranking is significant (p<0.05) where the higher the ranking (aka the lower ranked the university) the lower the sentiment. t-count is significant (p<0.05) where higher counts of academic stress words were associated with lower sentiment.

Model fit: R-squared: 0.4549, adjusted R-squared: 0.3115 Moderately good fit, lower adjusted R-squared due to the nonsignificance of some predictors. The model is statistically significant at p<0.05.

Check for multicollinearity (VIF): Correlation matrix suggested some strong correlations between institutional characteristics, but VIF values are all < 5.

# Visualize Relationships
plot_sfratio <- ggplot(data, aes(x = sfratio, y = avg_sentiment)) +
  geom_point() +
  geom_smooth(method = "lm", color = "blue") +
  labs(title = "Sentiment vs Student-Faculty Ratio", x = "Student-Faculty Ratio", y = "Sentiment") +
  theme_minimal()

print(plot_sfratio)
## `geom_smooth()` using formula = 'y ~ x'

plot_tcount <- ggplot(data, aes(x = t_count, y = avg_sentiment)) +
  geom_point() +
  geom_smooth(method = "lm", color = "blue") +
  labs(title = "Sentiment vs Academic Stress Word Count", x = "ASWC", y = "Sentiment") +
  theme_minimal()

print(plot_tcount)
## `geom_smooth()` using formula = 'y ~ x'

plot_livingcost <- ggplot(data, aes(x = livingcost, y = avg_sentiment)) +
  geom_point() +
  geom_smooth(method = "lm", color = "blue") +
  labs(title = "Sentiment vs Living Cost", x = "Living Cost", y = "Sentiment") +
  theme_minimal()

print(plot_livingcost)
## `geom_smooth()` using formula = 'y ~ x'

plot_ranking <- ggplot(data, aes(x = ranking, y = avg_sentiment)) +
  geom_point() +
  geom_smooth(method = "lm", color = "blue") +
  labs(title = "Sentiment vs Ranking", x = "Ranking", y = "Sentiment") +
  theme_minimal()

print(plot_ranking)
## `geom_smooth()` using formula = 'y ~ x'

plot_size <- ggplot(data, aes(x = size, y = avg_sentiment)) +
  geom_point() +
  geom_smooth(method = "lm", color = "blue") +
  labs(title = "Sentiment vs Size", x = "Size", y = "Sentiment") +
  theme_minimal()

print(plot_size)
## `geom_smooth()` using formula = 'y ~ x'

Visualizations of each relationship. Grey area is standard error.

# Diagnostic Plots 
par(mfrow = c(2, 2))
plot(model_main)

No major issues. Uni 13 and 23 seem to be outliers, but nothing has a Cooks distance > 4/n.