# List of packages
packages <- c("tidyverse", "dplyr", "modelsummary", "ggplot2", "ggcorrplot", "DescTools", "car")

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
##   backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
## 
## Revert to `kableExtra` for one session:
## 
##   options(modelsummary_factory_default = 'kableExtra')
##   options(modelsummary_factory_latex = 'kableExtra')
##   options(modelsummary_factory_html = 'kableExtra')
## 
## Silence this message forever:
## 
##   config_modelsummary(startup_message = FALSE)
## 
## 
## Attaching package: 'DescTools'
## 
## 
## The following objects are masked from 'package:modelsummary':
## 
##     Format, Mean, Median, N, SD, Var
## 
## 
## Loading required package: carData
## 
## 
## Attaching package: 'car'
## 
## 
## The following object is masked from 'package:DescTools':
## 
##     Recode
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## 
## The following object is masked from 'package:purrr':
## 
##     some
## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
##  [6] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [11] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [16] "datasets"     "methods"      "base"        
## 
## [[4]]
##  [1] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
##  [6] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [11] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [16] "datasets"     "methods"      "base"        
## 
## [[5]]
##  [1] "ggcorrplot"   "modelsummary" "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[6]]
##  [1] "DescTools"    "ggcorrplot"   "modelsummary" "lubridate"    "forcats"     
##  [6] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [11] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [16] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[7]]
##  [1] "car"          "carData"      "DescTools"    "ggcorrplot"   "modelsummary"
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"
library(dplyr)  # For data manipulation
library(ggplot2)  # For visualization
library(ggcorrplot) # For correlation matrix visualization 
library(DescTools) # Descriptive stats tools used for binning
library(car) # for multicollinearity check

data2 <- read.csv("data2.csv")
 
#Data Preparation
str(data2)
## 'data.frame':    59 obs. of  6 variables:
##  $ uni           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ sentiment     : num  0.1184 0.0131 0.0605 0.0786 -0.0048 ...
##  $ t_count       : num  1.23 3.8 3.28 8.33 4.67 2.88 1.46 2.76 1.05 4.17 ...
##  $ cost_of_living: int  108 166 125 111 171 139 170 157 132 95 ...
##  $ STUFACR       : int  20 11 29 6 20 25 24 22 24 5 ...
##  $ UGDS          : int  25234 17668 24521 7222 20976 25080 35249 32780 28544 7005 ...
summary(data2$cost_of_living)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    88.0   102.5   116.0   136.9   168.0   325.0
correlation_matrix <- data2 %>%
  select(sentiment, UGDS, STUFACR, cost_of_living) %>%
  cor()
print(correlation_matrix)
##                 sentiment       UGDS     STUFACR cost_of_living
## sentiment       1.0000000 -0.1613191 -0.22356741    -0.36975696
## UGDS           -0.1613191  1.0000000  0.75329954    -0.14628950
## STUFACR        -0.2235674  0.7532995  1.00000000    -0.06400615
## cost_of_living -0.3697570 -0.1462895 -0.06400615     1.00000000
ggcorrplot(correlation_matrix, method = "square")

INTERPRETATION: Average sentiment with UGDS (-0.161). Larger institutions are weakly negatively correlated with lower sentiment. Average sentiment with STUFACR (-0.224). Higher student-to-faculty ratios are weakly negatively correlated with lower sentiment scores. Average sentiment with cost_of_living (-0.370). Universities in areas with higher cost of living are moderately negatively correlated with lower sentiment scores. Larger institutions (UGDS) are strongly positively associated with higher student faculty ratio (0.753). This suggests potential multicolliearity, which will be checked when running the regression model.

An ANOVA cannot be run due to the loss of longdata. See limitations for details.

# Density plots
ggplot(data2, aes(x = sentiment)) +
  geom_density(color = "blue", fill = "blue", alpha = 0.5) +
  labs(
    title = "Density Plot of Average Sentiment",
    x = "sentiment",
    y = "Density"
  ) +
  theme_minimal() 

density plot: most average sentiment samples are neutral/slightly positive.

#regression model to predict sentiment based on institutional characteristics
model_main2 <- lm(sentiment ~ UGDS + STUFACR + cost_of_living, data = data2)
summary(model_main2)
## 
## Call:
## lm(formula = sentiment ~ UGDS + STUFACR + cost_of_living, data = data2)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.042509 -0.012011 -0.003407  0.009630  0.085112 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     7.080e-02  1.156e-02   6.127    1e-07 ***
## UGDS           -1.445e-07  3.688e-07  -0.392  0.69667    
## STUFACR        -6.377e-04  6.043e-04  -1.055  0.29590    
## cost_of_living -1.955e-04  6.081e-05  -3.214  0.00219 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02065 on 55 degrees of freedom
## Multiple R-squared:  0.2003, Adjusted R-squared:  0.1567 
## F-statistic: 4.593 on 3 and 55 DF,  p-value: 0.006119
# VIF
vif_values <- vif(model_main2)
print(vif_values)
##           UGDS        STUFACR cost_of_living 
##       2.364727       2.323640       1.027046

INTERPRETATION: Cost of living is significant (p<0.01) where the higher the cost of living, the lower the average sentiment. UGDS and STUFACR are not significant.

Model fit: Multiple R-squared: 0.2003, Adjusted R-squared: 0.1567 Model is statistically significant, but only 20% of the variance (adj. 16%) is explained by the model.

Check for multicollinearity (VIF): Correlation matrix suggested some strong correlations between institutional characteristics, but VIF values are all < 5. UGDS and STUFACR aren’t significant anyway.

# Visualize Relationships
plot_STUFACR <- ggplot(data2, aes(x = STUFACR, y = sentiment)) +
  geom_point() +
  geom_smooth(method = "lm", color = "green") +
  labs(title = "Sentiment vs Student-Faculty Ratio", x = "Student-Faculty Ratio", y = "Sentiment") +
  theme_minimal()
print(plot_STUFACR)
## `geom_smooth()` using formula = 'y ~ x'

plot_cost_of_living <- ggplot(data2, aes(x = cost_of_living, y = sentiment)) +
  geom_point() +
  geom_smooth(method = "lm", color = "green") +
  labs(title = "Sentiment vs Cost of Living", x = "Cost of Living", y = "Sentiment") +
  theme_minimal()

print(plot_cost_of_living)
## `geom_smooth()` using formula = 'y ~ x'

plot_UGDS <- ggplot(data2, aes(x = UGDS, y = sentiment)) +
  geom_point() +
  geom_smooth(method = "lm", color = "green") +
  labs(title = "Sentiment vs Institution Size", x = "Institution Size", y = "Sentiment") +
  theme_minimal()

print(plot_UGDS)
## `geom_smooth()` using formula = 'y ~ x'

# Diagnostic Plots 
par(mfrow = c(2,2))
plot(model_main2)

detected outliers, cleaning data for a second model

cooksD <- cooks.distance(model_main2)
influential_points <- which(cooksD > (4 / nrow(data2)))
print(influential_points)
##  1  3 18 
##  1  3 18

points 1, 3, 18 heavily influence the model, i’ll try removing them

# Refit model without the outliers
cleaned_data2 <- data2[-c( 1, 3, 18), ]
modelcleantest <- lm(sentiment ~ UGDS + STUFACR + cost_of_living + t_count, data = cleaned_data2)
summary(modelcleantest)
## 
## Call:
## lm(formula = sentiment ~ UGDS + STUFACR + cost_of_living + t_count, 
##     data = cleaned_data2)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.041051 -0.009805 -0.001833  0.008956  0.032636 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     6.500e-02  9.162e-03   7.095 3.85e-09 ***
## UGDS            2.079e-07  2.838e-07   0.733 0.467172    
## STUFACR        -1.263e-03  4.786e-04  -2.639 0.011008 *  
## cost_of_living -1.821e-04  4.620e-05  -3.941 0.000247 ***
## t_count         8.700e-04  1.256e-03   0.693 0.491490    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01506 on 51 degrees of freedom
## Multiple R-squared:  0.3618, Adjusted R-squared:  0.3117 
## F-statistic: 7.228 on 4 and 51 DF,  p-value: 0.0001087
# VIF
vif_values <- vif(model_main2)
print(vif_values)
##           UGDS        STUFACR cost_of_living 
##       2.364727       2.323640       1.027046

INTERPRETATION: After removing three outlier data points (uni 1, 3, 18): Cost of living is highly significant (p<0.001) where the higher the cost of living is associated with lower average sentiment. STUFACR becomes significant (p<0.05), where higher student-faculty ratios is associated with lower average sentiment.

Cleaned model: Multiple R-squared: 0.3558, Adjusted R-squared: 0.3186 Old model: Multiple R-squared: 0.2003, Adjusted R-squared: 0.1567

Model fit increased around 15% after removing three observations with drastic influence and STUFACR is significant. The fact that removing only 3 observations out of 59 improves adjusted by 15% means these points are disproportionately influencing the model. These points are flagged as Cook’s Distance > 4/n. This cleaned model is more generalizable.

Check for multicollinearity (VIF): Correlation matrix suggested some strong correlations between institutional characteristics, but VIF values are all < 5.