# List of packages
packages <- c("tidyverse", "dplyr", "modelsummary", "readxl", "lubridate", "ggplot2", "lme4", "ggcorrplot", "DescTools", "car")
# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
## backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
##
## Revert to `kableExtra` for one session:
##
## options(modelsummary_factory_default = 'kableExtra')
## options(modelsummary_factory_latex = 'kableExtra')
## options(modelsummary_factory_html = 'kableExtra')
##
## Silence this message forever:
##
## config_modelsummary(startup_message = FALSE)
##
## Loading required package: Matrix
##
##
## Attaching package: 'Matrix'
##
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
##
##
## Attaching package: 'DescTools'
##
##
## The following objects are masked from 'package:modelsummary':
##
## Format, Mean, Median, N, SD, Var
##
##
## Loading required package: carData
##
##
## Attaching package: 'car'
##
##
## The following object is masked from 'package:DescTools':
##
## Recode
##
##
## The following object is masked from 'package:dplyr':
##
## recode
##
##
## The following object is masked from 'package:purrr':
##
## some
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "modelsummary" "lubridate" "forcats" "stringr" "dplyr"
## [6] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [11] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [16] "datasets" "methods" "base"
##
## [[4]]
## [1] "readxl" "modelsummary" "lubridate" "forcats" "stringr"
## [6] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [11] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [16] "utils" "datasets" "methods" "base"
##
## [[5]]
## [1] "readxl" "modelsummary" "lubridate" "forcats" "stringr"
## [6] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [11] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [16] "utils" "datasets" "methods" "base"
##
## [[6]]
## [1] "readxl" "modelsummary" "lubridate" "forcats" "stringr"
## [6] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [11] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [16] "utils" "datasets" "methods" "base"
##
## [[7]]
## [1] "lme4" "Matrix" "readxl" "modelsummary" "lubridate"
## [6] "forcats" "stringr" "dplyr" "purrr" "readr"
## [11] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [16] "graphics" "grDevices" "utils" "datasets" "methods"
## [21] "base"
##
## [[8]]
## [1] "ggcorrplot" "lme4" "Matrix" "readxl" "modelsummary"
## [6] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [11] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [16] "stats" "graphics" "grDevices" "utils" "datasets"
## [21] "methods" "base"
##
## [[9]]
## [1] "DescTools" "ggcorrplot" "lme4" "Matrix" "readxl"
## [6] "modelsummary" "lubridate" "forcats" "stringr" "dplyr"
## [11] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [16] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [21] "datasets" "methods" "base"
##
## [[10]]
## [1] "car" "carData" "DescTools" "ggcorrplot" "lme4"
## [6] "Matrix" "readxl" "modelsummary" "lubridate" "forcats"
## [11] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [16] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [21] "grDevices" "utils" "datasets" "methods" "base"
library(dplyr) # For data manipulation
library(ggplot2) # For visualization
library(ggcorrplot) # For correlation matrix visualization
library(DescTools) # Descriptive stats tools used for binning
library(car) # for multicollinearity check
data <- read.csv("unistats1.csv")
longdata <- read.csv("dataforanalysis1.csv")
#Data Preparation
str(data)
## 'data.frame': 25 obs. of 7 variables:
## $ uni : int 1 2 3 4 5 6 7 8 9 10 ...
## $ avg_sentiment: num -0.007797 0.005153 0.000978 0.018655 0.04324 ...
## $ size : int 56643 45445 26793 39049 24596 39920 61890 66901 87995 58009 ...
## $ sfratio : num 19.8 34.2 10.5 17.2 10 11.3 7.4 10.5 14.6 18.7 ...
## $ livingcost : int 112 89 113 159 201 89 169 94 121 105 ...
## $ ranking : int 200 538 16 600 3 49 36 135 105 89 ...
## $ t_count : num 10.6 1.03 16.27 2.99 3.96 ...
summary(data$avg_sentiment)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.007797 0.018365 0.027047 0.031049 0.043506 0.126037
range(data$avg_sentiment)
## [1] -0.007796728 0.126037500
correlation_matrix <- data %>%
select(avg_sentiment, size, sfratio, livingcost, ranking, t_count) %>%
cor()
print(correlation_matrix)
## avg_sentiment size sfratio livingcost ranking
## avg_sentiment 1.00000000 0.22428669 -0.05716477 0.2512813 -0.23243767
## size 0.22428669 1.00000000 0.30093259 -0.4739626 0.08710757
## sfratio -0.05716477 0.30093259 1.00000000 -0.4782641 0.68548897
## livingcost 0.25128135 -0.47396257 -0.47826410 1.0000000 -0.28258098
## ranking -0.23243767 0.08710757 0.68548897 -0.2825810 1.00000000
## t_count -0.47608596 -0.15655121 -0.17677272 -0.2352761 -0.32118164
## t_count
## avg_sentiment -0.4760860
## size -0.1565512
## sfratio -0.1767727
## livingcost -0.2352761
## ranking -0.3211816
## t_count 1.0000000
ggcorrplot(correlation_matrix, method = "square")
INTERPRETATION: Average sentiment: Positively correlated with size (0.224) and livingcost (0.251): Larger institutions and higher living costs are slightly associated with higher average sentiment. Negatively correlated with ranking (-0.232) and t_count (-0.476): Higher-ranked institutions and those with higher academic stress words (t_count) tend to have lower average sentiment.
anova_longdata <- aov(sentiment ~ as.factor(uni), data = longdata)
summary(anova_longdata)
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(uni) 24 5.9 0.24777 4.147 4.04e-11 ***
## Residuals 13756 821.9 0.05975
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova_eta <- summary(anova_longdata)
ss_between <- anova_eta[[1]]$`Sum Sq`[1] # Between-group sum of squares
ss_total <- sum(anova_eta[[1]]$`Sum Sq`) # Total sum of squares
eta_squared <- ss_between / ss_total
# Print eta-squared
cat("Eta-squared:", eta_squared, "\n")
## Eta-squared: 0.00718316
anova_data <- aov(sentiment ~ as.factor(uni), data = longdata)
blueberry <- summary(anova_data)
print(blueberry)
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(uni) 24 5.9 0.24777 4.147 4.04e-11 ***
## Residuals 13756 821.9 0.05975
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova_eta2 <- summary(anova_data)
ss_between2 <- anova_eta2[[1]]$`Sum Sq`[1] # Between-group sum of squares
ss_total2 <- sum(anova_eta2[[1]]$`Sum Sq`) # Total sum of squares
eta_squared2 <- ss_between2 / ss_total2
# Print eta-squared
cat("Eta-squared:", eta_squared, "\n")
## Eta-squared: 0.00718316
F value 4.147*** Sentiment differs significantly across universities (p<0.001) eta-squared: 0.00718316 (very small effect) This is compelling evidence that there are meaningful differences between university groups. This will be examined with institutional characteristics.
# Density plots
ggplot(data, aes(x = avg_sentiment)) +
geom_density(color = "blue", fill = "blue", alpha = 0.5) +
labs(
title = "Density Plot of avg_sentiment",
x = "avg_sentiment",
y = "Density"
) +
theme_minimal()
density_plot2 <- ggplot(longdata, aes(x = sentiment)) +
geom_density(fill = "green", alpha = 0.5) +
labs(title = "Density Plot of Dependent Variable", x = "Dependent Variable", y = "Density") +
theme_minimal()
print(density_plot2)
longdata density plot: most sentiment samples are scored by VADER as
neutral with many values hovering around 0.0. There seems to be slightly
more positively scored samples than negatively scored samples across all
sentiment observations. aggdata density plot: most average sentiment
samples are neutral/slightly positive.
#regression model to predict avg_sentiment based on institutional characteristics
model_main <- lm(avg_sentiment ~ size + sfratio + livingcost + ranking + t_count, data = data)
summary(model_main)
##
## Call:
## lm(formula = avg_sentiment ~ size + sfratio + livingcost + ranking +
## t_count, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.027096 -0.012114 -0.003291 0.004386 0.072253
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.490e-02 4.130e-02 0.361 0.7222
## size 3.738e-07 3.827e-07 0.977 0.3410
## sfratio 8.605e-04 9.092e-04 0.946 0.3558
## livingcost 1.073e-04 1.340e-04 0.800 0.4334
## ranking -8.655e-05 4.071e-05 -2.126 0.0468 *
## t_count -2.908e-03 1.138e-03 -2.554 0.0194 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.02344 on 19 degrees of freedom
## Multiple R-squared: 0.4549, Adjusted R-squared: 0.3115
## F-statistic: 3.172 on 5 and 19 DF, p-value: 0.03007
vif_values <- vif(model_main)
print(vif_values)
## size sfratio livingcost ranking t_count
## 1.533861 2.336337 1.979676 2.196959 1.485234
INTERPRETATION: Ranking is significant (p<0.05) where the higher the ranking (aka the lower ranked the university) the lower the sentiment. t-count is significant (p<0.05) where higher counts of academic stress words were associated with lower sentiment.
Model fit: R-squared: 0.4549, adjusted R-squared: 0.3115 Moderately good fit, lower adjusted R-squared due to the nonsignificance of some predictors. The model is statistically significant at p<0.05.
Check for multicollinearity (VIF): Correlation matrix suggested some strong correlations between institutional characteristics, but VIF values are all < 5.
# Visualize Relationships
plot_sfratio <- ggplot(data, aes(x = sfratio, y = avg_sentiment)) +
geom_point() +
geom_smooth(method = "lm", color = "blue") +
labs(title = "Sentiment vs Student-Faculty Ratio", x = "Student-Faculty Ratio", y = "Sentiment") +
theme_minimal()
print(plot_sfratio)
## `geom_smooth()` using formula = 'y ~ x'
plot_tcount <- ggplot(data, aes(x = t_count, y = avg_sentiment)) +
geom_point() +
geom_smooth(method = "lm", color = "blue") +
labs(title = "Sentiment vs Academic Stress Word Count", x = "ASWC", y = "Sentiment") +
theme_minimal()
print(plot_tcount)
## `geom_smooth()` using formula = 'y ~ x'
plot_livingcost <- ggplot(data, aes(x = livingcost, y = avg_sentiment)) +
geom_point() +
geom_smooth(method = "lm", color = "blue") +
labs(title = "Sentiment vs Living Cost", x = "Living Cost", y = "Sentiment") +
theme_minimal()
print(plot_livingcost)
## `geom_smooth()` using formula = 'y ~ x'
plot_ranking <- ggplot(data, aes(x = ranking, y = avg_sentiment)) +
geom_point() +
geom_smooth(method = "lm", color = "blue") +
labs(title = "Sentiment vs Ranking", x = "Ranking", y = "Sentiment") +
theme_minimal()
print(plot_ranking)
## `geom_smooth()` using formula = 'y ~ x'
plot_size <- ggplot(data, aes(x = size, y = avg_sentiment)) +
geom_point() +
geom_smooth(method = "lm", color = "blue") +
labs(title = "Sentiment vs Size", x = "Size", y = "Sentiment") +
theme_minimal()
print(plot_size)
## `geom_smooth()` using formula = 'y ~ x'
Visualizations of each relationship. Grey area is standard error.
# Diagnostic Plots
par(mfrow = c(2, 2))
plot(model_main)
No major issues. Uni 13 and 23 seem to be outliers, but nothing has a
Cooks distance > 4/n.