library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.4     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.0.2     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
prefrosh = read.csv("~/Downloads/Stanford Communities Project - Prefrosh 2022-2023_September 28, 2022_10.26.csv", header=T, na.strings=c("","NA"))

# Removing unceccessary rows
prefrosh <- prefrosh[-c(1, 2, 548), ]

prefrosh = prefrosh %>% 
  distinct(RecipientEmail, .keep_all = TRUE)

For presentation, (1) an overview of what’s included in prefrosh survey, (2) some sample demographics (gender, political ideology, urm etc), (3) some temporal trends (e.g., how is this class rate in happiness, stress, depression compared to past prefrosh samples), and (4) some hypothesis testing (e.g., for the intervention perception questions, we can look at whether difference in other vs. self predicts their social behaviors like ERQ)

Visualisations

prefrosh_race = prefrosh %>% 
  filter(Race != "Unknown") %>% 
  filter(Race != "Other. Please specify:")
# Race

p<-ggplot(prefrosh_race, aes(x=Race)) + 
  geom_bar(color="black", fill="white")
p + theme(axis.text.x = element_text(size = 7, angle = 45, hjust = 1))

# Gender

prefrosh_gender = prefrosh %>% 
  filter(Gender != "Unknown")

p<-ggplot(prefrosh_gender, aes(x=Gender)) + 
  geom_bar(color="black", fill="white")
p + theme(axis.text.x = element_text(size = 7, angle = 45, hjust = 1))

prefrosh = read.csv("~/Downloads/Stanford Communities Project - Prefrosh 2022-2023_October 7, 2022_15.20.csv", header=T, na.strings=c("","NA"))

prefrosh <- prefrosh[-c(1, 2, 548), ]

prefrosh = prefrosh %>% 
  distinct(RecipientEmail, .keep_all = TRUE)

prefrosh_poli = prefrosh %>% 
  filter(PoliIdeology_general != "NA")

# Political Ideology

p = ggplot(prefrosh_poli, aes(x=PoliIdeology_general)) + 
  geom_bar(color="black", fill="white")

p + scale_x_discrete(name = "Political Ideology", labels=c("1" = "Extremely Liberal", "2" = "Liberal",
                              "3" = "Moderately Liberal", "4" = "Moderate", "5" = "Moderateley Conservative", "6" = "Conservative", "7" = "Extremely Conservative")) + theme(axis.text.x = element_text(size = 7, angle = 45, hjust = 1))

Family Income

# We have to take steps to re-order the labels

income_data = data.frame(
    ID   <- c(1:17),
    labels = c("$0-$20,000", "$20,001-$40,000 ", "$40,001-$60,000", "$60,001-$80,000", "$80,001-$100,000", "$100,001-$120,000", "$120,001-$140,000", "$140,001-$160,000", "$160,001-$180,000", "$180,001-$200,000", "$200,001-$250,000", "$250,001-$300,000", "$300,001-$350,000", "$350,001-$400,000", "$400,001-$450,000", "$450,001-$500,000", "$500,001+"),
    income = c(40, 76, 50, 45, 41, 41, 26, 25, 15, 21, 43, 37, 26, 23, 13, 15, 85))


#barplot(income_data$income, names.arg=labels,xlab="Income Bracket",
        #ylab="Count") 

Brief Preprocessing

library(ggsci)
df1 = read.csv("~/Downloads/df1.csv")
df1
##          Survey      Mean
## 1      TAI_5_21 12.866720
## 2      TAI_5_22 13.284450
## 3       EROS_21  3.722222
## 4       EROS_22  3.758606
## 5 loneliness_21  2.367347
## 6 loneliness_22  2.414708
## 7       CESD_21 16.354050
## 8       CESD_22 16.750940
p<-ggplot(data=df1, aes(x=Survey, y=Mean)) +
  geom_bar(stat="identity")

p + theme(axis.text.x = element_text(size = 7, angle = 45, hjust = 1)) + scale_fill_jco()

t.test(prefrosh_21$CESD, prefrosh$CESD)
## 
##  Welch Two Sample t-test
## 
## data:  prefrosh_21$CESD and prefrosh$CESD
## t = -1.7511, df = 1688.2, p-value = 0.0801
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.84143832  0.04764637
## sample estimates:
## mean of x mean of y 
##  16.35405  16.75094
t.test(prefrosh_21$TAI_5,prefrosh$TAI_5)
## 
##  Welch Two Sample t-test
## 
## data:  prefrosh_21$TAI_5 and prefrosh$TAI_5
## t = -2.7059, df = 1648.4, p-value = 0.006882
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.7205224 -0.1149353
## sample estimates:
## mean of x mean of y 
##  12.86672  13.28445
  1. some hypothesis testing (e.g., for the intervention perception questions, we can look at whether difference in other vs. self predicts their social behaviors like ERQ)
library(jtools)

# miscalibration_1 is "If someone else is feeling bad, how much do you expect a typical Stanford Student to try and help them feel better by listening to them and acting kindly?"

# miscalibration_3 is "If someone else is feeling bad, how likely are you to try and help them feel better by listening to them and acting kindly?"

prefrosh = prefrosh %>% 
  mutate(diff_self_other = miscalibration_1 - miscalibration_3) %>% 
  mutate(abs_diff_self_other = abs(miscalibration_1 - miscalibration_3))


model = lm(EROS ~ 1 + diff_self_other + miscalibration_3, prefrosh)
summary(model)
## 
## Call:
## lm(formula = EROS ~ 1 + diff_self_other + miscalibration_3, data = prefrosh)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.32135 -0.48310  0.01404  0.51404  1.76506 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       2.48186    0.12904  19.233   <2e-16 ***
## diff_self_other  -0.03972    0.02456  -1.617    0.106    
## miscalibration_3  0.25103    0.02666   9.415   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7006 on 758 degrees of freedom
##   (230 observations deleted due to missingness)
## Multiple R-squared:  0.1373, Adjusted R-squared:  0.135 
## F-statistic: 60.31 on 2 and 758 DF,  p-value: < 2.2e-16
summ(model, vifs = TRUE)
Observations 761 (230 missing obs. deleted)
Dependent variable EROS
Type OLS linear regression
F(2,758) 60.31
0.14
Adj. R² 0.13
Est. S.E. t val. p VIF
(Intercept) 2.48 0.13 19.23 0.00 NA
diff_self_other -0.04 0.02 -1.62 0.11 1.17
miscalibration_3 0.25 0.03 9.42 0.00 1.17
Standard errors: OLS
model = lm(EROS ~ 1 + abs_diff_self_other + miscalibration_3, prefrosh)
summary(model)
## 
## Call:
## lm(formula = EROS ~ 1 + abs_diff_self_other + miscalibration_3, 
##     data = prefrosh)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.29784 -0.44265  0.03549  0.50513  1.81828 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          2.39894    0.12489  19.209   <2e-16 ***
## abs_diff_self_other  0.06568    0.02755   2.384   0.0174 *  
## miscalibration_3     0.26093    0.02474  10.549   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6992 on 758 degrees of freedom
##   (230 observations deleted due to missingness)
## Multiple R-squared:  0.1407, Adjusted R-squared:  0.1385 
## F-statistic: 62.08 on 2 and 758 DF,  p-value: < 2.2e-16
summ(model, vifs = TRUE)
Observations 761 (230 missing obs. deleted)
Dependent variable EROS
Type OLS linear regression
F(2,758) 62.08
0.14
Adj. R² 0.14
Est. S.E. t val. p VIF
(Intercept) 2.40 0.12 19.21 0.00 NA
abs_diff_self_other 0.07 0.03 2.38 0.02 1.01
miscalibration_3 0.26 0.02 10.55 0.00 1.01
Standard errors: OLS
# ERQ cognitive reappraisal (ERQ_C) and expressive suppression (ERQ_E)

prefrosh <- prefrosh %>% 
  rowwise() %>% 
  mutate(ERQ_C = sum(c(ERQ_1,
                       ERQ_3,
                       ERQ_5,
                       ERQ_7,
                       ERQ_8,
                       ERQ_10)),
         ERQ_E = sum(c(ERQ_2,
                       ERQ_4,
                       ERQ_6,
                       ERQ_9)))

model = lm(ERQ_C ~ 1 + diff_self_other + miscalibration_3, prefrosh)
summary(model)
## 
## Call:
## lm(formula = ERQ_C ~ 1 + diff_self_other + miscalibration_3, 
##     data = prefrosh)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.3745  -3.6627   0.5219   3.9165  14.1635 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       24.7323     1.0623  23.281  < 2e-16 ***
## diff_self_other    0.6573     0.2023   3.249  0.00121 ** 
## miscalibration_3   0.9404     0.2195   4.285 2.07e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.762 on 757 degrees of freedom
##   (231 observations deleted due to missingness)
## Multiple R-squared:  0.02746,    Adjusted R-squared:  0.02489 
## F-statistic: 10.69 on 2 and 757 DF,  p-value: 2.647e-05
summ(model, vifs = TRUE)
Observations 760 (231 missing obs. deleted)
Dependent variable ERQ_C
Type OLS linear regression
F(2,757) 10.69
0.03
Adj. R² 0.02
Est. S.E. t val. p VIF
(Intercept) 24.73 1.06 23.28 0.00 NA
diff_self_other 0.66 0.20 3.25 0.00 1.17
miscalibration_3 0.94 0.22 4.28 0.00 1.17
Standard errors: OLS
model = lm(ERQ_C ~ 1 + abs_diff_self_other + miscalibration_3, prefrosh)
summary(model)
## 
## Call:
## lm(formula = ERQ_C ~ 1 + abs_diff_self_other + miscalibration_3, 
##     data = prefrosh)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.2493  -3.6583   0.4769   4.0679  13.7941 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          25.8923     1.0325  25.078  < 2e-16 ***
## abs_diff_self_other  -0.5910     0.2279  -2.593 0.009695 ** 
## miscalibration_3      0.7262     0.2045   3.552 0.000406 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.776 on 757 degrees of freedom
##   (231 observations deleted due to missingness)
## Multiple R-squared:  0.02258,    Adjusted R-squared:   0.02 
## F-statistic: 8.744 on 2 and 757 DF,  p-value: 0.000176
summ(model, vifs = TRUE)
Observations 760 (231 missing obs. deleted)
Dependent variable ERQ_C
Type OLS linear regression
F(2,757) 8.74
0.02
Adj. R² 0.02
Est. S.E. t val. p VIF
(Intercept) 25.89 1.03 25.08 0.00 NA
abs_diff_self_other -0.59 0.23 -2.59 0.01 1.01
miscalibration_3 0.73 0.20 3.55 0.00 1.01
Standard errors: OLS
model = lm(ERQ_E ~ 1 + diff_self_other + miscalibration_3, prefrosh)
summary(model)
## 
## Call:
## lm(formula = ERQ_E ~ 1 + diff_self_other + miscalibration_3, 
##     data = prefrosh)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -14.189  -3.582   0.418   3.418  13.326 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       20.1239     0.9276  21.696  < 2e-16 ***
## diff_self_other    0.1179     0.1767   0.667    0.505    
## miscalibration_3  -0.9084     0.1917  -4.739 2.57e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.034 on 758 degrees of freedom
##   (230 observations deleted due to missingness)
## Multiple R-squared:  0.03773,    Adjusted R-squared:  0.03519 
## F-statistic: 14.86 on 2 and 758 DF,  p-value: 4.667e-07
summ(model, vifs = TRUE)
Observations 761 (230 missing obs. deleted)
Dependent variable ERQ_E
Type OLS linear regression
F(2,758) 14.86
0.04
Adj. R² 0.04
Est. S.E. t val. p VIF
(Intercept) 20.12 0.93 21.70 0.00 NA
diff_self_other 0.12 0.18 0.67 0.51 1.17
miscalibration_3 -0.91 0.19 -4.74 0.00 1.17
Standard errors: OLS
model = lm(ERQ_E ~ 1 + abs_diff_self_other + miscalibration_3, prefrosh)
summary(model)
## 
## Call:
## lm(formula = ERQ_E ~ 1 + abs_diff_self_other + miscalibration_3, 
##     data = prefrosh)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.4778  -3.5174   0.4694   3.4826  13.7569 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          20.1639     0.8980  22.454  < 2e-16 ***
## abs_diff_self_other   0.2875     0.1983   1.450    0.148    
## miscalibration_3     -0.9868     0.1779  -5.546 4.03e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.028 on 758 degrees of freedom
##   (230 observations deleted due to missingness)
## Multiple R-squared:  0.03983,    Adjusted R-squared:  0.0373 
## F-statistic: 15.72 on 2 and 758 DF,  p-value: 2.041e-07
summ(model, vifs = TRUE)
Observations 761 (230 missing obs. deleted)
Dependent variable ERQ_E
Type OLS linear regression
F(2,758) 15.72
0.04
Adj. R² 0.04
Est. S.E. t val. p VIF
(Intercept) 20.16 0.90 22.45 0.00 NA
abs_diff_self_other 0.29 0.20 1.45 0.15 1.01
miscalibration_3 -0.99 0.18 -5.55 0.00 1.01
Standard errors: OLS