1 Load Libraries

library(naniar) # for the gg_miss-upset() command
library(psych) # for the describe() command
library(kableExtra) # for tables
library(corrplot) # for correlation plots
library(afex) # for ANOVA
library(emmeans)
library(ggplot2) # for plots
library(sjPlot) # for regression plots
library(dplyr)

2 Load Data

df <- read.csv(file="data/data_9-16.csv")

df <- df %>%
  mutate(raceeth = na_if(raceeth, "") %>% na_if(" "))

3 Notes

Variables of interest: * Gender (self-reported demographic) * Shared racial identity * Science interest * Science confidence

We came up with two hypotheses to test in our next meeting:

  • Hypothesis 1: There is a significant difference in interest/confidence between men and women
  • Hypothesis 2: There is a significant relationship between shared racial identity and science confidence.

Shared racial identity:

  • I feel like I am only doing well if people like me are, as a whole, are doing well
  • I like living around people that look like me
  • People from similar backgrounds need to stick together
  • Having political leaders who share my race/background makes me feel like I can do more to help my community
  • My background is an important part of who I am

Interest:

  • Enthusiastic about this subject
  • Interested in discussing this subject area with friends or family
  • Interested in taking or planning to take additional classes in this subject
  • Interested in pursuing a science career

Confidence:

  • Confident that I understand this subject
  • Confident that I can do this subject
  • Comfortable working with complex ideas

4 Create Variables

df2 <- subset(df, select=c(UID))
df2$ri_si <- (df$rid9 + df$rid11 + df$rid13 + df$rid14 + df$rid15)/5
df2$int <- (df$salg1 + df$salg2 + df$salg3 + df$salg4)/4
df2$con <- (df$salg5 + df$salg6 + df$salg7)/3

df2$cvs_g <- (df$cvs3 + df$cvs5 + df$cvs7)/3
df2$cvs_r <- (df$cvs10 + df$cvs12 + df$cvs14)/3


df2$gender2 <- "N"
# Recode 'f' or 'F' as 'F'
df2$gender2[df$gender %in% c("f", "F")] <- "F"
# Recode 'm' or 'M' as 'M'
df2$gender2[df$gender %in% c("m", "M")] <- "M"
# Recode 'female' or 'Female' as 'F'
df2$gender2[df$gender %in% c("female", "Female", "female ", "Female ", "Woman")] <- "F"
# Recode 'male' or 'Male' as 'M'
df2$gender2[df$gender %in% c("male", "Male", "male ", "Male ")] <- "M"
df2$gender2 <- as.factor(df2$gender2)

race <- read.csv(file="racesort2.csv", header=T, na.strings = c("", " "))
df2$race_fin <- race$newrace

df2$race_fin2[df2$race_fin == "asian"] <- "asian"
df2$race_fin2[df2$race_fin == "black"] <- "other"
df2$race_fin2[df2$race_fin == "latino"] <- "latino"
df2$race_fin2[df2$race_fin == "mena"] <- "other"
df2$race_fin2[df2$race_fin == "multi"] <- "other"
df2$race_fin2[df2$race_fin == "pi"] <- "other"
df2$race_fin2[df2$race_fin == "white"] <- "white"

df2 <- distinct(df2)

4.1 View Missing Data

gg_miss_upset(df2, nsets = 8)

4.2 View Item Normality

desc <- describe(df2[-1])
kable(round(desc, digits = 2)) %>%
  kable_styling() %>%
  row_spec(which(desc$kurtosis > 2), bold = T) %>%
  row_spec(which(desc$kurtosis < -2), bold = T) %>%
  row_spec(which(desc$skew > 2), italic = T) %>%
  row_spec(which(desc$skew < -2), italic = T)
vars n mean sd median trimmed mad min max range skew kurtosis se
ri_si 1 252 3.25 0.81 3.20 3.26 0.89 1.00 5 4.00 -0.18 -0.02 0.05
int 2 264 3.59 0.96 3.75 3.64 1.11 1.25 5 3.75 -0.40 -0.64 0.06
con 3 264 3.40 0.92 3.33 3.40 0.99 1.00 5 4.00 -0.02 -0.46 0.06
cvs_g 4 259 3.41 1.34 3.33 3.41 1.48 1.00 6 5.00 0.06 -0.87 0.08
cvs_r 5 259 3.05 1.45 3.00 2.98 1.48 1.00 6 5.00 0.31 -0.87 0.09
gender2* 6 291 1.55 0.75 1.00 1.44 0.00 1.00 3 2.00 0.95 -0.61 0.04
race_fin* 7 247 3.40 2.32 3.00 3.26 2.97 1.00 7 6.00 0.33 -1.45 0.15
race_fin2* 8 247 2.17 1.11 2.00 2.09 1.48 1.00 4 3.00 0.34 -1.30 0.07
table(df2$gender2, useNA = "always")

F M N 177 68 46 0

table(df2$race_fin, useNA = "always")

asian black latino mena multi pi white 96 3 51 4 26 29 38 44

table(df2$race_fin2, useNA = "always")

asian latino other white 96 51 62 38 44

4.3 Check Outliers

Using Mahalanobis’ distance. Two outliers dropped.

d <- na.omit(subset(df2, select=-c(7:9)))

m_dist <- mahalanobis(d[-1], colMeans(d[-1]), cov(d[-1]))
d$MD <- round(m_dist, 1)
plot(d$MD)
describe(m_dist)
##    vars   n mean   sd median trimmed mad  min   max range skew kurtosis  se
## X1    1 252 4.98 3.13   4.49    4.59 2.6 0.62 22.12  21.5 1.66     4.59 0.2
cut <- qchisq(.99, df=(ncol(d)-1))
abline(a=cut, b=0, col="red")

d$outlier <- F
d$outlier[d$MD > cut] <- T
table(d$outlier)
## 
## FALSE  TRUE 
##   250     2
outs <- subset(d, select=c(UID, outlier), outlier == T)
df3 <- subset(df2, !(UID %in% outs$UID))

5 H1

Hypothesis 1: There is a significant difference in interest/confidence between men and women

  • Int/Con: lower score = less interested/less confident
  • Gender: F = woman, M = man, N = non-binary, genderqueer, or NA

No significant difference in interest between men and women. Significant difference in confidence (p < .001). Men (M = 3.77, SE = .11) are significantly higher in confidence than women (M = 3.26, SE = .07).

# table(df3$gender2)
# 
# aov_out <- aov_ez(id = "UID", dv = "int", data = df3, between = c("gender2"))
# nice(aov_out)
# afex_plot(aov_out, x = "gender2")
# 
# aov_out <- aov_ez(id = "UID", dv = "con", data = df4, between = c("gender2"))
# nice(aov_out)
# emmeans(aov_out, specs = "gender2")
# pairs(emmeans(aov_out, specs = "gender2"))
# afex_plot(aov_out, x = "gender2")
# 
# aov_out <- aov_ez(id = "UID", dv = "con", data = df4, between = c("gender2","race_fin2"))
# nice(aov_out)
# emmeans(aov_out, specs = "gender2")
# pairs(emmeans(aov_out, specs = "gender2"))
# afex_plot(aov_out, x = "gender2", trace = "race_fin2")
# 
# aov_out <- aov_ez(id = "UID", dv = "int", data = df4, between = c("gender2","race_fin2"))
# nice(aov_out)
# emmeans(aov_out, specs = "gender2")
# pairs(emmeans(aov_out, specs = "gender2"))
# afex_plot(aov_out, x = "gender2", trace = "race_fin2")

6 H2

Hypothesis 2: There is a significant relationship between shared racial identity and science confidence.

  • Shared racial identity: higher score = more shared identity
  • Int/Con: lower score = less interested/less confident

Negative relationships between shared racial identity and interest (p = .025) but not confidence (p = .093).

out <- corr.test(subset(df3, select=c(int, con, ri_si)))

corrplot(out$r, type="upper", method = "color", tl.col = "black", tl.cex = .75,
         p.mat = out$p,
         sig.level = c(.001, .01, .05), pch.cex = .9,
         insig = "label_sig", pch.col = "white",
         order = "hclust")

out$r
##              int        con      ri_si
## int    1.0000000  0.6178160 -0.1372743
## con    0.6178160  1.0000000 -0.1266393
## ri_si -0.1372743 -0.1266393  1.0000000
out$p
##                int          con      ri_si
## int   0.000000e+00 1.713435e-28 0.06002838
## con   5.711451e-29 0.000000e+00 0.06002838
## ri_si 3.001419e-02 4.546043e-02 0.00000000
df4 <- subset(df3, gender2 != "N")
df4$gender2 <- droplevels(df4$gender2)
df4$race_fin2 <- as.factor(df4$race_fin2)
df4$race_fin2 <- relevel(df4$race_fin2, ref = "white")

reg_out <- lm(data = df4, con ~ ri_si + race_fin2 + int)
summary(reg_out)
## 
## Call:
## lm(formula = con ~ ri_si + race_fin2 + int, data = df4)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.06932 -0.54042  0.00254  0.49161  1.76804 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      1.42408    0.29093   4.895 1.83e-06 ***
## ri_si           -0.03475    0.05973  -0.582    0.561    
## race_fin2asian  -0.01221    0.14231  -0.086    0.932    
## race_fin2latino -0.10394    0.15956  -0.651    0.515    
## race_fin2other  -0.03325    0.15146  -0.220    0.826    
## int              0.58872    0.04931  11.940  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7233 on 235 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.3898, Adjusted R-squared:  0.3768 
## F-statistic: 30.03 on 5 and 235 DF,  p-value: < 2.2e-16
plot_model(reg_out, type = "eff", terms = "race_fin2")