library(naniar) # for the gg_miss-upset() command
library(psych) # for the describe() command
library(kableExtra) # for tables
library(corrplot) # for correlation plots
library(afex) # for ANOVA
library(emmeans)
library(ggplot2) # for plots
library(sjPlot) # for regression plots
library(dplyr)
df <- read.csv(file="data/data_9-16.csv")
df <- df %>%
mutate(raceeth = na_if(raceeth, "") %>% na_if(" "))
Variables of interest: * Gender (self-reported demographic) * Shared racial identity * Science interest * Science confidence
We came up with two hypotheses to test in our next meeting:
Shared racial identity:
Interest:
Confidence:
df2 <- subset(df, select=c(UID))
df2$ri_si <- (df$rid9 + df$rid11 + df$rid13 + df$rid14 + df$rid15)/5
df2$int <- (df$salg1 + df$salg2 + df$salg3 + df$salg4)/4
df2$con <- (df$salg5 + df$salg6 + df$salg7)/3
df2$cvs_g <- (df$cvs3 + df$cvs5 + df$cvs7)/3
df2$cvs_r <- (df$cvs10 + df$cvs12 + df$cvs14)/3
df2$gender2 <- "N"
# Recode 'f' or 'F' as 'F'
df2$gender2[df$gender %in% c("f", "F")] <- "F"
# Recode 'm' or 'M' as 'M'
df2$gender2[df$gender %in% c("m", "M")] <- "M"
# Recode 'female' or 'Female' as 'F'
df2$gender2[df$gender %in% c("female", "Female", "female ", "Female ", "Woman")] <- "F"
# Recode 'male' or 'Male' as 'M'
df2$gender2[df$gender %in% c("male", "Male", "male ", "Male ")] <- "M"
df2$gender2 <- as.factor(df2$gender2)
race <- read.csv(file="racesort2.csv", header=T, na.strings = c("", " "))
df2$race_fin <- race$newrace
df2$race_fin2[df2$race_fin == "asian"] <- "asian"
df2$race_fin2[df2$race_fin == "black"] <- "other"
df2$race_fin2[df2$race_fin == "latino"] <- "latino"
df2$race_fin2[df2$race_fin == "mena"] <- "other"
df2$race_fin2[df2$race_fin == "multi"] <- "other"
df2$race_fin2[df2$race_fin == "pi"] <- "other"
df2$race_fin2[df2$race_fin == "white"] <- "white"
df2 <- distinct(df2)
gg_miss_upset(df2, nsets = 8)
desc <- describe(df2[-1])
kable(round(desc, digits = 2)) %>%
kable_styling() %>%
row_spec(which(desc$kurtosis > 2), bold = T) %>%
row_spec(which(desc$kurtosis < -2), bold = T) %>%
row_spec(which(desc$skew > 2), italic = T) %>%
row_spec(which(desc$skew < -2), italic = T)
| vars | n | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ri_si | 1 | 252 | 3.25 | 0.81 | 3.20 | 3.26 | 0.89 | 1.00 | 5 | 4.00 | -0.18 | -0.02 | 0.05 |
| int | 2 | 264 | 3.59 | 0.96 | 3.75 | 3.64 | 1.11 | 1.25 | 5 | 3.75 | -0.40 | -0.64 | 0.06 |
| con | 3 | 264 | 3.40 | 0.92 | 3.33 | 3.40 | 0.99 | 1.00 | 5 | 4.00 | -0.02 | -0.46 | 0.06 |
| cvs_g | 4 | 259 | 3.41 | 1.34 | 3.33 | 3.41 | 1.48 | 1.00 | 6 | 5.00 | 0.06 | -0.87 | 0.08 |
| cvs_r | 5 | 259 | 3.05 | 1.45 | 3.00 | 2.98 | 1.48 | 1.00 | 6 | 5.00 | 0.31 | -0.87 | 0.09 |
| gender2* | 6 | 291 | 1.55 | 0.75 | 1.00 | 1.44 | 0.00 | 1.00 | 3 | 2.00 | 0.95 | -0.61 | 0.04 |
| race_fin* | 7 | 247 | 3.40 | 2.32 | 3.00 | 3.26 | 2.97 | 1.00 | 7 | 6.00 | 0.33 | -1.45 | 0.15 |
| race_fin2* | 8 | 247 | 2.17 | 1.11 | 2.00 | 2.09 | 1.48 | 1.00 | 4 | 3.00 | 0.34 | -1.30 | 0.07 |
table(df2$gender2, useNA = "always")
F M N
table(df2$race_fin, useNA = "always")
asian black latino mena multi pi white
table(df2$race_fin2, useNA = "always")
asian latino other white
Using Mahalanobis’ distance. Two outliers dropped.
d <- na.omit(subset(df2, select=-c(7:9)))
m_dist <- mahalanobis(d[-1], colMeans(d[-1]), cov(d[-1]))
d$MD <- round(m_dist, 1)
plot(d$MD)
describe(m_dist)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 252 4.98 3.13 4.49 4.59 2.6 0.62 22.12 21.5 1.66 4.59 0.2
cut <- qchisq(.99, df=(ncol(d)-1))
abline(a=cut, b=0, col="red")
d$outlier <- F
d$outlier[d$MD > cut] <- T
table(d$outlier)
##
## FALSE TRUE
## 250 2
outs <- subset(d, select=c(UID, outlier), outlier == T)
df3 <- subset(df2, !(UID %in% outs$UID))
Hypothesis 1: There is a significant difference in interest/confidence between men and women
No significant difference in interest between men and women. Significant difference in confidence (p < .001). Men (M = 3.77, SE = .11) are significantly higher in confidence than women (M = 3.26, SE = .07).
# table(df3$gender2)
#
# aov_out <- aov_ez(id = "UID", dv = "int", data = df3, between = c("gender2"))
# nice(aov_out)
# afex_plot(aov_out, x = "gender2")
#
# aov_out <- aov_ez(id = "UID", dv = "con", data = df4, between = c("gender2"))
# nice(aov_out)
# emmeans(aov_out, specs = "gender2")
# pairs(emmeans(aov_out, specs = "gender2"))
# afex_plot(aov_out, x = "gender2")
#
# aov_out <- aov_ez(id = "UID", dv = "con", data = df4, between = c("gender2","race_fin2"))
# nice(aov_out)
# emmeans(aov_out, specs = "gender2")
# pairs(emmeans(aov_out, specs = "gender2"))
# afex_plot(aov_out, x = "gender2", trace = "race_fin2")
#
# aov_out <- aov_ez(id = "UID", dv = "int", data = df4, between = c("gender2","race_fin2"))
# nice(aov_out)
# emmeans(aov_out, specs = "gender2")
# pairs(emmeans(aov_out, specs = "gender2"))
# afex_plot(aov_out, x = "gender2", trace = "race_fin2")
Hypothesis 2: There is a significant relationship between shared racial identity and science confidence.
Negative relationships between shared racial identity and interest (p = .025) but not confidence (p = .093).
out <- corr.test(subset(df3, select=c(int, con, ri_si)))
corrplot(out$r, type="upper", method = "color", tl.col = "black", tl.cex = .75,
p.mat = out$p,
sig.level = c(.001, .01, .05), pch.cex = .9,
insig = "label_sig", pch.col = "white",
order = "hclust")
out$r
## int con ri_si
## int 1.0000000 0.6178160 -0.1372743
## con 0.6178160 1.0000000 -0.1266393
## ri_si -0.1372743 -0.1266393 1.0000000
out$p
## int con ri_si
## int 0.000000e+00 1.713435e-28 0.06002838
## con 5.711451e-29 0.000000e+00 0.06002838
## ri_si 3.001419e-02 4.546043e-02 0.00000000
df4 <- subset(df3, gender2 != "N")
df4$gender2 <- droplevels(df4$gender2)
df4$race_fin2 <- as.factor(df4$race_fin2)
df4$race_fin2 <- relevel(df4$race_fin2, ref = "white")
reg_out <- lm(data = df4, con ~ ri_si + race_fin2 + int)
summary(reg_out)
##
## Call:
## lm(formula = con ~ ri_si + race_fin2 + int, data = df4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.06932 -0.54042 0.00254 0.49161 1.76804
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.42408 0.29093 4.895 1.83e-06 ***
## ri_si -0.03475 0.05973 -0.582 0.561
## race_fin2asian -0.01221 0.14231 -0.086 0.932
## race_fin2latino -0.10394 0.15956 -0.651 0.515
## race_fin2other -0.03325 0.15146 -0.220 0.826
## int 0.58872 0.04931 11.940 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7233 on 235 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.3898, Adjusted R-squared: 0.3768
## F-statistic: 30.03 on 5 and 235 DF, p-value: < 2.2e-16
plot_model(reg_out, type = "eff", terms = "race_fin2")