library(naniar) # for the gg_miss-upset() command
library(psych) # for the describe() command
library(kableExtra) # for tables
library(corrplot) # for correlation plots
library(car) # for VIF
library(sjPlot) # for regression plots
library(ggplot2) # for plots
df <- read.csv(file="data/data_9-16.csv")
Selected hypotheses:
TMFS:
df2 <- subset(df, select=c(UID))
df2$tmfs_ideal <- (df$tmfs2 + df$tmfs4 + df$tmfs5)/3
df2$tmfs_consider <- (df$tmfs1 + df$tmfs3 + df$tmfs6)/3
df2$tmfs <- (df$tmfs2 + df$tmfs3 + df$tmfs4 + df$tmfs5)/4
df2$int <- (df$salg1 + df$salg2 + df$salg3 + df$salg4)/4
df2$con <- (df$salg5 + df$salg6 + df$salg7)/3
df2$gender2 <- "N"
# Recode 'f' or 'F' as 'F'
df2$gender2[df$gender %in% c("f", "F")] <- "F"
# Recode 'm' or 'M' as 'M'
df2$gender2[df$gender %in% c("m", "M")] <- "M"
# Recode 'female' or 'Female' as 'F'
df2$gender2[df$gender %in% c("female", "Female", "female ", "Female ", "Woman")] <- "F"
# Recode 'male' or 'Male' as 'M'
df2$gender2[df$gender %in% c("male", "Male", "male ", "Male ")] <- "M"
df2$gender2 <- as.factor(df2$gender2)
gg_miss_upset(df2)
desc <- describe(df2[-1])
kable(round(desc, digits = 2)) %>%
kable_styling() %>%
row_spec(which(desc$kurtosis > 2), bold = T) %>%
row_spec(which(desc$kurtosis < -2), bold = T) %>%
row_spec(which(desc$skew > 2), italic = T) %>%
row_spec(which(desc$skew < -2), italic = T)
| vars | n | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| tmfs_ideal | 1 | 394 | 5.08 | 1.75 | 5.00 | 5.19 | 1.98 | 1.00 | 8.33 | 7.33 | -0.48 | -0.40 | 0.09 |
| tmfs_consider | 2 | 394 | 4.12 | 1.55 | 4.00 | 4.11 | 1.48 | 1.00 | 7.33 | 6.33 | 0.05 | -0.67 | 0.08 |
| tmfs | 3 | 394 | 4.86 | 1.57 | 5.00 | 4.97 | 1.85 | 1.00 | 7.50 | 6.50 | -0.55 | -0.22 | 0.08 |
| int | 4 | 451 | 3.53 | 0.97 | 3.75 | 3.56 | 1.11 | 1.25 | 5.00 | 3.75 | -0.33 | -0.71 | 0.05 |
| con | 5 | 451 | 3.39 | 0.94 | 3.33 | 3.39 | 0.99 | 1.00 | 5.00 | 4.00 | -0.04 | -0.47 | 0.04 |
| gender2* | 6 | 498 | 1.56 | 0.75 | 1.00 | 1.45 | 0.00 | 1.00 | 3.00 | 2.00 | 0.92 | -0.64 | 0.03 |
Using Mahalanobis’ distance. Two outliers dropped.
d <- na.omit(subset(df2, select=-c(7)))
m_dist <- mahalanobis(d[-1], colMeans(d[-1]), cov(d[-1]))
d$MD <- round(m_dist, 1)
plot(d$MD)
describe(m_dist)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 394 4.99 2.97 4.59 4.7 2.82 0.38 19.42 19.04 1.19 2.42 0.15
cut <- qchisq(.99, df=(ncol(d)-1))
abline(a=cut, b=0, col="red")
d$outlier <- F
d$outlier[d$MD > cut] <- T
table(d$outlier)
##
## FALSE TRUE
## 392 2
outs <- subset(d, select=c(UID, outlier), outlier == T)
df3 <- subset(df2, !(UID %in% outs$UID))
There is a significant relationship between TMFS (masculine-feminine) and interest/confidence
Correlation matrix suggests that there is not a significant relationship between TMFS and interest (p > .05) or TMFS and confidence (p > .05).
out <- corr.test(subset(df3, select=c(int, con, tmfs, tmfs_ideal, tmfs_consider)))
corrplot(out$r, type="upper", method = "color", tl.col = "black", tl.cex = .75,
p.mat = out$p,
sig.level = c(.001, .01, .05), pch.cex = .9,
insig = "label_sig", pch.col = "white",
order = "hclust")
out$r
## int con tmfs tmfs_ideal tmfs_consider
## int 1.000000000 0.61173923 0.001803163 -0.05728685 0.09009992
## con 0.611739227 1.00000000 -0.015857991 -0.04630012 -0.02386500
## tmfs 0.001803163 -0.01585799 1.000000000 0.95607678 0.50978219
## tmfs_ideal -0.057286849 -0.04630012 0.956076777 1.00000000 0.35750063
## tmfs_consider 0.090099915 -0.02386500 0.509782192 0.35750063 1.00000000
out$p
## int con tmfs tmfs_ideal
## int 0.000000e+00 1.746244e-46 1.000000e+00 1.000000e+00
## con 1.940271e-47 0.000000e+00 1.000000e+00 1.000000e+00
## tmfs 9.716118e-01 7.542888e-01 0.000000e+00 5.904070e-209
## tmfs_ideal 2.578325e-01 3.605811e-01 5.904070e-210 0.000000e+00
## tmfs_consider 7.477923e-02 6.375971e-01 2.569236e-27 2.917963e-13
## tmfs_consider
## int 4.486754e-01
## con 1.000000e+00
## tmfs 2.055389e-26
## tmfs_ideal 2.042574e-12
## tmfs_consider 0.000000e+00
Do significant differences in interest/confidence emerge when using TMFS scale versus self-reported gender?
df3 <- subset(df3, gender2 != "N")
df3$gender2 <- relevel(df3$gender2, ref = "M")
reg_model <- lm(con ~ tmfs + gender2 + int, data = df3)
vif(reg_model)
## tmfs gender2 int
## 1.095629 1.106730 1.010879
plot(reg_model, 1)
plot(reg_model, 2)
plot(reg_model, 3)
plot(reg_model, 4)
plot(reg_model, 5)
summary(reg_model)
##
## Call:
## lm(formula = con ~ tmfs + gender2 + int, data = df3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.95718 -0.53168 0.00806 0.53513 1.79978
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.56619 0.20181 7.761 8.37e-14 ***
## tmfs 0.02537 0.02501 1.014 0.311
## gender2F -0.40533 0.08696 -4.661 4.40e-06 ***
## int 0.56734 0.04184 13.558 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7241 on 369 degrees of freedom
## (44 observations deleted due to missingness)
## Multiple R-squared: 0.3746, Adjusted R-squared: 0.3695
## F-statistic: 73.67 on 3 and 369 DF, p-value: < 2.2e-16