RATINGS_OUT <- "mean_gender_ratings.csv"
DEMOPATH <- "raw_data/v1/demographics.csv"
RATINGPATH1 <- "raw_data/v1/ratings.csv"
DEMOPATH2 <- "raw_data/v2/demographics.csv"
RATINGPATH2 <- "raw_data/v2/ratings.csv"
demo1 <- read_csv(DEMOPATH) %>%
select(-X1) %>%
mutate_at(vars(question_name, response_str), tolower)
demo2 <- read_csv(DEMOPATH2) %>%
select(-X1) %>%
rename(education = "degree") %>%
gather("question_name", "response_str", -subjCode) %>%
rename(subj_id = subjCode) %>%
mutate_at(vars(question_name, response_str), tolower)
demo <- bind_rows(demo1, demo2)
age_data <- demo %>%
filter(question_name == "age") %>%
mutate(response_str = as.numeric(response_str)) %>%
rename(age = response_str) %>%
select(-question_name)
ggplot(age_data, aes(x = age)) +
geom_histogram(binwidth = 2) +
geom_vline(aes(xintercept = mean(age_data$age, na.rm = T)), color = "red") +
theme_classic() +
xlab("age")
demo_data <- demo %>%
filter(question_name == "gender") %>%
mutate(response_str = as.character(response_str) ,
response_str = case_when(response_str == "1" ~ "male",### check this!!
response_str == "2" ~ "female",
response_str == "3" ~"other",
TRUE ~ response_str),
response_str = as.factor(response_str) ) %>%
rename(gender = response_str) %>%
select(-question_name)
demo_data %>%
count(gender) %>%
kable()
gender | n |
---|---|
female | 130 |
male | 166 |
other | 3 |
age_demo <- age_data %>%
left_join(demo_data, by = "subj_id") %>%
mutate(subj_id = as.factor(subj_id))
age_demo %>%
filter(gender != "other") %>%
ggplot(aes(x = age, fill = gender)) +
geom_density(alpha = .3) +
geom_vline(aes(xintercept = mean(age_demo %>% filter(gender == "female") %>% pull(age)))) +
geom_vline(aes(xintercept = mean(age_demo %>% filter(gender == "male") %>% pull(age)))) +
theme_classic() +
xlab("age")
There are more male participants, and they are younger on average.
edu_data <- demo %>%
filter(question_name == "education") %>%
mutate(response_str = as.factor(response_str)) %>%
rename(edu = response_str) %>%
select(-question_name)
count(edu_data, edu) %>%
kable()
edu | n |
---|---|
1 | 2 |
2 | 43 |
3 | 70 |
4 | 77 |
5 | 71 |
6 | 35 |
NA | 1 |
native_data <- demo %>%
filter(question_name == "native_english" | question_name == "native" ) %>%
mutate(response_str = as.factor(response_str)) %>%
rename(native_english = response_str)
count(native_data, native_english) %>%
kable()
native_english | n |
---|---|
0 | 6 |
1 | 293 |
Each facet is a participant. All seemed to use tho whole scale.
ratings1 <- read_csv(RATINGPATH1)
ratings2 <- read_csv(RATINGPATH2) %>%
rename(subj_id = subjCode)
ratings <- bind_rows(ratings1, ratings2) %>%
filter(!is.na(resp))
Let’s exclude the 6 non-native speakers, and X subj with no reported gender.
non_natives <- native_data %>%
filter(native_english == 0) %>%
pull(subj_id)
no_gender<- demo_data %>%
filter(gender == "other") %>%
pull(subj_id)
ratings_clean <- ratings %>%
filter(!(subj_id %in% c(non_natives))) %>%
filter(!(subj_id %in% c(no_gender))) %>%
left_join(edu_data) %>%
left_join(demo_data) %>%
left_join(age_data) %>%
mutate(edu = as.numeric(edu)) %>%
select(-X1)
ratings_per_word <- count(ratings_clean, word) %>%
arrange(-n)
#ggplot(ratings_per_word, aes(x = n)) +
# geom_histogram(binwidth = 1) +
# theme_classic()
There are 2280 items, with 10-12 participants/item.
mean_ratings <- ratings_clean %>%
group_by(word) %>%
multi_boot_standard(col = "resp", na.rm = T) %>%
arrange(mean) %>%
rename(mean_gender_rating = mean)
write_csv(mean_ratings, RATINGS_OUT)
ggplot(mean_ratings, aes(x = mean_gender_rating)) +
geom_histogram() +
theme_classic(base_size = 15) +
ggtitle("Mean ratings by item") +
xlab("Mean gender rating (feminine-ness)") +
geom_vline(aes(xintercept = mean(mean_ratings$mean_gender_rating)),
color = "red")
Here are the words sorted by mean rating:
mean_ratings %>%
select(word, mean_gender_rating, everything()) %>%
DT::datatable()
glasgow <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/KIDBOOK_GENDER/data/raw/norms/GlasgowNorms.csv") %>%
select(word, GEND_M) %>%
rename(gender_rating_glasgow = GEND_M)
merged_ratings <- glasgow %>%
inner_join(mean_ratings) %>%
select(gender_rating_glasgow, mean_gender_rating)
951 of our words overlapped with the Glasgow data set. They’re correlated at r = .9.
ggplot(merged_ratings, aes(x = mean_gender_rating, y = gender_rating_glasgow)) +
geom_point() +
ggtitle("Glasgow vs. Our ratings ") +
geom_smooth(method = "lm") +
ylab("Glasgow gender ratings (maleness)") +
xlab("our gender ratings (femaleness)") +
theme_classic()
cor.test(merged_ratings$mean_gender_rating, merged_ratings$gender_rating_glasgow)
##
## Pearson's product-moment correlation
##
## data: merged_ratings$mean_gender_rating and merged_ratings$gender_rating_glasgow
## t = -64.063, df = 949, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.9125091 -0.8885522
## sample estimates:
## cor
## -0.9012169
mean_ratings_gender <- ratings_clean %>%
left_join(demo_data) %>%
filter(gender != "other") %>%
group_by(word, gender) %>%
multi_boot_standard(col = "resp") %>%
arrange(mean) %>%
rename(mean_gender_rating = mean)
ratings_by_gender <- mean_ratings_gender %>%
select(word, gender, mean_gender_rating) %>%
spread("gender", "mean_gender_rating") %>%
rename(mean_male = male,
mean_female = female)
ratings_by_gender_low <- mean_ratings_gender %>%
select(word, gender, ci_lower) %>%
spread("gender", "ci_lower") %>%
rename(ci_lower_male = male,
ci_lower_female = female)
ratings_by_gender_high <- mean_ratings_gender %>%
select(word, gender, ci_upper) %>%
spread("gender", "ci_upper") %>%
rename(ci_upper_male = male,
ci_upper_female = female)
all_ratings <- full_join(ratings_by_gender,
ratings_by_gender_low) %>%
full_join(ratings_by_gender_high)
all_ratings_gender_diff <- all_ratings %>%
mutate(female_incl = (ci_upper_female < mean_male) |
(ci_lower_female > mean_male),
male_incl = (ci_upper_male < mean_female) |
(ci_lower_male > mean_female)) %>%
filter(female_incl, male_incl)
mean_ratings_gender %>%
ggplot(aes(x = mean_gender_rating, fill = gender)) +
ggtitle("Mean rating by participant gender") +
geom_density(alpha = .4) +
xlab("Mean gender rating (feminine-ness)") +
theme_classic(base_size= 15)
ggplot(all_ratings, aes(x = mean_male, y = mean_female)) +
geom_abline(intercept = 0, slope = 1) +
annotate("text", x = 4.5, y = 1.5, label = "females < males", color = "red", size = 3) +
annotate("text", x = 1.6, y = 4.5, label = "females > males", color = "red", size = 3) +
ylab("female-participant feminine-ness rating")+
xlab("male-participant feminine-ness rating")+
geom_errorbarh(aes(xmin=ci_lower_male,
xmax=ci_upper_male),
height=0, color = "grey")+
geom_errorbar(aes(ymin=ci_lower_female,
ymax=ci_upper_female),
width=0, color = "grey") +
geom_point(size = .4) +
theme_classic(base_size = 15)
ggplot(all_ratings, aes(x = mean_male, y = mean_female)) +
geom_abline(intercept = 0, slope = 1) +
geom_text(aes(label = word), size = 1.5) +
annotate("text", x = 4.5, y = 1.5, label = "females < males", color = "red", size = 3) +
annotate("text", x = 1.6, y = 4.5, label = "females > males", color = "red", size = 3) +
ylab("female-participant femaleness rating")+
xlab("male-participant femaleness rating")+
theme_classic()
ggplot(all_ratings_gender_diff, aes(x = mean_male, y = mean_female)) +
geom_abline(intercept = 0, slope = 1) +
annotate("text", x = 4.5, y = 1.5, label = "females < males", color = "red", size = 3) +
annotate("text", x = 1.6, y = 4.5, label = "females > males", color = "red", size = 3) +
ylab("female-participant feminine-ness rating")+
xlab("male-participant feminine-ness rating")+
geom_errorbarh(aes(xmin=ci_lower_male,
xmax=ci_upper_male),
height=0, color = "grey")+
geom_errorbar(aes(ymin=ci_lower_female,
ymax=ci_upper_female),
width=0, color = "grey") +
geom_point(size = .4) +
theme_classic(base_size = 15)
ggplot(all_ratings_gender_diff, aes(x = mean_male, y = mean_female)) +
geom_abline(intercept = 0, slope = 1) +
annotate("text", x = 4.5, y = 1.5, label = "females < males", color = "red", size = 3) +
geom_text(aes(label = word), size =2) +
annotate("text", x = 1.6, y = 4.5, label = "females > males", color = "red", size = 3) +
ylab("female-participant feminine-ness rating")+
xlab("male-participant feminine-ness rating")+
theme_classic(base_size = 15)