library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.4 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.1 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(here)
## here() starts at /Users/caoanjie/Desktop/projects/looking_time/adult_analysis
library(stringr)
SIMILARITY_DATA_PATH <- here("data/processed_data/trimmed_similaritydata.csv")
COMPLEXITY_DATA_PATH <- here("data/processed_data/trimmed_complexitydata.csv")
df.similarity <- read_csv(SIMILARITY_DATA_PATH)
## Parsed with column specification:
## cols(
## subject = col_character(),
## question_type = col_character(),
## stimulus_left = col_character(),
## stimulus_right = col_character(),
## rating = col_double()
## )
df.complexity <- read_csv(COMPLEXITY_DATA_PATH)
## Parsed with column specification:
## cols(
## subject = col_character(),
## question_type = col_character(),
## stimulus = col_character(),
## rating = col_double(),
## complexity = col_character()
## )
just to make sure the average ratings for similar pairs are higher than dissimilar pairs
df.similarity <- df.similarity %>%
mutate(
complexity = case_when(
grepl("complex", stimulus_left) | grepl("complex", stimulus_right) ~ "complex",
grepl("simple", stimulus_right) | grepl("simple", stimulus_right) ~ "simple"
),
stimulus_left_number = as.numeric(str_extract(stimulus_left, "[[:digit:]]+")),
stimulus_right_number = as.numeric(str_extract(stimulus_right, "[[:digit:]]+")),
similarity = case_when(
stimulus_left_number == stimulus_right_number ~ "similar",
TRUE ~ "dissimilar"
)
)
df.similarity %>%
ggplot(aes(x = rating))+
geom_histogram(bins = 30)
df.similarity %>%
ggplot(aes(x = similarity, y= rating)) +
geom_point(alpha = 0.1,
position = position_jitter(width = 0.3)) +
stat_summary(fun.data = "mean_cl_boot") +
facet_wrap(~complexity)
lm.similarity <- lm(formula = rating ~ similarity + complexity,
data = df.similarity)
summary(lm.similarity)
##
## Call:
## lm(formula = rating ~ similarity + complexity, data = df.similarity)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.5855 -1.0683 -0.0683 0.9317 3.9317
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.06826 0.04425 46.738 <2e-16 ***
## similaritysimilar 2.49707 0.05085 49.108 <2e-16 ***
## complexitysimple 0.02019 0.05084 0.397 0.691
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.442 on 3217 degrees of freedom
## Multiple R-squared: 0.4285, Adjusted R-squared: 0.4281
## F-statistic: 1206 on 2 and 3217 DF, p-value: < 2.2e-16
df.complexity <- df.complexity %>%
mutate(
complexity = case_when(
grepl("complex", stimulus) ~ "complex",
grepl("simple", stimulus) ~ "simple"
)
)
df.complexity %>%
ggplot(aes(x = rating))+
geom_histogram(bins = 30)
df.complexity %>%
ggplot(aes(x = complexity, y = rating)) +
geom_point(alpha = 0.1,
position = position_jitter(width = 0.3)) +
stat_summary(fun.data = "mean_cl_boot")
lm.complexity <- lm(formula = rating ~ complexity,
data = df.complexity)
summary(lm.complexity)
##
## Call:
## lm(formula = rating ~ complexity, data = df.complexity)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.3649 -1.3649 -0.3649 0.6351 3.6291
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.36488 0.03627 120.35 <2e-16 ***
## complexitysimple -1.99400 0.05128 -38.88 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.453 on 3211 degrees of freedom
## Multiple R-squared: 0.3201, Adjusted R-squared: 0.3199
## F-statistic: 1512 on 1 and 3211 DF, p-value: < 2.2e-16