Here’s an overview of the packages we will need:
#LIBRARIES
library(plyr) #for summary tables throughout
library(dplyr) #because you need both?
library(reshape2) #for other data wrangling
library(tidyverse) #for science
library(ggplot2) #for all plots
# library(ggdark) #convert themes to dark backgrounds
library(readxl) #for reading in separate sheets from excel
# devtools::source_gist("2a1bb0133ff568cbe28d", filename = "geom_flat_violin.R") # raincloud plots. source = 10.7287/peerj.preprints.27137v1
library(lme4) #for linear mixed regressions
library(lmerTest) #for pvalues from lmes
# library(wesanderson) #for pleasing plot colors
# library(cowplot) #for combining plots
# library(broom) #for tidy model outputs
library(MASS) #for negative binomial regression
library(pscl) #for likelihood ratio test of a negative binomial model
library(jtools) #for actually legible model printouts -- where have you been all my life??
#FUNCTIONS
number_ticks <- function(n) {function(limits) pretty(limits, n)}#pretty scale breaks function
(also include some summary stats) This takes 2 files. The first file is the actual annotations (“bks” - BKStokens), the second file are full demographics. This section merges the files and reduces the factor creep of some of the annotations. Then it creates a summary of counts of fingerspelling by the item and participant variables and calculates proportions for fingerspelling per participant
import1 <- read_excel("BKS.xlsx", sheet = "export")
# import1
import2 <- read_excel("BKSstudyparticipants.xlsx", sheet = "export") #,range = cell_cols("A:H"))
# import2
full_df <- merge(import2, import1, by = "Participant")
#turn strings into factors
full_df <- full_df %>%
mutate_if(sapply(full_df, is.character), as.factor)
# str(full_df)
#reduce system coding to 1, 2, and other
summary(full_df$System) #ah = 4, b1-2 = 1, bah = 3
## 1 2 ah B1-2 bah
## 898 38 4 1 3
full_df$System <- mapvalues(full_df$System, from = c("ah", "bah","B1-2"), to = c("other", "other","other"))
#reduce uncertain coding to one category
full_df$Type <- mapvalues(full_df$Type, from = c("FULL?", "INIT?","MIX?","RED?"), to = c("?", "?","?", "?"))
summary(full_df$Type)
## ? FULL INIT MIX RED
## 13 360 270 231 70
type_df <- subset(full_df, !Type %in% c("?"))
nrow(full_df)-nrow(type_df) #removing uncertain coding excludes 13 observations.
## [1] 13
full_df <- type_df
full_df$Type <- factor(full_df$Type)
summary(full_df$Type)
## FULL INIT MIX RED
## 360 270 231 70
rm(type_df)
#reduce word class coding to Nloc, Nprop, and other
summary(full_df$WordClass)
## Adj Conj INT NLoc NorV Nprop NProp PREP SUB Unsure
## 8 1 1 223 1 2 672 3 1 19
full_df$WordClassOriginal <- full_df$WordClass
full_df$WordClassFull <- mapvalues(full_df$WordClassOriginal,
from = c("Adj", "Conj","INT","NorV","Nprop","NProp","PREP","SUB","Unsure"),
to = c("Adj", "Conj","INT", "NorV","NProp", "NProp","PREP","NLoc","Unsure"))
summary(full_df$WordClassFull)
## Adj Conj INT NLoc NorV NProp PREP Unsure
## 8 1 1 224 1 674 3 19
full_df$WordClass <- mapvalues(full_df$WordClassFull,
from = c("Adj", "Conj", "INT", "NLoc", "NorV", "NProp", "PREP","Unsure"),
to = c("other", "other","other", "NLoc","other","NProp","other","other"))
class_df <- subset(full_df, !WordClass %in% c("other"))
nrow(full_df)-nrow(class_df) #removing uncertain coding excludes 33 observations.
## [1] 33
full_df <- class_df
full_df$WordClass <- factor(full_df$WordClass)
summary(full_df$WordClass)
## NLoc NProp
## 224 674
rm(class_df)
Do demographic variables affect fingerspelling frequency?
demographics <- ddply(full_df, c("Participant","Region","Age","Gender"),summarise,
Count = length(Participant),
Signs = mean(SignCount),
Freq = round(Count/Signs,4)*100,
LogFreq = log(Count/Signs))
# demographics
d_mod <- glm(Count~Age+Region+Gender+offset(log(Signs)), data = demographics, family = poisson)
summ(d_mod)
## MODEL INFO:
## Observations: 29
## Dependent Variable: Count
## Type: Generalized linear model
## Family: poisson
## Link function: log
##
## MODEL FIT:
## <U+03C7>²(4) = 100.46, p = 0.00
## Pseudo-R² (Cragg-Uhler) = 0.97
## Pseudo-R² (McFadden) = 0.20
## AIC = 421.85, BIC = 428.68
##
## Standard errors: MLE
## ----------------------------------------------------
## Est. S.E. z val. p
## --------------------- ------- ------ -------- ------
## (Intercept) -3.17 0.13 -24.23 0.00
## Age -0.01 0.00 -7.70 0.00
## RegionOslo 0.37 0.11 3.41 0.00
## RegionTrondheim 0.39 0.10 3.75 0.00
## GenderMale -0.12 0.07 -1.70 0.09
## ----------------------------------------------------
# #test foroverdispersion following winter 227-9 if using only a generalized linear regression
#library(MASS)
d_mod_nb <- glm.nb(Count~Age+Region+Gender+offset(log(Signs)), data = demographics)
summ(d_mod_nb)
## Error in glm.control(...) :
## unused argument (offset = c(6.28599809450886, 6.42648845745769, 6.52941883826223, 7.9298464297425, 6.93342302573071, 6.79234442747081, 6.8596149036542, 7.11882624906208, 6.73815249459596, 7.44716835960004, 6.91869521902047, 6.54391184556479, 6.70686233660275, 7.01301578963963, 7.29437729928882, 7.22548147278229, 6.67456139181443, 6.4393503711001, 6.6631326959908, 6.40522845803084, 7.2291138777933, 6.80128303447162, 6.4707995037826, 6.3919171133926, 6.52356230614951, 7.22329567956231, 7.66528471847135, 7.24708058458576,
## 7.29641326877392))
## MODEL INFO:
## Observations: 29
## Dependent Variable: Count
## Type: Generalized linear model
## Family: Negative Binomial(3.5513)
## Link function: log
##
## MODEL FIT:
## <U+03C7>²(NA) = NA, p = NA
## Pseudo-R² (Cragg-Uhler) = NA
## Pseudo-R² (McFadden) = NA
## AIC = 253.99, BIC = 262.19
##
## Standard errors: MLE
## ----------------------------------------------------
## Est. S.E. z val. p
## --------------------- ------- ------ -------- ------
## (Intercept) -2.80 0.37 -7.55 0.00
## Age -0.02 0.01 -3.49 0.00
## RegionOslo 0.31 0.30 1.04 0.30
## RegionTrondheim 0.42 0.28 1.48 0.14
## GenderMale -0.16 0.22 -0.71 0.48
## ----------------------------------------------------
#library(pscl)
odTest(d_mod_nb)
## Likelihood ratio test of H0: Poisson, as restricted NB model:
## n.b., the distribution of the test-statistic under H0 is non-standard
## e.g., see help(odTest) for details/references
##
## Critical value of test statistic at the alpha= 0.05 level: 2.7055
## Chi-Square Test Statistic = 169.8594 p-value = < 2.2e-16
#winter p.129
hist(residuals(d_mod_nb), col = "skyblue") #not great
{qqnorm(residuals(d_mod_nb))
qqline(residuals(d_mod_nb))} #fine?
plot(fitted(d_mod_nb), residuals(d_mod_nb)) #no idea what this is showing
effect_plot(d_mod_nb, pred = Age, interval = TRUE, partial.residuals = TRUE,jitter = c(0.1,0),
y.label = "rate of fgs", x.label = "age at time of signing")
## Using data demographics from global environment. This could cause incorrect
## results if demographics has been altered since the model was fit. You can
## manually provide the data to the "data =" argument.
## Outcome is based on a total of 1 exposures
effect_plot(d_mod_nb, pred = Region, interval = TRUE, partial.residuals = TRUE,jitter = c(0.1,0),
y.label = "rate of fgs", x.label = "region")
## Using data demographics from global environment. This could cause incorrect
## results if demographics has been altered since the model was fit. You can
## manually provide the data to the "data =" argument.
## Outcome is based on a total of 1 exposures
effect_plot(d_mod_nb, pred = Gender, interval = TRUE, partial.residuals = TRUE,jitter = c(0.1,0),
y.label = "rate of fgs", x.label = "gender")
## Using data demographics from global environment. This could cause incorrect
## results if demographics has been altered since the model was fit. You can
## manually provide the data to the "data =" argument.
## Outcome is based on a total of 1 exposures
# demo_age <- ddply(demographics, c("AgeRange"),summarise, average = round(mean(Freq),2))
# demo_age
# ggplot(data=demographics, aes(x=Age, y=Freq, fill=Region)) +
# # geom_bar(stat="identity", position=position_dodge()) +
# # geom_boxplot(aes(x=Age, y=Freq, fill=Region)) +
# + geom_point()
# scale_fill_brewer(palette="Oranges") +
# scale_y_continuous(breaks=number_ticks(10)) +
# # coord_cartesian(ylim=c(0,24)) +
# # facet_wrap(~Gender, nrow = 1, strip.position = "top") +
# # geom_errorbar(data=demo_age, aes(AgeRange, ymax = average, ymin = average),
# # size=0.75, linetype = "solid", color = "black", inherit.aes = F, width = .9) +
# theme_bw()
ggplot(demographics, aes(x=Age, y=Freq, color=Region, shape=Region)) +
geom_point() +
geom_smooth(method=lm, se=FALSE, fullrange=TRUE) +
scale_color_brewer(palette="Dark2") +
theme_bw()
We were unable to run a Mixed effects generalized linear regression since we have not yet collected balanced groups for all demographic categories. A likelihood ratio test of a negative binomial model again a Poisson model revealed significant difference chi-squared(1) = 169.86, p < .0001. The more conservative model was selected. This revealed more fingerspelling for younger signs than older signers. No main effects of gender or region.
Does sharing geographical knowledge impact how signers fingerspell?
# full_df
fgs <- ddply(full_df, c("Participant","WordClass","RegionShared","Type"),summarise,
Count = length(Participant),
Signs = mean(SignCount),
Freq = round(Count/Signs,4)*100,
LogFreq = log(Count/Signs))
# fgs
# contrast coding if needed
# fgs <- mutate(fgs, RegionSum = RegionShared)
# contrasts(fgs$RegionSum) <- contr.sum(2) #RegionSum1 = different
# fgs
f_mmod <- glmer(Count~Type * RegionShared + WordClass + offset(log(Signs)) + (1+Type|Participant), data = fgs, family = poisson)
summ(f_mmod)
## MODEL INFO:
## Observations: 156
## Dependent Variable: Count
## Type: Mixed effects generalized linear regression
## Error Distribution: poisson
## Link function: log
##
## MODEL FIT:
## AIC = 948.62, BIC = 1006.56
## Pseudo-R² (fixed effects) = 0.36
## Pseudo-R² (total) = 0.83
##
## FIXED EFFECTS:
## --------------------------------------------------------------
## Est. S.E. z val. p
## ------------------------------- ------- ------ -------- ------
## (Intercept) -5.45 0.21 -25.76 0.00
## TypeINIT -0.98 0.30 -3.27 0.00
## TypeMIX -0.92 0.36 -2.57 0.01
## TypeRED -1.83 0.36 -5.14 0.00
## RegionSharedsame -0.15 0.26 -0.58 0.56
## WordClassNProp 0.79 0.09 9.18 0.00
## TypeINIT:RegionSharedsame 1.02 0.36 2.79 0.01
## TypeMIX:RegionSharedsame 0.82 0.44 1.87 0.06
## TypeRED:RegionSharedsame 0.71 0.39 1.85 0.06
## --------------------------------------------------------------
##
## RANDOM EFFECTS:
## ---------------------------------------
## Group Parameter Std. Dev.
## ------------- ------------- -----------
## Participant (Intercept) 0.62
## Participant TypeINIT 0.82
## Participant TypeMIX 1.01
## Participant TypeRED 0.37
## ---------------------------------------
##
## Grouping variables:
## -------------------------------
## Group # groups ICC
## ------------- ---------- ------
## Participant 29 0.28
## -------------------------------
hist(residuals(f_mmod), col = "skyblue")
{qqnorm(residuals(f_mmod))
qqline(residuals(f_mmod))} #winter p.129 needs the {} to add qqline
plot(fitted(f_mmod), residuals(f_mmod)) #no idea what this is showing
#Winter p. 77 R-squared is an effect size measurement from 0-1. High values indicate larger effects.
effect_plot(f_mmod, pred = RegionShared, interval = TRUE, partial.residuals = TRUE,jitter = c(0.1,0),
y.label = "rate of fgs", x.label = "shared geographical knowledge")
## Using data fgs from global environment. This could cause incorrect results
## if fgs has been altered since the model was fit. You can manually provide
## the data to the "data =" argument.
## Outcome is based on a total of 1 exposures
## Confidence intervals for merMod models is an experimental feature. The
## intervals reflect only the variance of the fixed effects, not the random
## effects.
effect_plot(f_mmod, pred = Type, interval = TRUE, partial.residuals = TRUE,jitter = c(0.1,0),
y.label = "rate of fgs", x.label = "type of fgs")
## Using data fgs from global environment. This could cause incorrect results
## if fgs has been altered since the model was fit. You can manually provide
## the data to the "data =" argument.
## Outcome is based on a total of 1 exposures
## Confidence intervals for merMod models is an experimental feature. The
## intervals reflect only the variance of the fixed effects, not the random
## effects.
effect_plot(f_mmod, pred = WordClass, interval = TRUE, partial.residuals = TRUE,jitter = c(0.1,0),
y.label = "rate of fgs", x.label = "word class")
## Using data fgs from global environment. This could cause incorrect results
## if fgs has been altered since the model was fit. You can manually provide
## the data to the "data =" argument.
## Outcome is based on a total of 1 exposures
## Confidence intervals for merMod models is an experimental feature. The
## intervals reflect only the variance of the fixed effects, not the random
## effects.
levels(fgs$Type)
## [1] "FULL" "INIT" "MIX" "RED"
# freq_df <- freq_df %>% mutate(Type = fct_reorder(Type, LogFreq)) #order by another value. doesn't work?
fgs$Type <- factor(fgs$Type, levels = c("FULL","INIT","MIX","RED")) #manually set order
# freq_df <- freq_df %>% mutate(RegionShared = fct_reorder(RegionShared, Freq))
fgs_means <- ddply(fgs, c("Type","WordClass"),summarise, average = round(mean(Freq),2))
fgs_means
## Type WordClass average
## 1 FULL NLoc 0.55
## 2 FULL NProp 0.90
## 3 INIT NLoc 0.22
## 4 INIT NProp 0.89
## 5 MIX NLoc 0.18
## 6 MIX NProp 0.96
## 7 RED NLoc 0.19
## 8 RED NProp 0.29
ggplot(data=fgs, aes(x=Type, y=Freq, fill=RegionShared)) +
# geom_bar(stat="identity", position=position_dodge()) +
geom_boxplot(aes(x=Type, y=Freq, fill=RegionShared)) +
scale_fill_brewer(palette="Blues") +
scale_y_continuous(breaks=number_ticks(10)) +
# coord_cartesian(ylim=c(0,24)) +
facet_wrap(~WordClass, nrow = 1, strip.position = "top") +
geom_errorbar(data=fgs_means, aes(Type, ymax = average, ymin = average),
size=0.75, linetype = "solid", color = "red", inherit.aes = F, width = .9) +
theme_bw()
We ran a Mixed effects generalized linear regression that included participant variation as a random effect to explore the contribution of shared geographic knowledge and noun type on the frequency of fingerspelling type. R-squared and stuff. Interaction of participant effects with predictors and whatnot.
Common nouns are fingerspelled less than proper nouns. Full common noun fingerspells more common than other strategies. Reduced proper noun fingerspellings less common than other proper noun strategies. (check red lines)
Full fingerspellings of common nouns are more frequent without commonground. Mostly equal rates of full fingerspelling of proper nouns given common ground but initializations mixing and reduction are significantly more common when commonground is shared. (check boxplot color differences)