Load libraries and functions

Here’s an overview of the packages we will need:

#LIBRARIES
library(plyr) #for summary tables throughout
library(dplyr) #because you need both?
library(reshape2) #for other data wrangling
library(tidyverse) #for science
library(ggplot2) #for all plots
# library(ggdark) #convert themes to dark backgrounds
library(readxl) #for reading in separate sheets from excel
# devtools::source_gist("2a1bb0133ff568cbe28d", filename = "geom_flat_violin.R") # raincloud plots. source = 10.7287/peerj.preprints.27137v1

library(lme4) #for linear mixed regressions
library(lmerTest) #for pvalues from lmes
# library(wesanderson) #for pleasing plot colors
# library(cowplot) #for combining plots
# library(broom) #for tidy model outputs
library(MASS) #for negative binomial regression
library(pscl) #for likelihood ratio test of a negative binomial model

library(jtools) #for actually legible model printouts -- where have you been all my life??

#FUNCTIONS
number_ticks <- function(n) {function(limits) pretty(limits, n)}#pretty scale breaks function

Dataframe preparation

(also include some summary stats) This takes 2 files. The first file is the actual annotations (“bks” - BKStokens), the second file are full demographics. This section merges the files and reduces the factor creep of some of the annotations. Then it creates a summary of counts of fingerspelling by the item and participant variables and calculates proportions for fingerspelling per participant

import1 <- read_excel("BKS.xlsx", sheet = "export")
# import1
import2 <- read_excel("BKSstudyparticipants.xlsx", sheet = "export") #,range = cell_cols("A:H"))
# import2
full_df <- merge(import2, import1, by = "Participant")

#turn strings into factors
full_df <- full_df %>%
  mutate_if(sapply(full_df, is.character), as.factor)
# str(full_df)

Collapse and summarize factors

#reduce system coding to 1, 2, and other
summary(full_df$System) #ah = 4, b1-2 = 1, bah = 3

##    1    2   ah B1-2  bah 
##  898   38    4    1    3

full_df$System <- mapvalues(full_df$System, from = c("ah", "bah","B1-2"), to = c("other", "other","other"))

#reduce uncertain coding to one category
full_df$Type <- mapvalues(full_df$Type, from = c("FULL?", "INIT?","MIX?","RED?"), to = c("?", "?","?", "?"))
summary(full_df$Type)

##    ? FULL INIT  MIX  RED 
##   13  360  270  231   70

type_df <- subset(full_df, !Type %in% c("?"))
nrow(full_df)-nrow(type_df) #removing uncertain coding excludes 13 observations.

## [1] 13

full_df <- type_df
full_df$Type <- factor(full_df$Type)
summary(full_df$Type)

## FULL INIT  MIX  RED 
##  360  270  231   70

rm(type_df)

#reduce word class coding to Nloc, Nprop, and other
summary(full_df$WordClass)

##    Adj   Conj    INT   NLoc   NorV  Nprop  NProp   PREP    SUB Unsure 
##      8      1      1    223      1      2    672      3      1     19

full_df$WordClassOriginal <- full_df$WordClass
 
full_df$WordClassFull <- mapvalues(full_df$WordClassOriginal, 
                      from = c("Adj", "Conj","INT","NorV","Nprop","NProp","PREP","SUB","Unsure"), 
                      to = c("Adj", "Conj","INT", "NorV","NProp", "NProp","PREP","NLoc","Unsure"))

summary(full_df$WordClassFull)

##    Adj   Conj    INT   NLoc   NorV  NProp   PREP Unsure 
##      8      1      1    224      1    674      3     19

full_df$WordClass <- mapvalues(full_df$WordClassFull, 
                      from = c("Adj", "Conj", "INT", "NLoc", "NorV", "NProp", "PREP","Unsure"), 
                      to = c("other", "other","other", "NLoc","other","NProp","other","other"))

class_df <- subset(full_df, !WordClass %in% c("other"))
nrow(full_df)-nrow(class_df) #removing uncertain coding excludes 33 observations.

## [1] 33

full_df <- class_df
full_df$WordClass <- factor(full_df$WordClass)
summary(full_df$WordClass)

##  NLoc NProp 
##   224   674

rm(class_df)

Demo Model

Do demographic variables affect fingerspelling frequency?

demographics <- ddply(full_df, c("Participant","Region","Age","Gender"),summarise, 
                  Count = length(Participant),
                  Signs = mean(SignCount),
                  Freq = round(Count/Signs,4)*100,
                  LogFreq = log(Count/Signs))
# demographics

d_mod <- glm(Count~Age+Region+Gender+offset(log(Signs)), data = demographics, family = poisson)
summ(d_mod)

## MODEL INFO:
## Observations: 29
## Dependent Variable: Count
## Type: Generalized linear model
##   Family: poisson 
##   Link function: log 
## 
## MODEL FIT:
## <U+03C7>²(4) = 100.46, p = 0.00
## Pseudo-R² (Cragg-Uhler) = 0.97
## Pseudo-R² (McFadden) = 0.20
## AIC = 421.85, BIC = 428.68 
## 
## Standard errors: MLE
## ----------------------------------------------------
##                          Est.   S.E.   z val.      p
## --------------------- ------- ------ -------- ------
## (Intercept)             -3.17   0.13   -24.23   0.00
## Age                     -0.01   0.00    -7.70   0.00
## RegionOslo               0.37   0.11     3.41   0.00
## RegionTrondheim          0.39   0.10     3.75   0.00
## GenderMale              -0.12   0.07    -1.70   0.09
## ----------------------------------------------------

# #test foroverdispersion following winter 227-9 if using only a generalized linear regression
#library(MASS)
d_mod_nb <- glm.nb(Count~Age+Region+Gender+offset(log(Signs)), data = demographics)
summ(d_mod_nb)

## Error in glm.control(...) : 
##   unused argument (offset = c(6.28599809450886, 6.42648845745769, 6.52941883826223, 7.9298464297425, 6.93342302573071, 6.79234442747081, 6.8596149036542, 7.11882624906208, 6.73815249459596, 7.44716835960004, 6.91869521902047, 6.54391184556479, 6.70686233660275, 7.01301578963963, 7.29437729928882, 7.22548147278229, 6.67456139181443, 6.4393503711001, 6.6631326959908, 6.40522845803084, 7.2291138777933, 6.80128303447162, 6.4707995037826, 6.3919171133926, 6.52356230614951, 7.22329567956231, 7.66528471847135, 7.24708058458576, 
## 7.29641326877392))

## MODEL INFO:
## Observations: 29
## Dependent Variable: Count
## Type: Generalized linear model
##   Family: Negative Binomial(3.5513) 
##   Link function: log 
## 
## MODEL FIT:
## <U+03C7>²(NA) = NA, p = NA
## Pseudo-R² (Cragg-Uhler) = NA
## Pseudo-R² (McFadden) = NA
## AIC = 253.99, BIC = 262.19 
## 
## Standard errors: MLE
## ----------------------------------------------------
##                          Est.   S.E.   z val.      p
## --------------------- ------- ------ -------- ------
## (Intercept)             -2.80   0.37    -7.55   0.00
## Age                     -0.02   0.01    -3.49   0.00
## RegionOslo               0.31   0.30     1.04   0.30
## RegionTrondheim          0.42   0.28     1.48   0.14
## GenderMale              -0.16   0.22    -0.71   0.48
## ----------------------------------------------------

#library(pscl)
odTest(d_mod_nb)

## Likelihood ratio test of H0: Poisson, as restricted NB model:
## n.b., the distribution of the test-statistic under H0 is non-standard
## e.g., see help(odTest) for details/references
## 
## Critical value of test statistic at the alpha= 0.05 level: 2.7055 
## Chi-Square Test Statistic =  169.8594 p-value = < 2.2e-16

#winter p.129
hist(residuals(d_mod_nb), col = "skyblue") #not great

{qqnorm(residuals(d_mod_nb)) 
  qqline(residuals(d_mod_nb))} #fine?

plot(fitted(d_mod_nb), residuals(d_mod_nb)) #no idea what this is showing

effect_plot(d_mod_nb, pred = Age, interval = TRUE, partial.residuals = TRUE,jitter = c(0.1,0),
            y.label = "rate of fgs", x.label = "age at time of signing")

## Using data demographics from global environment. This could cause incorrect
## results if demographics has been altered since the model was fit. You can
## manually provide the data to the "data =" argument.

## Outcome is based on a total of 1 exposures

effect_plot(d_mod_nb, pred = Region, interval = TRUE, partial.residuals = TRUE,jitter = c(0.1,0),
            y.label = "rate of fgs", x.label = "region")

## Using data demographics from global environment. This could cause incorrect
## results if demographics has been altered since the model was fit. You can
## manually provide the data to the "data =" argument.
## Outcome is based on a total of 1 exposures

effect_plot(d_mod_nb, pred = Gender, interval = TRUE, partial.residuals = TRUE,jitter = c(0.1,0),
            y.label = "rate of fgs", x.label = "gender")

## Using data demographics from global environment. This could cause incorrect
## results if demographics has been altered since the model was fit. You can
## manually provide the data to the "data =" argument.
## Outcome is based on a total of 1 exposures

Demo Plot

# demo_age <- ddply(demographics, c("AgeRange"),summarise, average = round(mean(Freq),2))
# demo_age
# ggplot(data=demographics, aes(x=Age, y=Freq, fill=Region)) +
#   # geom_bar(stat="identity", position=position_dodge()) +
#   # geom_boxplot(aes(x=Age, y=Freq, fill=Region)) +
#   + geom_point()
#   scale_fill_brewer(palette="Oranges") +
#   scale_y_continuous(breaks=number_ticks(10)) +
#   # coord_cartesian(ylim=c(0,24)) +
#   # facet_wrap(~Gender, nrow = 1, strip.position = "top") +
#   # geom_errorbar(data=demo_age, aes(AgeRange, ymax = average, ymin = average),
#              # size=0.75, linetype = "solid", color = "black", inherit.aes = F, width = .9) +
#   theme_bw() 

ggplot(demographics, aes(x=Age, y=Freq, color=Region, shape=Region)) +
  geom_point() + 
  geom_smooth(method=lm, se=FALSE, fullrange=TRUE) +
  scale_color_brewer(palette="Dark2") +
  theme_bw()

Interpretation

We were unable to run a Mixed effects generalized linear regression since we have not yet collected balanced groups for all demographic categories. A likelihood ratio test of a negative binomial model again a Poisson model revealed significant difference chi-squared(1) = 169.86, p < .0001. The more conservative model was selected. This revealed more fingerspelling for younger signs than older signers. No main effects of gender or region.

FGS-Type Model

Does sharing geographical knowledge impact how signers fingerspell?

# full_df
fgs <- ddply(full_df, c("Participant","WordClass","RegionShared","Type"),summarise,
                Count = length(Participant),
                Signs = mean(SignCount),
                Freq = round(Count/Signs,4)*100,
                LogFreq = log(Count/Signs))
# fgs

# contrast coding if needed
# fgs <- mutate(fgs, RegionSum = RegionShared)
# contrasts(fgs$RegionSum) <- contr.sum(2) #RegionSum1 = different
# fgs

f_mmod <- glmer(Count~Type * RegionShared + WordClass + offset(log(Signs)) + (1+Type|Participant), data = fgs, family = poisson)
summ(f_mmod)

## MODEL INFO:
## Observations: 156
## Dependent Variable: Count
## Type: Mixed effects generalized linear regression
## Error Distribution: poisson
## Link function: log 
## 
## MODEL FIT:
## AIC = 948.62, BIC = 1006.56
## Pseudo-R² (fixed effects) = 0.36
## Pseudo-R² (total) = 0.83 
## 
## FIXED EFFECTS:
## --------------------------------------------------------------
##                                    Est.   S.E.   z val.      p
## ------------------------------- ------- ------ -------- ------
## (Intercept)                       -5.45   0.21   -25.76   0.00
## TypeINIT                          -0.98   0.30    -3.27   0.00
## TypeMIX                           -0.92   0.36    -2.57   0.01
## TypeRED                           -1.83   0.36    -5.14   0.00
## RegionSharedsame                  -0.15   0.26    -0.58   0.56
## WordClassNProp                     0.79   0.09     9.18   0.00
## TypeINIT:RegionSharedsame          1.02   0.36     2.79   0.01
## TypeMIX:RegionSharedsame           0.82   0.44     1.87   0.06
## TypeRED:RegionSharedsame           0.71   0.39     1.85   0.06
## --------------------------------------------------------------
## 
## RANDOM EFFECTS:
## ---------------------------------------
##     Group       Parameter    Std. Dev. 
## ------------- ------------- -----------
##  Participant   (Intercept)     0.62    
##  Participant    TypeINIT       0.82    
##  Participant     TypeMIX       1.01    
##  Participant     TypeRED       0.37    
## ---------------------------------------
## 
## Grouping variables:
## -------------------------------
##     Group      # groups   ICC  
## ------------- ---------- ------
##  Participant      29      0.28 
## -------------------------------

hist(residuals(f_mmod), col = "skyblue")

{qqnorm(residuals(f_mmod)) 
  qqline(residuals(f_mmod))} #winter p.129 needs the {} to add qqline

plot(fitted(f_mmod), residuals(f_mmod)) #no idea what this is showing

#Winter p. 77 R-squared is an effect size measurement from 0-1. High values indicate larger effects.

effect_plot(f_mmod, pred = RegionShared, interval = TRUE, partial.residuals = TRUE,jitter = c(0.1,0),
            y.label = "rate of fgs", x.label = "shared geographical knowledge")

## Using data fgs from global environment. This could cause incorrect results
## if fgs has been altered since the model was fit. You can manually provide
## the data to the "data =" argument.

## Outcome is based on a total of 1 exposures

## Confidence intervals for merMod models is an experimental feature. The
## intervals reflect only the variance of the fixed effects, not the random
## effects.

effect_plot(f_mmod, pred = Type, interval = TRUE, partial.residuals = TRUE,jitter = c(0.1,0),
            y.label = "rate of fgs", x.label = "type of fgs")

## Using data fgs from global environment. This could cause incorrect results
## if fgs has been altered since the model was fit. You can manually provide
## the data to the "data =" argument.

## Outcome is based on a total of 1 exposures

## Confidence intervals for merMod models is an experimental feature. The
## intervals reflect only the variance of the fixed effects, not the random
## effects.

effect_plot(f_mmod, pred = WordClass, interval = TRUE, partial.residuals = TRUE,jitter = c(0.1,0),
            y.label = "rate of fgs", x.label = "word class")

## Using data fgs from global environment. This could cause incorrect results
## if fgs has been altered since the model was fit. You can manually provide
## the data to the "data =" argument.

## Outcome is based on a total of 1 exposures

## Confidence intervals for merMod models is an experimental feature. The
## intervals reflect only the variance of the fixed effects, not the random
## effects.

FGS-Type Plot

levels(fgs$Type)

## [1] "FULL" "INIT" "MIX"  "RED"

# freq_df <- freq_df %>% mutate(Type = fct_reorder(Type, LogFreq)) #order by another value. doesn't work?
fgs$Type <- factor(fgs$Type, levels = c("FULL","INIT","MIX","RED")) #manually set order
# freq_df <- freq_df %>% mutate(RegionShared = fct_reorder(RegionShared, Freq))

fgs_means <- ddply(fgs, c("Type","WordClass"),summarise, average = round(mean(Freq),2))
fgs_means

##   Type WordClass average
## 1 FULL      NLoc    0.55
## 2 FULL     NProp    0.90
## 3 INIT      NLoc    0.22
## 4 INIT     NProp    0.89
## 5  MIX      NLoc    0.18
## 6  MIX     NProp    0.96
## 7  RED      NLoc    0.19
## 8  RED     NProp    0.29

ggplot(data=fgs, aes(x=Type, y=Freq, fill=RegionShared)) +
  # geom_bar(stat="identity", position=position_dodge()) +
  geom_boxplot(aes(x=Type, y=Freq, fill=RegionShared)) +
  scale_fill_brewer(palette="Blues") +
  scale_y_continuous(breaks=number_ticks(10)) +
  # coord_cartesian(ylim=c(0,24)) +
  facet_wrap(~WordClass, nrow = 1, strip.position = "top") +
  geom_errorbar(data=fgs_means, aes(Type, ymax = average, ymin = average),
             size=0.75, linetype = "solid", color = "red", inherit.aes = F, width = .9) +
  theme_bw()

Interpretation

We ran a Mixed effects generalized linear regression that included participant variation as a random effect to explore the contribution of shared geographic knowledge and noun type on the frequency of fingerspelling type. R-squared and stuff. Interaction of participant effects with predictors and whatnot.

Common nouns are fingerspelled less than proper nouns. Full common noun fingerspells more common than other strategies. Reduced proper noun fingerspellings less common than other proper noun strategies. (check red lines)

Full fingerspellings of common nouns are more frequent without commonground. Mostly equal rates of full fingerspelling of proper nouns given common ground but initializations mixing and reduction are significantly more common when commonground is shared. (check boxplot color differences)

Fingerspelling in NTS - data processing

@bananabull

Last updated: 19 april 2021

Load libraries and functions

Dataframe preparation

Collapse and summarize factors

Demo Model

Demo Plot

Interpretation

FGS-Type Model

FGS-Type Plot

Interpretation