R code for fitting a mixed-effects model of preposition development in British children’s writing

## 3.3.1 Data preparation
#1-1) Load in libraries 
# install them first if not already installed in your R environment: e.g., to install Hmisc, type the following command: install.packages("Hmisc")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lme4)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
library(Hmisc)
## 
## Attaching package: 'Hmisc'
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, units
#1-2) read two CSV files in your working directory
setwd("~/R folder/tutorial_GiG_TAASSC")

#1-3) read two CSV files named 'metadata_gig.csv' and 'resultsSK.csv'
metadata <- read.csv('metadata_gig.csv')
results <- read.csv('resultsSK.csv')

#1-4) remove the '.txt' extension from the 'filename' column of the 'results'
results$filename <- gsub(' .txt', '', results$filename)

#1-5) match the 'file_id' column of the 'metadata' data frame with the 'filename' column of 'results' data frame
order <- match(metadata$file_id, results$filename)

#1-6) use indexing to match the rows in the 'results' data frame to the 'metadata' dataframe
results <- results[order,]

#1-7) create a new data frame named 'output' by combining both dataframes
results_meta <- data.frame(metadata, results)

#1-8) Rename variables
results_meta <- rename(results_meta, student = text_id)
results_meta <- rename(results_meta, year = year_group)
results_meta <- rename(results_meta, title = text_title)
results_meta <- rename(results_meta, PP = prep_nominal)
head(results_meta)
## 3.3.2 Visualize the frequency of prepositions per nominal 
#2-1) Convert grouping variable to factor
results_meta$year <- factor(results_meta$year, levels = c('Year_2', 'Year_6', 'Year_9', 'Year_11'))

#2-2) visualise the normed frequencies of prepostions per nouns across school years 
ggplot(results_meta, aes(year, PP, linetype = genre)) +
  stat_summary(fun = 'mean', geom = 'line', aes(group = genre)) +
  stat_summary(fun.data = 'mean_cl_boot', geom = 'errorbar', width = 0.2) +
  labs(y = "Prepositions per Nominal", x = "Year Group")

#2-3) Save the visual to the 'outputs' folder
ggsave('output/PP_graph.jpeg')
## Saving 7 x 5 in image
#3.3.3 Fit mixed-effects model with the desired order of factor levels
#3-1) fit a mixed-effects model
results_meta.model <- lmer(PP ~ year*genre + (1 | title), data = results_meta, REML = FALSE)

#3-2) Print the model summary
summary(results_meta.model)
## Linear mixed model fit by maximum likelihood  ['lmerMod']
## Formula: PP ~ year * genre + (1 | title)
##    Data: results_meta
## 
##      AIC      BIC   logLik deviance df.resid 
##  28710.9  28770.4 -14345.4  28690.9     2815 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.1748 -0.1948 -0.0890  0.0129 31.5489 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  title    (Intercept)   57.46   7.58   
##  Residual             1464.21  38.26   
## Number of obs: 2825, groups:  title, 326
## 
## Fixed effects:
##                               Estimate Std. Error t value
## (Intercept)                    10.9623     2.8996   3.781
## yearYear_6                     -5.1888     4.2399  -1.224
## yearYear_9                     -7.0254     5.9557  -1.180
## yearYear_11                    -6.4775     6.5134  -0.994
## genrenon_literary               4.5324     4.0061   1.131
## yearYear_6:genrenon_literary    0.8179     5.4926   0.149
## yearYear_9:genrenon_literary    4.7361     6.8407   0.692
## yearYear_11:genrenon_literary   4.4056     7.3789   0.597
## 
## Correlation of Fixed Effects:
##             (Intr) yrYr_6 yrYr_9 yrY_11 gnrnn_ yY_6:_ yY_9:_
## yearYear_6  -0.684                                          
## yearYear_9  -0.487  0.333                                   
## yearYear_11 -0.445  0.304  0.217                            
## gnrnn_ltrry -0.724  0.495  0.352  0.322                     
## yrYr_6:gnr_  0.528 -0.772 -0.257 -0.235 -0.729              
## yrYr_9:gnr_  0.424 -0.290 -0.871 -0.189 -0.586  0.427       
## yrYr_11:gn_  0.393 -0.269 -0.191 -0.883 -0.543  0.396  0.318