Prowrite: major writing blocks

1 Calculation

Load by-event data and extract variables for non-linearity:

# get file name
file <- list.files("data", pattern = "event.csv$", full.names = T)

# load data
data <- read_csv(file) %>% 
  filter(session == "diagnostic") %>% 
  select(ppt, n_text, contains("edge"), contains("jump"), n_major_block)  %>% 
  select(-jump_text, -is_jump) %>% 
  unique()

Data overview:

glimpse(data)

Rows: 2,425
Columns: 6
$ ppt           <dbl> 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, …
$ n_text        <dbl> 1268, 12, 1442, 7, 6, 53, 5, 0, 16, 27, 0, 32, 26, 106, …
$ edge_id       <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
$ n_edges       <dbl> 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, …
$ n_jumps       <dbl> 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, …
$ n_major_block <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,…

In this approach we determine the number of major blocks relative to the total number of characters produced across writing edges. There are two free parameters. First, we disregard very short writing edges, defined as edges shorter than 5 characters. Second, and more importantly we need to set a proportion that serves as the minimum proportion of text that constitutes a major writing for which we used .15. In other words, for a block to be considered a major writing block it needs to be at least 15% of the overall text.

# Disregard very short writing edges
short_edges <- 5
# Threshold for the min proportion of characters in a major writing block 
prop_threshold <- .15

Figure 1.1 illustrates and the code that generates it show how this algorithm determines major blocks. The red line shows the threshold for major blocks, with major blocks on the right of it.

data %>% 
  # just a few participants as example
  filter(ppt %in% sample(ppt, 30)) %>% 
  mutate(is_minor = n_text < short_edges,
         total_chars = sum(n_text[!is_minor]),
         prop_block = ifelse(!is_minor, n_text / total_chars, 0),
         is_block = prop_block >= prop_threshold,
         min_block = min(n_text[is_block]),
         n_blocks = sum(is_block),
         .by = ppt) %>% 
  ggplot(aes(x = n_text)) +
  geom_histogram() +
  geom_vline(aes(xintercept = min_block), colour = "red") +
  facet_wrap(~ppt) +
  labs(x = "Length of text per writing edge")

Figure 1.1: By-participant histograms with block thresholds.

On the basis of that we can create a function that classifies and counts major blocks (return = 'total_blocks'), the minimum length of a major block (return = 'len_min_block') or the number of characters across all major blocks (return = 'len_all_blocks').

large_blocks <- function(x, block_threshold = .15, 
                         short_edges = 5, 
                         return = c('total_blocks', 'len_min_block', 'len_all_blocks'),
                         return_min_block = FALSE,
                         return_len_all_blocks = FALSE) {
    
   is_minor <- x < short_edges
   x <- x[!is_minor]
   total_chars <- sum(x)
   prop_block <- x / total_chars
   is_block <- prop_block >= block_threshold
   min_block <- min(x[is_block])
   n_blocks <- sum(is_block)
   len_all_blocks <- sum(x[is_block])
   
   
   if(return == 'total_blocks'){
     return(n_blocks)
   } else if(return == 'len_min_block'){
     return(min_block)
   } else if(return == "len_all_blocks"){
     return(len_all_blocks)
   }
   
}

This function can then be used to summarise block information for each participant ppt.

data_block <- data %>%
  summarise(across(n_text, list(large_blocks = ~large_blocks(., return = 'total_blocks'), 
                                min_blocks_len = ~large_blocks(., return = 'len_min_block'),
                                all_blocks_len = ~large_blocks(., return = 'len_all_blocks'),
                                max_blocks_len = max,
                                text_len = sum), 
                   .names = "{.fn}"),
            .by = ppt) %>% 
  mutate(prop_max_block = max_blocks_len / text_len,
         prop_min_block = min_blocks_len / text_len,
         prop_all_block = all_blocks_len / text_len); data_block

# A tibble: 155 × 9
     ppt large_blocks min_blocks_len all_blocks_len max_blocks_len text_len
   <dbl>        <int>          <dbl>          <dbl>          <dbl>    <dbl>
 1    19            2           1268           2710           1442     3213
 2    18            1           2829           2829           2829     4423
 3    13            1            696            696            696     3391
 4    14            1           3079           3079           3079     3165
 5     9            2            515           1679           1164     3443
 6    15            3            601           1835            629     2745
 7     3            3            463           1983            817     2743
 8     8            4            774           3461            969     4102
 9    10            2            885           3227           2342     3560
10     4            4            368           1830            613     2054
# ℹ 145 more rows
# ℹ 3 more variables: prop_max_block <dbl>, prop_min_block <dbl>,
#   prop_all_block <dbl>

The results are visualised in Figure 1.2.

data_block_long <- data_block %>% 
  pivot_longer(-ppt) %>% 
  mutate(across(name, ~recode(., large_blocks = "Number of major blocks",
                                 max_blocks_len = "Length of largest block",
                                 min_blocks_len = "Length of shorest major block",
                                 all_blocks_len = "Length of all major blocks",
                                 prop_all_block = "Prop. of all major block",
                                 prop_max_block = "Prop. of largest block",
                                 prop_min_block = "Prop. of smallest major block",
                                 text_len = "Text length"))) 

ggplot(data_block_long, aes(x = value)) +
  geom_histogram(bins = 50) +
  facet_wrap(~name, scales = "free", nrow = 3) +
  labs(y = "Number of participants")

Figure 1.2: Results of block analysis.

2 Correlations

Correlations can be found in Figure 2.1. There are clear dependencies in these measures. The visualizations suggests that some dependencies are not linear and possibly the result of a leptokurtic distribution for correlations including the length of the shortest major block and strong bi-modality for the proportion of the shortest major block. For the latter the bi-modality is related to cases where the shortest major block is also the largest major block (where participants wrote only one major block and many other smaller blocks).

library(psyntur)
data_block_long %>% 
  pivot_wider(names_from = name, values_from = value) %>% 
  select(-starts_with("Length")) %>% 
  scatterplot_matrix(-ppt)

Figure 2.1: Correlation matrix

3 Threshold simulation

Iterate over different thresholds to inspect the effects of thresholds on the calculated block measures.

short_edges_thresholds <- seq_len(10)
prop_threshold <- seq(0, .5, .005)

data_sims <- list()
idx <- 0 
for(prop in prop_threshold){
  for(short_edge in short_edges_thresholds){
    idx <- idx + 1
    data_sims[[idx]] <- data %>%
      summarise(across(n_text, 
                       list(large_blocks = ~large_blocks(.,
                                                         short_edges = short_edge,
                                                         block_threshold = prop,
                                                         return = 'total_blocks'), 
                            min_blocks_len = ~large_blocks(.,
                                                         short_edges = short_edge,
                                                         block_threshold = prop,
                                                         return = 'len_min_block'),
                            all_blocks_len = ~large_blocks(.,
                                                         short_edges = short_edge,
                                                         block_threshold = prop,
                                                         return = 'len_all_blocks'),
                            
                            text_len = sum), 
                    .names = "{.fn}"),
                .by = ppt) %>% 
      mutate(prop_min_block = min_blocks_len / text_len,
             prop_all_block = all_blocks_len / text_len,
             prop_threshold = prop,
             short_edges = short_edge)
  }
}

data_sims <- bind_rows(data_sims) %>% 
  mutate(across(short_edges, factor))

Simulation results are shown in Figure 3.1. The simulation suggests:

A larger threshold for the relative length of a major block are positively associated with the proportion of text that belongs to the shortest major block. Higher thresholds require more text to be a major block.
A larger threshold has a negative association with the proportion of text associated with all major blocks. There is less text associated with major blocks if the cutoff value is higher.
Similarly, there is a negative relationship between the threshold value and the number blocks. A lower threshold generates more major blocks.
The length of short edges only seem to impact the resulting number of blocks for proportion thresholds below \(\approx\) .15 but the short edges cut-offs doesn’t seem to affect block numbers for larger thresholds.

data_sims %>% 
  pivot_longer(c(large_blocks, prop_min_block, prop_all_block)) %>% 
  mutate(across(name, ~recode_factor(., prop_all_block = "Prop. of text of all major blocks",
                                        prop_min_block = "Prop. of text of smallest major block",
                                        large_blocks = "Number of major blocks", 
                                     .ordered = TRUE))) %>% 
  ggplot(aes(x = prop_threshold, 
             y = value, 
             colour = short_edges)) +
  stat_smooth(fullrange = T, se = F) +
  facet_wrap(~name, scales = "free", nrow = 2) +
  labs(x = "Min. proportion of text needed for major block",
       colour = "Length of short edges\nto be disregarded") +
  scale_colour_viridis_d()

Figure 3.1: Simulation results

4 Session Info

R version 4.3.3 (2024-02-29)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 18.04.6 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/openblas/libblas.so.3 
LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so;  LAPACK version 3.7.1

locale:
 [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
 [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
 [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       

time zone: Europe/London
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] psyntur_0.1.0    rmdformats_1.0.4 patchwork_1.2.0  kableExtra_1.4.0
 [5] lubridate_1.9.3  forcats_1.0.0    stringr_1.5.1    dplyr_1.1.4     
 [9] purrr_1.0.2      readr_2.1.5      tidyr_1.3.1      tibble_3.2.1    
[13] ggplot2_3.5.1    tidyverse_2.0.0 

loaded via a namespace (and not attached):
 [1] gtable_0.3.5         xfun_0.44            bslib_0.7.0         
 [4] formula.tools_1.7.1  lattice_0.22-6       GGally_2.2.1        
 [7] tzdb_0.4.0           vctrs_0.6.5          tools_4.3.3         
[10] generics_0.1.3       parallel_4.3.3       fansi_1.0.6         
[13] highr_0.11           pkgconfig_2.0.3      Matrix_1.6-5        
[16] RColorBrewer_1.1-3   lifecycle_1.0.4      compiler_4.3.3      
[19] farver_2.1.2         munsell_0.5.1        carData_3.0-5       
[22] htmltools_0.5.8.1    sass_0.4.9           yaml_2.3.8          
[25] nloptr_2.0.3         car_3.1-2            pillar_1.9.0        
[28] crayon_1.5.2         jquerylib_0.1.4      MASS_7.3-60         
[31] cachem_1.1.0         boot_1.3-30          abind_1.4-5         
[34] nlme_3.1-164         ggstats_0.6.0        tidyselect_1.2.1    
[37] digest_0.6.35        stringi_1.8.4        reshape2_1.4.4      
[40] bookdown_0.39        splines_4.3.3        ggthemes_5.1.0      
[43] labeling_0.4.3       operator.tools_1.6.3 fastmap_1.2.0       
[46] grid_4.3.3           colorspace_2.1-0     cli_3.6.2           
[49] magrittr_2.0.3       utf8_1.2.4           withr_3.0.0         
[52] scales_1.3.0         bit64_4.0.5          timechange_0.3.0    
[55] rmarkdown_2.27       lme4_1.1-35.3        bit_4.0.5           
[58] hms_1.1.3            evaluate_0.23        knitr_1.46          
[61] viridisLite_0.4.2    mgcv_1.9-1           ez_4.4-0            
[64] rlang_1.1.3          Rcpp_1.0.12          glue_1.7.0          
[67] xml2_1.3.6           minqa_1.2.7          svglite_2.1.3       
[70] rstudioapi_0.16.0    vroom_1.6.5          jsonlite_1.8.8      
[73] R6_2.5.1             plyr_1.8.9           systemfonts_1.1.0

Prowrite: major writing blocks

1 Calculation

2 Correlations

3 Threshold simulation

4 Session Info

5 References