library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
# Read the 4 CSV files
collapsible_1 <- read.csv("Collapsible-1.csv")
collapsible_2 <- read.csv("Collapsible-2.csv")
noncollapsible_1 <- read.csv("Noncollapsible-1.csv")
noncollapsible_2 <- read.csv("Noncollapsible-2.csv")

# Add Home and Period columns
collapsible_1$Version <- "B"
collapsible_1$Period <- 1

collapsible_2$Version <- "B"
collapsible_2$Period <- 2

noncollapsible_1$Version <- "A"
noncollapsible_1$Period <- 1

noncollapsible_2$Version <- "A"
noncollapsible_2$Period <- 2

df <- bind_rows(collapsible_1,collapsible_2,noncollapsible_1,noncollapsible_2) %>% 
      select(Tester.ID, Value, Version, Period) %>%
      distinct() %>%
      rename(Subject = Tester.ID, Score = Value)
print(df)
##      Subject Score Version Period
## 1  457895107     5       B      1
## 2  464101650     7       B      1
## 3  464168937     6       B      1
## 4  464176866     9       B      1
## 5  460185122     6       B      1
## 6  464199019     8       B      1
## 7  464243752     8       B      1
## 8  463230127     5       B      1
## 9  464279823     3       B      1
## 10 464283964     3       B      1
## 11 463682411    10       B      1
## 12 464386065    10       B      1
## 13 464386169     6       B      1
## 14 463510048     8       B      1
## 15 456103220     6       B      2
## 16 464022343    10       B      2
## 17 464204034     6       B      2
## 18 464289978     7       B      2
## 19 462862639     2       B      2
## 20 302888762     4       B      2
## 21 462498662     7       B      2
## 22 464350923     9       B      2
## 23 464370209     5       B      2
## 24 464372331     8       B      2
## 25 463237021     7       B      2
## 26 464376961     8       B      2
## 27 464378135     4       B      2
## 28 464402783     6       B      2
## 29 464409779    10       B      2
## 30 463256322     8       B      2
## 31 461180896     8       B      2
## 32 456103220     9       A      1
## 33 464022343    10       A      1
## 34 464204034     6       A      1
## 35 464289978     4       A      1
## 36 462862639     9       A      1
## 37 302888762     4       A      1
## 38 462498662     7       A      1
## 39 464350923     9       A      1
## 40 464370209    10       A      1
## 41 464372331     4       A      1
## 42 463237021     9       A      1
## 43 464376961     8       A      1
## 44 464378135     8       A      1
## 45 464402783     8       A      1
## 46 464409779    10       A      1
## 47 463256322    10       A      1
## 48 461180896     8       A      1
## 49 457895107    10       A      2
## 50 464101650     8       A      2
## 51 464168937     6       A      2
## 52 464176866     8       A      2
## 53 460185122     6       A      2
## 54 464199019    10       A      2
## 55 464243752     8       A      2
## 56 463230127     4       A      2
## 57 464279823     8       A      2
## 58 464283964     3       A      2
## 59 463682411    10       A      2
## 60 464386065    10       A      2
## 61 464386169    10       A      2
## 62 463510048     9       A      2
# Data cleaning, keeping subjects that completed both rating questionnaires

## Count how many unique Version each subject has
subjects_complete <- df %>%
  group_by(Subject) %>%
  summarise(n_version = n_distinct(Version)) %>%
  filter(n_version == 2) %>%   # only subjects who did both questionnaires
  pull(Subject)

## Filter the main dataset
df_complete <- df %>% filter(Subject %in% subjects_complete)

n_distinct(df_complete$Subject)  # check how many subjects remain
## [1] 31
print(df_complete)
##      Subject Score Version Period
## 1  457895107     5       B      1
## 2  464101650     7       B      1
## 3  464168937     6       B      1
## 4  464176866     9       B      1
## 5  460185122     6       B      1
## 6  464199019     8       B      1
## 7  464243752     8       B      1
## 8  463230127     5       B      1
## 9  464279823     3       B      1
## 10 464283964     3       B      1
## 11 463682411    10       B      1
## 12 464386065    10       B      1
## 13 464386169     6       B      1
## 14 463510048     8       B      1
## 15 456103220     6       B      2
## 16 464022343    10       B      2
## 17 464204034     6       B      2
## 18 464289978     7       B      2
## 19 462862639     2       B      2
## 20 302888762     4       B      2
## 21 462498662     7       B      2
## 22 464350923     9       B      2
## 23 464370209     5       B      2
## 24 464372331     8       B      2
## 25 463237021     7       B      2
## 26 464376961     8       B      2
## 27 464378135     4       B      2
## 28 464402783     6       B      2
## 29 464409779    10       B      2
## 30 463256322     8       B      2
## 31 461180896     8       B      2
## 32 456103220     9       A      1
## 33 464022343    10       A      1
## 34 464204034     6       A      1
## 35 464289978     4       A      1
## 36 462862639     9       A      1
## 37 302888762     4       A      1
## 38 462498662     7       A      1
## 39 464350923     9       A      1
## 40 464370209    10       A      1
## 41 464372331     4       A      1
## 42 463237021     9       A      1
## 43 464376961     8       A      1
## 44 464378135     8       A      1
## 45 464402783     8       A      1
## 46 464409779    10       A      1
## 47 463256322    10       A      1
## 48 461180896     8       A      1
## 49 457895107    10       A      2
## 50 464101650     8       A      2
## 51 464168937     6       A      2
## 52 464176866     8       A      2
## 53 460185122     6       A      2
## 54 464199019    10       A      2
## 55 464243752     8       A      2
## 56 463230127     4       A      2
## 57 464279823     8       A      2
## 58 464283964     3       A      2
## 59 463682411    10       A      2
## 60 464386065    10       A      2
## 61 464386169    10       A      2
## 62 463510048     9       A      2
ggplot(df_complete, aes(x = Version, y = Score, fill = factor(Period))) +
  geom_boxplot(position = position_dodge(width = 0.8)) +
  labs(
    title = "Scores by Version and Period",
    x = "Version",
    y = "Score",
    fill = "Period"
  ) +
  theme_minimal()

ggplot(df_complete, aes(x = Score, fill = Version)) +
  geom_histogram(position = "identity", alpha = 0.6, bins = 10, color = "black") +
  facet_wrap(~Version) +
  labs(
    title = "Distribution of Scores by Version",
    x = "Score",
    y = "Count"
  ) +
  theme_minimal()

df_complete %>%
  group_by(Version) %>%
  summarise(
    n = n(),
    mean_score = mean(Score),
    sd_score = sd(Score),
    median_score = median(Score),
    min_score = min(Score),
    max_score = max(Score),
    .groups = "drop"
  )
## # A tibble: 2 × 7
##   Version     n mean_score sd_score median_score min_score max_score
##   <chr>   <int>      <dbl>    <dbl>        <int>     <int>     <int>
## 1 A          31       7.84     2.18            8         3        10
## 2 B          31       6.74     2.18            7         2        10
df_wide <- df_complete %>%
  select(Subject, Version, Score) %>%
  pivot_wider(names_from = Version, values_from = Score)

print(df_wide)
## # A tibble: 31 × 3
##      Subject     B     A
##        <int> <int> <int>
##  1 457895107     5    10
##  2 464101650     7     8
##  3 464168937     6     6
##  4 464176866     9     8
##  5 460185122     6     6
##  6 464199019     8    10
##  7 464243752     8     8
##  8 463230127     5     4
##  9 464279823     3     8
## 10 464283964     3     3
## # ℹ 21 more rows
# Run Wilcoxon signed-rank test
wilcox.test(df_wide$A, df_wide$B, paired = TRUE, exact=TRUE)
## Warning in wilcox.test.default(df_wide$A, df_wide$B, paired = TRUE, exact =
## TRUE): cannot compute exact p-value with ties
## Warning in wilcox.test.default(df_wide$A, df_wide$B, paired = TRUE, exact =
## TRUE): cannot compute exact p-value with zeroes
## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  df_wide$A and df_wide$B
## V = 126.5, p-value = 0.01863
## alternative hypothesis: true location shift is not equal to 0
df_wide %>%
  summarise(
    median_A = median(A, na.rm = TRUE),
    median_B = median(B, na.rm = TRUE)
  )
## # A tibble: 1 × 2
##   median_A median_B
##      <int>    <int>
## 1        8        7