Data A-priori Yes/No

#{r packages} #install.packages(c("psych", "irr","pwr","writexl")) #

data_dir <- "/Users/haisaosmanli/Desktop/CORE/SDA/DATA"
excel_files <- list.files(path = data_dir, pattern = "\\.xlsx$", full.names = TRUE)
excel_files <- excel_files[!grepl("~\\$", excel_files)]
excel_files

## [1] "/Users/haisaosmanli/Desktop/CORE/SDA/DATA/AD.xlsx"
## [2] "/Users/haisaosmanli/Desktop/CORE/SDA/DATA/AK.xlsx"
## [3] "/Users/haisaosmanli/Desktop/CORE/SDA/DATA/CD.xlsx"
## [4] "/Users/haisaosmanli/Desktop/CORE/SDA/DATA/EM.xlsx"
## [5] "/Users/haisaosmanli/Desktop/CORE/SDA/DATA/RL.xlsx"
## [6] "/Users/haisaosmanli/Desktop/CORE/SDA/DATA/VG.xlsx"

definitions <- read_excel(excel_files[1], sheet = 1)

## New names:
## • `` -> `...1`
## • `` -> `...3`

glimpse(definitions)

## Rows: 81
## Columns: 3
## $ ...1                                                                                                                                        <chr> …
## $ `Present = 1 point; absent=0 points: Place X in each box to indicate the presence of each indicator (total “X”s at the end in yellow box).` <chr> …
## $ ...3                                                                                                                                        <chr> …

read_researcher_data <- function(file_path) {
  # extract researcher initials (eg. "AD.xlsx" → "AD")
  researcher_id <- toupper(str_extract(basename(file_path), "^[A-Z]+"))
  
  # get all sheet names
  sheets <- excel_sheets(file_path)
  
  # decide which sheets to read
  if (researcher_id == "EM") {
    target_sheets <- sheets[2:4]   # EM → sheets 2,3 and 4
  } else if (researcher_id == "VG") {
    target_sheets <- sheets[2:4]   # VG → sheets 2, 3, and 4
  } else {
    target_sheets <- sheets[2]     # others → only sheet 2
  }
  
  # read and combine data from selected sheets
  df_list <- lapply(target_sheets, function(sheet_name) {
    df <- read_excel(file_path, sheet = sheet_name) %>%
      clean_names() %>%
      mutate(
        researcher = researcher_id,
        source_file = basename(file_path),
        # clean sheet name (remove "(Pilot)" and convert to date)
        date_collected = suppressWarnings(
          ymd(str_trim(str_remove(sheet_name, "\\(.*\\)")))
        )
      )
    return(df)
  })
  
  # combine all sheets vertically
  bind_rows(df_list)
}

# read all files and merge vertically
metadata_all <- map_dfr(excel_files, read_researcher_data)

# preview the merged data 
glimpse(metadata_all)

## Rows: 1,650
## Columns: 16
## $ researcher_initials          <chr> "AD", "N/A", "N/A", "N/A", "N/A", "N/A", …
## $ weather_brief_description    <chr> "Slightly Rainy, Cloudy", "N/A", "N/A", "…
## $ temperature_celsius          <chr> "7", "N/A", "N/A", "N/A", "N/A", "N/A", "…
## $ number_of_photos_taken       <chr> "4", "N/A", "N/A", "N/A", "N/A", "N/A", "…
## $ time_start_00_00am_pm        <chr> "0.375", "N/A", "N/A", "N/A", "N/A", "N/A…
## $ time_end_00_00am_pm          <chr> "0.41666666666666669", "N/A", "N/A", "N/A…
## $ location                     <chr> "Sunalta", "Sunalta", "Sunalta", "Sunalta…
## $ indicator_type               <chr> "N/A", "Physical disorder", "Physical dis…
## $ code                         <chr> "N/A", "001", "002", "003a", "003b", "004…
## $ indicator_description        <chr> "N/A", "Pile of discarded cigarette butts…
## $ present_yes_1                <chr> "N/A", "1", "1", "1", NA, "1", "1", NA, N…
## $ frequency_tally_total        <chr> "N/A", "7", "2", "14", NA, "4", "1", NA, …
## $ field_notes_where_applicable <chr> "N/A", NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ researcher                   <chr> "AD", "AD", "AD", "AD", "AD", "AD", "AD",…
## $ source_file                  <chr> "AD.xlsx", "AD.xlsx", "AD.xlsx", "AD.xlsx…
## $ date_collected               <date> 2025-10-19, 2025-10-19, 2025-10-19, 2025…

metadata_all$present_yes_1[
  is.na(metadata_all$present_yes_1) |
  metadata_all$present_yes_1 == "" |
  metadata_all$present_yes_1 == "N/A"
] <- 0

# Rows where present_yes_1 == 1
present_1 <- metadata_all[metadata_all$present_yes_1 == 1, ]

# Rows where present_yes_1 == 0
present_0 <- metadata_all[metadata_all$present_yes_1 == 0, ]

allowed_codes <- c("013", "015a", "016", "017", "030", "034", 
                   "044", "045", "047", "048", "064", "065a", "067")

present_1 <- metadata_all %>%
  filter(present_yes_1 == 1, code %in% allowed_codes) %>%
  select(location, code, present_yes_1)

present_0 <- metadata_all %>%
  filter(present_yes_1 == 0, code %in% allowed_codes) %>%
  select(location, code, present_yes_1) %>%
  mutate(location = ifelse(location == "" | is.na(location), "Sunalta", location))

make_code_location_table <- function(df) {
  df %>%
    filter(code %in% allowed_codes) %>%      # keep only allowed codes
    group_by(code, location) %>%             # group by code & location
    summarise(count = n(), .groups = "drop") %>%  # count rows
    pivot_wider(names_from = location, values_from = count, values_fill = 0) %>% # pivot to 13x3
    arrange(code)
}

# For present_1
table_present_1 <- make_code_location_table(present_1)

# For present_0
table_present_0 <- make_code_location_table(present_0)

physical_disorder <- c("013", "015a", "016", "017", "030")
social_disorder <- c("034", "044", "045", "047", "048")
social_order <- c("064", "065a", "067")
library(dplyr)

collapse_to_groups <- function(df) {
  df %>%
    mutate(group = case_when(
      code %in% physical_disorder ~ "physical_disorder",
      code %in% social_disorder ~ "social_disorder",
      code %in% social_order ~ "social_order",
      TRUE ~ "other"   # optional, can remove this if not needed
    )) %>%
    select(-code) %>%                     # remove old code column
    group_by(group) %>%                   # group by new group
    summarise(across(everything(), sum), .groups = "drop")  # sum all locations
}

# For present_1
table_present_1_grouped <- collapse_to_groups(table_present_1)

# For present_0
table_present_0_grouped <- collapse_to_groups(table_present_0)

# 2x2 matrix: Yes/No × Location
physical_matrix <- matrix(
  c(
    table_present_1_grouped$`Beltline Sheldon Chumir`[table_present_1_grouped$group == "physical_disorder"],  # Yes Chumir
    table_present_1_grouped$Sunalta[table_present_1_grouped$group == "physical_disorder"], # Yes Sunalta
    table_present_0_grouped$`Beltline Sheldon Chumir`[table_present_0_grouped$group == "physical_disorder"],  # No Chumir
    table_present_0_grouped$Sunalta[table_present_0_grouped$group == "physical_disorder"]  # No Sunalta
  ),
  nrow = 2,
  byrow = TRUE
)

rownames(physical_matrix) <- c("Yes", "No")
colnames(physical_matrix) <- c("Chumir", "Sunalta")

# Chi-square test
chisq.test(physical_matrix)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  physical_matrix
## X-squared = 1.9726, df = 1, p-value = 0.1602

# 2x2 matrix: Yes/No × Location
social_disorder_matrix <- matrix(
  c(
    table_present_1_grouped$`Beltline Sheldon Chumir`[table_present_1_grouped$group == "social_disorder"],  # Yes Chumir
    table_present_1_grouped$Sunalta[table_present_1_grouped$group == "social_disorder"], # Yes Sunalta
    table_present_0_grouped$`Beltline Sheldon Chumir`[table_present_0_grouped$group == "social_disorder"],  # No Chumir
    table_present_0_grouped$Sunalta[table_present_0_grouped$group == "social_disorder"]  # No Sunalta
  ),
  nrow = 2,
  byrow = TRUE
)

rownames(social_disorder_matrix) <- c("Yes", "No")
colnames(social_disorder_matrix) <- c("Chumir", "Sunalta")

# Chi-square test
chisq.test(social_disorder_matrix)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  social_disorder_matrix
## X-squared = 13.281, df = 1, p-value = 0.0002681

# 2x2 matrix: Yes/No × Location
social_order_matrix <- matrix(
  c(
    table_present_1_grouped$`Beltline Sheldon Chumir`[table_present_1_grouped$group == "social_order"],  # Yes Chumir
    table_present_1_grouped$Sunalta[table_present_1_grouped$group == "social_order"], # Yes Sunalta
    table_present_0_grouped$`Beltline Sheldon Chumir`[table_present_0_grouped$group == "social_order"],  # No Chumir
    table_present_0_grouped$Sunalta[table_present_0_grouped$group == "social_order"]  # No Sunalta
  ),
  nrow = 2,
  byrow = TRUE
)

rownames(social_order_matrix) <- c("Yes", "No")
colnames(social_order_matrix) <- c("Chumir", "Sunalta")

# Chi-square test
chisq.test(social_order_matrix)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  social_order_matrix
## X-squared = 0.28202, df = 1, p-value = 0.5954

physical_matrix_double <- matrix(
  c(
    62, 46,   # Yes Chumir, Yes Sunalta (doubled counts)
    38, 54    # No Chumir, No Sunalta (doubled counts)
  ),
  nrow = 2,
  byrow = TRUE
)

rownames(physical_matrix_double) <- c("Yes", "No")
colnames(physical_matrix_double) <- c("Chumir", "Sunalta")
chisq.test(physical_matrix_double)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  physical_matrix_double
## X-squared = 4.529, df = 1, p-value = 0.03333

# Multiply the counts by 3
yes_chumir <- 31 * 3
yes_sunalta <- 23 * 3
no_chumir <- 19 * 3
no_sunalta <- 27 * 3

# Create the 2x2 matrix
physical_matrix_triple <- matrix(
  c(yes_chumir, yes_sunalta,
    no_chumir, no_sunalta),
  nrow = 2,
  byrow = TRUE
)

rownames(physical_matrix_triple) <- c("Yes", "No")
colnames(physical_matrix_triple) <- c("Chumir", "Sunalta")

chisq.test(physical_matrix_triple)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  physical_matrix_triple
## X-squared = 7.0988, df = 1, p-value = 0.007714

Data A-priori Yes/No

Haisa Osmanli

2025-10-24