Data All Yes/No

#{r} #install.packages(c("psych", "irr","pwr","writexl","patchwork")) #

data_dir <- "/Users/haisaosmanli/Desktop/CORE/SDA/DATA"
excel_files <- list.files(path = data_dir, pattern = "\\.xlsx$", full.names = TRUE)
excel_files <- excel_files[!grepl("~\\$", excel_files)]
excel_files

## [1] "/Users/haisaosmanli/Desktop/CORE/SDA/DATA/AD.xlsx"
## [2] "/Users/haisaosmanli/Desktop/CORE/SDA/DATA/AK.xlsx"
## [3] "/Users/haisaosmanli/Desktop/CORE/SDA/DATA/CD.xlsx"
## [4] "/Users/haisaosmanli/Desktop/CORE/SDA/DATA/EM.xlsx"
## [5] "/Users/haisaosmanli/Desktop/CORE/SDA/DATA/RL.xlsx"
## [6] "/Users/haisaosmanli/Desktop/CORE/SDA/DATA/VG.xlsx"

definitions <- read_excel(excel_files[1], sheet = 1)

## New names:
## • `` -> `...1`
## • `` -> `...3`

glimpse(definitions)

## Rows: 81
## Columns: 3
## $ ...1                                                                                                                                        <chr> …
## $ `Present = 1 point; absent=0 points: Place X in each box to indicate the presence of each indicator (total “X”s at the end in yellow box).` <chr> …
## $ ...3                                                                                                                                        <chr> …

read_researcher_data <- function(file_path) {
  # extract researcher initials (eg. "AD.xlsx" → "AD")
  researcher_id <- toupper(str_extract(basename(file_path), "^[A-Z]+"))
  
  # get all sheet names
  sheets <- excel_sheets(file_path)
  
  # decide which sheets to read
  if (researcher_id == "EM") {
    target_sheets <- sheets[2:4]   # EM → sheets 2,3 and 4
  } else if (researcher_id == "VG") {
    target_sheets <- sheets[2:4]   # VG → sheets 2, 3, and 4
  } else {
    target_sheets <- sheets[2]     # others → only sheet 2
  }
  
  # read and combine data from selected sheets
  df_list <- lapply(target_sheets, function(sheet_name) {
    df <- read_excel(file_path, sheet = sheet_name) %>%
      clean_names() %>%
      mutate(
        researcher = researcher_id,
        source_file = basename(file_path),
        # clean sheet name (remove "(Pilot)" and convert to date)
        date_collected = suppressWarnings(
          ymd(str_trim(str_remove(sheet_name, "\\(.*\\)")))
        )
      )
    return(df)
  })
  
  # combine all sheets vertically
  bind_rows(df_list)
}

# read all files and merge vertically
metadata_all <- map_dfr(excel_files, read_researcher_data)

# preview the merged data 
glimpse(metadata_all)

## Rows: 1,650
## Columns: 16
## $ researcher_initials          <chr> "AD", "N/A", "N/A", "N/A", "N/A", "N/A", …
## $ weather_brief_description    <chr> "Slightly Rainy, Cloudy", "N/A", "N/A", "…
## $ temperature_celsius          <chr> "7", "N/A", "N/A", "N/A", "N/A", "N/A", "…
## $ number_of_photos_taken       <chr> "4", "N/A", "N/A", "N/A", "N/A", "N/A", "…
## $ time_start_00_00am_pm        <chr> "0.375", "N/A", "N/A", "N/A", "N/A", "N/A…
## $ time_end_00_00am_pm          <chr> "0.41666666666666669", "N/A", "N/A", "N/A…
## $ location                     <chr> "Sunalta", "Sunalta", "Sunalta", "Sunalta…
## $ indicator_type               <chr> "N/A", "Physical disorder", "Physical dis…
## $ code                         <chr> "N/A", "001", "002", "003a", "003b", "004…
## $ indicator_description        <chr> "N/A", "Pile of discarded cigarette butts…
## $ present_yes_1                <chr> "N/A", "1", "1", "1", NA, "1", "1", NA, N…
## $ frequency_tally_total        <chr> "N/A", "7", "2", "14", NA, "4", "1", NA, …
## $ field_notes_where_applicable <chr> "N/A", NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ researcher                   <chr> "AD", "AD", "AD", "AD", "AD", "AD", "AD",…
## $ source_file                  <chr> "AD.xlsx", "AD.xlsx", "AD.xlsx", "AD.xlsx…
## $ date_collected               <date> 2025-10-19, 2025-10-19, 2025-10-19, 2025…

metadata_all$present_yes_1[
  is.na(metadata_all$present_yes_1) |
  metadata_all$present_yes_1 == ""
] <- 0

# Rows where present_yes_1 == 1
present_1 <- metadata_all[metadata_all$present_yes_1 == 1, ]

# Rows where present_yes_1 == 0
present_0 <- metadata_all[metadata_all$present_yes_1 == 0, ]

codes <- c(
  # physical_disorder
  "001", "002", "003a", "003b", "004", "005", "006", "007", "008", "009",
  "010", "011", "012", "013", "014", "015a", "015b", "016", "017", "018",
  "019", "020", "021", "022", "023", "024", "025", "026", "027", "028",
  "029", "030", "031", "032",
  
  # social_disorder
  "033", "034", "035", "036", "037", "038", "039", "040", "041", "042",
  "043", "044", "045", "046", "047", "048", "049", "050", "051", "052",
  "053", "054", "055", "056", "057", "058",
  
  # social_order
  "059", "060", "061", "062", "063", "064", "065a", "065b", "066",
  "067", "068", "069", "070"
)

present_1 <- metadata_all %>%
  filter(present_yes_1 == 1, code %in% codes) %>%
  select(location, code, present_yes_1)

present_0 <- metadata_all %>%
  filter(present_yes_1 == 0, code %in% codes) %>%
  select(location, code, present_yes_1) %>%
  mutate(location = ifelse(location == "" | is.na(location), "Sunalta", location))

make_code_location_table <- function(df) {
  df %>%
    filter(code %in% codes) %>%      # keep only allowed codes
    group_by(code, location) %>%             # group by code & location
    summarise(count = n(), .groups = "drop") %>%  # count rows
    pivot_wider(names_from = location, values_from = count, values_fill = 0) %>% # pivot to 13x3
    arrange(code)
}

# For present_1
table_present_1 <- make_code_location_table(present_1)

# For present_0
table_present_0 <- make_code_location_table(present_0)

table_present_1 <- table_present_1 %>%
  mutate(
    Sunalta = Sunalta + if_else(is.na(`NA`), 0, `NA`)
  ) %>%
  select(-`NA`)

physical_disorder <- c("001", "002", "003a", "003b", "004", "005", "006", "007", "008", "009", "010", "011", "012", "013", "014", "015a", "015b", "016", "017", "018", "019", "020", "021", "022", "023", "024", "025", "026", "027", "028", "029", "030", "031", "032")
social_disorder <- c( "033", "034", "035", "036", "037", "038", "039", "040", "041", "042", "043", "044", "045", "046", "047", "048", "049", "050", "051", "052", "053", "054", "055", "056", "057", "058")
social_order <- c("059", "060", "061", "062", "063", "064", "065a", "065b", "066", "067", "068", "069", "070")

collapse_to_groups <- function(df) {
  df %>%
    mutate(group = case_when(
      code %in% physical_disorder ~ "physical_disorder",
      code %in% social_disorder ~ "social_disorder",
      code %in% social_order ~ "social_order",
      TRUE ~ "other"   # optional, can remove this if not needed
    )) %>%
    select(-code) %>%                     # remove old code column
    group_by(group) %>%                   # group by new group
    summarise(across(everything(), sum), .groups = "drop")  # sum all locations
}

# For present_1
table_present_1_grouped <- collapse_to_groups(table_present_1)

# For present_0
table_present_0_grouped <- collapse_to_groups(table_present_0)

# 2x2 matrix: Yes/No × Location
physical_matrix <- matrix(
  c(
    table_present_1_grouped$`Beltline Sheldon Chumir`[table_present_1_grouped$group == "physical_disorder"],  # Yes Chumir
    table_present_1_grouped$Sunalta[table_present_1_grouped$group == "physical_disorder"], # Yes Sunalta
    table_present_0_grouped$`Beltline Sheldon Chumir`[table_present_0_grouped$group == "physical_disorder"],  # No Chumir
    table_present_0_grouped$Sunalta[table_present_0_grouped$group == "physical_disorder"]  # No Sunalta
  ),
  nrow = 2,
  byrow = TRUE
)

rownames(physical_matrix) <- c("Yes", "No")
colnames(physical_matrix) <- c("Chumir", "Sunalta")

# Chi-square test
chisq.test(physical_matrix)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  physical_matrix
## X-squared = 11.913, df = 1, p-value = 0.0005573

# 2x2 matrix: Yes/No × Location
social_disorder_matrix <- matrix(
  c(
    table_present_1_grouped$`Beltline Sheldon Chumir`[table_present_1_grouped$group == "social_disorder"],  # Yes Chumir
    table_present_1_grouped$Sunalta[table_present_1_grouped$group == "social_disorder"], # Yes Sunalta
    table_present_0_grouped$`Beltline Sheldon Chumir`[table_present_0_grouped$group == "social_disorder"],  # No Chumir
    table_present_0_grouped$Sunalta[table_present_0_grouped$group == "social_disorder"]  # No Sunalta
  ),
  nrow = 2,
  byrow = TRUE
)

rownames(social_disorder_matrix) <- c("Yes", "No")
colnames(social_disorder_matrix) <- c("Chumir", "Sunalta")

# Chi-square test
chisq.test(social_disorder_matrix)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  social_disorder_matrix
## X-squared = 28.858, df = 1, p-value = 7.788e-08

# 2x2 matrix: Yes/No × Location
social_order_matrix <- matrix(
  c(
    table_present_1_grouped$`Beltline Sheldon Chumir`[table_present_1_grouped$group == "social_order"],  # Yes Chumir
    table_present_1_grouped$Sunalta[table_present_1_grouped$group == "social_order"], # Yes Sunalta
    table_present_0_grouped$`Beltline Sheldon Chumir`[table_present_0_grouped$group == "social_order"],  # No Chumir
    table_present_0_grouped$Sunalta[table_present_0_grouped$group == "social_order"]  # No Sunalta
  ),
  nrow = 2,
  byrow = TRUE
)

rownames(social_order_matrix) <- c("Yes", "No")
colnames(social_order_matrix) <- c("Chumir", "Sunalta")

# Chi-square test
chisq.test(social_order_matrix)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  social_order_matrix
## X-squared = 0.077502, df = 1, p-value = 0.7807

physical_matrix_double <- matrix(
  c(
    62, 46,   # Yes Chumir, Yes Sunalta (doubled counts)
    38, 54    # No Chumir, No Sunalta (doubled counts)
  ),
  nrow = 2,
  byrow = TRUE
)

rownames(physical_matrix_double) <- c("Yes", "No")
colnames(physical_matrix_double) <- c("Chumir", "Sunalta")
chisq.test(physical_matrix_double)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  physical_matrix_double
## X-squared = 4.529, df = 1, p-value = 0.03333

# Multiply the counts by 3
yes_chumir <- 31 * 3
yes_sunalta <- 23 * 3
no_chumir <- 19 * 3
no_sunalta <- 27 * 3

# Create the 2x2 matrix
physical_matrix_triple <- matrix(
  c(yes_chumir, yes_sunalta,
    no_chumir, no_sunalta),
  nrow = 2,
  byrow = TRUE
)

rownames(physical_matrix_triple) <- c("Yes", "No")
colnames(physical_matrix_triple) <- c("Chumir", "Sunalta")

chisq.test(physical_matrix_triple)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  physical_matrix_triple
## X-squared = 7.0988, df = 1, p-value = 0.007714

# Values for Social Order doubled
so_yes_sunalta <- 37 * 2
so_yes_chumir <- 34 * 2
so_no_sunalta <- 96 * 2
so_no_chumir <- 93 * 2

# Create 2x2 matrix
social_order_matrix_double <- matrix(
  c(
    so_yes_chumir, so_yes_sunalta,  # Yes row
    so_no_chumir,  so_no_sunalta   # No row
  ),
  nrow = 2,
  byrow = TRUE
)

rownames(social_order_matrix_double) <- c("Yes", "No")
colnames(social_order_matrix_double) <- c("Chumir", "Sunalta")

# Run Chi-square test
chisq.test(social_order_matrix_double)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  social_order_matrix_double
## X-squared = 0.028778, df = 1, p-value = 0.8653

# Social Order scenario: double only Group 1 values

# Yes group doubled
so_yes_sunalta <- 37 * 2
so_yes_chumir  <- 34 * 2

# No group unchanged
so_no_sunalta  <- 96
so_no_chumir   <- 93

# Build 2x2 matrix
social_order_matrix_double <- matrix(
  c(
    so_yes_chumir,  so_yes_sunalta,   # Yes row
    so_no_chumir,   so_no_sunalta     # No row
  ),
  nrow = 2,
  byrow = TRUE
)

rownames(social_order_matrix_double) <- c("Yes", "No")
colnames(social_order_matrix_double) <- c("Chumir", "Sunalta")

# Chi-square test
chisq.test(social_order_matrix_double)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  social_order_matrix_double
## X-squared = 0.016011, df = 1, p-value = 0.8993

# Social Order scenario: triple only Group 1 values

# Yes group tripled
so_yes_sunalta <- 37 * 3
so_yes_chumir  <- 34 * 3

# No group unchanged
so_no_sunalta  <- 96
so_no_chumir   <- 93

# Build 2x2 matrix
social_order_matrix_triple_yes_only <- matrix(
  c(
    so_yes_chumir,  so_yes_sunalta,   # Yes row
    so_no_chumir,   so_no_sunalta     # No row
  ),
  nrow = 2,
  byrow = TRUE
)

rownames(social_order_matrix_triple_yes_only) <- c("Yes", "No")
colnames(social_order_matrix_triple_yes_only) <- c("Chumir", "Sunalta")

# Chi-square test
chisq.test(social_order_matrix_triple_yes_only)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  social_order_matrix_triple_yes_only
## X-squared = 0.026941, df = 1, p-value = 0.8696

# Social Order scenario: quadruple only Group 1 values

# Yes group quadrupled
so_yes_sunalta <- 37 * 4
so_yes_chumir  <- 34 * 4

# No group unchanged
so_no_sunalta  <- 96
so_no_chumir   <- 93

# Build 2x2 matrix
social_order_matrix_quad_yes_only <- matrix(
  c(
    so_yes_chumir,  so_yes_sunalta,   # Yes row
    so_no_chumir,   so_no_sunalta     # No row
  ),
  nrow = 2,
  byrow = TRUE
)

rownames(social_order_matrix_quad_yes_only) <- c("Yes", "No")
colnames(social_order_matrix_quad_yes_only) <- c("Chumir", "Sunalta")

# Chi-square test
chisq.test(social_order_matrix_quad_yes_only)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  social_order_matrix_quad_yes_only
## X-squared = 0.035061, df = 1, p-value = 0.8515

# ---- GROUP 1 ----
df_group1 <- data.frame(
  group = c("physical disorder", "social disorder", "social order"),
  Chumir = c(195, 77, 37),
  Sunalta = c(149, 27, 34)
)

df_long1 <- df_group1 %>%
  pivot_longer(cols = c(Sunalta, Chumir),
               names_to = "Location",
               values_to = "Count") %>%
  group_by(group) %>%
  mutate(
    Percent = round(Count / sum(Count) * 100, 1),
    Label = paste0(Count, " (", Percent, "%)")
  ) %>%
  ungroup()

plot_group1 <- ggplot(df_long1, aes(x = group, y = Count, fill = Location)) +
  geom_col() +
  geom_text(aes(label = Label),
            position = position_stack(vjust = 0.5),
            color = "white",
            size = 4) +
  labs(
    title = "G1 Aggregated Indicators by Location",
    x = "Indicator Category",
    y = "Count",
    fill = "Location"
  ) +
  theme_minimal(base_size = 13) +
  theme(axis.text.x = element_text(angle = 20, hjust = 1),
        plot.title = element_text(face = "bold"))


# ---- GROUP 0 ----
df_group0 <- data.frame(
  group = c("physical disorder", "social disorder", "social order"),
  Chumir = c(145, 183, 93),
  Sunalta = c(191, 233, 96)
)

df_long0 <- df_group0 %>%
  pivot_longer(cols = c(Sunalta, Chumir),
               names_to = "Location",
               values_to = "Count") %>%
  group_by(group) %>%
  mutate(
    Percent = round(Count / sum(Count) * 100, 1),
    Label = paste0(Count, " (", Percent, "%)")
  ) %>%
  ungroup()

plot_group0 <- ggplot(df_long0, aes(x = group, y = Count, fill = Location)) +
  geom_col() +
  geom_text(aes(label = Label),
            position = position_stack(vjust = 0.5),
            color = "white",
            size = 4) +
  labs(
    title = "G0 Aggregated Indicators by Location",
    x = "Indicator Category",
    y = "Count",
    fill = "Location"
  ) +
  theme_minimal(base_size = 13) +
  theme(axis.text.x = element_text(angle = 20, hjust = 1),
        plot.title = element_text(face = "bold"))

# ---- Combine both charts ----
combined_plot <- plot_group1 + plot_group0
print(combined_plot)

Data All Yes/No

Haisa Osmanli

2025-10-24