Cleaning data

  1. merge all d2d obs together: 1.1 keeping only full obs (those who reported for all 11 days) 1.2. dropping duplicates (taking the last one if more than one obs per ID) 1.3. merging: d2d, new year, final survey
  2. keep only id, 3 main dv, experience expanded
  3. collapsing on id, calculating: average per each dv, count of each experience type
library(pacman)
p_load(tidyverse, readxl, janitor, lubridate, plotly, glue, ggpubr, haven, modelsummary, kableExtra)

loading data

d2d_fn<-  "day-to-day+UKR+proj_January+11,+2025_01.02.sav"
final_fn<-  "final_January+11,+2025_01.02.sav"
ny_fn<-  "new_year_January+11,+2025_01.00.sav"
starting_fn<-  "starting_January+11,+2025_01.01.sav"
# we need uiid for merging correctly starting with the rest of the data because starting file misses correct assignment_id field to retrieve unique participant id from.
uuid_fn<-'uuids.csv'
add_id<-function(df){
  df<-df %>% 
    mutate(
    id = str_extract(assignment_id, "dnum\\d+_(.*)") %>% str_remove("^dnum\\d+_")
  ) 
  return(df)
}

d2d_df<-read_spss(d2d_fn) %>% add_id()
  
final_df<-read_spss(final_fn)%>% add_id() %>% 
  group_by(id) %>% 
  filter(StartDate == max(StartDate)) %>% # Keep the row with the latest StartDate
  ungroup()
ny_df<-read_spss(ny_fn)%>% add_id()

uuid_df <- read_csv(uuid_fn) %>% 
  select(id=uuid, ResponseId)
## Rows: 997 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): uuid, ResponseId
## dbl (3): completion_code, start_time_difference, end_time_difference
## lgl (1): duplicate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
starting_df<-read_spss(starting_fn) %>% 
  right_join(uuid_df) %>%
  group_by(id) %>% 
  filter(StartDate == max(StartDate)) %>% # Keep the row with the latest StartDate
  ungroup()
## Joining with `by = join_by(ResponseId)`

merging d2d data only

cols_to_keep<-c('id', 'StartDate', 'd_num', 'experience_1', 'experience_2', 'experience_3', 'experience_4', 'experience_5', 'experience_6', 
                'happyness', 'hope', 'region_situ_eval', 'news_consumption', 'change_eval')



# we select only those who completed ALL waves (now only, for simplicity!)
full_df<-bind_rows(d2d_df, ny_df, final_df) %>% 
  select(all_of(cols_to_keep)) %>% 
  group_by(id, d_num) %>%                # Group by participant ID and wave
  filter(StartDate == max(StartDate)) %>% # Keep the row with the latest StartDate
  ungroup() %>% 
  group_by(id) %>% 
  mutate(n=n()) %>% 
  filter(n==11) %>%
  select(-n) %>%
  ungroup() %>% 
  rename(
    air_raid_alert = experience_1,
    power_outage = experience_2,
    limited_access_services = experience_3,
    positive_event = experience_4,
    negative_event = experience_5,
    none_above = experience_6
  )

Some plots (with plotly):

# Function to generate summary and plot for a specific experience
generate_experience_plot <- function(df, experience_col, experience_label, dv_col) {
  result_df <- df %>%
    group_by(id) %>% 
    summarize(
      avg_dv = mean({{ dv_col }}, na.rm = TRUE), # Calculate mean for the specified DV
      total_experience = sum({{ experience_col }}, na.rm = TRUE)
    ) %>% 
    group_by(total_experience) %>% 
    summarize(
      mean_dv_per_level = mean(avg_dv, na.rm = TRUE),
      count_participants = n()
    )
  
  # Generate the plot
  plot <- result_df %>%
    ggplot(aes(x = total_experience, y = mean_dv_per_level)) +
    geom_point() +
    geom_smooth(method = "lm") +
    labs(
      title = paste("Mean", deparse(substitute(dv_col)), "per level of", experience_label),
      x = paste("Total count of", experience_label, "per participant"),
      y = paste("Mean", deparse(substitute(dv_col)))
    ) +
    theme_pubclean()
  
  return(plot)
}

generate_experience_plot(full_df, air_raid_alert, "Air Raid Alert", happyness)
## `geom_smooth()` using formula = 'y ~ x'

generate_experience_plot(full_df, air_raid_alert, "Air Raid Alert", hope)
## `geom_smooth()` using formula = 'y ~ x'

generate_experience_plot(full_df, air_raid_alert, "Air Raid Alert", region_situ_eval)
## `geom_smooth()` using formula = 'y ~ x'

First merge start and final waves to get changes in attitudes

full_df %>%
    group_by(id) %>% 
    summarize(
      total_experience = sum(air_raid_alert, na.rm = TRUE)
    ) -> alert_counts
merged_df <- final_df %>%
  full_join(starting_df, by = "id", suffix = c("_final", "_starting")) %>% 
  filter(id %in% full_df$id) %>% 
  left_join(alert_counts, by = "id")



# Create rus_win variable
merged_df <- merged_df %>%
  filter(!ukr_win_final %in% c(5, 6) | !ukr_win_starting %in% c(5, 6)) %>%
  mutate(
    diff_ukr_win = ukr_win_final - ukr_win_starting,
    rus_win = diff_ukr_win > 0
  )

# Logistic regression
logit_model <- glm(rus_win ~ total_experience, data = merged_df, family = binomial)

# Summary of the model
modelsummary(logit_model, stars = TRUE, exponentiate = TRUE)
(1)
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001
(Intercept) 0.263***
(0.056)
total_experience 1.025
(0.029)
Num.Obs. 689
AIC 757.1
BIC 766.2
Log.Lik. -376.570
F 0.759
RMSE 0.42
# Predicted probabilities (optional)
merged_df <- merged_df %>%
  mutate(predicted_rus_win = predict(logit_model, type = "response"))

# Visualization of the relationship
merged_df %>%
  ggplot(aes(x = total_experience, y = predicted_rus_win)) +
  geom_point() +
  geom_smooth(method = "glm", method.args = list(family = binomial), se = TRUE) +
  labs(
    title = "Predicted probability of Rus Win by Total Alerts",
    x = "Total Air Raid Alerts",
    y = "Predicted Probability of Rus Win"
  ) +
  theme_pubclean() ->p1
ggplotly(p1)
## `geom_smooth()` using formula = 'y ~ x'
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

now let’s do a fixest model trying to predict happyness from airraid and lagged airraid. we use id as identifier, and d_num as time identifier

p_load(fixest)
full_df<-full_df %>% 
  mutate(across(
    c(air_raid_alert, power_outage, limited_access_services, positive_event, negative_event, none_above),
    ~ replace_na(., 0)
  )) %>% 
  mutate(d_num=as.numeric(d_num)) 

fe_model <- feols(happyness ~ air_raid_alert + power_outage + limited_access_services  | id + d_num, 
                  cluster = ~id, 
                  data = full_df)
## NOTE: 8 observations removed because of NA values (LHS: 8).
modelsummary(fe_model, stars=T)
(1)
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001
air_raid_alert -0.054***
(0.015)
power_outage -0.001
(0.024)
limited_access_services -0.023
(0.033)
Num.Obs. 8484
R2 0.741
R2 Adj. 0.714
R2 Within 0.002
R2 Within Adj. 0.001
AIC 12372.1
BIC 17903.1
RMSE 0.46
Std.Errors by: id
FE: id X
FE: d_num X