Cleaning data
- merge all d2d obs together: 1.1 keeping only full obs (those who
reported for all 11 days) 1.2. dropping duplicates (taking the last one
if more than one obs per ID) 1.3. merging: d2d, new year, final
survey
- keep only id, 3 main dv, experience expanded
- collapsing on id, calculating: average per each dv, count of each
experience type
library(pacman)
p_load(tidyverse, readxl, janitor, lubridate, plotly, glue, ggpubr, haven, modelsummary, kableExtra)
loading data
d2d_fn<- "day-to-day+UKR+proj_January+11,+2025_01.02.sav"
final_fn<- "final_January+11,+2025_01.02.sav"
ny_fn<- "new_year_January+11,+2025_01.00.sav"
starting_fn<- "starting_January+11,+2025_01.01.sav"
# we need uiid for merging correctly starting with the rest of the data because starting file misses correct assignment_id field to retrieve unique participant id from.
uuid_fn<-'uuids.csv'
add_id<-function(df){
df<-df %>%
mutate(
id = str_extract(assignment_id, "dnum\\d+_(.*)") %>% str_remove("^dnum\\d+_")
)
return(df)
}
d2d_df<-read_spss(d2d_fn) %>% add_id()
final_df<-read_spss(final_fn)%>% add_id() %>%
group_by(id) %>%
filter(StartDate == max(StartDate)) %>% # Keep the row with the latest StartDate
ungroup()
ny_df<-read_spss(ny_fn)%>% add_id()
uuid_df <- read_csv(uuid_fn) %>%
select(id=uuid, ResponseId)
## Rows: 997 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): uuid, ResponseId
## dbl (3): completion_code, start_time_difference, end_time_difference
## lgl (1): duplicate
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
starting_df<-read_spss(starting_fn) %>%
right_join(uuid_df) %>%
group_by(id) %>%
filter(StartDate == max(StartDate)) %>% # Keep the row with the latest StartDate
ungroup()
## Joining with `by = join_by(ResponseId)`
merging d2d data only
cols_to_keep<-c('id', 'StartDate', 'd_num', 'experience_1', 'experience_2', 'experience_3', 'experience_4', 'experience_5', 'experience_6',
'happyness', 'hope', 'region_situ_eval', 'news_consumption', 'change_eval')
# we select only those who completed ALL waves (now only, for simplicity!)
full_df<-bind_rows(d2d_df, ny_df, final_df) %>%
select(all_of(cols_to_keep)) %>%
group_by(id, d_num) %>% # Group by participant ID and wave
filter(StartDate == max(StartDate)) %>% # Keep the row with the latest StartDate
ungroup() %>%
group_by(id) %>%
mutate(n=n()) %>%
filter(n==11) %>%
select(-n) %>%
ungroup() %>%
rename(
air_raid_alert = experience_1,
power_outage = experience_2,
limited_access_services = experience_3,
positive_event = experience_4,
negative_event = experience_5,
none_above = experience_6
)
Some plots (with plotly):
- mean reported experience per type at X, mean happyness/etc at y
# Function to generate summary and plot for a specific experience
generate_experience_plot <- function(df, experience_col, experience_label, dv_col) {
result_df <- df %>%
group_by(id) %>%
summarize(
avg_dv = mean({{ dv_col }}, na.rm = TRUE), # Calculate mean for the specified DV
total_experience = sum({{ experience_col }}, na.rm = TRUE)
) %>%
group_by(total_experience) %>%
summarize(
mean_dv_per_level = mean(avg_dv, na.rm = TRUE),
count_participants = n()
)
# Generate the plot
plot <- result_df %>%
ggplot(aes(x = total_experience, y = mean_dv_per_level)) +
geom_point() +
geom_smooth(method = "lm") +
labs(
title = paste("Mean", deparse(substitute(dv_col)), "per level of", experience_label),
x = paste("Total count of", experience_label, "per participant"),
y = paste("Mean", deparse(substitute(dv_col)))
) +
theme_pubclean()
return(plot)
}
generate_experience_plot(full_df, air_raid_alert, "Air Raid Alert", happyness)
## `geom_smooth()` using formula = 'y ~ x'

generate_experience_plot(full_df, air_raid_alert, "Air Raid Alert", hope)
## `geom_smooth()` using formula = 'y ~ x'

generate_experience_plot(full_df, air_raid_alert, "Air Raid Alert", region_situ_eval)
## `geom_smooth()` using formula = 'y ~ x'

- prob to change the opinion (diff types) towards more favorable to
Russia as a function of number of experiences of a certain type
First merge start and final waves to get changes in attitudes
full_df %>%
group_by(id) %>%
summarize(
total_experience = sum(air_raid_alert, na.rm = TRUE)
) -> alert_counts
merged_df <- final_df %>%
full_join(starting_df, by = "id", suffix = c("_final", "_starting")) %>%
filter(id %in% full_df$id) %>%
left_join(alert_counts, by = "id")
# Create rus_win variable
merged_df <- merged_df %>%
filter(!ukr_win_final %in% c(5, 6) | !ukr_win_starting %in% c(5, 6)) %>%
mutate(
diff_ukr_win = ukr_win_final - ukr_win_starting,
rus_win = diff_ukr_win > 0
)
# Logistic regression
logit_model <- glm(rus_win ~ total_experience, data = merged_df, family = binomial)
# Summary of the model
modelsummary(logit_model, stars = TRUE, exponentiate = TRUE)
| |
(1) |
| + p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001 |
| (Intercept) |
0.263*** |
| |
(0.056) |
| total_experience |
1.025 |
| |
(0.029) |
| Num.Obs. |
689 |
| AIC |
757.1 |
| BIC |
766.2 |
| Log.Lik. |
-376.570 |
| F |
0.759 |
| RMSE |
0.42 |
# Predicted probabilities (optional)
merged_df <- merged_df %>%
mutate(predicted_rus_win = predict(logit_model, type = "response"))
# Visualization of the relationship
merged_df %>%
ggplot(aes(x = total_experience, y = predicted_rus_win)) +
geom_point() +
geom_smooth(method = "glm", method.args = list(family = binomial), se = TRUE) +
labs(
title = "Predicted probability of Rus Win by Total Alerts",
x = "Total Air Raid Alerts",
y = "Predicted Probability of Rus Win"
) +
theme_pubclean() ->p1
ggplotly(p1)
## `geom_smooth()` using formula = 'y ~ x'
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
now let’s do a fixest model trying to predict happyness from airraid
and lagged airraid. we use id as identifier, and d_num as
time identifier
p_load(fixest)
full_df<-full_df %>%
mutate(across(
c(air_raid_alert, power_outage, limited_access_services, positive_event, negative_event, none_above),
~ replace_na(., 0)
)) %>%
mutate(d_num=as.numeric(d_num))
fe_model <- feols(happyness ~ air_raid_alert + power_outage + limited_access_services | id + d_num,
cluster = ~id,
data = full_df)
## NOTE: 8 observations removed because of NA values (LHS: 8).
modelsummary(fe_model, stars=T)
| |
(1) |
| + p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001 |
| air_raid_alert |
-0.054*** |
| |
(0.015) |
| power_outage |
-0.001 |
| |
(0.024) |
| limited_access_services |
-0.023 |
| |
(0.033) |
| Num.Obs. |
8484 |
| R2 |
0.741 |
| R2 Adj. |
0.714 |
| R2 Within |
0.002 |
| R2 Within Adj. |
0.001 |
| AIC |
12372.1 |
| BIC |
17903.1 |
| RMSE |
0.46 |
| Std.Errors |
by: id |
| FE: id |
X |
| FE: d_num |
X |