library(tidyverse)
library(readxl)
library(janitor)
library(survey)
library(knitr)
library(kableExtra)

# ── File path ─────────────────────────────────────────────────────────────────
datafile <- "D:/Populism and Democrary/India civic behaviour survey/Survey_Data.xlsx"
stopifnot(file.exists(datafile))

# ── Load raw sheets (never modified after this point) ─────────────────────────
raw_rural <- read_excel(datafile, sheet = 1)
raw_urban <- read_excel(datafile, sheet = 2)

# ── Sanity check ──────────────────────────────────────────────────────────────
dims <- tibble(
  sheet   = c("Rural", "Urban"),
  rows    = c(nrow(raw_rural), nrow(raw_urban)),
  cols    = c(ncol(raw_rural), ncol(raw_urban)),
  dup_ids = c(anyDuplicated(raw_rural$id),
               anyDuplicated(raw_urban$id))
)

kable(dims, caption = "Sanity check: datasets loaded (rows × cols)") |>
  kable_styling(full_width = FALSE,
                bootstrap_options = c("striped", "hover", "condensed"))
Sanity check: datasets loaded (rows × cols)
sheet rows cols dup_ids
Rural 4187 51 0
Urban 5001 51 0
# ── Create working copies — raw_* objects remain untouched ────────────────────
make_workfile <- function(df, sample_label) {
  df |>
    clean_names() |>
    mutate(
      sample_type = sample_label,
      # Force demographic columns to numeric first, then recode 98/99 → NA
      across(c(a2, a3, a4, a5, a6, a6a, a7, a7a, a8),
             ~ na_if(as.numeric(.), 98) |> na_if(99)),
      across(starts_with("r"), ~ na_if(as.numeric(.), 98) |> na_if(99))
    )
}

rural <- make_workfile(raw_rural, "Rural")
urban <- make_workfile(raw_urban, "Urban")

# ── Column alignment check before binding ─────────────────────────────────────
col_check <- tibble(
  check  = "Column names identical across sheets",
  result = identical(names(rural), names(urban))
)

kable(col_check, caption = "Column alignment check") |>
  kable_styling(full_width = FALSE,
                bootstrap_options = c("striped", "hover", "condensed"))
Column alignment check
check result
Column names identical across sheets FALSE
# ── Stack into one analysis file ──────────────────────────────────────────────
df <- bind_rows(rural, urban)

# ── Coverage: n per state per sample type ─────────────────────────────────────
n_table <- df |>
  count(state, sample_type) |>
  pivot_wider(names_from = sample_type, values_from = n, values_fill = 0) |>
  arrange(state)

kable(n_table,
      caption = "Raw N per state by sample type (unweighted)") |>
  kable_styling(full_width = FALSE,
                bootstrap_options = c("striped", "hover", "condensed"))
Raw N per state by sample type (unweighted)
state Rural Urban
Andhra Pradesh 217 215
Assam 202 205
Bihar 203 200
Chandigarh 0 400
Chhattisgarh 210 205
Gujrat 203 201
Haryana 209 215
Himachal Pradesh 211 214
Jharkhand 217 211
Karnataka 217 219
Kerala 207 211
Madhya Pradesh 201 213
Maharashtra 211 204
NCT of Delhi 0 415
Odisha 201 206
Punjab 215 210
Rajasthan 227 218
Tamil Nadu 211 213
Telangana 215 208
Uttar Pradesh 205 213
Uttarkhand 201 201
West Bengal 204 204

Demographics

# ── Demographic recoding ───────────────────────────────────────────────────────
df <- df |>
  mutate(

    # Age → 3 cohorts
    age_cohort = case_when(
      a2%in% c(1, 2) ~ "Young (18–35)",
      a2%in% c(3, 4) ~ "Middle (36–60)",
      a2 == 5         ~ "Senior (60+)",
      TRUE            ~ NA_character_
    ),
    age_cohort = factor(age_cohort,
                        levels = c("Young (18–35)",
                                   "Middle (36–60)",
                                   "Senior (60+)")),

    # Gender
    gender = factor(a3,
                    levels = 1:2,
                    labels = c("Man", "Woman")),

    # Education → 3 cohorts
    edu_cohort = case_when(
      a5%in% 1:4 ~ "Below Secondary",
      a5%in% 5:6 ~ "Secondary / HSC",
      a5%in% 7:9 ~ "Graduate & Above",
      TRUE        ~ NA_character_
    ),
    edu_cohort = factor(edu_cohort,
                        levels = c("Below Secondary",
                                   "Secondary / HSC",
                                   "Graduate & Above")),

    # Religion
    religion = factor(a7,
                      levels = 1:6,
                      labels = c("Hindu", "Muslim", "Sikh",
                                 "Christian", "Atheist", "Other"))
  )

# ── Verification table ────────────────────────────────────────────────────────
verify_demo <- function(df, var, label) {
  df |>
    count(category = {{ var }}) |>
    filter(!is.na(category)) |>
    mutate(
      variable = label,
      pct      = round(n / sum(n) * 100, 1)
    ) |>
    select(variable, category, n, pct)
}

demo_check <- bind_rows(
  verify_demo(df, age_cohort, "Age Cohort"),
  verify_demo(df, gender,     "Gender"),
  verify_demo(df, edu_cohort, "Education"),
  verify_demo(df, religion,   "Religion")
)

kable(demo_check,
      caption = "Demographic distributions (unweighted)") |>
  kable_styling(full_width        = FALSE,
                bootstrap_options = c("striped", "hover", "condensed")) |>
  collapse_rows(columns = 1, valign = "top")
Demographic distributions (unweighted)
variable category n pct
Age Cohort Young (18–35) 4121 44.9
Middle (36–60) 3863 42.0
Senior (60+) 1204 13.1
Gender Man 4663 50.8
Woman 4525 49.2
Education Below Secondary 3435 37.4
Secondary / HSC 4108 44.7
Graduate & Above 1645 17.9
Religion Hindu 7703 83.8
Muslim 921 10.0
Sikh 346 3.8
Christian 197 2.1
Atheist 1 0.0
Other 20 0.2

Discrimination and diversity

Rural

# ── Variable mapping: Discrimination / Diversity — RURAL ─────────────────────
disc_vars_rural <- c(
  "Employer right\nnot to hire by religion"        = "r28",
  "Free to marry —\ndifferent religions"            = "r29",
  "Free to marry —\ndifferent castes"               = "r30",
  "Villagers exclude those\nwho eat certain foods"  = "r31",
  "Comfortable with diff.\nreligion neighbours"     = "r32",
  "Lower caste workers —\nsame water/toilets"       = "r33"
)

# ── Clean helper: 1/2 = Agree, 3/4 = Disagree ────────────────────────────────
clean_agree <- function(x) {
  case_when(
    x %in% c(1, 2) ~ "Agree",
    x %in% c(3, 4) ~ "Disagree",
    TRUE           ~ NA_character_
  )
}

# ── Compute % Agree per question by demographic ───────────────────────────────
compute_disc <- function(df, vars, group_var) {
  imap_dfr(vars, function(v, label) {
    df |>
      mutate(response = clean_agree(.data[[v]])) |>
      filter(!is.na(response), !is.na({{ group_var }})) |>
      count(group = {{ group_var }}, response) |>
      group_by(group) |>
      mutate(pct = round(n / sum(n) * 100, 1)) |>
      ungroup() |>
      mutate(question = label)
  })
}

# ── Rural data only ───────────────────────────────────────────────────────────
rural_df <- df |> filter(sample_type == "Rural")

# ── Age cohort — Rural ────────────────────────────────────────────────────────
disc_rural_age <- compute_disc(rural_df, disc_vars_rural, age_cohort)

disc_rural_age |>
  filter(!is.na(group)) |>
  mutate(question = str_wrap(question, 30)) |>
  ggplot(aes(x = group, y = pct, fill = response)) +
  geom_col(position = "stack", width = 0.6) +
  geom_text(aes(label = ifelse(pct >= 8, paste0(pct, "%"), "")),
            position = position_stack(vjust = 0.5),
            size = 3, colour = "white", fontface = "bold") +
  facet_wrap(~ question, nrow = 2) +
  scale_fill_manual(values = c("Agree" = "#4E79A7", "Disagree" = "#E15759"),
                    name = NULL) +
  scale_y_continuous(labels = scales::label_percent(scale = 1)) +
  labs(
    title    = "Discrimination & Diversity — Rural: by Age Cohort",
    subtitle = "India Civic Behaviour Survey — unweighted | 1+2 = Agree, 3+4 = Disagree",
    x        = NULL, y = "% of respondents"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    legend.position    = "bottom",
    strip.text         = element_text(face = "bold", size = 9),
    panel.grid.major.x = element_blank(),
    plot.title         = element_text(face = "bold", size = 13),
    plot.subtitle      = element_text(colour = "grey40", size = 10)
  )

# ── Gender — Rural ────────────────────────────────────────────────────────────
disc_rural_gender <- compute_disc(rural_df, disc_vars_rural, gender)

disc_rural_gender |>
  filter(!is.na(group)) |>
  mutate(question = str_wrap(question, 30)) |>
  ggplot(aes(x = group, y = pct, fill = response)) +
  geom_col(position = "stack", width = 0.6) +
  geom_text(aes(label = ifelse(pct >= 8, paste0(pct, "%"), "")),
            position = position_stack(vjust = 0.5),
            size = 3, colour = "white", fontface = "bold") +
  facet_wrap(~ question, nrow = 2) +
  scale_fill_manual(values = c("Agree" = "#4E79A7", "Disagree" = "#E15759"),
                    name = NULL) +
  scale_y_continuous(labels = scales::label_percent(scale = 1)) +
  labs(
    title    = "Discrimination & Diversity — Rural: by Gender",
    subtitle = "India Civic Behaviour Survey — unweighted | 1+2 = Agree, 3+4 = Disagree",
    x        = NULL, y = "% of respondents"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    legend.position    = "bottom",
    strip.text         = element_text(face = "bold", size = 9),
    panel.grid.major.x = element_blank(),
    plot.title         = element_text(face = "bold", size = 13),
    plot.subtitle      = element_text(colour = "grey40", size = 10)
  )

Urban

# ── Variable mapping: Discrimination / Diversity — URBAN ─────────────────────
disc_vars_urban <- c(
  "Employer right\nnot to hire by religion"         = "u28",
  "Free to marry —\ndifferent religions"             = "u29",
  "Free to marry —\ndifferent castes"                = "u30",
  "Housing societies\nright to ban food"             = "u31",
  "Comfortable with diff.\nreligion neighbours"      = "u32",
  "Domestic help —\nsame bathroom"                   = "u33"
)

urban_df <- df |> filter(sample_type == "Urban")

# ── Age cohort — Urban ────────────────────────────────────────────────────────
disc_urban_age <- compute_disc(urban_df, disc_vars_urban, age_cohort)

disc_urban_age |>
  filter(!is.na(group)) |>
  mutate(question = str_wrap(question, 30)) |>
  ggplot(aes(x = group, y = pct, fill = response)) +
  geom_col(position = "stack", width = 0.6) +
  geom_text(aes(label = ifelse(pct >= 8, paste0(pct, "%"), "")),
            position = position_stack(vjust = 0.5),
            size = 3, colour = "white", fontface = "bold") +
  facet_wrap(~ question, nrow = 2) +
  scale_fill_manual(values = c("Agree" = "#4E79A7", "Disagree" = "#E15759"),
                    name = NULL) +
  scale_y_continuous(labels = scales::label_percent(scale = 1)) +
  labs(
    title    = "Discrimination & Diversity — Urban: by Age Cohort",
    subtitle = "India Civic Behaviour Survey — unweighted | 1+2 = Agree, 3+4 = Disagree",
    x        = NULL, y = "% of respondents"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    legend.position    = "bottom",
    strip.text         = element_text(face = "bold", size = 9),
    panel.grid.major.x = element_blank(),
    plot.title         = element_text(face = "bold", size = 13),
    plot.subtitle      = element_text(colour = "grey40", size = 10)
  )

# ── Gender — Urban ────────────────────────────────────────────────────────────
disc_urban_gender <- compute_disc(urban_df, disc_vars_urban, gender)

disc_urban_gender |>
  filter(!is.na(group)) |>
  mutate(question = str_wrap(question, 30)) |>
  ggplot(aes(x = group, y = pct, fill = response)) +
  geom_col(position = "stack", width = 0.6) +
  geom_text(aes(label = ifelse(pct >= 8, paste0(pct, "%"), "")),
            position = position_stack(vjust = 0.5),
            size = 3, colour = "white", fontface = "bold") +
  facet_wrap(~ question, nrow = 2) +
  scale_fill_manual(values = c("Agree" = "#4E79A7", "Disagree" = "#E15759"),
                    name = NULL) +
  scale_y_continuous(labels = scales::label_percent(scale = 1)) +
  labs(
    title    = "Discrimination & Diversity — Urban: by Gender",
    subtitle = "India Civic Behaviour Survey — unweighted | 1+2 = Agree, 3+4 = Disagree",
    x        = NULL, y = "% of respondents"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    legend.position    = "bottom",
    strip.text         = element_text(face = "bold", size = 9),
    panel.grid.major.x = element_blank(),
    plot.title         = element_text(face = "bold", size = 13),
    plot.subtitle      = element_text(colour = "grey40", size = 10)
  )

Gender attitudes

Rural

# ── Variable mapping: Gender Attitudes — RURAL ────────────────────────────────
gender_vars_rural <- c(
  "Husband justified\nto beat wife"              = "r21",
  "Woman free to decide\non her earnings"        = "r22",
  "Male members take\nfinal household decisions" = "r23",
  "Woman free to marry\nagainst parents' wishes" = "r24",
  "Women vote as\nmale family members"           = "r25",
  "Women encouraged\nto work outside"            = "r26",
  "Daughters encouraged\nto study like sons"     = "r27"
)

# ── Age cohort — Rural ────────────────────────────────────────────────────────
gender_rural_age <- compute_disc(rural_df, gender_vars_rural, age_cohort)

gender_rural_age |>
  filter(!is.na(group)) |>
  mutate(question = str_wrap(question, 30)) |>
  ggplot(aes(x = group, y = pct, fill = response)) +
  geom_col(position = "stack", width = 0.6) +
  geom_text(aes(label = ifelse(pct >= 8, paste0(pct, "%"), "")),
            position = position_stack(vjust = 0.5),
            size = 3, colour = "white", fontface = "bold") +
  facet_wrap(~ question, nrow = 2) +
  scale_fill_manual(values = c("Agree" = "#4E79A7", "Disagree" = "#E15759"),
                    name = NULL) +
  scale_y_continuous(labels = scales::label_percent(scale = 1)) +
  labs(
    title    = "Gender Attitudes — Rural: by Age Cohort",
    subtitle = "India Civic Behaviour Survey — unweighted | 1+2 = Agree, 3+4 = Disagree",
    x        = NULL, y = "% of respondents"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    legend.position    = "bottom",
    strip.text         = element_text(face = "bold", size = 9),
    panel.grid.major.x = element_blank(),
    plot.title         = element_text(face = "bold", size = 13),
    plot.subtitle      = element_text(colour = "grey40", size = 10)
  )

# ── Gender — Rural ────────────────────────────────────────────────────────────
gender_rural_gender <- compute_disc(rural_df, gender_vars_rural, gender)

gender_rural_gender |>
  filter(!is.na(group)) |>
  mutate(question = str_wrap(question, 30)) |>
  ggplot(aes(x = group, y = pct, fill = response)) +
  geom_col(position = "stack", width = 0.6) +
  geom_text(aes(label = ifelse(pct >= 8, paste0(pct, "%"), "")),
            position = position_stack(vjust = 0.5),
            size = 3, colour = "white", fontface = "bold") +
  facet_wrap(~ question, nrow = 2) +
  scale_fill_manual(values = c("Agree" = "#4E79A7", "Disagree" = "#E15759"),
                    name = NULL) +
  scale_y_continuous(labels = scales::label_percent(scale = 1)) +
  labs(
    title    = "Gender Attitudes — Rural: by Gender",
    subtitle = "India Civic Behaviour Survey — unweighted | 1+2 = Agree, 3+4 = Disagree",
    x        = NULL, y = "% of respondents"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    legend.position    = "bottom",
    strip.text         = element_text(face = "bold", size = 9),
    panel.grid.major.x = element_blank(),
    plot.title         = element_text(face = "bold", size = 13),
    plot.subtitle      = element_text(colour = "grey40", size = 10)
  )

Urban

# ── Variable mapping: Gender Attitudes — URBAN ────────────────────────────────
gender_vars_urban <- c(
  "Husband justified\nto beat wife"              = "u21",
  "Woman free to decide\non her earnings"        = "u22",
  "Male members take\nfinal household decisions" = "u23",
  "Woman free to marry\nagainst parents' wishes" = "u24",
  "Women vote as\nmale family members"           = "u25",
  "Women encouraged\nto work outside"            = "u26",
  "Daughters encouraged\nto study like sons"     = "u27"
)

urban_df <- df |> filter(sample_type == "Urban")

# ── Age cohort — Urban ────────────────────────────────────────────────────────
gender_urban_age <- compute_disc(urban_df, gender_vars_urban, age_cohort)

gender_urban_age |>
  filter(!is.na(group)) |>
  mutate(question = str_wrap(question, 30)) |>
  ggplot(aes(x = group, y = pct, fill = response)) +
  geom_col(position = "stack", width = 0.6) +
  geom_text(aes(label = ifelse(pct >= 8, paste0(pct, "%"), "")),
            position = position_stack(vjust = 0.5),
            size = 3, colour = "white", fontface = "bold") +
  facet_wrap(~ question, nrow = 2) +
  scale_fill_manual(values = c("Agree" = "#4E79A7", "Disagree" = "#E15759"),
                    name = NULL) +
  scale_y_continuous(labels = scales::label_percent(scale = 1)) +
  labs(
    title    = "Gender Attitudes — Urban: by Age Cohort",
    subtitle = "India Civic Behaviour Survey — unweighted | 1+2 = Agree, 3+4 = Disagree",
    x        = NULL, y = "% of respondents"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    legend.position    = "bottom",
    strip.text         = element_text(face = "bold", size = 9),
    panel.grid.major.x = element_blank(),
    plot.title         = element_text(face = "bold", size = 13),
    plot.subtitle      = element_text(colour = "grey40", size = 10)
  )

# ── Gender — Urban ────────────────────────────────────────────────────────────
gender_urban_gender <- compute_disc(urban_df, gender_vars_urban, gender)

gender_urban_gender |>
  filter(!is.na(group)) |>
  mutate(question = str_wrap(question, 30)) |>
  ggplot(aes(x = group, y = pct, fill = response)) +
  geom_col(position = "stack", width = 0.6) +
  geom_text(aes(label = ifelse(pct >= 8, paste0(pct, "%"), "")),
            position = position_stack(vjust = 0.5),
            size = 3, colour = "white", fontface = "bold") +
  facet_wrap(~ question, nrow = 2) +
  scale_fill_manual(values = c("Agree" = "#4E79A7", "Disagree" = "#E15759"),
                    name = NULL) +
  scale_y_continuous(labels = scales::label_percent(scale = 1)) +
  labs(
    title    = "Gender Attitudes — Urban: by Gender",
    subtitle = "India Civic Behaviour Survey — unweighted | 1+2 = Agree, 3+4 = Disagree",
    x        = NULL, y = "% of respondents"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    legend.position    = "bottom",
    strip.text         = element_text(face = "bold", size = 9),
    panel.grid.major.x = element_blank(),
    plot.title         = element_text(face = "bold", size = 13),
    plot.subtitle      = element_text(colour = "grey40", size = 10)
  )