library(tidyverse)
library(readxl)
library(janitor)
library(survey)
library(knitr)
library(kableExtra)

# ── File path ─────────────────────────────────────────────────────────────────
datafile <- "D:/Populism and Democrary/India civic behaviour survey/Survey_Data.xlsx"
stopifnot(file.exists(datafile))

# ── Load raw sheets (never modified after this point) ─────────────────────────
raw_rural <- read_excel(datafile, sheet = 1)
raw_urban <- read_excel(datafile, sheet = 2)

# ── Sanity check ──────────────────────────────────────────────────────────────
dims <- tibble(
  sheet   = c("Rural", "Urban"),
  rows    = c(nrow(raw_rural), nrow(raw_urban)),
  cols    = c(ncol(raw_rural), ncol(raw_urban)),
  dup_ids = c(anyDuplicated(raw_rural$id),
               anyDuplicated(raw_urban$id))
)

kable(dims, caption = "Sanity check: datasets loaded (rows × cols)") |>
  kable_styling(full_width = FALSE,
                bootstrap_options = c("striped", "hover", "condensed"))

Sanity check: datasets loaded (rows × cols)
sheet	rows	cols	dup_ids
Rural	4187	51	0
Urban	5001	51	0

# ── Create working copies — raw_* objects remain untouched ────────────────────
make_workfile <- function(df, sample_label) {
  df |>
    clean_names() |>
    mutate(
      sample_type = sample_label,
      # Force demographic columns to numeric first, then recode 98/99 → NA
      across(c(a2, a3, a4, a5, a6, a6a, a7, a7a, a8),
             ~ na_if(as.numeric(.), 98) |> na_if(99)),
      across(starts_with("r"), ~ na_if(as.numeric(.), 98) |> na_if(99))
    )
}

rural <- make_workfile(raw_rural, "Rural")
urban <- make_workfile(raw_urban, "Urban")

# ── Column alignment check before binding ─────────────────────────────────────
col_check <- tibble(
  check  = "Column names identical across sheets",
  result = identical(names(rural), names(urban))
)

kable(col_check, caption = "Column alignment check") |>
  kable_styling(full_width = FALSE,
                bootstrap_options = c("striped", "hover", "condensed"))

Column alignment check
check	result
Column names identical across sheets	FALSE

# ── Stack into one analysis file ──────────────────────────────────────────────
df <- bind_rows(rural, urban)

# ── Coverage: n per state per sample type ─────────────────────────────────────
n_table <- df |>
  count(state, sample_type) |>
  pivot_wider(names_from = sample_type, values_from = n, values_fill = 0) |>
  arrange(state)

kable(n_table,
      caption = "Raw N per state by sample type (unweighted)") |>
  kable_styling(full_width = FALSE,
                bootstrap_options = c("striped", "hover", "condensed"))

Raw N per state by sample type (unweighted)
state	Rural	Urban
Andhra Pradesh	217	215
Assam	202	205
Bihar	203	200
Chandigarh	0	400
Chhattisgarh	210	205
Gujrat	203	201
Haryana	209	215
Himachal Pradesh	211	214
Jharkhand	217	211
Karnataka	217	219
Kerala	207	211
Madhya Pradesh	201	213
Maharashtra	211	204
NCT of Delhi	0	415
Odisha	201	206
Punjab	215	210
Rajasthan	227	218
Tamil Nadu	211	213
Telangana	215	208
Uttar Pradesh	205	213
Uttarkhand	201	201
West Bengal	204	204

Demographics

# ── Demographic recoding ───────────────────────────────────────────────────────
df <- df |>
  mutate(

    # Age → 3 cohorts
    age_cohort = case_when(
      a2%in% c(1, 2) ~ "Young (18–35)",
      a2%in% c(3, 4) ~ "Middle (36–60)",
      a2 == 5         ~ "Senior (60+)",
      TRUE            ~ NA_character_
    ),
    age_cohort = factor(age_cohort,
                        levels = c("Young (18–35)",
                                   "Middle (36–60)",
                                   "Senior (60+)")),

    # Gender
    gender = factor(a3,
                    levels = 1:2,
                    labels = c("Man", "Woman")),

    # Education → 3 cohorts
    edu_cohort = case_when(
      a5%in% 1:4 ~ "Below Secondary",
      a5%in% 5:6 ~ "Secondary / HSC",
      a5%in% 7:9 ~ "Graduate & Above",
      TRUE        ~ NA_character_
    ),
    edu_cohort = factor(edu_cohort,
                        levels = c("Below Secondary",
                                   "Secondary / HSC",
                                   "Graduate & Above")),

    # Religion
    religion = factor(a7,
                      levels = 1:6,
                      labels = c("Hindu", "Muslim", "Sikh",
                                 "Christian", "Atheist", "Other"))
  )

# ── Verification table ────────────────────────────────────────────────────────
verify_demo <- function(df, var, label) {
  df |>
    count(category = {{ var }}) |>
    filter(!is.na(category)) |>
    mutate(
      variable = label,
      pct      = round(n / sum(n) * 100, 1)
    ) |>
    select(variable, category, n, pct)
}

demo_check <- bind_rows(
  verify_demo(df, age_cohort, "Age Cohort"),
  verify_demo(df, gender,     "Gender"),
  verify_demo(df, edu_cohort, "Education"),
  verify_demo(df, religion,   "Religion")
)

kable(demo_check,
      caption = "Demographic distributions (unweighted)") |>
  kable_styling(full_width        = FALSE,
                bootstrap_options = c("striped", "hover", "condensed")) |>
  collapse_rows(columns = 1, valign = "top")

Demographic distributions (unweighted)
variable	category	n	pct
Age Cohort	Young (18–35)	4121	44.9
	Middle (36–60)	3863	42.0
	Senior (60+)	1204	13.1
Gender	Man	4663	50.8
Gender	Woman	4525	49.2
Education	Below Secondary	3435	37.4
	Secondary / HSC	4108	44.7
	Graduate & Above	1645	17.9
Religion	Hindu	7703	83.8
	Muslim	921	10.0
	Sikh	346	3.8
	Christian	197	2.1
	Atheist	1	0.0
	Other	20	0.2

Discrimination and diversity

Rural

# ── Variable mapping: Discrimination / Diversity — RURAL ─────────────────────
disc_vars_rural <- c(
  "Employer right\nnot to hire by religion"        = "r28",
  "Free to marry —\ndifferent religions"            = "r29",
  "Free to marry —\ndifferent castes"               = "r30",
  "Villagers exclude those\nwho eat certain foods"  = "r31",
  "Comfortable with diff.\nreligion neighbours"     = "r32",
  "Lower caste workers —\nsame water/toilets"       = "r33"
)

# ── Clean helper: 1/2 = Agree, 3/4 = Disagree ────────────────────────────────
clean_agree <- function(x) {
  case_when(
    x %in% c(1, 2) ~ "Agree",
    x %in% c(3, 4) ~ "Disagree",
    TRUE           ~ NA_character_
  )
}

# ── Compute % Agree per question by demographic ───────────────────────────────
compute_disc <- function(df, vars, group_var) {
  imap_dfr(vars, function(v, label) {
    df |>
      mutate(response = clean_agree(.data[[v]])) |>
      filter(!is.na(response), !is.na({{ group_var }})) |>
      count(group = {{ group_var }}, response) |>
      group_by(group) |>
      mutate(pct = round(n / sum(n) * 100, 1)) |>
      ungroup() |>
      mutate(question = label)
  })
}

# ── Rural data only ───────────────────────────────────────────────────────────
rural_df <- df |> filter(sample_type == "Rural")

# ── Age cohort — Rural ────────────────────────────────────────────────────────
disc_rural_age <- compute_disc(rural_df, disc_vars_rural, age_cohort)

disc_rural_age |>
  filter(!is.na(group)) |>
  mutate(question = str_wrap(question, 30)) |>
  ggplot(aes(x = group, y = pct, fill = response)) +
  geom_col(position = "stack", width = 0.6) +
  geom_text(aes(label = ifelse(pct >= 8, paste0(pct, "%"), "")),
            position = position_stack(vjust = 0.5),
            size = 3, colour = "white", fontface = "bold") +
  facet_wrap(~ question, nrow = 2) +
  scale_fill_manual(values = c("Agree" = "#4E79A7", "Disagree" = "#E15759"),
                    name = NULL) +
  scale_y_continuous(labels = scales::label_percent(scale = 1)) +
  labs(
    title    = "Discrimination & Diversity — Rural: by Age Cohort",
    subtitle = "India Civic Behaviour Survey — unweighted | 1+2 = Agree, 3+4 = Disagree",
    x        = NULL, y = "% of respondents"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    legend.position    = "bottom",
    strip.text         = element_text(face = "bold", size = 9),
    panel.grid.major.x = element_blank(),
    plot.title         = element_text(face = "bold", size = 13),
    plot.subtitle      = element_text(colour = "grey40", size = 10)
  )

# ── Gender — Rural ────────────────────────────────────────────────────────────
disc_rural_gender <- compute_disc(rural_df, disc_vars_rural, gender)

disc_rural_gender |>
  filter(!is.na(group)) |>
  mutate(question = str_wrap(question, 30)) |>
  ggplot(aes(x = group, y = pct, fill = response)) +
  geom_col(position = "stack", width = 0.6) +
  geom_text(aes(label = ifelse(pct >= 8, paste0(pct, "%"), "")),
            position = position_stack(vjust = 0.5),
            size = 3, colour = "white", fontface = "bold") +
  facet_wrap(~ question, nrow = 2) +
  scale_fill_manual(values = c("Agree" = "#4E79A7", "Disagree" = "#E15759"),
                    name = NULL) +
  scale_y_continuous(labels = scales::label_percent(scale = 1)) +
  labs(
    title    = "Discrimination & Diversity — Rural: by Gender",
    subtitle = "India Civic Behaviour Survey — unweighted | 1+2 = Agree, 3+4 = Disagree",
    x        = NULL, y = "% of respondents"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    legend.position    = "bottom",
    strip.text         = element_text(face = "bold", size = 9),
    panel.grid.major.x = element_blank(),
    plot.title         = element_text(face = "bold", size = 13),
    plot.subtitle      = element_text(colour = "grey40", size = 10)
  )

Urban

# ── Variable mapping: Discrimination / Diversity — URBAN ─────────────────────
disc_vars_urban <- c(
  "Employer right\nnot to hire by religion"         = "u28",
  "Free to marry —\ndifferent religions"             = "u29",
  "Free to marry —\ndifferent castes"                = "u30",
  "Housing societies\nright to ban food"             = "u31",
  "Comfortable with diff.\nreligion neighbours"      = "u32",
  "Domestic help —\nsame bathroom"                   = "u33"
)

urban_df <- df |> filter(sample_type == "Urban")

# ── Age cohort — Urban ────────────────────────────────────────────────────────
disc_urban_age <- compute_disc(urban_df, disc_vars_urban, age_cohort)

disc_urban_age |>
  filter(!is.na(group)) |>
  mutate(question = str_wrap(question, 30)) |>
  ggplot(aes(x = group, y = pct, fill = response)) +
  geom_col(position = "stack", width = 0.6) +
  geom_text(aes(label = ifelse(pct >= 8, paste0(pct, "%"), "")),
            position = position_stack(vjust = 0.5),
            size = 3, colour = "white", fontface = "bold") +
  facet_wrap(~ question, nrow = 2) +
  scale_fill_manual(values = c("Agree" = "#4E79A7", "Disagree" = "#E15759"),
                    name = NULL) +
  scale_y_continuous(labels = scales::label_percent(scale = 1)) +
  labs(
    title    = "Discrimination & Diversity — Urban: by Age Cohort",
    subtitle = "India Civic Behaviour Survey — unweighted | 1+2 = Agree, 3+4 = Disagree",
    x        = NULL, y = "% of respondents"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    legend.position    = "bottom",
    strip.text         = element_text(face = "bold", size = 9),
    panel.grid.major.x = element_blank(),
    plot.title         = element_text(face = "bold", size = 13),
    plot.subtitle      = element_text(colour = "grey40", size = 10)
  )

# ── Gender — Urban ────────────────────────────────────────────────────────────
disc_urban_gender <- compute_disc(urban_df, disc_vars_urban, gender)

disc_urban_gender |>
  filter(!is.na(group)) |>
  mutate(question = str_wrap(question, 30)) |>
  ggplot(aes(x = group, y = pct, fill = response)) +
  geom_col(position = "stack", width = 0.6) +
  geom_text(aes(label = ifelse(pct >= 8, paste0(pct, "%"), "")),
            position = position_stack(vjust = 0.5),
            size = 3, colour = "white", fontface = "bold") +
  facet_wrap(~ question, nrow = 2) +
  scale_fill_manual(values = c("Agree" = "#4E79A7", "Disagree" = "#E15759"),
                    name = NULL) +
  scale_y_continuous(labels = scales::label_percent(scale = 1)) +
  labs(
    title    = "Discrimination & Diversity — Urban: by Gender",
    subtitle = "India Civic Behaviour Survey — unweighted | 1+2 = Agree, 3+4 = Disagree",
    x        = NULL, y = "% of respondents"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    legend.position    = "bottom",
    strip.text         = element_text(face = "bold", size = 9),
    panel.grid.major.x = element_blank(),
    plot.title         = element_text(face = "bold", size = 13),
    plot.subtitle      = element_text(colour = "grey40", size = 10)
  )

Gender attitudes

Rural

# ── Variable mapping: Gender Attitudes — RURAL ────────────────────────────────
gender_vars_rural <- c(
  "Husband justified\nto beat wife"              = "r21",
  "Woman free to decide\non her earnings"        = "r22",
  "Male members take\nfinal household decisions" = "r23",
  "Woman free to marry\nagainst parents' wishes" = "r24",
  "Women vote as\nmale family members"           = "r25",
  "Women encouraged\nto work outside"            = "r26",
  "Daughters encouraged\nto study like sons"     = "r27"
)

# ── Age cohort — Rural ────────────────────────────────────────────────────────
gender_rural_age <- compute_disc(rural_df, gender_vars_rural, age_cohort)

gender_rural_age |>
  filter(!is.na(group)) |>
  mutate(question = str_wrap(question, 30)) |>
  ggplot(aes(x = group, y = pct, fill = response)) +
  geom_col(position = "stack", width = 0.6) +
  geom_text(aes(label = ifelse(pct >= 8, paste0(pct, "%"), "")),
            position = position_stack(vjust = 0.5),
            size = 3, colour = "white", fontface = "bold") +
  facet_wrap(~ question, nrow = 2) +
  scale_fill_manual(values = c("Agree" = "#4E79A7", "Disagree" = "#E15759"),
                    name = NULL) +
  scale_y_continuous(labels = scales::label_percent(scale = 1)) +
  labs(
    title    = "Gender Attitudes — Rural: by Age Cohort",
    subtitle = "India Civic Behaviour Survey — unweighted | 1+2 = Agree, 3+4 = Disagree",
    x        = NULL, y = "% of respondents"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    legend.position    = "bottom",
    strip.text         = element_text(face = "bold", size = 9),
    panel.grid.major.x = element_blank(),
    plot.title         = element_text(face = "bold", size = 13),
    plot.subtitle      = element_text(colour = "grey40", size = 10)
  )

# ── Gender — Rural ────────────────────────────────────────────────────────────
gender_rural_gender <- compute_disc(rural_df, gender_vars_rural, gender)

gender_rural_gender |>
  filter(!is.na(group)) |>
  mutate(question = str_wrap(question, 30)) |>
  ggplot(aes(x = group, y = pct, fill = response)) +
  geom_col(position = "stack", width = 0.6) +
  geom_text(aes(label = ifelse(pct >= 8, paste0(pct, "%"), "")),
            position = position_stack(vjust = 0.5),
            size = 3, colour = "white", fontface = "bold") +
  facet_wrap(~ question, nrow = 2) +
  scale_fill_manual(values = c("Agree" = "#4E79A7", "Disagree" = "#E15759"),
                    name = NULL) +
  scale_y_continuous(labels = scales::label_percent(scale = 1)) +
  labs(
    title    = "Gender Attitudes — Rural: by Gender",
    subtitle = "India Civic Behaviour Survey — unweighted | 1+2 = Agree, 3+4 = Disagree",
    x        = NULL, y = "% of respondents"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    legend.position    = "bottom",
    strip.text         = element_text(face = "bold", size = 9),
    panel.grid.major.x = element_blank(),
    plot.title         = element_text(face = "bold", size = 13),
    plot.subtitle      = element_text(colour = "grey40", size = 10)
  )

Urban

# ── Variable mapping: Gender Attitudes — URBAN ────────────────────────────────
gender_vars_urban <- c(
  "Husband justified\nto beat wife"              = "u21",
  "Woman free to decide\non her earnings"        = "u22",
  "Male members take\nfinal household decisions" = "u23",
  "Woman free to marry\nagainst parents' wishes" = "u24",
  "Women vote as\nmale family members"           = "u25",
  "Women encouraged\nto work outside"            = "u26",
  "Daughters encouraged\nto study like sons"     = "u27"
)

urban_df <- df |> filter(sample_type == "Urban")

# ── Age cohort — Urban ────────────────────────────────────────────────────────
gender_urban_age <- compute_disc(urban_df, gender_vars_urban, age_cohort)

gender_urban_age |>
  filter(!is.na(group)) |>
  mutate(question = str_wrap(question, 30)) |>
  ggplot(aes(x = group, y = pct, fill = response)) +
  geom_col(position = "stack", width = 0.6) +
  geom_text(aes(label = ifelse(pct >= 8, paste0(pct, "%"), "")),
            position = position_stack(vjust = 0.5),
            size = 3, colour = "white", fontface = "bold") +
  facet_wrap(~ question, nrow = 2) +
  scale_fill_manual(values = c("Agree" = "#4E79A7", "Disagree" = "#E15759"),
                    name = NULL) +
  scale_y_continuous(labels = scales::label_percent(scale = 1)) +
  labs(
    title    = "Gender Attitudes — Urban: by Age Cohort",
    subtitle = "India Civic Behaviour Survey — unweighted | 1+2 = Agree, 3+4 = Disagree",
    x        = NULL, y = "% of respondents"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    legend.position    = "bottom",
    strip.text         = element_text(face = "bold", size = 9),
    panel.grid.major.x = element_blank(),
    plot.title         = element_text(face = "bold", size = 13),
    plot.subtitle      = element_text(colour = "grey40", size = 10)
  )

# ── Gender — Urban ────────────────────────────────────────────────────────────
gender_urban_gender <- compute_disc(urban_df, gender_vars_urban, gender)

gender_urban_gender |>
  filter(!is.na(group)) |>
  mutate(question = str_wrap(question, 30)) |>
  ggplot(aes(x = group, y = pct, fill = response)) +
  geom_col(position = "stack", width = 0.6) +
  geom_text(aes(label = ifelse(pct >= 8, paste0(pct, "%"), "")),
            position = position_stack(vjust = 0.5),
            size = 3, colour = "white", fontface = "bold") +
  facet_wrap(~ question, nrow = 2) +
  scale_fill_manual(values = c("Agree" = "#4E79A7", "Disagree" = "#E15759"),
                    name = NULL) +
  scale_y_continuous(labels = scales::label_percent(scale = 1)) +
  labs(
    title    = "Gender Attitudes — Urban: by Gender",
    subtitle = "India Civic Behaviour Survey — unweighted | 1+2 = Agree, 3+4 = Disagree",
    x        = NULL, y = "% of respondents"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    legend.position    = "bottom",
    strip.text         = element_text(face = "bold", size = 9),
    panel.grid.major.x = element_blank(),
    plot.title         = element_text(face = "bold", size = 13),
    plot.subtitle      = element_text(colour = "grey40", size = 10)
  )

India civic behaviour

Arslan

2026-03-16

Demographics

Discrimination and diversity

Rural

Urban

Gender attitudes

Rural

Urban