HCES 2022-23

The task for this assignment was to recreate some graphs from the Data for India dashboard that use the Household Consumption Expenditure Survey 2022-23 data to calculate monthly per capita consumption expenditure.

library(tidyverse)
library(readxl)
library(haven)
library(forcats)
library(gridExtra)
library(Hmisc)
library(scales)
library(sf)
library(RColorBrewer)

setwd("~/Downloads/univ/sem 2/communicating econ/data assignment")

loading data

StateList <- read_excel("tabulation_state_code.xlsx") %>% 
  select(st, stn) 

StateList$st <- as.numeric(StateList$st)

hces_level3 <- read_dta("HCESdta/hces22_lvl_3.dta")
hces_level14 <- read_dta("HCESdta/hces22_lvl_14.dta")
hces_level15 <- read_dta("HCESdta/hces22_lvl_15.dta")

data wrangling

hces_level3_new <- hces_level3 %>% 
  mutate(state = as.numeric(hces_level3$state)) %>%
  mutate(Weights = mult / 100,
         HHID = paste0(fsu, b1q1pt11, b1q1pt12),
         StateName = factor(state,
                            levels = StateList$st,
                            labels = StateList$stn)) 

hces_level14_new <- hces_level14 %>% 
  mutate(state = as.numeric(hces_level14$state)) %>%
  mutate(Weights = mult / 100,
         HHID = paste0(fsu, b1q1pt11, b1q1pt12),
         StateName = factor(state,
                            levels = StateList$st,
                            labels = StateList$stn)) 

hces_level15_new <- hces_level15 %>% 
  mutate(state = as.numeric(hces_level15$state)) %>%
  mutate(Weights = mult / 100,
         HHID = paste0(fsu, b1q1pt11, b1q1pt12),
         StateName = factor(state,
                            levels = StateList$st,
                            labels = StateList$stn)) 

separating data

FoodSummary <- hces_level14_new %>% 
  filter(questionaire_no == "F")

ConsumablesSummary <- hces_level14_new %>% 
  filter(questionaire_no == "C")

DurablesSummary <- hces_level14_new %>% 
  filter(questionaire_no == "D")

food expenses

Food1 <- FoodSummary %>% 
  filter(ba1b1c1_2 %in% c(129, 139, 159, 179)) %>% 
  group_by(HHID) %>% 
  summarise(
    Sum1 = sum(ba1b1c1_3)
  )

Food2 <- FoodSummary %>% 
  filter(ba1b1c1_2 %in% c(169, 219, 239, 249, 199, 189, 269, 279, 289, 299)) %>% 
  group_by(HHID) %>% 
  summarise(
    Sum2 = sum(ba1b1c1_3 * (30/7))
  )

Food <- full_join(Food1, Food2, by = "HHID")
Food[is.na(Food)] <- 0
Food <- Food %>% 
  mutate(FoodExpense = Sum1 + Sum2) %>% 
  select(HHID, FoodExpense)

consumables expenses

# items with 30 day period
Consumables1 <- ConsumablesSummary %>% 
  filter(ba1b1c1_2 %in% c(349, 459, 479, 429, 519, 499, 439, 529)) %>% 
  group_by(HHID) %>% 
  summarise(Sum1 = sum(ba1b1c1_3))

# items with 7 day period
Consumables2 <- ConsumablesSummary %>% 
  filter(ba1b1c1_2 %in% c(309, 319, 329)) %>% 
  group_by(HHID) %>% 
  summarise(Sum2 = sum(ba1b1c1_3) * (30/7))

# items with 365 day period
Consumables3 <- ConsumablesSummary %>% 
  filter(ba1b1c1_2 %in% c(409, 419, 899)) %>% 
  group_by(HHID) %>% 
  summarise(Sum3 = sum(ba1b1c1_3) * (30/365))

Consumables <- Consumables1 %>%
  full_join(Consumables2, by = "HHID") %>%
  full_join(Consumables3, by = "HHID")

Consumables[is.na(Consumables)] <- 0

Consumables <- Consumables %>% 
  mutate(ConsumablesExpense = Sum1 + Sum2 + Sum3) %>% 
  select(HHID, ConsumablesExpense)

anyNA(Consumables)
## [1] FALSE

durables expenses

Durables <- DurablesSummary %>% 
  group_by(HHID) %>% 
  summarise(DurablesExpense = sum(ba1b1c1_3) * (30/365)) %>% 
  select(HHID, DurablesExpense)

Durables[is.na(Durables)] <- 0

anyNA(Durables)
## [1] FALSE

merging all expenses

AllExpenses <- Food %>% 
  full_join(Consumables) %>% 
  full_join(Durables)
## Joining with `by = join_by(HHID)`
## Joining with `by = join_by(HHID)`
AllExpenses[is.na(AllExpenses)] <- 0
anyNA(AllExpenses)
## [1] FALSE

household sizes

FoodHH <- hces_level15_new %>% 
  filter(questionaire_no == "F") %>% 
  select(HHID, ba2b2c2q9)

ConsumablesHH <- hces_level15_new %>% 
  filter(questionaire_no == "C") %>% 
  select(HHID, ba2b2c2q9)

DurablesHH <- hces_level15_new %>% 
  filter(questionaire_no == "D") %>% 
  select(HHID, ba2b2c2q9)

Additional <- hces_level15_new %>% 
  filter(questionaire_no == "F") %>% 
  select(HHID, Weights, sector, state, StateName)

AdditionalInfo <- Additional %>% 
  full_join(FoodHH, by = "HHID") %>% 
  rename(Household.size.F = ba2b2c2q9)

AdditionalInfo <- AdditionalInfo %>% 
  full_join(ConsumablesHH, by = "HHID") %>% 
  rename(Household.size.C = ba2b2c2q9)

AdditionalInfo <- AdditionalInfo %>% 
  full_join(DurablesHH, by = "HHID") %>% 
  rename(Household.size.D = ba2b2c2q9) 

AdditionalInfo <- AdditionalInfo %>% 
  full_join(hces_level3_new, by = "HHID") %>% 
  rename(Household.size.FDQ = b2q2pt1)

anyNA(Additional)
## [1] FALSE

final merge

Results <- full_join(AllExpenses, AdditionalInfo)
## Joining with `by = join_by(HHID)`
anyNA(Results)
## [1] FALSE
Results <- Results %>%
  mutate(
    TE = FoodExpense + (ConsumablesExpense * (Household.size.F/Household.size.C)) + (DurablesExpense * (Household.size.F/Household.size.D)),
    MPCE = TE / Household.size.F
  )

sum(Results$TE * Results$Weights.x)/sum(Results$Household.size.F * Results$Weights.x)
## [1] 4534.269
SummaryTable1 <- Results %>%
  group_by(sector.x) %>%
  summarise(
    MPCE_mean = sum(TE * Weights.x) / sum(Household.size.F * Weights.x),
    MHCE_mean = sum(TE * Weights.x) / sum(Weights.x)
  ) %>%
  mutate(Sector = ifelse(sector.x == 1, "Rural", "Urban"))

print(SummaryTable1)
## # A tibble: 2 × 4
##   sector.x MPCE_mean MHCE_mean Sector
##   <chr>        <dbl>     <dbl> <chr> 
## 1 1            3773.    16839. Rural 
## 2 2            6459.    24828. Urban

household and per capita monthly expenditure (2023)

lorem ipsum

# plot 1
chart1 <- ggplot(SummaryTable1, aes(x = Sector, y = MPCE_mean)) +
  geom_bar(stat = "identity", fill = "deepskyblue4") +
  coord_flip() +
  theme_minimal() +
  geom_text(aes(label = paste0("Rs. ", round(MPCE_mean))),
            vjust = 0.5, 
            hjust = 1.2, 
            color = "white") +
  scale_y_continuous(labels = scales::comma, limits = c(0, 25000)) +
  labs(title = "Monthly Per Capita Consumption Expenditure (mean)",
       x = "",
       y = "") +
  theme(legend.position = "none",
        plot.title = element_text(size = 16))

# plot 2
chart2 <- ggplot(SummaryTable1, aes(x = Sector, y = MHCE_mean)) +
  geom_bar(stat = "identity", fill = "deepskyblue4") +
  coord_flip() +
  theme_minimal() +
  geom_text(aes(label = paste0("Rs. ", round(MHCE_mean))),
            vjust = 0.5,
            hjust = 1.2,
            color = "white") +
  scale_y_continuous(labels = scales::comma) +
  labs(title = "Monthly Household Consumption Expenditure (mean)",
       x = "",
       y = "") +
    theme(legend.position = "none",
          plot.title = element_text(size = 16))

grid.arrange(chart1, chart2)

Monthly per capita expenditure among consumption classes

rural_data <- Results %>% 
  filter(sector.x == 1)

urban_data <- Results %>% 
  filter(sector.x == 2)

rural_data <- rural_data %>% 
  mutate(Quintile = cut(MPCE,
                        breaks = wtd.quantile(MPCE, weights = Weights.x,
                                              probs = seq(0, 1, 0.2)),
                        labels = c("0-20", "20-40", "40-60", "60-80", "80-100"),
                        include.lowest = TRUE))

urban_data <- urban_data %>% 
  mutate(Quintile = cut(MPCE,
                        breaks = wtd.quantile(MPCE, weights = Weights.x,
                                              probs = seq(0, 1, 0.2)),
                        labels = c("0-20", "20-40", "40-60", "60-80", "80-100"),
                        include.lowest = TRUE))

Rural_Summary <- rural_data %>%
  group_by(Quintile) %>%
  summarise(meanMPCE_avg = sum(TE * Weights.x) / sum(Household.size.F * Weights.x))

Urban_Summary <- urban_data %>%
  group_by(Quintile) %>%
  summarise(meanMPCE_avg = sum(TE * Weights.x) / sum(Household.size.F * Weights.x))

Combined_Summary <- left_join(Rural_Summary, Urban_Summary, by = "Quintile", suffix = c("_Rural", "_Urban"))

# plot 3
ggplot(Combined_Summary) +
  geom_point(aes(x = meanMPCE_avg_Rural, y = Quintile, color = "Rural"), size = 2) +
  geom_point(aes(x = meanMPCE_avg_Urban, y = Quintile, color = "Urban"), size = 2) +
  geom_text(aes(x = meanMPCE_avg_Rural, y = Quintile, label = paste0("Rs. ", round(meanMPCE_avg_Rural, 0))), hjust = -0.2) +
  geom_text(aes(x = meanMPCE_avg_Urban, y = Quintile, label = paste0("Rs. ", round(meanMPCE_avg_Urban, 0))), hjust = -0.2) +
  scale_color_manual(values = c("Rural" = "deepskyblue4", "Urban" = "orange")) +
  labs(title = "Monthly Per Capita Expenditure by Consumption Quintile",
       x = "", y = "", color = "") +
  theme_minimal() +
  theme(legend.position = "bottom")

Average monthly per capita expenditure by social group and religion (2023)

# Filter the Results data for selected religions
Results_filtered <- Results %>%
  filter(b4q4pt11 %in% c(1, 2, 3, 4, 5, 6))

# Compute MPCE by religion using weighted mean
MPCE_by_religion <- Results_filtered %>%
  group_by(b4q4pt11) %>%
  summarise(Mean_MPCE = sum(TE * Weights.x) / sum(Household.size.F * Weights.x))

MPCE_by_religion <- MPCE_by_religion %>% 
  mutate(
    Religion = recode(b4q4pt11,
                      "1" = "Hinduism",
                      "2" = "Islam",
                      "3" = "Christianity",
                      "4" = "Sikhism",
                      "5" = "Jainism",
                      "6" = "Buddhism"))

# plot 3
ggplot(MPCE_by_religion, aes(x = fct_reorder(Religion, Mean_MPCE), y = Mean_MPCE)) +
  geom_bar(stat = "identity", fill = "deepskyblue4") +
  coord_flip() +
  theme_minimal() +
  geom_text(aes(label = paste0("Rs. ", round(Mean_MPCE))),
            vjust = 0.5,
            hjust = 1.2,
            color = "white") +
  scale_y_continuous(labels = scales::comma) +
  labs(title = "Average monthly per capita expenditure by religion (2023)",
       x = "",
       y = "") +
  theme(legend.position = "none",
        plot.title = element_text(size = 16)) 

Results_filtered2 <- Results %>% 
  filter(b4q4pt12 %in% c(1, 2, 3, 9))

MPCE_by_social_group <- Results_filtered2 %>%
  group_by(b4q4pt12) %>% 
  summarise(Mean_MPCE = sum(TE * Weights.x) / sum(Household.size.F * Weights.x))

MPCE_by_social_group <- MPCE_by_social_group %>% 
  mutate(
    Social_group = recode(b4q4pt12,
                          "1" = "Scheduled Tribes (ST)",
                          "2" = "Scheduled Castes (SC)",
                          "3" = "Other Backward Classes (OBC)",
                          "9" = "Others (General)")
    )

# plot 4
ggplot(MPCE_by_social_group, aes(x = fct_reorder(Social_group, Mean_MPCE), y = Mean_MPCE)) +
  geom_bar(stat = "identity", fill = "deepskyblue4") +
  coord_flip() +
  theme_minimal() +
  geom_text(aes(label = paste0("Rs. ", round(Mean_MPCE))),
            vjust = 0.5,
            hjust = 1.2,
            color = "white"
            ) +
  scale_y_continuous(labels = scales::comma) +
  scale_y_continuous(labels = scales::comma, limits = c(0, 6000)) +
  labs(title = "Average monthly per capita expenditure by social group (2023)",
       x = "",
       y = "") +
  theme(legend.position = "none",
        plot.title = element_text(size = 16)) 

Average monthly per capita expenditure by state (2023)

MPCE_by_state <- Results %>%
  group_by(state.x, StateName.x) %>%
  summarise(
    Mean_MPCE = sum(TE * Weights.x) / sum(Household.size.F * Weights.x),
    .groups = 'drop'
  ) %>%
  rename(state = state.x, StateName = StateName.x)

india_map <- st_read("in.shp")
## Reading layer `in' from data source 
##   `/Users/tahoorapu/Downloads/univ/sem 2/communicating econ/data assignment/in.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 36 features and 3 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 68.18625 ymin: 6.754256 xmax: 97.41163 ymax: 37.07827
## Geodetic CRS:  WGS 84
state_codes_manual <- data.frame(
  name = c("Andaman and Nicobar", "Telangana", "Andhra Pradesh", "Arunachal Pradesh",
           "Assam", "Bihar", "Chandigarh", "Chhattisgarh",
           "Dādra and Nagar Haveli and Damān and Diu", "Delhi", "Goa", "Gujarat",
           "Haryana", "Himachal Pradesh", "Jharkhand", "Karnataka",
           "Kerala", "Madhya Pradesh", "Maharashtra", "Manipur",
           "Meghalaya", "Mizoram", "Nagaland", "Orissa",
           "Puducherry", "Punjab", "Rajasthan", "Sikkim",
           "Tamil Nadu", "Tripura", "Uttar Pradesh", "Uttaranchal",
           "West Bengal", "Lakshadweep", "Jammu and Kashmir", "Ladakh"),
  state = c(35, 36, 28, 12,
            18, 10, 4, 22, 
            25, 7, 30, 24,
            6, 2, 20, 29,
            32, 23, 27, 14,
            17, 15, 13, 21,
            34, 3, 8, 11,
            33, 16, 9, 5, 
            19, 31, 1, 37))

india_map <- india_map %>% 
  left_join(state_codes_manual, by = "name")

map_data <- left_join(india_map, MPCE_by_state)

# plot 5
ggplot(map_data) +
  geom_sf(aes(fill = Mean_MPCE)) +
  scale_fill_gradientn(colours = brewer.pal(9, "GnBu")[4:9],
                       name = "Monthly Per Capita\nExpenditure",
                       labels = scales::comma) +
  theme_minimal() +
  labs(title = "Average monthly per capita expenditure by state (2023)") +
  theme(
    plot.title = element_text(size = 16)
  )