The task for this assignment was to recreate some graphs from the Data for India dashboard that use the Household Consumption Expenditure Survey 2022-23 data to calculate monthly per capita consumption expenditure.
library(tidyverse)
library(readxl)
library(haven)
library(forcats)
library(gridExtra)
library(Hmisc)
library(scales)
library(sf)
library(RColorBrewer)
setwd("~/Downloads/univ/sem 2/communicating econ/data assignment")
StateList <- read_excel("tabulation_state_code.xlsx") %>%
select(st, stn)
StateList$st <- as.numeric(StateList$st)
hces_level3 <- read_dta("HCESdta/hces22_lvl_3.dta")
hces_level14 <- read_dta("HCESdta/hces22_lvl_14.dta")
hces_level15 <- read_dta("HCESdta/hces22_lvl_15.dta")
hces_level3_new <- hces_level3 %>%
mutate(state = as.numeric(hces_level3$state)) %>%
mutate(Weights = mult / 100,
HHID = paste0(fsu, b1q1pt11, b1q1pt12),
StateName = factor(state,
levels = StateList$st,
labels = StateList$stn))
hces_level14_new <- hces_level14 %>%
mutate(state = as.numeric(hces_level14$state)) %>%
mutate(Weights = mult / 100,
HHID = paste0(fsu, b1q1pt11, b1q1pt12),
StateName = factor(state,
levels = StateList$st,
labels = StateList$stn))
hces_level15_new <- hces_level15 %>%
mutate(state = as.numeric(hces_level15$state)) %>%
mutate(Weights = mult / 100,
HHID = paste0(fsu, b1q1pt11, b1q1pt12),
StateName = factor(state,
levels = StateList$st,
labels = StateList$stn))
FoodSummary <- hces_level14_new %>%
filter(questionaire_no == "F")
ConsumablesSummary <- hces_level14_new %>%
filter(questionaire_no == "C")
DurablesSummary <- hces_level14_new %>%
filter(questionaire_no == "D")
Food1 <- FoodSummary %>%
filter(ba1b1c1_2 %in% c(129, 139, 159, 179)) %>%
group_by(HHID) %>%
summarise(
Sum1 = sum(ba1b1c1_3)
)
Food2 <- FoodSummary %>%
filter(ba1b1c1_2 %in% c(169, 219, 239, 249, 199, 189, 269, 279, 289, 299)) %>%
group_by(HHID) %>%
summarise(
Sum2 = sum(ba1b1c1_3 * (30/7))
)
Food <- full_join(Food1, Food2, by = "HHID")
Food[is.na(Food)] <- 0
Food <- Food %>%
mutate(FoodExpense = Sum1 + Sum2) %>%
select(HHID, FoodExpense)
# items with 30 day period
Consumables1 <- ConsumablesSummary %>%
filter(ba1b1c1_2 %in% c(349, 459, 479, 429, 519, 499, 439, 529)) %>%
group_by(HHID) %>%
summarise(Sum1 = sum(ba1b1c1_3))
# items with 7 day period
Consumables2 <- ConsumablesSummary %>%
filter(ba1b1c1_2 %in% c(309, 319, 329)) %>%
group_by(HHID) %>%
summarise(Sum2 = sum(ba1b1c1_3) * (30/7))
# items with 365 day period
Consumables3 <- ConsumablesSummary %>%
filter(ba1b1c1_2 %in% c(409, 419, 899)) %>%
group_by(HHID) %>%
summarise(Sum3 = sum(ba1b1c1_3) * (30/365))
Consumables <- Consumables1 %>%
full_join(Consumables2, by = "HHID") %>%
full_join(Consumables3, by = "HHID")
Consumables[is.na(Consumables)] <- 0
Consumables <- Consumables %>%
mutate(ConsumablesExpense = Sum1 + Sum2 + Sum3) %>%
select(HHID, ConsumablesExpense)
anyNA(Consumables)
## [1] FALSE
Durables <- DurablesSummary %>%
group_by(HHID) %>%
summarise(DurablesExpense = sum(ba1b1c1_3) * (30/365)) %>%
select(HHID, DurablesExpense)
Durables[is.na(Durables)] <- 0
anyNA(Durables)
## [1] FALSE
AllExpenses <- Food %>%
full_join(Consumables) %>%
full_join(Durables)
## Joining with `by = join_by(HHID)`
## Joining with `by = join_by(HHID)`
AllExpenses[is.na(AllExpenses)] <- 0
anyNA(AllExpenses)
## [1] FALSE
FoodHH <- hces_level15_new %>%
filter(questionaire_no == "F") %>%
select(HHID, ba2b2c2q9)
ConsumablesHH <- hces_level15_new %>%
filter(questionaire_no == "C") %>%
select(HHID, ba2b2c2q9)
DurablesHH <- hces_level15_new %>%
filter(questionaire_no == "D") %>%
select(HHID, ba2b2c2q9)
Additional <- hces_level15_new %>%
filter(questionaire_no == "F") %>%
select(HHID, Weights, sector, state, StateName)
AdditionalInfo <- Additional %>%
full_join(FoodHH, by = "HHID") %>%
rename(Household.size.F = ba2b2c2q9)
AdditionalInfo <- AdditionalInfo %>%
full_join(ConsumablesHH, by = "HHID") %>%
rename(Household.size.C = ba2b2c2q9)
AdditionalInfo <- AdditionalInfo %>%
full_join(DurablesHH, by = "HHID") %>%
rename(Household.size.D = ba2b2c2q9)
AdditionalInfo <- AdditionalInfo %>%
full_join(hces_level3_new, by = "HHID") %>%
rename(Household.size.FDQ = b2q2pt1)
anyNA(Additional)
## [1] FALSE
Results <- full_join(AllExpenses, AdditionalInfo)
## Joining with `by = join_by(HHID)`
anyNA(Results)
## [1] FALSE
Results <- Results %>%
mutate(
TE = FoodExpense + (ConsumablesExpense * (Household.size.F/Household.size.C)) + (DurablesExpense * (Household.size.F/Household.size.D)),
MPCE = TE / Household.size.F
)
sum(Results$TE * Results$Weights.x)/sum(Results$Household.size.F * Results$Weights.x)
## [1] 4534.269
SummaryTable1 <- Results %>%
group_by(sector.x) %>%
summarise(
MPCE_mean = sum(TE * Weights.x) / sum(Household.size.F * Weights.x),
MHCE_mean = sum(TE * Weights.x) / sum(Weights.x)
) %>%
mutate(Sector = ifelse(sector.x == 1, "Rural", "Urban"))
print(SummaryTable1)
## # A tibble: 2 × 4
## sector.x MPCE_mean MHCE_mean Sector
## <chr> <dbl> <dbl> <chr>
## 1 1 3773. 16839. Rural
## 2 2 6459. 24828. Urban
lorem ipsum
# plot 1
chart1 <- ggplot(SummaryTable1, aes(x = Sector, y = MPCE_mean)) +
geom_bar(stat = "identity", fill = "deepskyblue4") +
coord_flip() +
theme_minimal() +
geom_text(aes(label = paste0("Rs. ", round(MPCE_mean))),
vjust = 0.5,
hjust = 1.2,
color = "white") +
scale_y_continuous(labels = scales::comma, limits = c(0, 25000)) +
labs(title = "Monthly Per Capita Consumption Expenditure (mean)",
x = "",
y = "") +
theme(legend.position = "none",
plot.title = element_text(size = 16))
# plot 2
chart2 <- ggplot(SummaryTable1, aes(x = Sector, y = MHCE_mean)) +
geom_bar(stat = "identity", fill = "deepskyblue4") +
coord_flip() +
theme_minimal() +
geom_text(aes(label = paste0("Rs. ", round(MHCE_mean))),
vjust = 0.5,
hjust = 1.2,
color = "white") +
scale_y_continuous(labels = scales::comma) +
labs(title = "Monthly Household Consumption Expenditure (mean)",
x = "",
y = "") +
theme(legend.position = "none",
plot.title = element_text(size = 16))
grid.arrange(chart1, chart2)
rural_data <- Results %>%
filter(sector.x == 1)
urban_data <- Results %>%
filter(sector.x == 2)
rural_data <- rural_data %>%
mutate(Quintile = cut(MPCE,
breaks = wtd.quantile(MPCE, weights = Weights.x,
probs = seq(0, 1, 0.2)),
labels = c("0-20", "20-40", "40-60", "60-80", "80-100"),
include.lowest = TRUE))
urban_data <- urban_data %>%
mutate(Quintile = cut(MPCE,
breaks = wtd.quantile(MPCE, weights = Weights.x,
probs = seq(0, 1, 0.2)),
labels = c("0-20", "20-40", "40-60", "60-80", "80-100"),
include.lowest = TRUE))
Rural_Summary <- rural_data %>%
group_by(Quintile) %>%
summarise(meanMPCE_avg = sum(TE * Weights.x) / sum(Household.size.F * Weights.x))
Urban_Summary <- urban_data %>%
group_by(Quintile) %>%
summarise(meanMPCE_avg = sum(TE * Weights.x) / sum(Household.size.F * Weights.x))
Combined_Summary <- left_join(Rural_Summary, Urban_Summary, by = "Quintile", suffix = c("_Rural", "_Urban"))
# plot 3
ggplot(Combined_Summary) +
geom_point(aes(x = meanMPCE_avg_Rural, y = Quintile, color = "Rural"), size = 2) +
geom_point(aes(x = meanMPCE_avg_Urban, y = Quintile, color = "Urban"), size = 2) +
geom_text(aes(x = meanMPCE_avg_Rural, y = Quintile, label = paste0("Rs. ", round(meanMPCE_avg_Rural, 0))), hjust = -0.2) +
geom_text(aes(x = meanMPCE_avg_Urban, y = Quintile, label = paste0("Rs. ", round(meanMPCE_avg_Urban, 0))), hjust = -0.2) +
scale_color_manual(values = c("Rural" = "deepskyblue4", "Urban" = "orange")) +
labs(title = "Monthly Per Capita Expenditure by Consumption Quintile",
x = "", y = "", color = "") +
theme_minimal() +
theme(legend.position = "bottom")
MPCE_by_state <- Results %>%
group_by(state.x, StateName.x) %>%
summarise(
Mean_MPCE = sum(TE * Weights.x) / sum(Household.size.F * Weights.x),
.groups = 'drop'
) %>%
rename(state = state.x, StateName = StateName.x)
india_map <- st_read("in.shp")
## Reading layer `in' from data source
## `/Users/tahoorapu/Downloads/univ/sem 2/communicating econ/data assignment/in.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 36 features and 3 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 68.18625 ymin: 6.754256 xmax: 97.41163 ymax: 37.07827
## Geodetic CRS: WGS 84
state_codes_manual <- data.frame(
name = c("Andaman and Nicobar", "Telangana", "Andhra Pradesh", "Arunachal Pradesh",
"Assam", "Bihar", "Chandigarh", "Chhattisgarh",
"Dādra and Nagar Haveli and Damān and Diu", "Delhi", "Goa", "Gujarat",
"Haryana", "Himachal Pradesh", "Jharkhand", "Karnataka",
"Kerala", "Madhya Pradesh", "Maharashtra", "Manipur",
"Meghalaya", "Mizoram", "Nagaland", "Orissa",
"Puducherry", "Punjab", "Rajasthan", "Sikkim",
"Tamil Nadu", "Tripura", "Uttar Pradesh", "Uttaranchal",
"West Bengal", "Lakshadweep", "Jammu and Kashmir", "Ladakh"),
state = c(35, 36, 28, 12,
18, 10, 4, 22,
25, 7, 30, 24,
6, 2, 20, 29,
32, 23, 27, 14,
17, 15, 13, 21,
34, 3, 8, 11,
33, 16, 9, 5,
19, 31, 1, 37))
india_map <- india_map %>%
left_join(state_codes_manual, by = "name")
map_data <- left_join(india_map, MPCE_by_state)
# plot 5
ggplot(map_data) +
geom_sf(aes(fill = Mean_MPCE)) +
scale_fill_gradientn(colours = brewer.pal(9, "GnBu")[4:9],
name = "Monthly Per Capita\nExpenditure",
labels = scales::comma) +
theme_minimal() +
labs(title = "Average monthly per capita expenditure by state (2023)") +
theme(
plot.title = element_text(size = 16)
)