# Read in HHS data
fastfield <- read_csv(here("data", "raw", "hhs_fastfield.csv"))
#kobo_1 <- read_excel(here("data", "raw", "hhs_kobo_mod_1.xlsx")) # Same as fp
kobo_1_fp <- read_csv(here("data", "raw", "hhs_fp.csv"))
#kobo_2 <- read_excel(here("data", "raw", "hhs_kobo_mod_2.xlsx")) # Same as fp (divided for hon and phi)
# devtools::install_github("datadotworld/data.world-r")
# devtools::install_github("datadotworld/dwapi-r")
dwapi::configure(auth_token = "eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiJyLWFuZC1yLXN0dWRpbzptYXJpYW5vdml6IiwiaXNzIjoiY2xpZW50OnItYW5kLXItc3R1ZGlvOmFnZW50Om1hcmlhbm92aXo6OjEyYzRkOTYzLTk0NDctNDg5Ni04ZmM0LTQ5YTM1ZWQ2MWQ0NSIsImlhdCI6MTcyODUwNTU0MCwicm9sZSI6WyJ1c2VyX2FwaV9hZG1pbiIsInVzZXJfYXBpX3JlYWQiLCJ1c2VyX2FwaV93cml0ZSJdLCJnZW5lcmFsLXB1cnBvc2UiOnRydWUsInNhbWwiOnt9fQ.MSVsDGnQK4y3WWiWYrDluIYsojOmmbrJC-PeFSgZY7qDq63pIobopiDTHcaCX8LG7pwuBC5HBQrO8Kk8ifahpQ")
sql_stmt <- qry_sql("SELECT * FROM hhs_fp_hnd")  
kobo_2_fp_hon <- data.world::query(
  sql_stmt, "rare/household-surveys"
)
kobo_2_fp_phi <- read_csv(here("data", "raw", "hhs_fp_phl.csv"))
#kobo_3 <- read_excel(here("data", "raw", "hhs_kobo_mod_3.xlsx")) #Check which sheets are relevant!


# Read in Income Data

# World Bank Data
wb_median_income <- read.csv("https://ourworldindata.org/grapher/daily-median-income.csv?v=1&csvType=full&useColumnShortNames=true")
wb_median_income <- wb_median_income %>% 
  filter(Entity == c("Brazil", "Honduras", "Indonesia", "Mozambique", "Philippines"))



# Honduras
hon_2023 <- read_excel(here("data", "raw","hh_income","hon", "10.-Cuadros-de-Ingreso-marzo-2023.xlsx"), sheet = "C01", range = "D15:E15", col_names = FALSE) # Rural Income! (in Honduran Lempira)
colnames(hon_2023) <- c("household_size", "per_capita_income")
hon_2023$year <- 2023

hon_2019 <- read_excel(here("data", "raw","hh_income", "hon", "5.-Cuadros-de-Ingreso-2019.xlsx"), sheet = "C01", range = "D15:F15", col_names = FALSE) # Rural Income! (in Honduran Lempira)
hon_2019 <- hon_2019[, c(1, 3)]
colnames(hon_2019) <- c("household_size", "per_capita_income")
hon_2019$year <- 2019

hon_2021 <- read_excel(here("data", "raw","hh_income", "hon", "4.-Cuadros-de-Mercado-Laboral-octubre-2021.xlsx"), sheet = "C05", range = "B15", col_names = FALSE) # Rural Income! (in Honduran Lempira)
colnames(hon_2021) <- c("per_capita_income")
household_size_value <- (hon_2023[[1, 1]]+hon_2019[[1, 1]])/2
hon_2021$household_size <- household_size_value
hon_2021$year <- 2021

hon_hh_income <- bind_rows(hon_2019, hon_2021, hon_2023)

# Add the new column for household income
hon_hh_income <- hon_hh_income %>%
  mutate(household_income_rural = household_size * per_capita_income)



# Brazil:
# 2019 Average Household Size: 3.0 people (as per IBGE data).
# 2022 Average Household Size: 2.79 people (as per IBGE data).
# 2023 Average Household Size (estimated based on the decline 2019/2022 of 0.21 people over 3 years): 2.75 people (estimated)
bra_2023 <- read_excel(here("data", "raw","hh_income","bra", "Brazil per capita monthly nominal household earnings - 2019_2023.xlsx"), sheet = "2023", range = "B3", col_names = FALSE)
colnames(bra_2023) <- "per_capita_income"
bra_2023$household_size <- 2.75
bra_2023$year <- 2023

bra_2022 <- read_excel(here("data", "raw","hh_income","bra", "Brazil per capita monthly nominal household earnings - 2019_2023.xlsx"), sheet = "2022", range = "B3", col_names = FALSE)
colnames(bra_2022) <- "per_capita_income"
bra_2022$household_size <- 2.79
bra_2022$year <- 2022

bra_2019 <- read_excel(here("data", "raw","hh_income","bra", "Brazil per capita monthly nominal household earnings - 2019_2023.xlsx"), sheet = "2019", range = "B3", col_names = FALSE)
colnames(bra_2019) <- "per_capita_income"
bra_2019$household_size <- 3
bra_2019$year <- 2019


bra_hh_income <- bind_rows(bra_2019, bra_2022, bra_2023)
bra_hh_income <- bra_hh_income %>%
  mutate(household_income = household_size * per_capita_income)


# Indonesia
# For this case since it's expressed in Average of Net Income per Month of Casual Worker, we would need not only Average Household Size (which is easy to get) but also the Number of Income Earners per Household (info about this is really limited). So, for this exercise we assume that the casual worker is the sole income earner in the household; i.e., the household income would be equivalent to the worker's net monthly income.

ind_2019 <- read_excel(here("data", "raw","hh_income","ind", "Average of Net Income per Month of Casual Worker) by Province and Main Industry (thousand rupiahs), 2019.xlsx"), sheet = "3.3.1", range = "C51:G51", col_names = FALSE)
ind_2019 <- ind_2019[, c(1, 5)]
ind_2019$average <- rowMeans(ind_2019, na.rm = TRUE)
ind_2019 <- ind_2019[, 3]
colnames(ind_2019) <- "casual_worker_income"
ind_2019$year <- 2019


ind_2020 <- read_excel(here("data", "raw","hh_income","ind", "Average of Net Income per Month of Casual Worker) by Province and Main Industry (thousand rupiahs), 2020.xlsx"), sheet = "ENG", range = "C51:G51", col_names = FALSE)
ind_2020 <- ind_2020[, c(1, 5)]
ind_2020$average <- rowMeans(ind_2020, na.rm = TRUE)
ind_2020 <- ind_2020[, 3]
colnames(ind_2020) <- "casual_worker_income"
ind_2020$year <- 2020

ind_2021 <- read_excel(here("data", "raw","hh_income","ind", "Average of Net Income per Month of Casual Worker) by Province and Main Industry (thousand rupiahs), 2021.xlsx"), sheet = "Eng", range = "C51:G51", col_names = FALSE)
ind_2021 <- ind_2021[, c(1, 5)]
ind_2021$average <- rowMeans(ind_2021, na.rm = TRUE)
ind_2021 <- ind_2021[, 3]
colnames(ind_2021) <- "casual_worker_income"
ind_2021$year <- 2021

ind_2023 <- read_excel(here("data", "raw","hh_income","ind", "Average of Net Income per Month of Casual Worker) by Province and Main Industry (thousand rupiahs), 2023.xlsx"), sheet = "Eng", range = "C51", col_names = FALSE)
colnames(ind_2023) <- "casual_worker_income"
ind_2023$year <- 2023

ind_2024 <- read_excel(here("data", "raw","hh_income","ind", "Average of Net Income per Month of Casual Worker) by Province and Main Industry (thousand rupiahs), 2024.xlsx"), sheet = "Eng", range = "C54", col_names = FALSE)
colnames(ind_2024) <- "casual_worker_income"
ind_2024$year <- 2024

ind_hh_income <- bind_rows(ind_2019, ind_2020, ind_2021, ind_2023, ind_2024)
ind_hh_income <- ind_hh_income %>%
  mutate(household_income = casual_worker_income * 1000) #


# Phillipines
# Average Annual Family Income (in thousand PhP)
phi_hh_income <- read_excel(here("data", "raw","hh_income","phi", "Average Annual Family Income (in thousand PhP).xlsx"), sheet = "data", range = "A3:B5", col_names = FALSE)
colnames(phi_hh_income) <- c("household_income", "year")

phi_hh_income <- phi_hh_income %>%
  mutate(household_income = (household_income * 1000)/12)
# Data Wrangling

# Sites
fastfield$ma_name <- tolower(gsub(" ", "_", fastfield$maa))
kobo_1_fp$ma_name <- tolower(gsub(" ", "_", kobo_1_fp$ma_name))
kobo_2_fp_hon$ma_name <- tolower(gsub(" ", "_", kobo_2_fp_hon$ma_name))
kobo_2_fp_phi$ma_name <- tolower(gsub(" ", "_", kobo_2_fp_phi$level4_name))

fastfield <- fastfield %>% 
  filter(!is.na(ma_name))

kobo_1_fp <- kobo_1_fp %>% 
  filter(!is.na(ma_name))

kobo_2_fp_hon <- kobo_2_fp_hon %>% 
  filter(!is.na(ma_name))

kobo_2_fp_phi <- kobo_2_fp_phi %>% 
  filter(!is.na(ma_name))

# Get years
kobo_1_fp$year <- year(ymd(kobo_1_fp$submission_time)) 
kobo_2_fp_hon$year <- year(ymd(kobo_2_fp_hon$submission_time))
kobo_2_fp_phi$year <- year(ymd(kobo_2_fp_phi$submission_time))

# Country
fastfield <- fastfield %>% 
  rename(country = iso3)

fastfield <- fastfield %>%
  mutate(country = recode(country,
                          "HND" = "Honduras",
                          "BRA" = "Brazil",
                          "FSM" = "Federated States of Micronesia",
                          "IDN" = "Indonesia",
                          "PLW" = "Palau",
                          "GTM" = "Guatemala",
                          "MOZ" = "Mozambique",
                          "PHL" = "Philippines"))


kobo_1_fp <- kobo_1_fp %>%
  filter(!is.na(country)) %>% # Remove rows with NA in 'country'
  mutate(country = recode(country,
                          "HND" = "Honduras",
                          "BRA" = "Brazil",
                          "FSM" = "Federated States of Micronesia",
                          "IDN" = "Indonesia",
                          "PLW" = "Palau",
                          "MOZ" = "Mozambique",
                          "PHL" = "Philippines"))

kobo_2_fp_hon <- kobo_2_fp_hon %>%
  mutate(country = recode(country,
                          "HND" = "Honduras"))

kobo_2_fp_phi <- kobo_2_fp_phi %>%
  mutate(country = recode(country,
                          "PHL" = "Philippines"))
# Q83

# Find data
colnames(fastfield)[grepl("83", colnames(fastfield))]
#> [1] "83_hh_average_income"
colnames(kobo_1_fp)[grepl("average_income", colnames(kobo_1_fp))] # 83_hh_average_income
#> [1] "83_hh_average_income"
colnames(kobo_2_fp_hon)[grepl("average_income", colnames(kobo_2_fp_hon))] # 57_hh_average_income
#> [1] "57_hh_average_income"
colnames(kobo_2_fp_phi)[grepl("average_income", colnames(kobo_2_fp_phi))]
#> [1] "57_hh_average_income"

# Rename
kobo_2_fp_hon <- kobo_2_fp_hon %>% 
  rename('83_hh_average_income' = '57_hh_average_income')
kobo_2_fp_phi <- kobo_2_fp_phi %>% 
  rename('83_hh_average_income' = '57_hh_average_income')

# Answer in local currency
fastfield_83 <- fastfield %>% 
  select(country, year, '83_hh_average_income')
kobo_1_fp_83 <- kobo_1_fp %>% 
  select(country, year, '83_hh_average_income')
kobo_2_fp_hon_83 <- kobo_2_fp_hon %>% 
  select(country, year, '83_hh_average_income')
kobo_2_fp_phi_83 <- kobo_2_fp_phi %>% 
  select(country, year, '83_hh_average_income')


# Combine data for question 83
q83 <- bind_rows(
  fastfield_83 %>% select(year, country, `83_hh_average_income`),
  kobo_1_fp_83 %>% select(year, country, `83_hh_average_income`),
  kobo_2_fp_hon_83 %>% select(year, country, `83_hh_average_income`),
  kobo_2_fp_phi_83 %>% select(year, country, `83_hh_average_income`))

q83<- q83 %>% 
  filter(complete.cases(.)) 
# Honduras

q83_hon <- q83 %>% 
  filter(country == "Honduras") %>%
  filter(`83_hh_average_income` < quantile(`83_hh_average_income`, 0.99, na.rm = TRUE))  # Remove values above the 99th percentile (errors)



# Recalculate the trend with filtered data
q83_hon_trend <- q83_hon %>%
  group_by(year) %>%
  summarize(avg_income = mean(`83_hh_average_income`, na.rm = TRUE))

# Plot with filtered data
hon <- ggplot() +
  geom_violin(data = q83_hon, aes(x = as.factor(year), y = `83_hh_average_income`), 
              fill = "#BEBADA", alpha = 0.5) +
  geom_line(data = q83_hon_trend, aes(x = as.factor(year), y = avg_income, group = 1, color = "Average HH Income for FF"), 
            size = 0.5) +
  geom_point(data = q83_hon_trend, aes(x = as.factor(year), y = avg_income, color = "Average HH Income for FF"), 
             size = 2) +
  geom_line(data = hon_hh_income, aes(x = as.factor(year), y = household_income_rural, group = 1, color = "Average HH Rural Income (Source: INE)"), 
            size = 0.5) +
  geom_point(data = hon_hh_income, aes(x = as.factor(year), y = household_income_rural, color = "Average HH Rural Income (Source: INE)"), 
             size = 2) +
  labs(
    title = "Trend of Average Monthly HH Income (Q83) for Honduras",
    x = "Year",
    y = "Average Monthly Income per Household (HNL)",
    color = ""  # Blank legend title
  ) +
  theme_minimal() +
  theme(legend.position = "bottom") +
  scale_color_manual(values = c("Average HH Income for FF" = "black", 
                                "Average HH Rural Income (Source: INE)" = "red"))



# show_col(brewer_pal(palette = "Set3")(6)) #To find the colour used in the original violin plot

# DiD
ff_data <- q83_hon_trend %>%
  mutate(group = 1, income = avg_income) %>%
  select(year, group, income) %>% 
  filter(year!= 2024)

rural_data <- hon_hh_income %>%
  mutate(group = 0, income = household_income_rural) %>%
  select(year, group, income)

did_data <- bind_rows(ff_data, rural_data)

did_data <- did_data %>%
  mutate(treatment = ifelse(year >= 2021, 1, 0))

did_model <- lm(income ~ group * treatment, data = did_data)

did_hon <- summary(did_model)

# The interaction term group:treatment (DiD estimate) measures the differential effect of the program on Fish Forever households compared to rural households over time.



# 2019 to 2021
did_data_2019_2021 <- did_data %>% filter(year %in% c(2019, 2021))
did_model_2019_2021 <- lm(income ~ group, data = did_data_2019_2021)
summary(did_model_2019_2021)
#> 
#> Call:
#> lm(formula = income ~ group, data = did_data_2019_2021)
#> 
#> Residuals:
#>     1     2     3     4 
#>  1115 -1115 -3843  3843 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)  
#> (Intercept)    11715       2830   4.140   0.0537 .
#> group          -6658       4002  -1.664   0.2381  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 4002 on 2 degrees of freedom
#> Multiple R-squared:  0.5805, Adjusted R-squared:  0.3708 
#> F-statistic: 2.768 on 1 and 2 DF,  p-value: 0.2381

# 2021 to 2023
did_data_2021_2023 <- did_data %>% filter(year %in% c(2021, 2023)) 
did_model_2021_2023 <- lm(income ~ group, data = did_data_2021_2023)
summary(did_model_2021_2023)
#> 
#> Call:
#> lm(formula = income ~ group, data = did_data_2021_2023)
#> 
#> Residuals:
#>       1       2       3       4 
#> -1593.5  1593.5   880.3  -880.3 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)   
#> (Intercept)    14678       1287  11.402   0.0076 **
#> group          -9142       1820  -5.022   0.0374 * 
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 1820 on 2 degrees of freedom
#> Multiple R-squared:  0.9265, Adjusted R-squared:  0.8898 
#> F-statistic: 25.22 on 1 and 2 DF,  p-value: 0.03744


# - The Difference-in-Differences approach estimates the interaction effect (group:treatment), which captures the relative impact of the program (FF vs. Rural households) after the treatment year (2019).
# 
# - The interaction term group:treatment (DiD estimate) measures the differential effect of the program on FF households compared to rural households over time.
# 
# **Key Coefficients:**
# - Intercept = 7872 : Baseline income for rural households before the program (2019).
# - group = -1700 : Fish Forever households had 1700 HNL lower income than rural households before the program.
# - treatment = 6806 : Rural households' income increased by 6806 HNL post-program.
# - group:treatment (Interaction) = -7443: FF households experienced a relative decrease of 7443 HNL compared to rural households after the program, though this result is not statistically significant (p=0.14).
# 
# 
# **Significance:**
# - The p-value for the interaction term suggests limited evidence of a statistically significant effect of the program.
# Fixed effects mode
fe_model <- feols(income ~ group + year, data = did_data)
fe_hon <- summary(fe_model)



# - This model uses fixed effects to capture the average differences between FF and rural households (group) and year-specific trends (year).
# 
# **Key Coefficients:**
# - group = -6661 : FF households, on average, had 6661 HNL lower income than rural households, independent of time.
# - year = 860 : Average income increased by 860 HNL per year, reflecting the general time trend across both groups.
# 
# 
# **Significance:**
# - The group coefficient (p=0.069) suggests marginal evidence that FF households had consistently lower income than rural households.
# - The year coefficient (p=0.326) suggests no strong evidence of a significant yearly trend in income.
# 
# 
# 
# 
# KEY PROBLEM: very small sample size (which limits the ability to get valid models)! 
# 
# SOLUTION:
# - Adding more years (e.g., pre-2019 or post-2023 data).
# - Including more observations!! --> Here we are working with the averages and not individual-level samples!!
#     Individual-level samples (rather than the averages):
#     - Higher statistical power
#     - Accurate std errors (with averages we ignore the within-group variation in income, underestimating the std errors)
#     - Robustness (weight observations appropriately if needed-e.g. if groups or years have different sample sizes)
#     
# (- Also: increase data comparability --> include additional filtering: income source, hh location, or other demographics)
# Brazil

q83_bra <- q83 %>% 
  filter(country == "Brazil") %>%
  filter(`83_hh_average_income` < quantile(`83_hh_average_income`, 0.99, na.rm = TRUE))  # Remove values above the 99th percentile (errors)



# Recalculate the trend with filtered data
q83_bra_trend <- q83_bra %>%
  group_by(year) %>%
  summarize(avg_income = mean(`83_hh_average_income`, na.rm = TRUE))

# Plot with filtered data
bra <- ggplot() +
  geom_violin(data = q83_bra, aes(x = as.factor(year), y = `83_hh_average_income`), 
              fill = "#8DD3C7", alpha = 0.5) +
  geom_line(data = q83_bra_trend, aes(x = as.factor(year), y = avg_income, group = 1, color = "Average HH Income for FF"), 
            size = 0.5) +
  geom_point(data = q83_bra_trend, aes(x = as.factor(year), y = avg_income, color = "Average HH Income for FF"), 
             size = 2) +
  geom_line(data = bra_hh_income, aes(x = as.factor(year), y = household_income, group = 1, color = "Average HH Income (Source: IBGE)"), 
            size = 0.5) +
  geom_point(data = bra_hh_income, aes(x = as.factor(year), y = household_income, color = "Average HH Income (Source: IBGE)"), 
             size = 2) +
  labs(
    title = "Trend of Average Monthly HH Income (Q83) for Brazil",
    x = "Year",
    y = "Average Monthly Income per Household (BRL)",
    color = ""  # Blank legend title
  ) +
  theme_minimal() +
  theme(legend.position = "bottom") +
  scale_color_manual(values = c("Average HH Income for FF" = "black", 
                                "Average HH Income (Source: IBGE)" = "red"))


# show_col(brewer_pal(palette = "Set3")(6)) #To find the colour used in the original violin plot
# Indonesia

q83_ind <- q83 %>% 
  filter(country == "Indonesia") %>%
  filter(`83_hh_average_income` < quantile(`83_hh_average_income`, 0.99, na.rm = TRUE))  # Remove values above the 99th percentile (errors)



# Recalculate the trend with filtered data
q83_ind_trend <- q83_ind %>%
  group_by(year) %>%
  summarize(avg_income = mean(`83_hh_average_income`, na.rm = TRUE))

q83_ind$year <- factor(q83_ind$year, levels = as.character(2018:2024))
q83_ind_trend$year <- factor(q83_ind_trend$year, levels = as.character(2018:2024))
ind_hh_income$year <- factor(ind_hh_income$year, levels = as.character(2018:2024))

# Plot
ind <- ggplot() +
  geom_violin(data = q83_ind, aes(x = year, y = `83_hh_average_income`), 
              fill = "#FB8072", alpha = 0.5) +
  geom_line(data = q83_ind_trend, aes(x = year, y = avg_income, group = 1, color = "Average HH Income for FF"), 
            size = 0.5) +
  geom_point(data = q83_ind_trend, aes(x = year, y = avg_income, color = "Average HH Income for FF"), 
             size = 2) +
  geom_line(data = ind_hh_income, aes(x = year, y = household_income, group = 1, color = "Average HH Agricultural Income (Source: BPS)"), 
            size = 0.5) +
  geom_point(data = ind_hh_income, aes(x = year, y = household_income, color = "Average HH Agricultural Income (Source: BPS)"), 
             size = 2) +
  labs(
    title = "Trend of Average Monthly HH Income (Q83) for Indonesia",
    x = "Year",
    y = "Average Monthly Income per Household (IDR)",
    color = ""  # Blank legend title
  ) +
  scale_y_continuous(labels = scales::label_comma()) +  # Use `scales::label_comma()` for formatted y-axis
  theme_minimal() +
  theme(legend.position = "bottom") +
  scale_color_manual(values = c("Average HH Income for FF" = "black", 
                                "Average HH Agricultural Income (Source: BPS)" = "red")) +
  scale_x_discrete(drop = FALSE)


# show_col(brewer_pal(palette = "Set3")(6)) 
# Philippines

q83_phi <- q83 %>% 
  filter(country == "Philippines") %>%
  filter(`83_hh_average_income` < quantile(`83_hh_average_income`, 0.99, na.rm = TRUE))  # Remove values above the 99th percentile (errors)



# Recalculate the trend with filtered data
q83_phi_trend <- q83_phi %>%
  group_by(year) %>%
  summarize(avg_income = mean(`83_hh_average_income`, na.rm = TRUE))

# Plot with filtered data

q83_phi$year <- factor(q83_phi$year, levels = as.character(2018:2024))
q83_phi_trend$year <- factor(q83_phi_trend$year, levels = as.character(2018:2024))
phi_hh_income$year <- factor(phi_hh_income$year, levels = as.character(2018:2024))

# Plot
phi <- ggplot() +
  geom_violin(data = q83_phi, aes(x = year, y = `83_hh_average_income`), 
              fill = "#FDB462", alpha = 0.5) +
  geom_line(data = q83_phi_trend, aes(x = year, y = avg_income, group = 1, color = "Average HH Income for FF"), 
            size = 0.5) +
  geom_point(data = q83_phi_trend, aes(x = year, y = avg_income, color = "Average HH Income for FF"), 
             size = 2) +
  geom_line(data = phi_hh_income, aes(x = year, y = household_income, group = 1, color = "Average HH Income (Source: PSA)"), 
            size = 0.5) +
  geom_point(data = phi_hh_income, aes(x = year, y = household_income, color = "Average HH Income (Source: PSA)"), 
             size = 2) +
  scale_x_discrete(drop = FALSE) +  # Ensure all levels (2018-2024) are shown, even if missing data
  labs(
    title = "Trend of Average Monthly HH Income (Q83) for the Philippines",
    x = "Year",
    y = "Average Monthly Income per Household (PHP)",
    color = ""  # Blank legend title
  ) +
  theme_minimal() +
  theme(legend.position = "bottom") +
  scale_color_manual(values = c("Average HH Income for FF" = "black", 
                                "Average HH Income (Source: PSA)" = "red"))
# Read PPP-adjusted international dollars dataset
ppp <- read_csv(here("data", "raw", "PPP_world_bank", "PPP_world_bank.csv"),
                skip = 4)


fastfield_83 <- fastfield %>% 
  select(country, year, '83_hh_average_income')
kobo_1_fp_83 <- kobo_1_fp %>% 
  select(country, year, '83_hh_average_income')
kobo_2_fp_hon_83 <- kobo_2_fp_hon %>% 
  select(country, year, '83_hh_average_income')
kobo_2_fp_phi_83 <- kobo_2_fp_phi %>% 
  select(country, year, '83_hh_average_income')


# Convert to PPP-adjusted income

country_mapping <- c(
  "Honduras" = "Honduras",
  "Brazil" = "Brazil",
  "Micronesia, Fed. Sts." = "Federated States of Micronesia",
  "Indonesia" = "Indonesia",
  "Mozambique" = "Mozambique",
  "Philippines" = "Philippines"
)


ppp <- ppp %>%
  mutate(`2024` = `2023`) # using 2023 values for 2024 since the dataset has no values for 2024


ppp_long <- ppp %>%
  rename(country = `Country Name`) %>%
  mutate(country = recode(country, !!!country_mapping)) %>%
  pivot_longer(cols = starts_with("19") | starts_with("20"),  
               names_to = "year",
               values_to = "ppp_value") %>%
  mutate(year = as.numeric(year))  

ppp_long <- ppp_long %>% 
  select(country, year, ppp_value)


convert_to_ppp <- function(data) {
  data %>%
    left_join(ppp_long, by = c("country", "year")) %>%
    mutate(`83_hh_average_income_ppp` = `83_hh_average_income` / ppp_value) %>%
    select(-ppp_value)  # Remove PPP column after calculation if desired
}

# Apply the function to each dataset to get PPP-adjusted income
fastfield_83 <- convert_to_ppp(fastfield_83)
kobo_1_fp_83 <- convert_to_ppp(kobo_1_fp_83)
kobo_2_fp_hon_83 <- convert_to_ppp(kobo_2_fp_hon_83)
kobo_2_fp_phi_83 <- convert_to_ppp(kobo_2_fp_phi_83)


# sum(is.na(fastfield_83$'83_hh_average_income_ppp'))
# sum(is.na(kobo_1_fp_83$'83_hh_average_income_ppp'))
# sum(is.na(kobo_2_fp_hon_83$'83_hh_average_income_ppp'))
# sum(is.na(kobo_2_fp_phi_83$'83_hh_average_income_ppp'))

# Combine data for question 83
q83_data <- bind_rows(
  fastfield_83 %>% select(year, country, `83_hh_average_income_ppp`),
  kobo_1_fp_83 %>% select(year, country, `83_hh_average_income_ppp`),
  kobo_2_fp_hon_83 %>% select(year, country, `83_hh_average_income_ppp`),
  kobo_2_fp_phi_83 %>% select(year, country, `83_hh_average_income_ppp`))

q83_data <- q83_data %>% 
  filter(country != "Palau")


# Remove NAs
q83_data <- q83_data %>%
  mutate(`83_hh_average_income_ppp` = as.numeric(`83_hh_average_income_ppp`))
q83_data <- q83_data %>%
  filter(!is.na(`83_hh_average_income_ppp`))

# Round PPP-adjusted income
q83_data <- q83_data %>%
  mutate(`83_hh_average_income_ppp` = round(`83_hh_average_income_ppp`, 2))



# Remove values above the 99th percentile (errors)
q83_data_filtered <- q83_data %>%
  filter(`83_hh_average_income_ppp` < quantile(`83_hh_average_income_ppp`, 0.99, na.rm = TRUE))


# Recalculate the trend with filtered data
q83_trend_filtered <- q83_data_filtered %>%
  group_by(country, year) %>%
  summarize(avg_income = mean(`83_hh_average_income_ppp`, na.rm = TRUE))

# Plot with filtered data
plot <- ggplot(q83_data_filtered, aes(x = as.factor(year), y = `83_hh_average_income_ppp`, fill = country)) +
  geom_violin(alpha = 0.5) +
  geom_line(data = q83_trend_filtered, aes(x = as.factor(year), y = avg_income, group = 1), color = "black", size = 0.5) +
  geom_point(data = q83_trend_filtered, aes(x = as.factor(year), y = avg_income), color = "black", size = 2) +
  labs(
    title = "Trend of Average Monthly HH Income (Q83) by Country",
    x = "Year",
    y = "Average Monthly Income per Household (PPP)"
  ) +
    scale_fill_brewer(palette = "Set3") +
  theme_minimal() +
  theme(legend.position = "none") +
  facet_wrap(~ country)

Evaluating FF Program Impact Using Difference-in-Differences and Fixed Effects Analysis

We conducted a Difference-in-Differences (DiD) and Fixed Effects (FE) analysis for Honduras to explore their usefulness in evaluating the available data and assessing the impact of the FF program on household incomes compared to national rural household incomes.

hon

Below are the key findings form the analysiss:

DiD Model

did_hon
#> 
#> Call:
#> lm(formula = income ~ group * treatment, data = did_data)
#> 
#> Residuals:
#>          1          2          3          4          5          6 
#>  3.909e-13 -1.594e+03  1.594e+03 -3.871e-15  8.803e+02 -8.803e+02 
#> 
#> Coefficients:
#>                 Estimate Std. Error t value Pr(>|t|)  
#> (Intercept)         7872       1820   4.324   0.0495 *
#> group              -1700       2574  -0.660   0.5770  
#> treatment           6806       2230   3.053   0.0926 .
#> group:treatment    -7443       3153  -2.360   0.1422  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 1820 on 2 degrees of freedom
#> Multiple R-squared:  0.9365, Adjusted R-squared:  0.8412 
#> F-statistic: 9.828 on 3 and 2 DF,  p-value: 0.09376

Objective: Measure the relative impact of the FF program on household incomes compared to rural households over time (pre- and post-treatment year: 2019).

Key Findings:

  • Baseline Differences: FF households had 1700 HNL lower income than rural households before the program (2019).

  • Program Impact (Interaction): FF households experienced a relative income decrease of 7443 HNL compared to rural households after the program, but this effect is not statistically significant (p = 0.14).

Significance Issue: The small sample size limits the model’s ability to detect meaningful effects or establish statistical significance.

FE Model

fe_hon
#> OLS estimation, Dep. Var.: income
#> Observations: 6
#> Standard-errors: IID 
#>                 Estimate  Std. Error  t value Pr(>|t|)    
#> (Intercept) -1726370.938 1485622.329 -1.16205 0.329261    
#> group          -6661.298    2400.801 -2.77461 0.069301 .  
#> year             860.356     735.092  1.17041 0.326354    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> RMSE: 2,079.2   Adj. R2: 0.585693

Objective: Capture consistent differences between FF and rural households (group effect) and time trends (year effect).

Key Findings: Group Effect: FF households had 6661 HNL lower income than rural households on average, independent of time, with marginal significance (p = 0.069). Time Trend: Income increased by 860 HNL per year across both groups, but this trend is not statistically significant (p = 0.326).

Significance Issue: Both group and time effects lack strong statistical significance due to the small sample size and use of aggregated averages.

Key Limitation: Both analysis are severely constrained by the very small sample size (reliance on averages instead of individual-level data) which limits the ability to draw valid and robust conclusions.

Proposed Next Steps for Enhancing the Usefulness of Income Data for FF Impact Evaluation

  1. Expand the Dataset:
  • Include additional years: Add post-2023 data to capture longer-term trends and allow for better comparisons across time.

  • Increase observations: Use individual-level data rather than averages. Using raw data from national household income surveys would:

    • Increase statistical power.

    • Provide more accurate standard errors.

    • Reflect within-group income variability.

  1. Improve Data Specificity:
  • Focus on obtaining more specific data to improve the comparability of FF and non-FF households and enhance the robustness of the analysis: Targeting fishing communities outside the FF program to serve as more precise control groups.

  • Filtering data by income source (e.g., fishing vs. non-fishing), household location (rural coastal vs. inland rural), and other demographics such as household size and education levels.

  1. Explore Alternative Models:
  • Use random effects or hierarchical models to account for variations within and across groups (e.g., years, provinces).

  • Explore propensity score matching or synthetic control methods to construct better counterfactuals for FF households.

  1. Explore Alternative Data Sources