Income and expenditure analysis

R Markdown

## Loading the required packages
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.5     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/

library(readxl)
library(dplyr)
library(ggplot2)
library(GGally)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

library(ROCR)
library(readr)
library(mlbench)
library(gamlss)

## Loading required package: splines
## Loading required package: gamlss.data
## 
## Attaching package: 'gamlss.data'
## 
## The following object is masked from 'package:datasets':
## 
##     sleep
## 
## Loading required package: gamlss.dist
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## 
## The following object is masked from 'package:dplyr':
## 
##     collapse
## 
## Loading required package: parallel
##  **********   GAMLSS Version 5.4-20  ********** 
## For more on GAMLSS look at https://www.gamlss.com/
## Type gamlssNews() to see new features/changes/bug fixes.

library(magrittr)

## 
## Attaching package: 'magrittr'
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## The following object is masked from 'package:tidyr':
## 
##     extract

library(aod)
library(moments)
library(fpp)

## Loading required package: forecast
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo 
## 
## Attaching package: 'forecast'
## 
## The following object is masked from 'package:gamlss':
## 
##     CV
## 
## The following object is masked from 'package:nlme':
## 
##     getResponse
## 
## The following object is masked from 'package:yardstick':
## 
##     accuracy
## 
## Loading required package: fma
## 
## Attaching package: 'fma'
## 
## The following object is masked from 'package:GGally':
## 
##     pigs
## 
## Loading required package: expsmooth
## Loading required package: lmtest
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Loading required package: tseries
## 
## Attaching package: 'fpp'
## 
## The following object is masked from 'package:gamlss.data':
## 
##     oil

library(caret)

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:gamlss':
## 
##     calibration
## 
## The following objects are masked from 'package:yardstick':
## 
##     precision, recall, sensitivity, specificity
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(Metrics)

## 
## Attaching package: 'Metrics'
## 
## The following objects are masked from 'package:caret':
## 
##     precision, recall
## 
## The following object is masked from 'package:forecast':
## 
##     accuracy
## 
## The following objects are masked from 'package:yardstick':
## 
##     accuracy, mae, mape, mase, precision, recall, rmse, smape

library(aod)
##Importing the dataset
options(scipen = 999)
rm(list = ls())
gc()

##           used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells 2827975 151.1    4272271 228.2  4272271 228.2
## Vcells 4814786  36.8   10146329  77.5  7903531  60.3

library(readxl)
income_data <- read_excel("income_data.xlsx") %>% 
  tibble::as_tibble() %>% 
  janitor::clean_names()
dplyr::glimpse(income_data)

## Rows: 1,131
## Columns: 11
## $ food_exp           <dbl> 41310, 55449, 77842, 106158, 63724, 47363, 216442, …
## $ clothing_exp       <dbl> 1915, 3716, 6500, 16083, 1285, 2435, 14613, 2283, 1…
## $ house_water_exp    <dbl> 8262, 31968, 57462, 78630, 6000, 12012, 159000, 841…
## $ transpo_exp        <dbl> 360, 3060, 10086, 13656, 2460, 1560, 35346, 9432, 3…
## $ educ_exp           <dbl> 955, 0, 70820, 1510, 120, 485, 86240, 0, 950, 2000,…
## $ goods_services_exp <dbl> 4530, 3888, 22194, 14130, 2712, 1404, 10764, 25866,…
## $ inc_entrep         <dbl> 0, 27122, 7500, 233700, 6140, 5284, 0, 0, 23472, 10…
## $ head_sex           <chr> "Male", "Female", "Male", "Male", "Male", "Male", "…
## $ head_age           <dbl> 35, 64, 49, 42, 39, 70, 61, 79, 49, 68, 55, 36, 66,…
## $ empl_members       <dbl> 1, 2, 0, 0, 2, 3, 3, 2, 1, 1, 3, 2, 1, 2, 2, 2, 0, …
## $ dispo_house_inc    <dbl> 64985, 124027, 321405, 329661, 78035, 125302, 70453…

dim(income_data)

## [1] 1131   11

##Summary statistics of income data

#summary statistics for Household income data
library(fBasics)

## 
## Attaching package: 'fBasics'

## The following objects are masked from 'package:moments':
## 
##     kurtosis, skewness

statistics <-data.frame(
  'Disposable income' = c(mean(income_data$dispo_house_inc), 
    median(income_data$dispo_house_inc),sd(income_data$dispo_house_inc),
skewness(income_data$dispo_house_inc),kurtosis(income_data$dispo_house_inc),
    min(income_data$dispo_house_inc),max(income_data$dispo_house_inc)),
  'Food' =c(mean(income_data$food_exp), 
  median(income_data$food_exp), sd(income_data$food_exp),
 skewness(income_data$food_exp),kurtosis(income_data$food_exp),
 min(income_data$food_exp),
  max(income_data$food_exp)),
  'Transportation' = c(mean(income_data$transpo_exp),
median(income_data$transpo_exp),sd(income_data$transpo_exp),
skewness(income_data$transpo_exp),kurtosis(income_data$transpo_exp),
min(income_data$transpo_exp),max(income_data$transpo_exp)),
  'Housing and Water' = c(mean(income_data$house_water_exp),
median(income_data$house_water_exp),sd(income_data$house_water_exp),
 skewness(income_data$house_water_exp),kurtosis(income_data$house_water_exp),
min(income_data$house_water_exp),max(income_data$house_water_exp)),
  'Clothing and Footwear' = c(mean(income_data$clothing_exp),
median(income_data$clothing_exp),sd(income_data$clothing_exp),
skewness(income_data$clothing_exp),kurtosis(income_data$clothing_exp),
min(income_data$clothing_exp),max(income_data$clothing_exp)),
  'Education' = c(mean(income_data$educ_exp),median(income_data$educ_exp),
sd(income_data$educ_exp), skewness(income_data$educ_exp),
kurtosis(income_data$educ_exp),min(income_data$educ_exp),
max(income_data$educ_exp)),
  'Other expenditures' = c(mean(income_data$goods_services_exp),
median(income_data$goods_services_exp),sd(income_data$goods_services_exp),
 skewness(income_data$goods_services_exp),
kurtosis(income_data$goods_services_exp),min(income_data$goods_services_exp),
max(income_data$goods_services_exp)),
  'entrepreneur income' = c(mean(income_data$inc_entrep),
median(income_data$inc_entrep),sd(income_data$inc_entrep),
skewness(income_data$inc_entrep),kurtosis(income_data$inc_entrep), 
min(income_data$inc_entrep),max(income_data$inc_entrep)),
  'Number employed' = c(mean(income_data$empl_members),
median(income_data$empl_members),sd(income_data$empl_members),
skewness(income_data$empl_members),kurtosis(income_data$empl_members),
min(income_data$empl_members),max(income_data$empl_members)),
  'Head age' = c(mean(income_data$head_age),median(income_data$head_age),
sd(income_data$head_age),
skewness(income_data$head_age),kurtosis(income_data$head_age),
min(income_data$head_age),max(income_data$head_age)))
row.names(statistics) <- c("mean", "median","standard deviation",
"skewness","kurtosis","minimum","maximum")
#table of statistics.
print(statistics)

##                    Disposable.income          Food Transportation
## mean                    238584.77365  82569.718833    11746.56764
## median                  161722.00000  70482.000000     5748.00000
## standard deviation      257355.71932  47751.863323    22506.01750
## skewness                     4.45757      1.610935        9.55107
## kurtosis                    30.50598      3.755881      150.93559
## minimum                  17840.00000   2947.000000        0.00000
## maximum                2887946.00000 329490.000000   458100.00000
##                    Housing.and.Water Clothing.and.Footwear     Education
## mean                    38822.769231           4692.481874   7218.274978
## median                  23016.000000           2760.000000    900.000000
## standard deviation      58038.408265           6484.261324  17873.136260
## skewness                    8.398693              4.988403      4.407618
## kurtosis                  107.758311             41.410816     24.474383
## minimum                  2550.000000              0.000000      0.000000
## maximum               1004160.000000          89150.000000 179500.000000
##                    Other.expenditures entrepreneur.income Number.employed
## mean                     11536.286472        50696.327144        1.275862
## median                    6378.000000        15040.000000        1.000000
## standard deviation       15208.836936       122421.921467        1.152117
## skewness                     4.417933            8.624963        1.020415
## kurtosis                    30.269646          104.231038        1.259017
## minimum                     18.000000            0.000000        0.000000
## maximum                 166806.000000      2025250.000000        7.000000
##                      Head.age
## mean               51.5137047
## median             50.0000000
## standard deviation 14.3594974
## skewness            0.2291373
## kurtosis           -0.4492665
## minimum            16.0000000
## maximum            96.0000000

# Features with missing values (graphically)
library(DataExplorer)
DataExplorer::plot_missing(income_data[1:6],  ggtheme = theme_minimal())

DataExplorer::plot_missing(income_data[7:11],  ggtheme = theme_minimal())

## dataset for modeling
income_data$head_sex <-factor(income_data$head_sex, levels = 
                                c("Male", "Female"))
dplyr::glimpse(income_data)

## Rows: 1,131
## Columns: 11
## $ food_exp           <dbl> 41310, 55449, 77842, 106158, 63724, 47363, 216442, …
## $ clothing_exp       <dbl> 1915, 3716, 6500, 16083, 1285, 2435, 14613, 2283, 1…
## $ house_water_exp    <dbl> 8262, 31968, 57462, 78630, 6000, 12012, 159000, 841…
## $ transpo_exp        <dbl> 360, 3060, 10086, 13656, 2460, 1560, 35346, 9432, 3…
## $ educ_exp           <dbl> 955, 0, 70820, 1510, 120, 485, 86240, 0, 950, 2000,…
## $ goods_services_exp <dbl> 4530, 3888, 22194, 14130, 2712, 1404, 10764, 25866,…
## $ inc_entrep         <dbl> 0, 27122, 7500, 233700, 6140, 5284, 0, 0, 23472, 10…
## $ head_sex           <fct> Male, Female, Male, Male, Male, Male, Male, Female,…
## $ head_age           <dbl> 35, 64, 49, 42, 39, 70, 61, 79, 49, 68, 55, 36, 66,…
## $ empl_members       <dbl> 1, 2, 0, 0, 2, 3, 3, 2, 1, 1, 3, 2, 1, 2, 2, 2, 0, …
## $ dispo_house_inc    <dbl> 64985, 124027, 321405, 329661, 78035, 125302, 70453…

# Split the Data
set.seed(7)
income_data_split <- income_data %>% rsample::initial_split(strata= 
         "dispo_house_inc",
                    prop = 0.8)
income_data_train <- rsample::training(income_data_split)
income_data_test <- rsample::testing(income_data_split)
glimpse(income_data_train)

## Rows: 903
## Columns: 11
## $ food_exp           <dbl> 41310, 63724, 47599, 25571, 65708, 51028, 66251, 54…
## $ clothing_exp       <dbl> 1915, 1285, 1550, 850, 1110, 0, 1640, 1600, 2910, 1…
## $ house_water_exp    <dbl> 8262, 6000, 33282, 12348, 9648, 14580, 18426, 18600…
## $ transpo_exp        <dbl> 360, 2460, 7884, 5874, 2496, 4824, 2040, 4902, 480,…
## $ educ_exp           <dbl> 955, 120, 1830, 0, 720, 0, 965, 500, 0, 0, 890, 155…
## $ goods_services_exp <dbl> 4530, 2712, 9066, 2292, 4146, 2580, 8724, 7374, 202…
## $ inc_entrep         <dbl> 0, 6140, 59490, 1460, 48635, 36198, 1825, 84850, 15…
## $ head_sex           <fct> Male, Male, Female, Female, Male, Female, Female, M…
## $ head_age           <dbl> 35, 39, 53, 58, 44, 66, 68, 51, 66, 75, 68, 41, 60,…
## $ empl_members       <dbl> 1, 2, 0, 1, 1, 0, 2, 0, 0, 1, 1, 0, 2, 1, 1, 2, 0, …
## $ dispo_house_inc    <dbl> 64985, 78035, 95610, 59644, 77379, 88203, 94358, 10…

glimpse(income_data_test)

## Rows: 228
## Columns: 11
## $ food_exp           <dbl> 106158, 47363, 79830, 104279, 138169, 100475, 95706…
## $ clothing_exp       <dbl> 16083, 2435, 3910, 3037, 2210, 5023, 9175, 509, 244…
## $ house_water_exp    <dbl> 78630, 12012, 23868, 33198, 18876, 116964, 14838, 1…
## $ transpo_exp        <dbl> 13656, 1560, 4320, 4728, 18684, 10920, 10800, 2892,…
## $ educ_exp           <dbl> 1510, 485, 26700, 1927, 1310, 39400, 3250, 155, 435…
## $ goods_services_exp <dbl> 14130, 1404, 5160, 12558, 10692, 21270, 9582, 1962,…
## $ inc_entrep         <dbl> 233700, 5284, 0, 43800, 45700, 0, 53870, 36160, 0, …
## $ head_sex           <fct> Male, Male, Male, Male, Male, Male, Male, Male, Fem…
## $ head_age           <dbl> 42, 70, 41, 36, 43, 68, 49, 44, 76, 53, 55, 20, 31,…
## $ empl_members       <dbl> 0, 3, 2, 1, 2, 1, 1, 1, 2, 1, 6, 0, 0, 0, 1, 1, 1, …
## $ dispo_house_inc    <dbl> 329661, 125302, 138760, 212200, 194121, 370571, 180…

# Fitting GLMs to model disposable household income
#GAMMA DISTRIBUTION
# Define the recipe
gamma_rec <- recipes::recipe(dispo_house_inc ~ .,data = income_data_train
) %>%step_dummy(all_nominal(), -all_outcomes()) %>%
  step_center(all_predictors(), -all_nominal()) %>%
step_scale(all_predictors(), -all_nominal()) %>%
step_log(dispo_house_inc, base = 10) %>%
prep(training = income_data_train, retain = TRUE)
# Fit a glm with gamma distribution and log link
fit_gamma <- glm(formula = dispo_house_inc ~ .,data = recipes::
  juice(gamma_rec),
family = Gamma(link = "log")  ## Specify gamma distribution and log link
)
summary(fit_gamma)

## 
## Call:
## glm(formula = dispo_house_inc ~ ., family = Gamma(link = "log"), 
##     data = recipes::juice(gamma_rec))
## 
## Coefficients:
##                     Estimate Std. Error  t value             Pr(>|t|)    
## (Intercept)         1.655077   0.001072 1543.987 < 0.0000000000000002 ***
## food_exp            0.028430   0.001627   17.469 < 0.0000000000000002 ***
## clothing_exp        0.004471   0.001632    2.739              0.00629 ** 
## house_water_exp     0.013585   0.001429    9.508 < 0.0000000000000002 ***
## transpo_exp        -0.001094   0.001596   -0.685              0.49341    
## educ_exp            0.005161   0.001227    4.208           0.00002842 ***
## goods_services_exp  0.008411   0.001613    5.214           0.00000023 ***
## inc_entrep          0.010503   0.001162    9.043 < 0.0000000000000002 ***
## head_age            0.001050   0.001132    0.928              0.35374    
## empl_members        0.004742   0.001205    3.935           0.00008970 ***
## head_sex_Female     0.001361   0.001149    1.185              0.23638    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 0.001037616)
## 
##     Null deviance: 3.43327  on 902  degrees of freedom
## Residual deviance: 0.94107  on 892  degrees of freedom
## AIC: -625.52
## 
## Number of Fisher Scoring iterations: 4

BIC(fit_gamma)

## [1] -567.8544

test_gamma <- recipes::bake(gamma_rec,new_data = income_data_test,
                            all_predictors()
)
# get predictions
pred <- predict(fit_gamma, test_gamma, se.fit = FALSE, scale = NULL, df = Inf,
interval = "prediction",level = 0.95, type = "response")
results <-income_data_test %>% select(dispo_house_inc) %>% 
  mutate(dispo_house_inc = log10(dispo_house_inc)) %>% 
bind_cols(pred)

## New names:
## • `` -> `...2`

rmarkdown::paged_table(results)

mse <- mean((fit_gamma$residuals)^2)
cat("Mean Squared Error:", mse)

## Mean Squared Error: 0.001024977

##Calculate accuracy (as a percentage)
accuracy <- 100 - mse
cat("Accuracy (MSE):", accuracy, "%\n")

## Accuracy (MSE): 99.99898 %

##Calculate the mean absolute error
mae <- mean(abs(fit_gamma$residuals))
cat("Mean Absolute Error:", mae)

## Mean Absolute Error: 0.02401644

##Calculate accuracy (as a percentage)
accuracy <- 100 - mae
cat("Accuracy (MAE):", accuracy, "%\n")

## Accuracy (MAE): 99.97598 %

##Plotting the results
plot(results$dispo_house_inc,type = "l",lty= 1.8,
     main="Actual income and Predicted income",col = "grey")
lines(results$...2, type = "l", col = "red") 
legend("topleft",legend=c("Actual income", "Predicted income"),
col=c("grey", "red"), lty=1:2, cex=0.80)

# Create a data frame for plotting
plot_data <- data.frame(Actual = 
  results$dispo_house_inc,  Predicted = results$...2)
##Create the point plot
point_plot <- ggplot(plot_data, aes(x = Actual, y = Predicted)) +
geom_point() +geom_abline(intercept = 0, slope = 1, color = "red", 
                    linetype = "dashed") +
labs(x = "Actual income", y = "Predicted income", 
title = "Actual vs. Predicted income") + theme_minimal()
## Print the plot
print(point_plot)

# Load the necessary libraries
library(stats)
# Calculate the chi-square statistic
chi_square_statistic <- chisq.test(results$dispo_house_inc, results$...2)

## Warning in chisq.test(results$dispo_house_inc, results$...2): Chi-squared
## approximation may be incorrect

# print the results
print(chi_square_statistic)

## 
##  Pearson's Chi-squared test
## 
## data:  results$dispo_house_inc and results$...2
## X-squared = 51756, df = 51529, p-value = 0.2394

# Calculate the degrees of freedom
df <- length(results$dispo_house_inc) - 1
print(df)

## [1] 227

# Calculate the p-value
p_value <- pchisq(51756,df=227,lower.tail = FALSE)
# print the results
print(p_value)

## [1] 0

## Wald test
wald.test(Sigma = vcov(fit_gamma), b = coef(fit_gamma), Terms = seq(1,11))

## Wald test:
## ----------
## 
## Chi-squared test:
## X2 = 2386300.7, df = 11, P(> X2) = 0.0

Income and expenditure analysis

JUDETADEUS MASIKA

2023-11-12

R Markdown