## Loading the required packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.5 ✔ workflows 1.1.3
## ✔ modeldata 1.2.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/
library(readxl)
library(dplyr)
library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(ROCR)
library(readr)
library(mlbench)
library(gamlss)
## Loading required package: splines
## Loading required package: gamlss.data
##
## Attaching package: 'gamlss.data'
##
## The following object is masked from 'package:datasets':
##
## sleep
##
## Loading required package: gamlss.dist
## Loading required package: nlme
##
## Attaching package: 'nlme'
##
## The following object is masked from 'package:dplyr':
##
## collapse
##
## Loading required package: parallel
## ********** GAMLSS Version 5.4-20 **********
## For more on GAMLSS look at https://www.gamlss.com/
## Type gamlssNews() to see new features/changes/bug fixes.
library(magrittr)
##
## Attaching package: 'magrittr'
##
## The following object is masked from 'package:purrr':
##
## set_names
##
## The following object is masked from 'package:tidyr':
##
## extract
library(aod)
library(moments)
library(fpp)
## Loading required package: forecast
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
##
## Attaching package: 'forecast'
##
## The following object is masked from 'package:gamlss':
##
## CV
##
## The following object is masked from 'package:nlme':
##
## getResponse
##
## The following object is masked from 'package:yardstick':
##
## accuracy
##
## Loading required package: fma
##
## Attaching package: 'fma'
##
## The following object is masked from 'package:GGally':
##
## pigs
##
## Loading required package: expsmooth
## Loading required package: lmtest
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Loading required package: tseries
##
## Attaching package: 'fpp'
##
## The following object is masked from 'package:gamlss.data':
##
## oil
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:gamlss':
##
## calibration
##
## The following objects are masked from 'package:yardstick':
##
## precision, recall, sensitivity, specificity
##
## The following object is masked from 'package:purrr':
##
## lift
library(Metrics)
##
## Attaching package: 'Metrics'
##
## The following objects are masked from 'package:caret':
##
## precision, recall
##
## The following object is masked from 'package:forecast':
##
## accuracy
##
## The following objects are masked from 'package:yardstick':
##
## accuracy, mae, mape, mase, precision, recall, rmse, smape
library(aod)
##Importing the dataset
options(scipen = 999)
rm(list = ls())
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2827975 151.1 4272271 228.2 4272271 228.2
## Vcells 4814786 36.8 10146329 77.5 7903531 60.3
library(readxl)
income_data <- read_excel("income_data.xlsx") %>%
tibble::as_tibble() %>%
janitor::clean_names()
dplyr::glimpse(income_data)
## Rows: 1,131
## Columns: 11
## $ food_exp <dbl> 41310, 55449, 77842, 106158, 63724, 47363, 216442, …
## $ clothing_exp <dbl> 1915, 3716, 6500, 16083, 1285, 2435, 14613, 2283, 1…
## $ house_water_exp <dbl> 8262, 31968, 57462, 78630, 6000, 12012, 159000, 841…
## $ transpo_exp <dbl> 360, 3060, 10086, 13656, 2460, 1560, 35346, 9432, 3…
## $ educ_exp <dbl> 955, 0, 70820, 1510, 120, 485, 86240, 0, 950, 2000,…
## $ goods_services_exp <dbl> 4530, 3888, 22194, 14130, 2712, 1404, 10764, 25866,…
## $ inc_entrep <dbl> 0, 27122, 7500, 233700, 6140, 5284, 0, 0, 23472, 10…
## $ head_sex <chr> "Male", "Female", "Male", "Male", "Male", "Male", "…
## $ head_age <dbl> 35, 64, 49, 42, 39, 70, 61, 79, 49, 68, 55, 36, 66,…
## $ empl_members <dbl> 1, 2, 0, 0, 2, 3, 3, 2, 1, 1, 3, 2, 1, 2, 2, 2, 0, …
## $ dispo_house_inc <dbl> 64985, 124027, 321405, 329661, 78035, 125302, 70453…
dim(income_data)
## [1] 1131 11
##Summary statistics of income data
#summary statistics for Household income data
library(fBasics)
##
## Attaching package: 'fBasics'
## The following objects are masked from 'package:moments':
##
## kurtosis, skewness
statistics <-data.frame(
'Disposable income' = c(mean(income_data$dispo_house_inc),
median(income_data$dispo_house_inc),sd(income_data$dispo_house_inc),
skewness(income_data$dispo_house_inc),kurtosis(income_data$dispo_house_inc),
min(income_data$dispo_house_inc),max(income_data$dispo_house_inc)),
'Food' =c(mean(income_data$food_exp),
median(income_data$food_exp), sd(income_data$food_exp),
skewness(income_data$food_exp),kurtosis(income_data$food_exp),
min(income_data$food_exp),
max(income_data$food_exp)),
'Transportation' = c(mean(income_data$transpo_exp),
median(income_data$transpo_exp),sd(income_data$transpo_exp),
skewness(income_data$transpo_exp),kurtosis(income_data$transpo_exp),
min(income_data$transpo_exp),max(income_data$transpo_exp)),
'Housing and Water' = c(mean(income_data$house_water_exp),
median(income_data$house_water_exp),sd(income_data$house_water_exp),
skewness(income_data$house_water_exp),kurtosis(income_data$house_water_exp),
min(income_data$house_water_exp),max(income_data$house_water_exp)),
'Clothing and Footwear' = c(mean(income_data$clothing_exp),
median(income_data$clothing_exp),sd(income_data$clothing_exp),
skewness(income_data$clothing_exp),kurtosis(income_data$clothing_exp),
min(income_data$clothing_exp),max(income_data$clothing_exp)),
'Education' = c(mean(income_data$educ_exp),median(income_data$educ_exp),
sd(income_data$educ_exp), skewness(income_data$educ_exp),
kurtosis(income_data$educ_exp),min(income_data$educ_exp),
max(income_data$educ_exp)),
'Other expenditures' = c(mean(income_data$goods_services_exp),
median(income_data$goods_services_exp),sd(income_data$goods_services_exp),
skewness(income_data$goods_services_exp),
kurtosis(income_data$goods_services_exp),min(income_data$goods_services_exp),
max(income_data$goods_services_exp)),
'entrepreneur income' = c(mean(income_data$inc_entrep),
median(income_data$inc_entrep),sd(income_data$inc_entrep),
skewness(income_data$inc_entrep),kurtosis(income_data$inc_entrep),
min(income_data$inc_entrep),max(income_data$inc_entrep)),
'Number employed' = c(mean(income_data$empl_members),
median(income_data$empl_members),sd(income_data$empl_members),
skewness(income_data$empl_members),kurtosis(income_data$empl_members),
min(income_data$empl_members),max(income_data$empl_members)),
'Head age' = c(mean(income_data$head_age),median(income_data$head_age),
sd(income_data$head_age),
skewness(income_data$head_age),kurtosis(income_data$head_age),
min(income_data$head_age),max(income_data$head_age)))
row.names(statistics) <- c("mean", "median","standard deviation",
"skewness","kurtosis","minimum","maximum")
#table of statistics.
print(statistics)
## Disposable.income Food Transportation
## mean 238584.77365 82569.718833 11746.56764
## median 161722.00000 70482.000000 5748.00000
## standard deviation 257355.71932 47751.863323 22506.01750
## skewness 4.45757 1.610935 9.55107
## kurtosis 30.50598 3.755881 150.93559
## minimum 17840.00000 2947.000000 0.00000
## maximum 2887946.00000 329490.000000 458100.00000
## Housing.and.Water Clothing.and.Footwear Education
## mean 38822.769231 4692.481874 7218.274978
## median 23016.000000 2760.000000 900.000000
## standard deviation 58038.408265 6484.261324 17873.136260
## skewness 8.398693 4.988403 4.407618
## kurtosis 107.758311 41.410816 24.474383
## minimum 2550.000000 0.000000 0.000000
## maximum 1004160.000000 89150.000000 179500.000000
## Other.expenditures entrepreneur.income Number.employed
## mean 11536.286472 50696.327144 1.275862
## median 6378.000000 15040.000000 1.000000
## standard deviation 15208.836936 122421.921467 1.152117
## skewness 4.417933 8.624963 1.020415
## kurtosis 30.269646 104.231038 1.259017
## minimum 18.000000 0.000000 0.000000
## maximum 166806.000000 2025250.000000 7.000000
## Head.age
## mean 51.5137047
## median 50.0000000
## standard deviation 14.3594974
## skewness 0.2291373
## kurtosis -0.4492665
## minimum 16.0000000
## maximum 96.0000000
# Features with missing values (graphically)
library(DataExplorer)
DataExplorer::plot_missing(income_data[1:6], ggtheme = theme_minimal())
DataExplorer::plot_missing(income_data[7:11], ggtheme = theme_minimal())
## dataset for modeling
income_data$head_sex <-factor(income_data$head_sex, levels =
c("Male", "Female"))
dplyr::glimpse(income_data)
## Rows: 1,131
## Columns: 11
## $ food_exp <dbl> 41310, 55449, 77842, 106158, 63724, 47363, 216442, …
## $ clothing_exp <dbl> 1915, 3716, 6500, 16083, 1285, 2435, 14613, 2283, 1…
## $ house_water_exp <dbl> 8262, 31968, 57462, 78630, 6000, 12012, 159000, 841…
## $ transpo_exp <dbl> 360, 3060, 10086, 13656, 2460, 1560, 35346, 9432, 3…
## $ educ_exp <dbl> 955, 0, 70820, 1510, 120, 485, 86240, 0, 950, 2000,…
## $ goods_services_exp <dbl> 4530, 3888, 22194, 14130, 2712, 1404, 10764, 25866,…
## $ inc_entrep <dbl> 0, 27122, 7500, 233700, 6140, 5284, 0, 0, 23472, 10…
## $ head_sex <fct> Male, Female, Male, Male, Male, Male, Male, Female,…
## $ head_age <dbl> 35, 64, 49, 42, 39, 70, 61, 79, 49, 68, 55, 36, 66,…
## $ empl_members <dbl> 1, 2, 0, 0, 2, 3, 3, 2, 1, 1, 3, 2, 1, 2, 2, 2, 0, …
## $ dispo_house_inc <dbl> 64985, 124027, 321405, 329661, 78035, 125302, 70453…
# Split the Data
set.seed(7)
income_data_split <- income_data %>% rsample::initial_split(strata=
"dispo_house_inc",
prop = 0.8)
income_data_train <- rsample::training(income_data_split)
income_data_test <- rsample::testing(income_data_split)
glimpse(income_data_train)
## Rows: 903
## Columns: 11
## $ food_exp <dbl> 41310, 63724, 47599, 25571, 65708, 51028, 66251, 54…
## $ clothing_exp <dbl> 1915, 1285, 1550, 850, 1110, 0, 1640, 1600, 2910, 1…
## $ house_water_exp <dbl> 8262, 6000, 33282, 12348, 9648, 14580, 18426, 18600…
## $ transpo_exp <dbl> 360, 2460, 7884, 5874, 2496, 4824, 2040, 4902, 480,…
## $ educ_exp <dbl> 955, 120, 1830, 0, 720, 0, 965, 500, 0, 0, 890, 155…
## $ goods_services_exp <dbl> 4530, 2712, 9066, 2292, 4146, 2580, 8724, 7374, 202…
## $ inc_entrep <dbl> 0, 6140, 59490, 1460, 48635, 36198, 1825, 84850, 15…
## $ head_sex <fct> Male, Male, Female, Female, Male, Female, Female, M…
## $ head_age <dbl> 35, 39, 53, 58, 44, 66, 68, 51, 66, 75, 68, 41, 60,…
## $ empl_members <dbl> 1, 2, 0, 1, 1, 0, 2, 0, 0, 1, 1, 0, 2, 1, 1, 2, 0, …
## $ dispo_house_inc <dbl> 64985, 78035, 95610, 59644, 77379, 88203, 94358, 10…
glimpse(income_data_test)
## Rows: 228
## Columns: 11
## $ food_exp <dbl> 106158, 47363, 79830, 104279, 138169, 100475, 95706…
## $ clothing_exp <dbl> 16083, 2435, 3910, 3037, 2210, 5023, 9175, 509, 244…
## $ house_water_exp <dbl> 78630, 12012, 23868, 33198, 18876, 116964, 14838, 1…
## $ transpo_exp <dbl> 13656, 1560, 4320, 4728, 18684, 10920, 10800, 2892,…
## $ educ_exp <dbl> 1510, 485, 26700, 1927, 1310, 39400, 3250, 155, 435…
## $ goods_services_exp <dbl> 14130, 1404, 5160, 12558, 10692, 21270, 9582, 1962,…
## $ inc_entrep <dbl> 233700, 5284, 0, 43800, 45700, 0, 53870, 36160, 0, …
## $ head_sex <fct> Male, Male, Male, Male, Male, Male, Male, Male, Fem…
## $ head_age <dbl> 42, 70, 41, 36, 43, 68, 49, 44, 76, 53, 55, 20, 31,…
## $ empl_members <dbl> 0, 3, 2, 1, 2, 1, 1, 1, 2, 1, 6, 0, 0, 0, 1, 1, 1, …
## $ dispo_house_inc <dbl> 329661, 125302, 138760, 212200, 194121, 370571, 180…
# Fitting GLMs to model disposable household income
#GAMMA DISTRIBUTION
# Define the recipe
gamma_rec <- recipes::recipe(dispo_house_inc ~ .,data = income_data_train
) %>%step_dummy(all_nominal(), -all_outcomes()) %>%
step_center(all_predictors(), -all_nominal()) %>%
step_scale(all_predictors(), -all_nominal()) %>%
step_log(dispo_house_inc, base = 10) %>%
prep(training = income_data_train, retain = TRUE)
# Fit a glm with gamma distribution and log link
fit_gamma <- glm(formula = dispo_house_inc ~ .,data = recipes::
juice(gamma_rec),
family = Gamma(link = "log") ## Specify gamma distribution and log link
)
summary(fit_gamma)
##
## Call:
## glm(formula = dispo_house_inc ~ ., family = Gamma(link = "log"),
## data = recipes::juice(gamma_rec))
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.655077 0.001072 1543.987 < 0.0000000000000002 ***
## food_exp 0.028430 0.001627 17.469 < 0.0000000000000002 ***
## clothing_exp 0.004471 0.001632 2.739 0.00629 **
## house_water_exp 0.013585 0.001429 9.508 < 0.0000000000000002 ***
## transpo_exp -0.001094 0.001596 -0.685 0.49341
## educ_exp 0.005161 0.001227 4.208 0.00002842 ***
## goods_services_exp 0.008411 0.001613 5.214 0.00000023 ***
## inc_entrep 0.010503 0.001162 9.043 < 0.0000000000000002 ***
## head_age 0.001050 0.001132 0.928 0.35374
## empl_members 0.004742 0.001205 3.935 0.00008970 ***
## head_sex_Female 0.001361 0.001149 1.185 0.23638
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Gamma family taken to be 0.001037616)
##
## Null deviance: 3.43327 on 902 degrees of freedom
## Residual deviance: 0.94107 on 892 degrees of freedom
## AIC: -625.52
##
## Number of Fisher Scoring iterations: 4
BIC(fit_gamma)
## [1] -567.8544
test_gamma <- recipes::bake(gamma_rec,new_data = income_data_test,
all_predictors()
)
# get predictions
pred <- predict(fit_gamma, test_gamma, se.fit = FALSE, scale = NULL, df = Inf,
interval = "prediction",level = 0.95, type = "response")
results <-income_data_test %>% select(dispo_house_inc) %>%
mutate(dispo_house_inc = log10(dispo_house_inc)) %>%
bind_cols(pred)
## New names:
## • `` -> `...2`
rmarkdown::paged_table(results)
mse <- mean((fit_gamma$residuals)^2)
cat("Mean Squared Error:", mse)
## Mean Squared Error: 0.001024977
##Calculate accuracy (as a percentage)
accuracy <- 100 - mse
cat("Accuracy (MSE):", accuracy, "%\n")
## Accuracy (MSE): 99.99898 %
##Calculate the mean absolute error
mae <- mean(abs(fit_gamma$residuals))
cat("Mean Absolute Error:", mae)
## Mean Absolute Error: 0.02401644
##Calculate accuracy (as a percentage)
accuracy <- 100 - mae
cat("Accuracy (MAE):", accuracy, "%\n")
## Accuracy (MAE): 99.97598 %
##Plotting the results
plot(results$dispo_house_inc,type = "l",lty= 1.8,
main="Actual income and Predicted income",col = "grey")
lines(results$...2, type = "l", col = "red")
legend("topleft",legend=c("Actual income", "Predicted income"),
col=c("grey", "red"), lty=1:2, cex=0.80)
# Create a data frame for plotting
plot_data <- data.frame(Actual =
results$dispo_house_inc, Predicted = results$...2)
##Create the point plot
point_plot <- ggplot(plot_data, aes(x = Actual, y = Predicted)) +
geom_point() +geom_abline(intercept = 0, slope = 1, color = "red",
linetype = "dashed") +
labs(x = "Actual income", y = "Predicted income",
title = "Actual vs. Predicted income") + theme_minimal()
## Print the plot
print(point_plot)
# Load the necessary libraries
library(stats)
# Calculate the chi-square statistic
chi_square_statistic <- chisq.test(results$dispo_house_inc, results$...2)
## Warning in chisq.test(results$dispo_house_inc, results$...2): Chi-squared
## approximation may be incorrect
# print the results
print(chi_square_statistic)
##
## Pearson's Chi-squared test
##
## data: results$dispo_house_inc and results$...2
## X-squared = 51756, df = 51529, p-value = 0.2394
# Calculate the degrees of freedom
df <- length(results$dispo_house_inc) - 1
print(df)
## [1] 227
# Calculate the p-value
p_value <- pchisq(51756,df=227,lower.tail = FALSE)
# print the results
print(p_value)
## [1] 0
## Wald test
wald.test(Sigma = vcov(fit_gamma), b = coef(fit_gamma), Terms = seq(1,11))
## Wald test:
## ----------
##
## Chi-squared test:
## X2 = 2386300.7, df = 11, P(> X2) = 0.0