R Markdown

library(readxl)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.1
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.5.2
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.5.1
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(sandwich)
## Warning: package 'sandwich' was built under R version 4.5.2
df <- read_excel("C:/Users/Hp/Desktop/HCES_2025_PUF.xlsx")
## New names:
## • `other vehicles` -> `other vehicles...40`
## • `other vehicles` -> `other vehicles...41`
## • `Total monthly value of cereals in ngultrums` -> `Total monthly value of
##   cereals in ngultrums...225`
## • `Total monthly value of Other vegetables, tubers, plantains and cooking
##   bananas,` -> `Total monthly value of Other vegetables, tubers, plantains and
##   cooking bananas,...257`
## • `Total monthly value of Other vegetables, tubers, plantains and cooking
##   bananas,` -> `Total monthly value of Other vegetables, tubers, plantains and
##   cooking bananas,...258`
## • `Total monthly value of cereals in ngultrums` -> `Total monthly value of
##   cereals in ngultrums...283`
## • `Total monthly value of electricity in ngultrums` -> `Total monthly value of
##   electricity in ngultrums...360`
## • `Total monthly value of cameras in ngultrums` -> `Total monthly value of
##   cameras in ngultrums...461`
## • `Total monthly value of electricity in ngultrums` -> `Total monthly value of
##   electricity in ngultrums...524`
## • `Total monthly value of cameras in ngultrums` -> `Total monthly value of
##   cameras in ngultrums...542`
data <- df %>%
  dplyr::select(
    consumption = `Monthly per capita hh total consumption expenditure`,
    hhsize = `(first) hhsize`,
    area = `Rural and urban`
  ) %>%
  filter(!is.na(consumption), !is.na(hhsize), consumption > 0)
data <- data %>%
  mutate(log_consumption = log(consumption))
summary(data$consumption)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1745    8219   12622   16636   19480  323893
summary(data$hhsize)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   4.000   3.715   5.000  15.000
ggplot(data, aes(consumption)) +
  geom_histogram(bins = 40) +
  labs(title = "Distribution of Household Consumption")

ggplot(data, aes(log_consumption)) +
  geom_histogram(bins = 40) +
  labs(title = "Distribution of Log Consumption")

ggplot(data, aes(hhsize, log_consumption)) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "lm") +
  labs(title = "Household Size vs Log Consumption")
## `geom_smooth()` using formula = 'y ~ x'

model <- lm(log_consumption ~ hhsize, data = data)
summary(model)
## 
## Call:
## lm(formula = log_consumption ~ hhsize, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.81959 -0.41451 -0.02568  0.37212  2.73610 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.12834    0.01823   555.7   <2e-16 ***
## hhsize      -0.17627    0.00444   -39.7   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5948 on 5881 degrees of freedom
## Multiple R-squared:  0.2114, Adjusted R-squared:  0.2112 
## F-statistic:  1576 on 1 and 5881 DF,  p-value: < 2.2e-16
coeftest(model, vcov = vcovHC(model, type = "HC1"))
## 
## t test of coefficients:
## 
##               Estimate Std. Error t value  Pr(>|t|)    
## (Intercept) 10.1283437  0.0198305 510.746 < 2.2e-16 ***
## hhsize      -0.1762727  0.0048071 -36.669 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
bptest(model)
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 7.6454, df = 1, p-value = 0.005692
library(nortest)
## Warning: package 'nortest' was built under R version 4.5.2
ad.test(resid(model))    # Anderson-Darling test
## 
##  Anderson-Darling normality test
## 
## data:  resid(model)
## A = 6.1874, p-value = 3.375e-15
plot(model, which = 1)

urban_data <- data %>% filter(area == "Urban")
model_urban <- lm(log_consumption ~ hhsize, data = urban_data)
summary(model_urban)
## 
## Call:
## lm(formula = log_consumption ~ hhsize, data = urban_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.59776 -0.36815 -0.02429  0.34187  2.25809 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.358561   0.029570  350.31   <2e-16 ***
## hhsize      -0.182499   0.007545  -24.19   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5646 on 2186 degrees of freedom
## Multiple R-squared:  0.2111, Adjusted R-squared:  0.2108 
## F-statistic: 585.1 on 1 and 2186 DF,  p-value: < 2.2e-16
rural_data <- data %>% filter(area == "Rural")
model_rural <- lm(log_consumption ~ hhsize, data = rural_data)
summary(model_rural)
## 
## Call:
## lm(formula = log_consumption ~ hhsize, data = rural_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.68777 -0.38931 -0.03109  0.35261  2.88977 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.963747   0.021942  454.09   <2e-16 ***
## hhsize      -0.165349   0.005211  -31.73   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5775 on 3693 degrees of freedom
## Multiple R-squared:  0.2143, Adjusted R-squared:  0.214 
## F-statistic:  1007 on 1 and 3693 DF,  p-value: < 2.2e-16
semi_data <- data %>% filter(area == "Semi-urban")
if(nrow(semi_data) > 0){
  model_semi <- lm(log_consumption ~ hhsize, data = semi_data)
  summary(model_semi)
}
model_poly <- lm(log_consumption ~ hhsize + I(hhsize^2), data = data)
summary(model_poly)
## 
## Call:
## lm(formula = log_consumption ~ hhsize + I(hhsize^2), data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.79470 -0.41259 -0.01701  0.36637  2.61509 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.368597   0.030193 343.409   <2e-16 ***
## hhsize      -0.310955   0.014255 -21.814   <2e-16 ***
## I(hhsize^2)  0.015434   0.001554   9.934   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5899 on 5880 degrees of freedom
## Multiple R-squared:  0.2244, Adjusted R-squared:  0.2241 
## F-statistic: 850.5 on 2 and 5880 DF,  p-value: < 2.2e-16
p1 <- quantile(data$log_consumption, 0.01)
p99 <- quantile(data$log_consumption, 0.99)

data_trimmed <- data %>% filter(log_consumption > p1, log_consumption < p99)

model_trimmed <- lm(log_consumption ~ hhsize, data = data_trimmed)
summary(model_trimmed)
## 
## Call:
## lm(formula = log_consumption ~ hhsize, data = data_trimmed)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.61523 -0.39569 -0.02075  0.36503  1.98949 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.047149   0.017518  573.52   <2e-16 ***
## hhsize      -0.155966   0.004287  -36.38   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5594 on 5763 degrees of freedom
## Multiple R-squared:  0.1868, Adjusted R-squared:  0.1866 
## F-statistic:  1324 on 1 and 5763 DF,  p-value: < 2.2e-16
summary(data)
##   consumption         hhsize           area           log_consumption 
##  Min.   :  1745   Min.   : 1.000   Length:5883        Min.   : 7.465  
##  1st Qu.:  8219   1st Qu.: 2.000   Class :character   1st Qu.: 9.014  
##  Median : 12622   Median : 4.000   Mode  :character   Median : 9.443  
##  Mean   : 16636   Mean   : 3.715                      Mean   : 9.474  
##  3rd Qu.: 19480   3rd Qu.: 5.000                      3rd Qu.: 9.877  
##  Max.   :323893   Max.   :15.000                      Max.   :12.688
write.csv(data, "cleaned_data.csv", row.names = FALSE)