R Markdown
library(readxl)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.1
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.5.2
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.5.1
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(sandwich)
## Warning: package 'sandwich' was built under R version 4.5.2
df <- read_excel("C:/Users/Hp/Desktop/HCES_2025_PUF.xlsx")
## New names:
## • `other vehicles` -> `other vehicles...40`
## • `other vehicles` -> `other vehicles...41`
## • `Total monthly value of cereals in ngultrums` -> `Total monthly value of
## cereals in ngultrums...225`
## • `Total monthly value of Other vegetables, tubers, plantains and cooking
## bananas,` -> `Total monthly value of Other vegetables, tubers, plantains and
## cooking bananas,...257`
## • `Total monthly value of Other vegetables, tubers, plantains and cooking
## bananas,` -> `Total monthly value of Other vegetables, tubers, plantains and
## cooking bananas,...258`
## • `Total monthly value of cereals in ngultrums` -> `Total monthly value of
## cereals in ngultrums...283`
## • `Total monthly value of electricity in ngultrums` -> `Total monthly value of
## electricity in ngultrums...360`
## • `Total monthly value of cameras in ngultrums` -> `Total monthly value of
## cameras in ngultrums...461`
## • `Total monthly value of electricity in ngultrums` -> `Total monthly value of
## electricity in ngultrums...524`
## • `Total monthly value of cameras in ngultrums` -> `Total monthly value of
## cameras in ngultrums...542`
data <- df %>%
dplyr::select(
consumption = `Monthly per capita hh total consumption expenditure`,
hhsize = `(first) hhsize`,
area = `Rural and urban`
) %>%
filter(!is.na(consumption), !is.na(hhsize), consumption > 0)
data <- data %>%
mutate(log_consumption = log(consumption))
summary(data$consumption)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1745 8219 12622 16636 19480 323893
summary(data$hhsize)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 4.000 3.715 5.000 15.000
ggplot(data, aes(consumption)) +
geom_histogram(bins = 40) +
labs(title = "Distribution of Household Consumption")

ggplot(data, aes(log_consumption)) +
geom_histogram(bins = 40) +
labs(title = "Distribution of Log Consumption")

ggplot(data, aes(hhsize, log_consumption)) +
geom_point(alpha = 0.3) +
geom_smooth(method = "lm") +
labs(title = "Household Size vs Log Consumption")
## `geom_smooth()` using formula = 'y ~ x'

model <- lm(log_consumption ~ hhsize, data = data)
summary(model)
##
## Call:
## lm(formula = log_consumption ~ hhsize, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.81959 -0.41451 -0.02568 0.37212 2.73610
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.12834 0.01823 555.7 <2e-16 ***
## hhsize -0.17627 0.00444 -39.7 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5948 on 5881 degrees of freedom
## Multiple R-squared: 0.2114, Adjusted R-squared: 0.2112
## F-statistic: 1576 on 1 and 5881 DF, p-value: < 2.2e-16
coeftest(model, vcov = vcovHC(model, type = "HC1"))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.1283437 0.0198305 510.746 < 2.2e-16 ***
## hhsize -0.1762727 0.0048071 -36.669 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
bptest(model)
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 7.6454, df = 1, p-value = 0.005692
library(nortest)
## Warning: package 'nortest' was built under R version 4.5.2
ad.test(resid(model)) # Anderson-Darling test
##
## Anderson-Darling normality test
##
## data: resid(model)
## A = 6.1874, p-value = 3.375e-15
plot(model, which = 1)

urban_data <- data %>% filter(area == "Urban")
model_urban <- lm(log_consumption ~ hhsize, data = urban_data)
summary(model_urban)
##
## Call:
## lm(formula = log_consumption ~ hhsize, data = urban_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.59776 -0.36815 -0.02429 0.34187 2.25809
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.358561 0.029570 350.31 <2e-16 ***
## hhsize -0.182499 0.007545 -24.19 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5646 on 2186 degrees of freedom
## Multiple R-squared: 0.2111, Adjusted R-squared: 0.2108
## F-statistic: 585.1 on 1 and 2186 DF, p-value: < 2.2e-16
rural_data <- data %>% filter(area == "Rural")
model_rural <- lm(log_consumption ~ hhsize, data = rural_data)
summary(model_rural)
##
## Call:
## lm(formula = log_consumption ~ hhsize, data = rural_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.68777 -0.38931 -0.03109 0.35261 2.88977
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.963747 0.021942 454.09 <2e-16 ***
## hhsize -0.165349 0.005211 -31.73 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5775 on 3693 degrees of freedom
## Multiple R-squared: 0.2143, Adjusted R-squared: 0.214
## F-statistic: 1007 on 1 and 3693 DF, p-value: < 2.2e-16
semi_data <- data %>% filter(area == "Semi-urban")
if(nrow(semi_data) > 0){
model_semi <- lm(log_consumption ~ hhsize, data = semi_data)
summary(model_semi)
}
model_poly <- lm(log_consumption ~ hhsize + I(hhsize^2), data = data)
summary(model_poly)
##
## Call:
## lm(formula = log_consumption ~ hhsize + I(hhsize^2), data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.79470 -0.41259 -0.01701 0.36637 2.61509
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.368597 0.030193 343.409 <2e-16 ***
## hhsize -0.310955 0.014255 -21.814 <2e-16 ***
## I(hhsize^2) 0.015434 0.001554 9.934 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5899 on 5880 degrees of freedom
## Multiple R-squared: 0.2244, Adjusted R-squared: 0.2241
## F-statistic: 850.5 on 2 and 5880 DF, p-value: < 2.2e-16
p1 <- quantile(data$log_consumption, 0.01)
p99 <- quantile(data$log_consumption, 0.99)
data_trimmed <- data %>% filter(log_consumption > p1, log_consumption < p99)
model_trimmed <- lm(log_consumption ~ hhsize, data = data_trimmed)
summary(model_trimmed)
##
## Call:
## lm(formula = log_consumption ~ hhsize, data = data_trimmed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.61523 -0.39569 -0.02075 0.36503 1.98949
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.047149 0.017518 573.52 <2e-16 ***
## hhsize -0.155966 0.004287 -36.38 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5594 on 5763 degrees of freedom
## Multiple R-squared: 0.1868, Adjusted R-squared: 0.1866
## F-statistic: 1324 on 1 and 5763 DF, p-value: < 2.2e-16
summary(data)
## consumption hhsize area log_consumption
## Min. : 1745 Min. : 1.000 Length:5883 Min. : 7.465
## 1st Qu.: 8219 1st Qu.: 2.000 Class :character 1st Qu.: 9.014
## Median : 12622 Median : 4.000 Mode :character Median : 9.443
## Mean : 16636 Mean : 3.715 Mean : 9.474
## 3rd Qu.: 19480 3rd Qu.: 5.000 3rd Qu.: 9.877
## Max. :323893 Max. :15.000 Max. :12.688
write.csv(data, "cleaned_data.csv", row.names = FALSE)