IIM-A Research Intern Assessment

Author

Pranish Shinde

Step 1: Load the libraries & data

# Load necessary libraries
library(tidyverse)
Warning: package 'tidyverse' was built under R version 4.5.2
Warning: package 'ggplot2' was built under R version 4.5.2
Warning: package 'tibble' was built under R version 4.5.2
Warning: package 'tidyr' was built under R version 4.5.2
Warning: package 'readr' was built under R version 4.5.2
Warning: package 'purrr' was built under R version 4.5.2
Warning: package 'dplyr' was built under R version 4.5.2
Warning: package 'stringr' was built under R version 4.5.2
Warning: package 'forcats' was built under R version 4.5.2
Warning: package 'lubridate' was built under R version 4.5.2
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.6
✔ forcats   1.0.1     ✔ stringr   1.6.0
✔ ggplot2   4.0.1     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.2
✔ purrr     1.2.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)
library(MASS)

Attaching package: 'MASS'

The following object is masked from 'package:dplyr':

    select
library(fitdistrplus)
Warning: package 'fitdistrplus' was built under R version 4.5.2
Loading required package: survival
library(broom)
Warning: package 'broom' was built under R version 4.5.2
library(moments)
Warning: package 'moments' was built under R version 4.5.2
library(stargazer)
Warning: package 'stargazer' was built under R version 4.5.2

Please cite as: 

 Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
 R package version 5.2.3. https://CRAN.R-project.org/package=stargazer 
library(knitr)
Warning: package 'knitr' was built under R version 4.5.2
library(stats4)


# Step 1: Load the data
data <- read.csv("C:/Users/Administrator/Downloads/data_income_consumption_gender.xlsx - Data Task_Intern.csv", header = FALSE)
colnames(data) <- c("Income", "Consumption", "Gender")

view(data)

Step 2: Summary statistics

# Step 2: Summary statistics
summary_stats <- data %>%
  summarise(
    Income_Mean = mean(Income, na.rm = TRUE),
    Income_Median = median(Income, na.rm = TRUE),
    Income_SD = sd(Income, na.rm = TRUE),
    Income_Min = min(Income, na.rm = TRUE),
    Income_Max = max(Income, na.rm = TRUE),
    
    Consumption_Mean = mean(Consumption, na.rm = TRUE),
    Consumption_Median = median(Consumption, na.rm = TRUE),
    Consumption_SD = sd(Consumption, na.rm = TRUE),
    Consumption_Min = min(Consumption, na.rm = TRUE),
    Consumption_Max = max(Consumption, na.rm = TRUE),
    
    Male_Proportion = mean(Gender == 1, na.rm = TRUE)
  )

print(summary_stats)
  Income_Mean Income_Median Income_SD Income_Min Income_Max Consumption_Mean
1    257556.5        205001  184649.7      18901    1641001         240165.9
  Consumption_Median Consumption_SD Consumption_Min Consumption_Max
1           226272.2       142579.2               0         1141876
  Male_Proportion
1       0.5296566
# Create a more detailed summary table by gender
summary_by_gender <- data %>%
  group_by(Gender) %>%
  summarise(
    Count = n(),
    Income_Mean = mean(Income, na.rm = TRUE),
    Income_SD = sd(Income, na.rm = TRUE),
    Consumption_Mean = mean(Consumption, na.rm = TRUE),
    Consumption_SD = sd(Consumption, na.rm = TRUE)
  )

print(summary_by_gender)
# A tibble: 2 × 6
  Gender Count Income_Mean Income_SD Consumption_Mean Consumption_SD
   <int> <int>       <dbl>     <dbl>            <dbl>          <dbl>
1      0   452     250898.   177335.          233613.        137158.
2      1   509     263469.   190890.          245985.        147116.
view(summary_by_gender)

Step 3: Plot normalized histogram of income

# Step 3: Plot normalized histogram of income
# Calculate the bin width using Freedman-Diaconis rule
bin_width <- 2 * IQR(data$Income) / (length(data$Income)^(1/3))
num_bins <- ceiling((max(data$Income) - min(data$Income)) / bin_width)

# Create the histogram
hist_data <- hist(data$Income, breaks = num_bins, plot = FALSE)
hist_data$density <- hist_data$counts / sum(hist_data$counts)

# Plot the normalized histogram
pdf("income_histogram.pdf", width = 10, height = 6)
plot(hist_data, freq = FALSE, 
     main = "Normalized Histogram of Income",
     xlab = "Income",
     ylab = "Density",
     col = "lightblue1",
     prob = TRUE)  # This normalises the histogram
Warning in plot.window(xlim, ylim, "", ...): "prob" is not a graphical
parameter
Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...): "prob"
is not a graphical parameter
Warning in axis(1, ...): "prob" is not a graphical parameter
Warning in axis(2, at = yt, ...): "prob" is not a graphical parameter
dev.off()
png 
  2 
ggplot(data = data, aes(x = Income)) +
  geom_histogram(bins = 30, fill = "lightblue1", color = "black", alpha = 0.7) +
  labs(title = "Histogram of Income", x = "Income", y = "Frequency") +
  theme_minimal()

ggplot(data = data, aes(x = Income)) +
  geom_histogram(aes(y = after_stat(density)), bins = 30, fill = "lightblue1", color = "black", alpha = 0.7) +
  labs(title = "Normalized Histogram of Income", x = "Income", y = "Density") +
  theme_minimal()

Step 4: Fit distributions to the income data

# Step 4: Fit distributions to the income data
# Remove zeros or negative values if any for fitting

df <- data
df <- df %>%
  rename(income=Income, consumption=Consumption, gender=Gender)

# Read the data
income_data <- df

# Basic exploration
summary(income_data$income)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  18901  147001  205001  257556  308701 1641001 
# --- FIX START ---
# 1. Clean: Remove NAs and values <= 0
income_positive <- income_data$income[!is.na(income_data$income) & income_data$income > 0]

# 2. Scale: Divide by 1000 to prevent "Error Code 100"
# (Optimizers fail with very large numbers; scaling fixes this numerical instability)
scale_factor <- 1000
income_scaled <- income_positive / scale_factor

# Fit distributions on the SCALED data
lnorm_fit <- fitdist(income_scaled, "lnorm")
gamma_fit <- fitdist(income_scaled, "gamma")

# Compare the fits
par(mfrow=c(2,2))

# 1. Density Plot
plot(lnorm_fit)

plot(gamma_fit)

# Reset plotting parameters
par(mfrow=c(1,1))

# Plot the fitted distributions against the empirical density (using SCALED data)
hist(income_scaled, breaks=50, probability=TRUE, 
     main="Income (in '000s) Distribution with Fitted Models", 
     xlab="Income (Thousands)", 
     ylim=c(0, max(density(income_scaled)$y)*1.1))

# Add density curves
x_seq <- seq(min(income_scaled), max(income_scaled), length.out=1000)

# Add lognormal density
lnorm_density <- dlnorm(x_seq, meanlog=lnorm_fit$estimate[1], sdlog=lnorm_fit$estimate[2])
lines(x_seq, lnorm_density, col="blue", lwd=2)

# Add gamma density
gamma_density <- dgamma(x_seq, shape=gamma_fit$estimate[1], rate=gamma_fit$estimate[2])
lines(x_seq, gamma_density, col="red", lwd=2)

# Add legend
legend("topright", legend=c("Lognormal", "Gamma"), 
       col=c("blue", "red"), lwd=2)

# Compute goodness-of-fit statistics
lnorm_gof <- gofstat(lnorm_fit)
gamma_gof <- gofstat(gamma_fit)

# Print AIC and BIC
cat("Lognormal AIC:", lnorm_fit$aic, "\n")
Lognormal AIC: 12083.91 
cat("Gamma AIC:", gamma_fit$aic, "\n")
Gamma AIC: 12165.65 
cat("Lognormal BIC:", lnorm_fit$bic, "\n")
Lognormal BIC: 12093.65 
cat("Gamma BIC:", gamma_fit$bic, "\n")
Gamma BIC: 12175.38 
# Calculate and print the Kolmogorov-Smirnov statistics
cat("Lognormal KS statistic:", lnorm_gof$ks, "\n")
Lognormal KS statistic: 0.0540778 
cat("Gamma KS statistic:", gamma_gof$ks, "\n")
Gamma KS statistic: 0.08410316 
# Print the Anderson-Darling statistics
cat("Lognormal AD statistic:", lnorm_gof$ad, "\n")
Lognormal AD statistic: 4.799638 
cat("Gamma AD statistic:", gamma_gof$ad, "\n")
Gamma AD statistic: 13.93761 
# Q-Q plots for visual comparison
par(mfrow=c(1,2))
qqcomp(lnorm_fit, main="Lognormal Q-Q Plot")
qqcomp(gamma_fit, main="Gamma Q-Q Plot")

# P-P plots for visual comparison
par(mfrow=c(1,2))
ppcomp(lnorm_fit, main="Lognormal P-P Plot")
ppcomp(gamma_fit, main="Gamma P-P Plot")

# Compare the empirical CDF with the theoretical CDF
par(mfrow=c(1,2))
cdfcomp(list(lnorm_fit, gamma_fit), legendtext=c("Lognormal", "Gamma"))
cdfcomp(list(lnorm_fit, gamma_fit), legendtext=c("Lognormal", "Gamma"), 
        xlogscale=TRUE, main="CDF Comparison (Log Scale)")

# Reset plot parameters
par(mfrow=c(1,1))

Step 5: Estimating MPC

# Step 5: Estimating MPC
# Run regression (consumption ~ income + gender)

model <- lm(consumption ~ income + gender, data = df)
summary(model)

Call:
lm(formula = consumption ~ income + gender, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-278887  -57758    1056   57861  304128 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 8.473e+04  5.866e+03  14.444   <2e-16 ***
income      5.934e-01  1.595e-02  37.204   <2e-16 ***
gender      4.913e+03  5.897e+03   0.833    0.405    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 91200 on 958 degrees of freedom
Multiple R-squared:  0.5917,    Adjusted R-squared:  0.5909 
F-statistic: 694.3 on 2 and 958 DF,  p-value: < 2.2e-16
mpc <- coef(model)["income"]
cat("Estimated MPC (Income coefficient):", mpc, "\n") 
Estimated MPC (Income coefficient): 0.593381 
# On average-- males have a consumption level that is 4912.594 units higher than females, holding income constant.(male=1, female = 0)

gender_coef <- coef(model)["gender"]
cat("Estimated Gender coefficient:", gender_coef, "\n") 
Estimated Gender coefficient: 4912.594 
# means that, on average, for every additional unit of income, consumption increases by about 59.3% of that amount