Econometrics Final

Author

AS - Song Lu

Libraries

remove(list=ls())
# Load necessary libraries
library(dplyr)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

library(car)

Loading required package: carData


Attaching package: 'car'

The following object is masked from 'package:dplyr':

    recode

library(corrplot)

corrplot 0.94 loaded

library(GGally)

Loading required package: ggplot2

Registered S3 method overwritten by 'GGally':
  method from   
  +.gg   ggplot2

library(ggplot2)
library(glmnet)

Loading required package: Matrix

Loaded glmnet 4.1-8

library(gridExtra)


Attaching package: 'gridExtra'

The following object is masked from 'package:dplyr':

    combine

library(visdat)
library(car)
library(stargazer)


Please cite as:

 Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.

 R package version 5.2.3. https://CRAN.R-project.org/package=stargazer

library(plm)


Attaching package: 'plm'

The following objects are masked from 'package:dplyr':

    between, lag, lead

library(quantreg)

Loading required package: SparseM


Attaching package: 'SparseM'

The following object is masked from 'package:Matrix':

    det

Load in Dataset

file_path <- "SCFP2022.csv"  
data <- read.csv(file_path)

length(unique(data$Y1))  # total obs

[1] 22975

length(unique(data$YY1)) # HH

[1] 4595

Aggregate Data (cross-sectional)

vis_dat(data[,1:30])

vis_dat(data[,31:60])

vis_dat(data[,327:357])

# Aggregate by HH1

aggregated_data <- data %>%
  group_by(YY1) %>%
  summarize(
    AGE = mean(AGE, na.rm=TRUE), 
    KIDS = mean(KIDS, na.rm=TRUE), 
    INCOME = mean(INCOME, na.rm=TRUE), 
    HHSEX = mean(HHSEX, na.rm=TRUE), 
    EDCL=mean(EDCL, na.rm=TRUE), 
    MARRIED = mean(MARRIED, na.rm=TRUE), 
    WSAVED = mean(WSAVED, na.rm=TRUE), 
    DEBT = mean(DEBT, na.rm=TRUE), 
    YESFINRISK = mean(YESFINRISK, na.rm=TRUE), 
    NOFINRISK = mean(NOFINRISK, na.rm=TRUE), 
    ANYPEN = mean(ANYPEN, na.rm=TRUE), 
    CURRPEN = mean(CURRPEN, na.rm=TRUE),
  
    n = n()  # Count of rows per group
  )


lm_df <- lm(formula = I(log(CURRPEN+.0000001)) ~ AGE + KIDS + INCOME + EDCL + MARRIED + WSAVED + DEBT + YESFINRISK,
            data = aggregated_data)
stargazer(lm_df, type = "text")


===============================================
                        Dependent variable:    
                    ---------------------------
                      I(log(CURRPEN + 1e-07))  
-----------------------------------------------
AGE                          0.039***          
                              (0.004)          
                                               
KIDS                         -0.206***         
                              (0.062)          
                                               
INCOME                         0.000           
                              (0.000)          
                                               
EDCL                         0.215***          
                              (0.064)          
                                               
MARRIED                      -0.544***         
                              (0.139)          
                                               
WSAVED                        -0.066           
                              (0.088)          
                                               
DEBT                         -0.00000          
                             (0.00000)         
                                               
YESFINRISK                    -0.341           
                              (0.281)          
                                               
Constant                    -17.163***         
                              (0.452)          
                                               
-----------------------------------------------
Observations                   4,595           
R2                             0.037           
Adjusted R2                    0.035           
Residual Std. Error      4.260 (df = 4586)     
F Statistic          21.858*** (df = 8; 4586)  
===============================================
Note:               *p<0.1; **p<0.05; ***p<0.01

Panel Data

Multicollinearity Visualization

as.numeric code made more efficient

# Step 2: Select relevant columns for analysis
selected_columns <- data %>%
  select(YY1, Y1,AGE, KIDS, INCOME, HHSEX, EDCL, MARRIED, WSAVED, DEBT, YESFINRISK, NOFINRISK, ANYPEN, CURRPEN)


# Convert factors to numeric for correlation calculation (dummy encoding)
selected_columns <- selected_columns %>%
  mutate(across(c(YY1,  Y1,HHSEX, EDCL, MARRIED, YESFINRISK, NOFINRISK, ANYPEN, CURRPEN), as.numeric))


# Now compute the correlation matrix
cor_matrix <- cor(selected_columns %>% select(YY1, Y1, AGE, KIDS, INCOME, HHSEX, EDCL, MARRIED, WSAVED, DEBT, YESFINRISK, NOFINRISK, CURRPEN, ANYPEN), use = "complete.obs")

# Plot the correlation matrix as a heatmap
corrplot(cor_matrix, method = "circle")

# Fit a linear regression model for CURRPEN
lm_currpen <- lm(CURRPEN ~ AGE + KIDS + INCOME + EDCL + MARRIED + WSAVED + DEBT + YESFINRISK + NOFINRISK, data = selected_columns)


# Calculate VIF for CURRPEN model
vif_currpen <- vif(lm_currpen)

# Create a table of VIFs for CURRPEN model
vif_currpen_table <- data.frame(Variable = names(vif_currpen), VIF = vif_currpen)

# Print the table
print(vif_currpen_table)

             Variable      VIF
AGE               AGE 1.161610
KIDS             KIDS 1.199761
INCOME         INCOME 1.125151
EDCL             EDCL 1.262541
MARRIED       MARRIED 1.154818
WSAVED         WSAVED 1.115705
DEBT             DEBT 1.112265
YESFINRISK YESFINRISK 1.039653
NOFINRISK   NOFINRISK 1.321477

summary(lm_currpen)


Call:
lm(formula = CURRPEN ~ AGE + KIDS + INCOME + EDCL + MARRIED + 
    WSAVED + DEBT + YESFINRISK + NOFINRISK, data = selected_columns)

Residuals:
     Min       1Q   Median       3Q      Max 
-1182816   -51546   -22004     5154 16354486 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -4.967e+04  1.777e+04  -2.796 0.005185 ** 
AGE          1.606e+03  1.598e+02  10.045  < 2e-16 ***
KIDS        -8.763e+03  2.373e+03  -3.692 0.000223 ***
INCOME       2.388e-03  2.047e-04  11.668  < 2e-16 ***
EDCL         7.298e+03  2.629e+03   2.776 0.005512 ** 
MARRIED     -2.921e+04  5.350e+03  -5.460 4.81e-08 ***
WSAVED       4.855e+03  3.390e+03   1.432 0.152158    
DEBT        -4.682e-03  9.952e-04  -4.704 2.56e-06 ***
YESFINRISK   2.776e+04  1.091e+04   2.544 0.010965 *  
NOFINRISK   -9.318e+03  5.929e+03  -1.572 0.116048    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 363900 on 22965 degrees of freedom
Multiple R-squared:  0.01813,   Adjusted R-squared:  0.01774 
F-statistic: 47.11 on 9 and 22965 DF,  p-value: < 2.2e-16

FE model

Failing as not much time variation

# estimate the fixed effects regression with plm()
fe_currpen <- plm(formula = CURRPEN ~ AGE + KIDS + INCOME + EDCL + MARRIED + WSAVED + DEBT + YESFINRISK , 
                    data    = selected_columns, 
                    index   = c("YY1", "Y1"),     # declaring data to be panel
                    model   = "within"            # fixed effects model
                    )

# estimate the fixed effects regression with plm()
fe_currpen <- plm(formula = CURRPEN ~ INCOME + DEBT + YESFINRISK , 
                    data    = selected_columns, 
                    index   = c("YY1", "Y1"),     # declaring data to be panel
                    model   = "within"            # fixed effects model
                    )

# Calculate variation (e.g., standard deviation) within YY1 groups over time
variation_summary <- selected_columns %>%
  group_by(YY1) %>%  # Group by YY1 
  summarize(
    currpen_sd = sd(CURRPEN, na.rm = TRUE),
    income_sd = sd(INCOME, na.rm = TRUE),
    debt_sd = sd(DEBT, na.rm = TRUE),
    yesfinrisk_sd = sd(YESFINRISK, na.rm = TRUE)
  ) %>%
  ungroup()  # Remove grouping
print(variation_summary)

# A tibble: 4,595 × 5
     YY1 currpen_sd income_sd  debt_sd yesfinrisk_sd
   <dbl>      <dbl>     <dbl>    <dbl>         <dbl>
 1     1          0     2145.  147164.             0
 2     2          0        0      548.             0
 3     3          0    23773. 1602476.             0
 4     4          0     3114.       0              0
 5     5          0     4116.     548.             0
 6     6          0     7229.   23700.             0
 7     7          0   479357.       0              0
 8     8          0      483.       0              0
 9     9          0        0      548.             0
10    10          0        0        0              0
# ℹ 4,585 more rows

Data Preparation

Make sure dummy creation/recode is correct.
Make as.factor more efficient

# Step 3: Convert categorical variables to factors for correlation analysis and modeling
selected_columns <- selected_columns %>%
  mutate(across(c(HHSEX, EDCL, MARRIED, YESFINRISK, NOFINRISK, WSAVED), as.factor))


# Recode ANYPEN from 1 and 2 to 0 and 1 using ifelse
selected_columns$ANYPEN <- ifelse(selected_columns$ANYPEN == 1,
                                  yes = 0, 
                                  no = 1
                                  )

# Convert ANYPEN to a factor (binary outcome for logistic regression)
selected_columns$ANYPEN <- as.factor(selected_columns$ANYPEN)

Quantile regression can be run if -

Multicollinearity not an issue.
Enough variation in the data.
No missing values.

lm_df <- lm(formula = CURRPEN ~ AGE + KIDS + INCOME + EDCL + MARRIED + WSAVED + DEBT + YESFINRISK,
            data = selected_columns)

plot(lm_df)

vif(lm_df )

               GVIF Df GVIF^(1/(2*Df))
AGE        1.161160  1        1.077571
KIDS       1.200294  1        1.095579
INCOME     1.125711  1        1.060995
EDCL       1.135215  3        1.021362
MARRIED    1.148578  1        1.071717
WSAVED     1.120905  2        1.028945
DEBT       1.112255  1        1.054635
YESFINRISK 1.010812  1        1.005391

hist(selected_columns$CURRPEN)

summary(selected_columns$CURRPEN)

    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
       0        0        0    26634        0 16690000

lm_df <- lm(formula = I(log(CURRPEN+.0000001)) ~ AGE + KIDS + INCOME + EDCL + MARRIED + WSAVED + DEBT + YESFINRISK,
            data = selected_columns)

plot(lm_df)

vif(lm_df )

               GVIF Df GVIF^(1/(2*Df))
AGE        1.161160  1        1.077571
KIDS       1.200294  1        1.095579
INCOME     1.125711  1        1.060995
EDCL       1.135215  3        1.021362
MARRIED    1.148578  1        1.071717
WSAVED     1.120905  2        1.028945
DEBT       1.112255  1        1.054635
YESFINRISK 1.010812  1        1.005391

summary(lm_df)


Call:
lm(formula = I(log(CURRPEN + 1e-07)) ~ AGE + KIDS + INCOME + 
    EDCL + MARRIED + WSAVED + DEBT + YESFINRISK, data = selected_columns)

Residuals:
    Min      1Q  Median      3Q     Max 
-4.0031 -1.2373 -0.6148  0.0177 30.5607 

Coefficients:
              Estimate Std. Error  t value Pr(>|t|)    
(Intercept) -1.770e+01  1.617e-01 -109.454  < 2e-16 ***
AGE          3.834e-02  1.850e-03   20.727  < 2e-16 ***
KIDS        -1.978e-01  2.748e-02   -7.201 6.17e-13 ***
INCOME       4.318e-09  2.369e-09    1.822  0.06842 .  
EDCL2        5.458e-01  1.114e-01    4.900 9.66e-07 ***
EDCL3        5.246e-01  1.103e-01    4.755 2.00e-06 ***
EDCL4        8.076e-01  1.028e-01    7.859 4.05e-15 ***
MARRIED2    -5.165e-01  6.176e-02   -8.364  < 2e-16 ***
WSAVED2     -9.933e-02  9.175e-02   -1.083  0.27901    
WSAVED3     -1.517e-01  8.157e-02   -1.860  0.06292 .  
DEBT        -3.524e-08  1.152e-08   -3.060  0.00222 ** 
YESFINRISK1 -3.395e-01  1.245e-01   -2.726  0.00641 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.212 on 22963 degrees of freedom
Multiple R-squared:  0.03638,   Adjusted R-squared:  0.03592 
F-statistic: 78.81 on 11 and 22963 DF,  p-value: < 2.2e-16

Multicollinearity not an issue.

summary(selected_columns)

      YY1             Y1             AGE             KIDS        
 Min.   :   1   Min.   :   11   Min.   :18.00   Min.   : 0.0000  
 1st Qu.:1152   1st Qu.:11524   1st Qu.:42.00   1st Qu.: 0.0000  
 Median :2303   Median :23033   Median :56.00   Median : 0.0000  
 Mean   :2303   Mean   :23029   Mean   :54.47   Mean   : 0.7386  
 3rd Qu.:3454   3rd Qu.:34542   3rd Qu.:67.00   3rd Qu.: 1.0000  
 Max.   :4603   Max.   :46035   Max.   :95.00   Max.   :10.0000  
     INCOME          HHSEX     EDCL      MARRIED   WSAVED   
 Min.   :        0   1:17485   1: 2146   1:14525   1: 3574  
 1st Qu.:    42156   2: 5490   2: 4522   2: 8450   2: 5250  
 Median :    94039             3: 4939             3:14151  
 Mean   :  1592855             4:11368                      
 3rd Qu.:   264823                                          
 Max.   :458230949                                          
      DEBT           YESFINRISK NOFINRISK ANYPEN       CURRPEN        
 Min.   :        0   0:21754    0:15679   0:12897   Min.   :       0  
 1st Qu.:        0   1: 1221    1: 7296   1:10078   1st Qu.:       0  
 Median :    29000                                  Median :       0  
 Mean   :   360643                                  Mean   :   26634  
 3rd Qu.:   217000                                  3rd Qu.:       0  
 Max.   :131990000                                  Max.   :16690000

sapply(selected_columns, function(x) length(unique(x)))

       YY1         Y1        AGE       KIDS     INCOME      HHSEX       EDCL 
      4595      22975         78         10       2593          2          4 
   MARRIED     WSAVED       DEBT YESFINRISK  NOFINRISK     ANYPEN    CURRPEN 
         2          3       4547          2          2          2        255

selected_columns2 <- selected_columns %>%
  select_if(function(x) length(unique(x)) > 1)

Enough variation in the data.

sum(is.na(selected_columns))  # Count missing values

[1] 0

# Remove rows with missing values
selected_columns <- na.omit(selected_columns)

No missing values.

Quantile Regression

Watch the video for explanation.
In a picture
Implementation in Stata ; Python
Interpretation

Still cannot run quantile regression. Simplify the regression further - remove some variables.

Identify and Remove Outliers

You would naturally assume right skewness in these three vars (lower bound is zero).
Defend your assumption of Z score threshold of 3. Addressing skewness in independent variables (X variables) in an OLS regression can be important, but it’s not strictly required in every case.
- Can winsorise the top 5% observations or so, or try to log them to reduce skewness, or do both.
Use quantile regression. This makes your paper a bit fancier as well.

quantile_model <- rq(
  formula = CURRPEN ~ AGE + KIDS + INCOME + EDCL + MARRIED + WSAVED + DEBT + YESFINRISK,
  data = aggregated_data,
  tau = 0.5,
  lambda = 0.1
)

# Summary
summary(quantile_model)

Standard errors seem to be an issue. Not a lot of variation in data.

# Fit lasso-regularized quantile regression
quantile_model_lasso <- rq(
  formula = CURRPEN ~ AGE + KIDS + INCOME + EDCL + MARRIED + WSAVED + DEBT + YESFINRISK,
  data = aggregated_data,
  tau = c(0.1, 0.25, 0.5, 0.75, 0.9),
  method = "lasso"
)

# Extract coefficients
coef(quantile_model_lasso)

                tau= 0.10     tau= 0.25     tau= 0.50     tau= 0.75
(Intercept) -6.305314e-11 -1.235397e-10  9.020230e-12  1.803980e-11
AGE          5.188849e-17  1.443716e-16  1.592052e-16  5.894242e-17
KIDS        -6.102776e-13 -6.491629e-12 -3.421326e-12 -9.772582e-13
INCOME       1.322198e-28 -2.420799e-28  7.418144e-28 -1.978610e-28
EDCL         1.986470e-13 -3.448319e-13  8.207572e-13  2.691709e-13
MARRIED     -1.270683e-12 -1.899526e-11 -1.857585e-12  6.644150e-13
WSAVED      -6.239952e-14  4.978797e-13  9.656998e-13 -3.097413e-14
DEBT         6.139817e-26 -4.437688e-26  2.852260e-26  1.143532e-26
YESFINRISK  -1.112190e-12 -2.465705e-11 -8.290218e-12 -2.524586e-12
                tau= 0.90
(Intercept)  8.454461e-10
AGE          4.324559e-16
KIDS        -6.782490e-11
INCOME      -7.214120e-28
EDCL         4.223731e-12
MARRIED     -2.172013e-11
WSAVED       2.374944e-12
DEBT         1.080709e-26
YESFINRISK  -1.217901e-10

Coefficients are not an issue. Some variation in data.

Try to play with this more - try different packages.