# Step 2: Select relevant columns for analysisselected_columns <- data %>%select(YY1, Y1,AGE, KIDS, INCOME, HHSEX, EDCL, MARRIED, WSAVED, DEBT, YESFINRISK, NOFINRISK, ANYPEN, CURRPEN)# Convert factors to numeric for correlation calculation (dummy encoding)selected_columns <- selected_columns %>%mutate(across(c(YY1, Y1,HHSEX, EDCL, MARRIED, YESFINRISK, NOFINRISK, ANYPEN, CURRPEN), as.numeric))# Now compute the correlation matrixcor_matrix <-cor(selected_columns %>%select(YY1, Y1, AGE, KIDS, INCOME, HHSEX, EDCL, MARRIED, WSAVED, DEBT, YESFINRISK, NOFINRISK, CURRPEN, ANYPEN), use ="complete.obs")# Plot the correlation matrix as a heatmapcorrplot(cor_matrix, method ="circle")
# Fit a linear regression model for CURRPENlm_currpen <-lm(CURRPEN ~ AGE + KIDS + INCOME + EDCL + MARRIED + WSAVED + DEBT + YESFINRISK + NOFINRISK, data = selected_columns)# Calculate VIF for CURRPEN modelvif_currpen <-vif(lm_currpen)# Create a table of VIFs for CURRPEN modelvif_currpen_table <-data.frame(Variable =names(vif_currpen), VIF = vif_currpen)# Print the tableprint(vif_currpen_table)
Variable VIF
AGE AGE 1.161610
KIDS KIDS 1.199761
INCOME INCOME 1.125151
EDCL EDCL 1.262541
MARRIED MARRIED 1.154818
WSAVED WSAVED 1.115705
DEBT DEBT 1.112265
YESFINRISK YESFINRISK 1.039653
NOFINRISK NOFINRISK 1.321477
summary(lm_currpen)
Call:
lm(formula = CURRPEN ~ AGE + KIDS + INCOME + EDCL + MARRIED +
WSAVED + DEBT + YESFINRISK + NOFINRISK, data = selected_columns)
Residuals:
Min 1Q Median 3Q Max
-1182816 -51546 -22004 5154 16354486
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -4.967e+04 1.777e+04 -2.796 0.005185 **
AGE 1.606e+03 1.598e+02 10.045 < 2e-16 ***
KIDS -8.763e+03 2.373e+03 -3.692 0.000223 ***
INCOME 2.388e-03 2.047e-04 11.668 < 2e-16 ***
EDCL 7.298e+03 2.629e+03 2.776 0.005512 **
MARRIED -2.921e+04 5.350e+03 -5.460 4.81e-08 ***
WSAVED 4.855e+03 3.390e+03 1.432 0.152158
DEBT -4.682e-03 9.952e-04 -4.704 2.56e-06 ***
YESFINRISK 2.776e+04 1.091e+04 2.544 0.010965 *
NOFINRISK -9.318e+03 5.929e+03 -1.572 0.116048
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 363900 on 22965 degrees of freedom
Multiple R-squared: 0.01813, Adjusted R-squared: 0.01774
F-statistic: 47.11 on 9 and 22965 DF, p-value: < 2.2e-16
FE model
Failing as not much time variation
# estimate the fixed effects regression with plm()fe_currpen <-plm(formula = CURRPEN ~ AGE + KIDS + INCOME + EDCL + MARRIED + WSAVED + DEBT + YESFINRISK , data = selected_columns, index =c("YY1", "Y1"), # declaring data to be panelmodel ="within"# fixed effects model )# estimate the fixed effects regression with plm()fe_currpen <-plm(formula = CURRPEN ~ INCOME + DEBT + YESFINRISK , data = selected_columns, index =c("YY1", "Y1"), # declaring data to be panelmodel ="within"# fixed effects model )
# Calculate variation (e.g., standard deviation) within YY1 groups over timevariation_summary <- selected_columns %>%group_by(YY1) %>%# Group by YY1 summarize(currpen_sd =sd(CURRPEN, na.rm =TRUE),income_sd =sd(INCOME, na.rm =TRUE),debt_sd =sd(DEBT, na.rm =TRUE),yesfinrisk_sd =sd(YESFINRISK, na.rm =TRUE) ) %>%ungroup() # Remove groupingprint(variation_summary)
# Step 3: Convert categorical variables to factors for correlation analysis and modelingselected_columns <- selected_columns %>%mutate(across(c(HHSEX, EDCL, MARRIED, YESFINRISK, NOFINRISK, WSAVED), as.factor))# Recode ANYPEN from 1 and 2 to 0 and 1 using ifelseselected_columns$ANYPEN <-ifelse(selected_columns$ANYPEN ==1,yes =0, no =1 )# Convert ANYPEN to a factor (binary outcome for logistic regression)selected_columns$ANYPEN <-as.factor(selected_columns$ANYPEN)
Quantile regression can be run if -
Multicollinearity not an issue.
Enough variation in the data.
No missing values.
lm_df <-lm(formula = CURRPEN ~ AGE + KIDS + INCOME + EDCL + MARRIED + WSAVED + DEBT + YESFINRISK,data = selected_columns)plot(lm_df)
Still cannot run quantile regression. Simplify the regression further - remove some variables.
Identify and Remove Outliers
You would naturally assume right skewness in these three vars (lower bound is zero).
Defend your assumption of Z score threshold of 3. Addressing skewness in independent variables (X variables) in an OLS regression can be important, but it’s not strictly required in every case.
Can winsorise the top 5% observations or so, or try to log them to reduce skewness, or do both.
Use quantile regression. This makes your paper a bit fancier as well.
quantile_model <-rq(formula = CURRPEN ~ AGE + KIDS + INCOME + EDCL + MARRIED + WSAVED + DEBT + YESFINRISK,data = aggregated_data,tau =0.5,lambda =0.1)# Summarysummary(quantile_model)
Standard errors seem to be an issue. Not a lot of variation in data.
# Fit lasso-regularized quantile regressionquantile_model_lasso <-rq(formula = CURRPEN ~ AGE + KIDS + INCOME + EDCL + MARRIED + WSAVED + DEBT + YESFINRISK,data = aggregated_data,tau =c(0.1, 0.25, 0.5, 0.75, 0.9),method ="lasso")# Extract coefficientscoef(quantile_model_lasso)