Goal - Predict income based on demographics for NYC residents. The data sets is from ACS using the tidycensus package, using the latest 5 year estimates. The goal is to understand which demographic factors can predict median household income for NYC residents. As a NYC resident, this data set seemed interesting because it can help us understand whether certain demographics plays a significant role in determining income for new yorkers

The variables we are using are:

Load data

library(tidycensus)
library(tidyverse)
library(dplyr)
library(stringr)
library(tidyr)
library(ggcorrplot)
library(rpart)
library(gbm)
library(nnet)
census_api_key("d0dd6f2e8d522579907cb8f5d50d832d4887bbe7", install = TRUE, overwrite = TRUE)
## Your original .Renviron will be backed up and stored in your R HOME directory if needed.
## Your API key has been stored in your .Renviron and can be accessed by Sys.getenv("CENSUS_API_KEY"). 
## To use now, restart R or run `readRenviron("~/.Renviron")`
## [1] "d0dd6f2e8d522579907cb8f5d50d832d4887bbe7"
readRenviron("~/.Renviron")
vars <- c(
  # Income & Population
  median_income = "B19013_001",

  
  # Race
  white_pop = "B02001_002",
  black_pop = "B02001_003",
  asian_pop = "B02001_005",
  hispanic_pop = "B03003_003",
  
  # Education
  bachelors = "B15003_022",
  masters = "B15003_023",
  
  # Age groups
  age_under_18 = "B01001_003",    
  age_18_64 = "B01001_020",       
  age_65_over = "B01001_020",     
  
  # Household info
  total_households = "B11001_001",
  single_parent_households = "B11003_010",  # Female head, children under 18
  
  # Immigration 
  foreign_born = "B05002_013",
  native_born = "B05002_002",
  
  # Dependents 
  under_18_total = "B09001_001"

)
data <- get_acs(
  geography = "zcta",
  variables = vars,
  year = 2023,
  survey = "acs5",
  output = "wide"
)
## Getting data from the 2019-2023 5-year ACS

filter by nyc

nyc <- data %>%
  filter(GEOID %in% nyc_zips)

glimpse(nyc)
## Rows: 240
## Columns: 30
## $ GEOID                     <chr> "10001", "10002", "10003", "10004", "10005",…
## $ NAME                      <chr> "ZCTA5 10001", "ZCTA5 10002", "ZCTA5 10003",…
## $ median_incomeE            <dbl> 123393, 46525, 153750, 220592, 211810, 20997…
## $ median_incomeM            <dbl> 22852, 5561, 11191, 53147, 29291, 43452, NA,…
## $ white_popE                <dbl> 15893, 23597, 36193, 2460, 6254, 2932, 5593,…
## $ white_popM                <dbl> 1561, 1946, 2666, 635, 977, 547, 998, 2435, …
## $ black_popE                <dbl> 3070, 7074, 2367, 278, 405, 271, 704, 5002, …
## $ black_popM                <dbl> 727, 1161, 313, 165, 280, 235, 324, 698, 133…
## $ asian_popE                <dbl> 5228, 27700, 9639, 923, 1311, 1058, 1179, 75…
## $ asian_popM                <dbl> 698, 1989, 982, 586, 461, 628, 511, 1209, 80…
## $ hispanic_popE             <dbl> 5547, 18646, 5285, 189, 935, 125, 1016, 1424…
## $ hispanic_popM             <dbl> 803, 2053, 720, 105, 488, 110, 600, 1456, 61…
## $ bachelorsE                <dbl> 9424, 15771, 16346, 1283, 3703, 1499, 2936, …
## $ bachelorsM                <dbl> 1009, 1236, 2279, 392, 658, 382, 751, 1818, …
## $ mastersE                  <dbl> 5018, 5620, 7964, 839, 1377, 741, 1362, 9037…
## $ mastersM                  <dbl> 720, 812, 803, 307, 437, 229, 442, 1673, 807…
## $ age_under_18E             <dbl> 675, 1351, 895, 153, 56, 92, 295, 1101, 708,…
## $ age_under_18M             <dbl> 324, 411, 264, 111, 91, 73, 176, 406, 293, 2…
## $ age_18_64E                <dbl> 174, 1154, 484, 59, 151, 0, 114, 498, 287, 4…
## $ age_18_64M                <dbl> 85, 662, 188, 91, 128, 13, 80, 265, 210, 177…
## $ total_householdsE         <dbl> 15097, 35771, 25080, 1775, 5156, 2305, 3309,…
## $ total_householdsM         <dbl> 796, 1265, 1152, 384, 588, 270, 472, 1230, 1…
## $ single_parent_householdsE <dbl> 176, 227, 154, 11, 5, 0, 21, 643, 276, 132, …
## $ single_parent_householdsM <dbl> 193, 117, 134, 11, 7, 13, 27, 401, 62, 93, 8…
## $ foreign_bornE             <dbl> 7796, 28899, 10633, 856, 2285, 1508, 1469, 1…
## $ foreign_bornM             <dbl> 995, 1988, 1128, 329, 604, 646, 459, 1645, 9…
## $ native_bornE              <dbl> 21283, 46618, 43192, 3019, 6953, 2967, 6333,…
## $ native_bornM              <dbl> 1571, 2525, 2385, 785, 1038, 515, 999, 2903,…
## $ under_18_totalE           <dbl> 3322, 10121, 3792, 776, 843, 301, 1550, 6049…
## $ under_18_totalM           <dbl> 724, 1368, 668, 341, 279, 98, 399, 831, 657,…
nyc <- nyc[, !grepl("M$", names(nyc))]

Data Exploration

summary(nyc)
##     GEOID               NAME           median_incomeE     white_popE   
##  Length:240         Length:240         Min.   : 27500   Min.   :    0  
##  Class :character   Class :character   1st Qu.: 69786   1st Qu.: 4085  
##  Mode  :character   Mode  :character   Median : 91042   Median : 9838  
##                                        Mean   :101374   Mean   :14073  
##                                        3rd Qu.:122390   3rd Qu.:19889  
##                                        Max.   :250001   Max.   :61357  
##                                        NA's   :27                      
##    black_popE      asian_popE    hispanic_popE     bachelorsE   
##  Min.   :    0   Min.   :    0   Min.   :    0   Min.   :    0  
##  1st Qu.:  544   1st Qu.:  764   1st Qu.: 2041   1st Qu.: 2579  
##  Median : 2367   Median : 2594   Median : 6702   Median : 5166  
##  Mean   : 8566   Mean   : 5482   Mean   :10890   Mean   : 6498  
##  3rd Qu.:10534   3rd Qu.: 7211   3rd Qu.:12665   3rd Qu.: 9780  
##  Max.   :76552   Max.   :59546   Max.   :79956   Max.   :20536  
##                                                                 
##     mastersE     age_under_18E      age_18_64E     total_householdsE
##  Min.   :    0   Min.   :   0.0   Min.   :   0.0   Min.   :    0    
##  1st Qu.: 1362   1st Qu.: 331.8   1st Qu.: 129.5   1st Qu.: 5868    
##  Median : 2763   Median : 862.0   Median : 337.0   Median :13178    
##  Mean   : 3457   Mean   :1144.8   Mean   : 387.2   Mean   :14865    
##  3rd Qu.: 4762   3rd Qu.:1772.0   3rd Qu.: 571.8   3rd Qu.:22948    
##  Max.   :17867   Max.   :7033.0   Max.   :1494.0   Max.   :41029    
##                                                                     
##  single_parent_householdsE foreign_bornE    native_bornE   under_18_totalE
##  Min.   :   0.0            Min.   :    0   Min.   :    0   Min.   :    0  
##  1st Qu.:  50.0            1st Qu.: 3788   1st Qu.:10152   1st Qu.: 2805  
##  Median : 191.5            Median :10511   Median :22212   Median : 6286  
##  Mean   : 253.6            Mean   :13812   Mean   :24588   Mean   : 7873  
##  3rd Qu.: 368.2            3rd Qu.:19627   3rd Qu.:38235   3rd Qu.:11352  
##  Max.   :1443.0            Max.   :65326   Max.   :71276   Max.   :37775  
## 
nyc<- nyc %>%
  filter(complete.cases(.))%>%
  mutate(GEOID = as.factor(GEOID))

glimpse(nyc)
## Rows: 213
## Columns: 16
## $ GEOID                     <fct> 10001, 10002, 10003, 10004, 10005, 10006, 10…
## $ NAME                      <chr> "ZCTA5 10001", "ZCTA5 10002", "ZCTA5 10003",…
## $ median_incomeE            <dbl> 123393, 46525, 153750, 220592, 211810, 20997…
## $ white_popE                <dbl> 15893, 23597, 36193, 2460, 6254, 2932, 5593,…
## $ black_popE                <dbl> 3070, 7074, 2367, 278, 405, 271, 704, 5002, …
## $ asian_popE                <dbl> 5228, 27700, 9639, 923, 1311, 1058, 1179, 75…
## $ hispanic_popE             <dbl> 5547, 18646, 5285, 189, 935, 125, 1016, 1424…
## $ bachelorsE                <dbl> 9424, 15771, 16346, 1283, 3703, 1499, 2936, …
## $ mastersE                  <dbl> 5018, 5620, 7964, 839, 1377, 741, 1362, 9037…
## $ age_under_18E             <dbl> 675, 1351, 895, 153, 56, 92, 295, 1101, 708,…
## $ age_18_64E                <dbl> 174, 1154, 484, 59, 151, 0, 114, 498, 287, 4…
## $ total_householdsE         <dbl> 15097, 35771, 25080, 1775, 5156, 2305, 3309,…
## $ single_parent_householdsE <dbl> 176, 227, 154, 11, 5, 0, 21, 643, 276, 132, …
## $ foreign_bornE             <dbl> 7796, 28899, 10633, 856, 2285, 1508, 1469, 1…
## $ native_bornE              <dbl> 21283, 46618, 43192, 3019, 6953, 2967, 6333,…
## $ under_18_totalE           <dbl> 3322, 10121, 3792, 776, 843, 301, 1550, 6049…

Normalize variables as calculation of total population to interpretable results

nyc$total_pop <- nyc$foreign_bornE + nyc$native_bornE

nyc$percent_white <- nyc$white_popE / nyc$total_pop
nyc$percent_black <- nyc$black_popE / nyc$total_pop
nyc$percent_asian <- nyc$asian_popE / nyc$total_pop
nyc$percent_hispanic <- nyc$hispanic_popE / nyc$total_pop

nyc$percent_bachelors <- nyc$bachelorsE / nyc$total_householdsE
nyc$percent_masters <- nyc$mastersE / nyc$total_householdsE

nyc$percent_under_18 <- nyc$under_18_totalE / nyc$total_pop
nyc$percent_age_18_64 <- nyc$age_18_64E / nyc$total_pop

nyc$percent_single_parent <- nyc$single_parent_householdsE / nyc$total_householdsE

nyc$percent_foreign_born <- nyc$foreign_bornE / nyc$total_pop

Many NYC residents average income is less than $122k, the distribution is right skewed as there are some higher earners. The education distribution has a bell shape, with slight skews. All races are right skewed except for white as it is mostly

nyc_data <- nyc %>%
  select(median_incomeE, total_householdsE ,single_parent_householdsE,under_18_totalE,   percent_white, percent_black, percent_asian, percent_hispanic, percent_bachelors, percent_masters, percent_under_18,
         percent_age_18_64, percent_single_parent, percent_foreign_born)


long_percent_data <-nyc_data %>%
  pivot_longer(cols = everything(), names_to = "variable", values_to = "value")


ggplot(long_percent_data, aes(x = value)) +
  geom_histogram(bins = 30, fill = "steelblue", color = "black") +
  facet_wrap(~ variable, scales = "free") +
  theme_minimal() +
  labs(title = "Distribution of Percent-Based Variables",
       x = "Proportion", y = "Count")

Bachelors and Masters variables are highly correlated, probably because many residents holding a bachelors also hold a Masters degree. However, median income and degrees have a high positive correlation, meaning that it’s relationship may be linear.

cor_matrix <- cor(nyc_data, use = "complete.obs")


chunk_size <- 10
var_names <- colnames(cor_matrix)
chunks <- split(var_names, ceiling(seq_along(var_names)/chunk_size))

# Plot each chunk
for (chunk in chunks) {
  chunk_cor <- cor_matrix[chunk, chunk]
  print(
    ggcorrplot(chunk_cor,
               method = "circle",
               type = "lower",
               lab = TRUE,
               lab_size = 2.5,
               title = paste("Correlation Plot -", paste(chunk, collapse = ", ")),
               tl.cex = 8,
               tl.srt = 45,
               colors = c("red", "white", "blue"),
               outline.color = "gray",
               hc.order = TRUE)
  )
}

Modeling

Splitting the data

set.seed(123)
sample <- sample(nrow(nyc_data), round(nrow(nyc_data)*.8),
                 replace = FALSE)

nyc_train <- nyc_data[sample,]
nyc_test <- nyc_data[-sample,]

Model 1 Decision Tree - Anova method for regression

Areas with bachelors degrees above 32% will probably have higher median incomes than below. The left split predicts that areas with a black population higher than 23% have the lowest median income.In addition on the left side, areas with white residents greater than 66% and a masters greater than 32% have a meadian income of $119K.

set.seed(123)
tree_model <- rpart(median_incomeE ~ ., data = nyc_train, method = "anova")
plot(tree_model)
text(tree_model, use.n = TRUE, cex = 0.4)

predictions <- predict(tree_model, newdata = nyc_test)

actuals <- nyc_test$median_incomeE

rmse <- sqrt(mean((predictions - actuals)^2))
mae <- mean(abs(predictions - actuals))
r2 <- 1 - sum((predictions - actuals)^2) / sum((actuals - mean(actuals))^2)

cat("RMSE:", round(rmse, 2), "\n")
## RMSE: 22999.37
cat("MAE:", round(mae, 2), "\n")
## MAE: 17161.66
cat("R-squared:", round(r2, 3), "\n")
## R-squared: 0.608

Education background stands as the most important factor for predicting income. Surprisingly, having dependents are also an important predictor of income. On the other hand, being a single parent or age are not important variables to predict a nyc resident’s income

barplot(tree_model$variable.importance, 
        main = "Variable Importance", 
        las = 2, col = "steelblue")

Model 2 Gradient Boosting model for a better predictive performance.

This model indicates that bachelors, masters degree, racial backround of white, and hispanic have the highest incfluence for predicting median income.

set.seed(123)
gbm_model <- gbm(
  formula = median_incomeE ~ percent_masters + percent_bachelors + under_18_totalE + 
            percent_white + percent_black + percent_asian + percent_hispanic,
  data = nyc_train,
  distribution = "gaussian",  
  n.trees = 500,
  interaction.depth = 3,
  shrinkage = 0.01,
  cv.folds = 5,
  n.cores = NULL, 
  verbose = FALSE
)

best_iter <- gbm.perf(gbm_model, method = "cv")

summary(gbm_model)

##                                 var   rel.inf
## percent_bachelors percent_bachelors 38.669599
## percent_masters     percent_masters 24.399806
## percent_white         percent_white 12.571100
## percent_hispanic   percent_hispanic 11.142181
## under_18_totalE     under_18_totalE  9.519230
## percent_black         percent_black  1.872323
## percent_asian         percent_asian  1.825761

R squared improved by 20%, meaning that this model explains 84% of the factors related to predicting income.

predictions2 <- predict(gbm_model, newdata = nyc_test)
## Using 499 trees...
rmse <- sqrt(mean((predictions2 - actuals)^2))
mae <- mean(abs(predictions2 - actuals))
r2 <- 1 - sum((predictions2 - actuals)^2) / sum((actuals - mean(actuals))^2)

cat("RMSE:", round(rmse, 2), "\n")
## RMSE: 14626.27
cat("MAE:", round(mae, 2), "\n")
## MAE: 11260.76
cat("R-squared:", round(r2, 3), "\n")
## R-squared: 0.842

Model 3 Neural Network - capture non-linear and complex relationships

Worst performer possibly due to linear relationships between the variables.

set.seed(123)
nn_model <- nnet(median_incomeE ~ percent_masters + percent_bachelors + under_18_totalE + 
            percent_white + percent_black + percent_asian + percent_hispanic,
  data = nyc_train, size = 5, linout = TRUE)
## # weights:  46
## initial  value 2192887354300.679443 
## final  value 388163041288.594055 
## converged
predictions <- predict(nn_model, newdata = nyc_test)

rmse <- sqrt(mean((predictions - actuals)^2))
mae <- mean(abs(predictions - actuals))
r2 <- 1 - sum((predictions - actuals)^2) / sum((actuals - mean(actuals))^2)

cat("RMSE:", round(rmse, 2), "\n")
## RMSE: 37656.09
cat("MAE:", round(mae, 2), "\n")
## MAE: 31706.21
cat("R-squared:", round(r2, 3), "\n")
## R-squared: -0.05