Goal - Predict income based on demographics for NYC residents. The data sets is from ACS using the tidycensus package, using the latest 5 year estimates. The goal is to understand which demographic factors can predict median household income for NYC residents. As a NYC resident, this data set seemed interesting because it can help us understand whether certain demographics plays a significant role in determining income for new yorkers
The variables we are using are:
library(tidycensus)
library(tidyverse)
library(dplyr)
library(stringr)
library(tidyr)
library(ggcorrplot)
library(rpart)
library(gbm)
library(nnet)
census_api_key("d0dd6f2e8d522579907cb8f5d50d832d4887bbe7", install = TRUE, overwrite = TRUE)
## Your original .Renviron will be backed up and stored in your R HOME directory if needed.
## Your API key has been stored in your .Renviron and can be accessed by Sys.getenv("CENSUS_API_KEY").
## To use now, restart R or run `readRenviron("~/.Renviron")`
## [1] "d0dd6f2e8d522579907cb8f5d50d832d4887bbe7"
readRenviron("~/.Renviron")
vars <- c(
# Income & Population
median_income = "B19013_001",
# Race
white_pop = "B02001_002",
black_pop = "B02001_003",
asian_pop = "B02001_005",
hispanic_pop = "B03003_003",
# Education
bachelors = "B15003_022",
masters = "B15003_023",
# Age groups
age_under_18 = "B01001_003",
age_18_64 = "B01001_020",
age_65_over = "B01001_020",
# Household info
total_households = "B11001_001",
single_parent_households = "B11003_010", # Female head, children under 18
# Immigration
foreign_born = "B05002_013",
native_born = "B05002_002",
# Dependents
under_18_total = "B09001_001"
)
data <- get_acs(
geography = "zcta",
variables = vars,
year = 2023,
survey = "acs5",
output = "wide"
)
## Getting data from the 2019-2023 5-year ACS
filter by nyc
nyc <- data %>%
filter(GEOID %in% nyc_zips)
glimpse(nyc)
## Rows: 240
## Columns: 30
## $ GEOID <chr> "10001", "10002", "10003", "10004", "10005",…
## $ NAME <chr> "ZCTA5 10001", "ZCTA5 10002", "ZCTA5 10003",…
## $ median_incomeE <dbl> 123393, 46525, 153750, 220592, 211810, 20997…
## $ median_incomeM <dbl> 22852, 5561, 11191, 53147, 29291, 43452, NA,…
## $ white_popE <dbl> 15893, 23597, 36193, 2460, 6254, 2932, 5593,…
## $ white_popM <dbl> 1561, 1946, 2666, 635, 977, 547, 998, 2435, …
## $ black_popE <dbl> 3070, 7074, 2367, 278, 405, 271, 704, 5002, …
## $ black_popM <dbl> 727, 1161, 313, 165, 280, 235, 324, 698, 133…
## $ asian_popE <dbl> 5228, 27700, 9639, 923, 1311, 1058, 1179, 75…
## $ asian_popM <dbl> 698, 1989, 982, 586, 461, 628, 511, 1209, 80…
## $ hispanic_popE <dbl> 5547, 18646, 5285, 189, 935, 125, 1016, 1424…
## $ hispanic_popM <dbl> 803, 2053, 720, 105, 488, 110, 600, 1456, 61…
## $ bachelorsE <dbl> 9424, 15771, 16346, 1283, 3703, 1499, 2936, …
## $ bachelorsM <dbl> 1009, 1236, 2279, 392, 658, 382, 751, 1818, …
## $ mastersE <dbl> 5018, 5620, 7964, 839, 1377, 741, 1362, 9037…
## $ mastersM <dbl> 720, 812, 803, 307, 437, 229, 442, 1673, 807…
## $ age_under_18E <dbl> 675, 1351, 895, 153, 56, 92, 295, 1101, 708,…
## $ age_under_18M <dbl> 324, 411, 264, 111, 91, 73, 176, 406, 293, 2…
## $ age_18_64E <dbl> 174, 1154, 484, 59, 151, 0, 114, 498, 287, 4…
## $ age_18_64M <dbl> 85, 662, 188, 91, 128, 13, 80, 265, 210, 177…
## $ total_householdsE <dbl> 15097, 35771, 25080, 1775, 5156, 2305, 3309,…
## $ total_householdsM <dbl> 796, 1265, 1152, 384, 588, 270, 472, 1230, 1…
## $ single_parent_householdsE <dbl> 176, 227, 154, 11, 5, 0, 21, 643, 276, 132, …
## $ single_parent_householdsM <dbl> 193, 117, 134, 11, 7, 13, 27, 401, 62, 93, 8…
## $ foreign_bornE <dbl> 7796, 28899, 10633, 856, 2285, 1508, 1469, 1…
## $ foreign_bornM <dbl> 995, 1988, 1128, 329, 604, 646, 459, 1645, 9…
## $ native_bornE <dbl> 21283, 46618, 43192, 3019, 6953, 2967, 6333,…
## $ native_bornM <dbl> 1571, 2525, 2385, 785, 1038, 515, 999, 2903,…
## $ under_18_totalE <dbl> 3322, 10121, 3792, 776, 843, 301, 1550, 6049…
## $ under_18_totalM <dbl> 724, 1368, 668, 341, 279, 98, 399, 831, 657,…
nyc <- nyc[, !grepl("M$", names(nyc))]
summary(nyc)
## GEOID NAME median_incomeE white_popE
## Length:240 Length:240 Min. : 27500 Min. : 0
## Class :character Class :character 1st Qu.: 69786 1st Qu.: 4085
## Mode :character Mode :character Median : 91042 Median : 9838
## Mean :101374 Mean :14073
## 3rd Qu.:122390 3rd Qu.:19889
## Max. :250001 Max. :61357
## NA's :27
## black_popE asian_popE hispanic_popE bachelorsE
## Min. : 0 Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 544 1st Qu.: 764 1st Qu.: 2041 1st Qu.: 2579
## Median : 2367 Median : 2594 Median : 6702 Median : 5166
## Mean : 8566 Mean : 5482 Mean :10890 Mean : 6498
## 3rd Qu.:10534 3rd Qu.: 7211 3rd Qu.:12665 3rd Qu.: 9780
## Max. :76552 Max. :59546 Max. :79956 Max. :20536
##
## mastersE age_under_18E age_18_64E total_householdsE
## Min. : 0 Min. : 0.0 Min. : 0.0 Min. : 0
## 1st Qu.: 1362 1st Qu.: 331.8 1st Qu.: 129.5 1st Qu.: 5868
## Median : 2763 Median : 862.0 Median : 337.0 Median :13178
## Mean : 3457 Mean :1144.8 Mean : 387.2 Mean :14865
## 3rd Qu.: 4762 3rd Qu.:1772.0 3rd Qu.: 571.8 3rd Qu.:22948
## Max. :17867 Max. :7033.0 Max. :1494.0 Max. :41029
##
## single_parent_householdsE foreign_bornE native_bornE under_18_totalE
## Min. : 0.0 Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 50.0 1st Qu.: 3788 1st Qu.:10152 1st Qu.: 2805
## Median : 191.5 Median :10511 Median :22212 Median : 6286
## Mean : 253.6 Mean :13812 Mean :24588 Mean : 7873
## 3rd Qu.: 368.2 3rd Qu.:19627 3rd Qu.:38235 3rd Qu.:11352
## Max. :1443.0 Max. :65326 Max. :71276 Max. :37775
##
nyc<- nyc %>%
filter(complete.cases(.))%>%
mutate(GEOID = as.factor(GEOID))
glimpse(nyc)
## Rows: 213
## Columns: 16
## $ GEOID <fct> 10001, 10002, 10003, 10004, 10005, 10006, 10…
## $ NAME <chr> "ZCTA5 10001", "ZCTA5 10002", "ZCTA5 10003",…
## $ median_incomeE <dbl> 123393, 46525, 153750, 220592, 211810, 20997…
## $ white_popE <dbl> 15893, 23597, 36193, 2460, 6254, 2932, 5593,…
## $ black_popE <dbl> 3070, 7074, 2367, 278, 405, 271, 704, 5002, …
## $ asian_popE <dbl> 5228, 27700, 9639, 923, 1311, 1058, 1179, 75…
## $ hispanic_popE <dbl> 5547, 18646, 5285, 189, 935, 125, 1016, 1424…
## $ bachelorsE <dbl> 9424, 15771, 16346, 1283, 3703, 1499, 2936, …
## $ mastersE <dbl> 5018, 5620, 7964, 839, 1377, 741, 1362, 9037…
## $ age_under_18E <dbl> 675, 1351, 895, 153, 56, 92, 295, 1101, 708,…
## $ age_18_64E <dbl> 174, 1154, 484, 59, 151, 0, 114, 498, 287, 4…
## $ total_householdsE <dbl> 15097, 35771, 25080, 1775, 5156, 2305, 3309,…
## $ single_parent_householdsE <dbl> 176, 227, 154, 11, 5, 0, 21, 643, 276, 132, …
## $ foreign_bornE <dbl> 7796, 28899, 10633, 856, 2285, 1508, 1469, 1…
## $ native_bornE <dbl> 21283, 46618, 43192, 3019, 6953, 2967, 6333,…
## $ under_18_totalE <dbl> 3322, 10121, 3792, 776, 843, 301, 1550, 6049…
Normalize variables as calculation of total population to interpretable results
nyc$total_pop <- nyc$foreign_bornE + nyc$native_bornE
nyc$percent_white <- nyc$white_popE / nyc$total_pop
nyc$percent_black <- nyc$black_popE / nyc$total_pop
nyc$percent_asian <- nyc$asian_popE / nyc$total_pop
nyc$percent_hispanic <- nyc$hispanic_popE / nyc$total_pop
nyc$percent_bachelors <- nyc$bachelorsE / nyc$total_householdsE
nyc$percent_masters <- nyc$mastersE / nyc$total_householdsE
nyc$percent_under_18 <- nyc$under_18_totalE / nyc$total_pop
nyc$percent_age_18_64 <- nyc$age_18_64E / nyc$total_pop
nyc$percent_single_parent <- nyc$single_parent_householdsE / nyc$total_householdsE
nyc$percent_foreign_born <- nyc$foreign_bornE / nyc$total_pop
Many NYC residents average income is less than $122k, the distribution is right skewed as there are some higher earners. The education distribution has a bell shape, with slight skews. All races are right skewed except for white as it is mostly
nyc_data <- nyc %>%
select(median_incomeE, total_householdsE ,single_parent_householdsE,under_18_totalE, percent_white, percent_black, percent_asian, percent_hispanic, percent_bachelors, percent_masters, percent_under_18,
percent_age_18_64, percent_single_parent, percent_foreign_born)
long_percent_data <-nyc_data %>%
pivot_longer(cols = everything(), names_to = "variable", values_to = "value")
ggplot(long_percent_data, aes(x = value)) +
geom_histogram(bins = 30, fill = "steelblue", color = "black") +
facet_wrap(~ variable, scales = "free") +
theme_minimal() +
labs(title = "Distribution of Percent-Based Variables",
x = "Proportion", y = "Count")
Bachelors and Masters variables are highly correlated, probably because many residents holding a bachelors also hold a Masters degree. However, median income and degrees have a high positive correlation, meaning that it’s relationship may be linear.
cor_matrix <- cor(nyc_data, use = "complete.obs")
chunk_size <- 10
var_names <- colnames(cor_matrix)
chunks <- split(var_names, ceiling(seq_along(var_names)/chunk_size))
# Plot each chunk
for (chunk in chunks) {
chunk_cor <- cor_matrix[chunk, chunk]
print(
ggcorrplot(chunk_cor,
method = "circle",
type = "lower",
lab = TRUE,
lab_size = 2.5,
title = paste("Correlation Plot -", paste(chunk, collapse = ", ")),
tl.cex = 8,
tl.srt = 45,
colors = c("red", "white", "blue"),
outline.color = "gray",
hc.order = TRUE)
)
}
Splitting the data
set.seed(123)
sample <- sample(nrow(nyc_data), round(nrow(nyc_data)*.8),
replace = FALSE)
nyc_train <- nyc_data[sample,]
nyc_test <- nyc_data[-sample,]
Areas with bachelors degrees above 32% will probably have higher median incomes than below. The left split predicts that areas with a black population higher than 23% have the lowest median income.In addition on the left side, areas with white residents greater than 66% and a masters greater than 32% have a meadian income of $119K.
set.seed(123)
tree_model <- rpart(median_incomeE ~ ., data = nyc_train, method = "anova")
plot(tree_model)
text(tree_model, use.n = TRUE, cex = 0.4)
predictions <- predict(tree_model, newdata = nyc_test)
actuals <- nyc_test$median_incomeE
rmse <- sqrt(mean((predictions - actuals)^2))
mae <- mean(abs(predictions - actuals))
r2 <- 1 - sum((predictions - actuals)^2) / sum((actuals - mean(actuals))^2)
cat("RMSE:", round(rmse, 2), "\n")
## RMSE: 22999.37
cat("MAE:", round(mae, 2), "\n")
## MAE: 17161.66
cat("R-squared:", round(r2, 3), "\n")
## R-squared: 0.608
Education background stands as the most important factor for predicting income. Surprisingly, having dependents are also an important predictor of income. On the other hand, being a single parent or age are not important variables to predict a nyc resident’s income
barplot(tree_model$variable.importance,
main = "Variable Importance",
las = 2, col = "steelblue")
This model indicates that bachelors, masters degree, racial backround of white, and hispanic have the highest incfluence for predicting median income.
set.seed(123)
gbm_model <- gbm(
formula = median_incomeE ~ percent_masters + percent_bachelors + under_18_totalE +
percent_white + percent_black + percent_asian + percent_hispanic,
data = nyc_train,
distribution = "gaussian",
n.trees = 500,
interaction.depth = 3,
shrinkage = 0.01,
cv.folds = 5,
n.cores = NULL,
verbose = FALSE
)
best_iter <- gbm.perf(gbm_model, method = "cv")
summary(gbm_model)
## var rel.inf
## percent_bachelors percent_bachelors 38.669599
## percent_masters percent_masters 24.399806
## percent_white percent_white 12.571100
## percent_hispanic percent_hispanic 11.142181
## under_18_totalE under_18_totalE 9.519230
## percent_black percent_black 1.872323
## percent_asian percent_asian 1.825761
R squared improved by 20%, meaning that this model explains 84% of the factors related to predicting income.
predictions2 <- predict(gbm_model, newdata = nyc_test)
## Using 499 trees...
rmse <- sqrt(mean((predictions2 - actuals)^2))
mae <- mean(abs(predictions2 - actuals))
r2 <- 1 - sum((predictions2 - actuals)^2) / sum((actuals - mean(actuals))^2)
cat("RMSE:", round(rmse, 2), "\n")
## RMSE: 14626.27
cat("MAE:", round(mae, 2), "\n")
## MAE: 11260.76
cat("R-squared:", round(r2, 3), "\n")
## R-squared: 0.842
Worst performer possibly due to linear relationships between the variables.
set.seed(123)
nn_model <- nnet(median_incomeE ~ percent_masters + percent_bachelors + under_18_totalE +
percent_white + percent_black + percent_asian + percent_hispanic,
data = nyc_train, size = 5, linout = TRUE)
## # weights: 46
## initial value 2192887354300.679443
## final value 388163041288.594055
## converged
predictions <- predict(nn_model, newdata = nyc_test)
rmse <- sqrt(mean((predictions - actuals)^2))
mae <- mean(abs(predictions - actuals))
r2 <- 1 - sum((predictions - actuals)^2) / sum((actuals - mean(actuals))^2)
cat("RMSE:", round(rmse, 2), "\n")
## RMSE: 37656.09
cat("MAE:", round(mae, 2), "\n")
## MAE: 31706.21
cat("R-squared:", round(r2, 3), "\n")
## R-squared: -0.05