Data cleaning

There are 10 HUGE tables in this dataset so it took a bit to clean. I opted to use the excel spreadsheet source as it has the more usable data for me. But because of this I had to manually clean up the column names to be relational data friendly since it was written to be human read. I also did a bit of tweaking to the data so I can try to combine the state and cz_name aka city so I can try to join with to map the data.

I then filtered by state or cz_name aka city as applicable to the plots.

library(tidyverse)
library(dplyr)
library(janitor) # not a part of the tidyverse, so need to be installed separately
library(haven)
library(readxl)
library(usmap)
library(ggrepel)
library(patchwork)
library(scales)
# Load original data

# https://1drv.ms/x/c/9985fb69ac3e352c/IQAnHPTxxZFJR43FdGJPLU2xAfhuxw-lv5ZaNPiQRGUmNRs?e=IUOSOY
#dataTable_raw <- read_excel("LoP/online_data_tables.xls", sheet="Online Data Table 5", skip=49) %>% clean_names() %>% arrange(am_college_attendance)
data_organized <- read_csv("LoP/IntergenerationalMobilityEstimatesNEW.csv") %>% clean_names() %>% arrange(state)

cleaned_Data <- data_organized %>% drop_na()
cleaned_Data

## # A tibble: 517 × 35
##       cz cz_name    state childrenin1980_82cohorts am80_82cohort rm80_82cohort
##    <dbl> <chr>      <chr>                    <dbl>         <dbl>         <dbl>
##  1 34102 Anchorage  AK                       15326          45.5         0.268
##  2 34115 Fairbanks  AK                        4243          45.6         0.26 
##  3  6000 Huntsville AL                       18969          38.5         0.363
##  4  6100 Gadsden    AL                       11274          41.2         0.368
##  5  6200 Florence   AL                        8496          41.5         0.361
##  6  9500 Talladega  AL                        5584          37.9         0.422
##  7  9800 Auburn     AL                        4611          35.8         0.39 
##  8 10302 Dothan     AL                        8993          39           0.407
##  9 10600 Jasper     AL                        6330          41.9         0.379
## 10 10700 Birmingham AL                       38677          37.6         0.392
## # ℹ 507 more rows
## # ℹ 29 more variables: p_childin_q5_parentin_q1_80_85cohort <dbl>,
## #   am_college_attendance <dbl>, rm_college_attendance <dbl>,
## #   am_teenagebirth <dbl>, rm_teenagebirth <dbl>, am_females <dbl>,
## #   rm_females <dbl>, am_males <dbl>, rm_males <dbl>,
## #   am_males_individual_income <dbl>, rm_males_individual_income <dbl>,
## #   am_costof_living_adjusted <dbl>, rm_costof_living_adjusted <dbl>, …

sumofstudents = sum(cleaned_Data$childrenin1980_82cohorts)
sumofstudents

## [1] 9339992

data_organized$location <- paste(data_organized$cz_name, data_organized$state, sep=", ")
#View(data_organized)

####################################################################
# map data for R to read
# https://r-graph-gallery.com/map.html

# https://catalog.data.gov/dataset/commuting-zones-and-labor-market-areas/resource/6ab410f1-5384-4b59-a244-955abe1a0d31

##info on how to use us map library
#https://cran.r-project.org/web/packages/usmap/vignettes/usmap2.html

##
#https://jtr13.github.io/cc19/different-ways-of-plotting-u-s-map-in-r.html

## parameters documentation
# https://www.rdocumentation.org/packages/usmap/versions/1.0.0/topics/plot_usmap
#####################################################################

##FIGURE 1
# plot all of college attendance data and highlight recognizable cities for comparison
collegeAttendance_compare <- data_organized %>% mutate(should_be_labeled = ifelse(cz_name %in% c("Seattle", "San Diego", "Dallas", "Orlando", "Mobile", "Detroit", "Boston", "Spokane"), TRUE, FALSE)) 

##Figure 2
#selected state college attendance
selected_states <- c("WA", "CA", "TX", "FL", "AL", "MI", "MA")
collegeAttendance_states <- filter(data_organized, state %in% selected_states)


##Figure 3
# parent income 
income_raw <- read_csv("LoP/incomeCZ.csv") %>% clean_names() %>% arrange(state)

income_state <-filter(income_raw, state %in% selected_states)

income_city <- filter(income_raw, cz_name %in% c("Seattle", "San Diego", "Dallas", "Orlando", "Mobile", "Detroit", "Boston", "Spokane"))
income_WAcity <- filter(income_raw, cz_name %in% c("Seattle", "Spokane"))

##Figure 4
incomeMobility_raw <- filter(data_organized, state %in% selected_states)

Figure 1

AM, College Attendance Predicted college attendance rate for child with parent at 25th percentile of national family income distribution based on OLS regression of indicator for college attendance between ages 18-21 on parent income rank in core sample
RM, College Attendance Slope of OLS regression of indicator for college attendance between ages 18-21 on parent income rank in core sample

I picked point for first figure to show full college attendance dataset and comparison between the different CZ and the different mobility scales. And be able to highlight recognizable cities as comparison.

Added linear regression line to have a common point of how well the cities are doing over the median.

library(ggplot2)
library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

ca_city_mobility_plot <-ggplot(collegeAttendance_compare, aes(x=am_college_attendance, y=rm_college_attendance)) + 
  geom_point(aes(color="blue")) + 
  geom_label_repel(data = filter(collegeAttendance_compare, should_be_labeled == TRUE), aes(label=cz_name, fill=should_be_labeled), color="white") + scale_color_manual(values = c("grey50", "white")) + scale_fill_manual(values = c("blue")) + 
  guides(color = "none", fill="none") + 
  labs(title = "US College Attendance based on Parent Income Rank by Commuting Zone", 
       x = "Actual Mobility", y = "Relative Mobility")

coef(lm(rm_college_attendance~am_college_attendance,data=collegeAttendance_compare))

##           (Intercept) am_college_attendance 
##           1.015013113          -0.008006391

#ca_city_mobility_interactive <- 
  ca_city_mobility_plot + geom_abline(intercept=1.015013113,slope=-0.008006391, color="red") + annotate(geom="label",
          x = 15, y = .9,
          label = "Linear regression") +
annotate(geom="label", 
         x = 65, y = .85,
         label = "AM right is better") +
annotate(geom="label", 
         x = 20, y = .35,
         label = "RM higher is better")

#ggploty(ca_city_mobility_interactive)
ggsave("plots/ca_city_mobility_plot.png")

## Saving 7 x 5 in image

Figure 2

Basically using same data as Figure 1 but statewide. I picked histogram mostly as a condensed way to show within states from Fig 1 how wide the mobility scales can be. How does the economic background of a college student from one location compare to another state? It’s interesting to see the states with higher popular has such wide range of mobility. It’s interesting to see Alabama doesn’t break 40 percentile and Michigan is in the middle.

I added the yellow box to show how 68% of the student’s mobility is at.

library(ggridges)
cleaned_am = collegeAttendance_states %>% drop_na()
cleaned_am_mobility = cleaned_am$am_college_attendance
quantile(cleaned_am_mobility)

##     0%    25%    50%    75%   100% 
## 23.100 35.875 39.150 43.150 60.000

sumAM = sum(cleaned_am$childrenin1980_82cohorts)
majorityStudent= sumAM*.68
majorityStudent

## [1] 2054307

sumAM

## [1] 3021039

ca_state_mobility_plot <- ggplot(cleaned_am, aes(x=am_college_attendance, fill=state)) + 
  geom_histogram(binwidth = 1, color="blue", boundary=1) + scale_x_continuous() +
  labs(title = "US College Attendance Mobility by Selected State (3,021,039 total students)", 
       x = "Actual Mobility", y = "Relative Mobility", fill = "State") + 
  annotate(geom = "rect",
           xmin = 35.875, xmax = 43.150,
           ymin = 0, ymax = 12.5,
           fill = "yellow", alpha = 0.3) +
  annotate(geom = "label",
           x= 39, y = 13,
           label="68% or 2,054,307 of students here")
ggsave("plots/ca_state_mobility_plot.png")

## Saving 7 x 5 in image

ca_state_mobility_plot

Figure 3

I picked density ridges here to help compare the distribution of income between parent and child of between 1 generation and emphasizing the upper and lower income median brackets. AKA how many people are in each state and each parental income bracket.

I added lines for the quartiles to see the difference of scale and the spread of the data.

cleaned_income = income_state %>% drop_na()
cleaned_parent_income = cleaned_income$median_parent_income
quantile(cleaned_parent_income)

##    0%   25%   50%   75%  100% 
## 20300 41000 49050 58400 82100

cleaned_child_income = cleaned_income$median_child_income
quantile(cleaned_child_income)

##    0%   25%   50%   75%  100% 
## 21800 30225 33500 36900 53300

parent_income_state <- ggplot(income_state, aes(x=median_parent_income, y=state, fill=..x..)) +   geom_density_ridges_gradient() + scale_fill_viridis_c(option="cividis") +
  labs(title="Parent of College Child Income by Selected State", x="Median Parent Income", y="Selected States", fill = "Income") + scale_x_continuous(labels = label_comma(), breaks = seq(0, 100000, 10000)) + scale_fill_continuous(labels = label_comma()) +
  annotate(geom = "segment",
           x = 82100, xend = 82100,
           y = 0, yend = 9,
           color="red") + 
  annotate(geom = "segment",
           x = 58400, xend = 58400,
           y = 0, yend = 9,
           color="red") + 
  annotate(geom = "segment",
           x = 49050, xend = 49050,
           y = 0, yend = 9,
           color="blue") + 
  annotate(geom = "segment",
           x = 41000, xend = 41000,
           y = 0, yend = 9,
           color="red") + 
  annotate(geom = "segment",
           x = 20300, xend = 20300,
           y = 0, yend = 9,
           color="red")

## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.

child_income_state <- ggplot(income_state, aes(x=median_child_income, y=state, fill=..x..)) + geom_density_ridges_gradient() + scale_fill_viridis_c(option="cividis") +
  labs(title="College Child Income by Selected State", x="Median Parent Income", y="Selected States", fill = "Income") + scale_x_continuous(labels = scales::comma, breaks = seq(0, 100000, 10000)) + scale_fill_continuous(labels = label_comma()) +
  annotate(geom = "segment",
           x = 53300, xend = 53300,
           y = 0, yend = 9,
           color="red") + 
  annotate(geom = "segment",
           x = 36900, xend = 36900,
           y = 0, yend = 9,
           color="red") + 
  annotate(geom = "segment",
           x = 33500, xend = 33500,
           y = 0, yend = 9,
           color="blue") + 
  annotate(geom = "segment",
           x = 30225, xend = 30225,
           y = 0, yend = 9,
           color="red") + 
  annotate(geom = "segment",
           x = 21800, xend = 21800,
           y = 0, yend = 9,
           color="red")

## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.

combined_plot <- (parent_income_state + child_income_state) + plot_layout(ncol = 1, heights = c(1, 1)) + plot_annotation(title="Parent to Child Income") #prevent overlap
print(combined_plot)

## Picking joint bandwidth of 3760
## Picking joint bandwidth of 1560

ggsave("plots/combined_income_state_plot.png")

## Saving 7 x 5 in image
## Picking joint bandwidth of 3760
## 
## Picking joint bandwidth of 1560

Figure 4

After seeing how college attendance affects one’s economic mobility, I wanted to have a comparison to parent’s income affecting it to see the contrast. I picked these values to contrast to pure parental income impact mobility and see if there is any patterns. I broke this down by state to help show the range differences per state.

I added the lines as it’s the AM/RM medians so it’s easier to compare the scales between the states.

# "WA", "CA", "TX", "FL", "AL", "MI", "MA"
cleaned_incomeMobility = incomeMobility_raw %>% drop_na()
AMparentsIncome = cleaned_incomeMobility$am2011_2012parent_income
quantile(AMparentsIncome)

##     0%    25%    50%    75%   100% 
## 35.900 40.375 42.250 43.850 50.500

RMparentsIncome = cleaned_incomeMobility$rm2011_2012parent_income
quantile(RMparentsIncome)

##    0%   25%   50%   75%  100% 
## 0.191 0.258 0.285 0.324 0.386

parent_income_mobility <- ggplot(incomeMobility_raw, aes(x=am2011_2012parent_income, y=rm2011_2012parent_income, fill=state)) + geom_area() + labs(title="Parent Income Mobility by Selected State", x="Actual Mobility", y="Relative Mobility") + 
  theme(legend.position="none", panel.spacing = unit(0.1, "lines"), strip.text.x = element_text(size = 8)) + facet_wrap(~state, scale="free_y") +
  annotate(geom = "segment",
           x = 42.250, xend = 42.250,
           y = 0, yend = 0.8,
           color="black") +
  annotate(geom = "segment",
           x = 30, xend = 57,
           y = 0.285, yend = 0.285,
           color="black")
parent_income_mobility

ggsave("plots/parent_income_mobility_plot.png")

## Saving 7 x 5 in image

Figure 5

I picked simple bar chart as I wanted to do comparsion of the Parent Child Income Difference between the different income brackets. What is the intergenerational income difference between parents and child? Obviously it was best to compare the income percentile between the generation to help show the differences and the outcome is fairly surprising. I also narrowed this to Seattle and Spokane to make it more managable of seeing the differences between the “big city” vs “small city”.

The blue box is to make it easier to compare between the cities.

income_difference <- income_WAcity %>% 
mutate(incp99=parent_income_p99-child_income_p99) %>% mutate(incp90=parent_income_p90-child_income_p90) %>% mutate(incp75=parent_income_p75-child_income_p75) %>% mutate(incp25=parent_income_p25-child_income_p25) %>% mutate(incp10=parent_income_p10-child_income_p10) %>% mutate(medianIncome=median_parent_income-median_child_income) %>% mutate(meanIncome=mean_parent_income-mean_child_income)

grouped_income_difference <- income_difference %>% gather(Description, Income, incp99:meanIncome)

#grouped_income_difference2 <- income_difference %>% pivot_longer(c(incp99:meanIncome), names_to=Description, values_to=values)

Seattle_Spokane_income_plot <- ggplot(grouped_income_difference, aes(x=cz_name, y=Income, fill=Description)) +
  geom_bar(stat="identity", width=10, position="dodge") + scale_fill_manual(labels = c("Bottom 90%", "Bottom 75%", "Top 25%", "Top 10%", "Top 1%", "Mean Income", "Median Income"), values = c("tan4", "burlywood3", "darkgoldenrod", "deepskyblue3", "darkolivegreen", "darkorchid4", "pink3")) + scale_y_continuous(labels = comma) + facet_wrap(~cz_name) + labs(title="Parent Child Income Difference by City", fill="Income Bracket") +
theme(axis.title.x=element_blank(), axis.text.x = element_blank(),axis.ticks.x = element_blank()) + 
  annotate(geom = "rect",
           xmin = -4, xmax = 7,
           ymin = 0, ymax = 60000,
           fill = "blue", alpha = 0.3) +
  annotate(geom = "label",
           x= 0, y = 70000,
           label = "Bottom 99%")
Seattle_Spokane_income_plot

ggsave("plots/Seattle_Spokane_income_plot.png")

## Saving 7 x 5 in image

Final Project

Steven Ngu

02/15/2026

Executive summary

Data background