Ed Attainment by Race/Eth

Author

CRG

Dallas County Median Income by Educational Attainment and Race_Ethnicity, ACS 2022 Microdata from IPUMS

library(quarto)

Warning: package 'quarto' was built under R version 4.2.3

# Hide warning messages
options(warn=-1)

# Hide library messages
quiet <- suppressPackageStartupMessages(library(quarto))

library(ipumsr)
library(dplyr)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

usa_00061<- read_ipums_ddi("usa_00061.xml")
acs_00061 <- read_ipums_micro(usa_00061, data_file = ("usa_00061.dat.gz"), verbose = FALSE)

names(acs_00061) <- tolower(names(acs_00061))

head(acs_00061)

# A tibble: 6 × 22
   year multyear sample         serial cbserial  hhwt cluster statefip countyfip
  <int>    <dbl> <int+lbl>       <dbl>    <dbl> <dbl>   <dbl> <int+lb> <dbl+lbl>
1  2022     2018 202203 [2018-… 5.82e6  2.02e12     6 2.02e12 48 [Tex…   0 [Cou…
2  2022     2018 202203 [2018-… 5.82e6  2.02e12     4 2.02e12 48 [Tex… 355      
3  2022     2018 202203 [2018-… 5.82e6  2.02e12    12 2.02e12 48 [Tex… 245      
4  2022     2018 202203 [2018-… 5.82e6  2.02e12    54 2.02e12 48 [Tex… 485      
5  2022     2018 202203 [2018-… 5.82e6  2.02e12     3 2.02e12 48 [Tex…   0 [Cou…
6  2022     2018 202203 [2018-… 5.82e6  2.02e12    11 2.02e12 48 [Tex…   0 [Cou…
# ℹ 13 more variables: strata <dbl>, gq <int+lbl>, pernum <dbl>, perwt <dbl>,
#   sex <int+lbl>, age <int+lbl>, race <int+lbl>, raced <int+lbl>,
#   hispan <int+lbl>, hispand <int+lbl>, educ <int+lbl>, educd <int+lbl>,
#   inctot <dbl+lbl>

str(acs_00061)

tibble [795,260 × 22] (S3: tbl_df/tbl/data.frame)
 $ year     : int [1:795260] 2022 2022 2022 2022 2022 2022 2022 2022 2022 2022 ...
  ..- attr(*, "label")= chr "Census year"
  ..- attr(*, "var_desc")= chr "YEAR reports the four-digit year when the household was enumerated or included in the census, the ACS, and the "| __truncated__
 $ multyear : num [1:795260] 2018 2018 2018 2018 2018 ...
  ..- attr(*, "label")= chr "Actual year of survey, multi-year ACS/PRCS"
  ..- attr(*, "var_desc")= chr "MULTYEAR identifies the actual year of survey in multi-year ACS/PRCS samples.  \n\nFor example, the 3-year ACS "| __truncated__
 $ sample   : int+lbl [1:795260] 202203, 202203, 202203, 202203, 202203, 202203, 202...
   ..@ labels  : Named int [1:146] 185001 185002 186001 186002 186003 187001 187002 187003 188001 188002 ...
   .. ..- attr(*, "names")= chr [1:146] "1850 1%" "1850 100% database" "1860 1%" "1860 1% sample with black oversample" ...
   ..@ label   : chr "IPUMS sample identifier"
   ..@ var_desc: chr "SAMPLE identifies the IPUMS sample from which the case is drawn. Each sample receives a unique 6-digit code. Th"| __truncated__
 $ serial   : num [1:795260] 5821940 5821941 5821942 5821943 5821944 ...
  ..- attr(*, "label")= chr "Household serial number"
  ..- attr(*, "var_desc")= chr "SERIAL is an identifying number unique to each household record in a given sample. All person records are assig"| __truncated__
 $ cbserial : num [1:795260] 2.02e+12 2.02e+12 2.02e+12 2.02e+12 2.02e+12 ...
  ..- attr(*, "label")= chr "Original Census Bureau household serial number"
  ..- attr(*, "var_desc")= chr "CBSERIAL is the unique, original identification number assigned to each household record in a given sample by t"| __truncated__
 $ hhwt     : num [1:795260] 6 4 12 54 3 11 15 12 5 5 ...
  ..- attr(*, "label")= chr "Household weight"
  ..- attr(*, "var_desc")= chr "HHWT indicates how many households in the U.S. population are represented by a given household in an IPUMS samp"| __truncated__
 $ cluster  : num [1:795260] 2.02e+12 2.02e+12 2.02e+12 2.02e+12 2.02e+12 ...
  ..- attr(*, "label")= chr "Household cluster for variance estimation"
  ..- attr(*, "var_desc")= chr "CLUSTER is designed for use with STRATA in Taylor series linear approximation for correction of complex sample "| __truncated__
 $ statefip : int+lbl [1:795260] 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,...
   ..@ labels  : Named int [1:62] 1 2 4 5 6 8 9 10 11 12 ...
   .. ..- attr(*, "names")= chr [1:62] "Alabama" "Alaska" "Arizona" "Arkansas" ...
   ..@ label   : chr "State (FIPS code)"
   ..@ var_desc: chr "STATEFIP reports the state in which the household was located, using the Federal Information Processing Standar"| __truncated__
 $ countyfip: dbl+lbl [1:795260]   0, 355, 245, 485,   0,   0,   0,   0, 245, 439, 3...
   ..@ labels  : Named num 0
   .. ..- attr(*, "names")= chr "County not identifiable from public-use data (1950-onward)"
   ..@ label   : chr "County (FIPS code, identifiable counties only)"
   ..@ var_desc: chr "IPUMS USA cannot identify most counties in recent samples.\n\nCOUNTYFIP identifies the county where the househo"| __truncated__
 $ strata   : num [1:795260] 500048 660148 430248 70048 90048 ...
  ..- attr(*, "label")= chr "Household strata for variance estimation"
  ..- attr(*, "var_desc")= chr "STRATA is designed for use with CLUSTER in Taylor series linear approximation for correction of complex sample "| __truncated__
 $ gq       : int+lbl [1:795260] 4, 4, 3, 4, 4, 3, 3, 3, 3, 4, 4, 3, 3, 4, 3, 3, 3, ...
   ..@ labels  : Named int [1:7] 0 1 2 3 4 5 6
   .. ..- attr(*, "names")= chr [1:7] "Vacant unit" "Households under 1970 definition" "Additional households under 1990 definition" "Group quarters--Institutions" ...
   ..@ label   : chr "Group quarters status"
   ..@ var_desc: chr "GQ classifies all housing units as falling into one of three main categories: households, group quarters, or va"| __truncated__
 $ pernum   : num [1:795260] 1 1 1 1 1 1 1 1 1 1 ...
  ..- attr(*, "label")= chr "Person number in sample unit"
  ..- attr(*, "var_desc")= chr "PERNUM numbers all persons within each household consecutively in the order in which they appear on the origina"| __truncated__
 $ perwt    : num [1:795260] 6 4 12 54 3 11 15 12 5 5 ...
  ..- attr(*, "label")= chr "Person weight"
  ..- attr(*, "var_desc")= chr "PERWT indicates how many persons in the U.S. population are represented by a given person in an IPUMS sample. \"| __truncated__
 $ sex      : int+lbl [1:795260] 2, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, ...
   ..@ labels  : Named int [1:3] 1 2 9
   .. ..- attr(*, "names")= chr [1:3] "Male" "Female" "Missing/blank"
   ..@ label   : chr "Sex"
   ..@ var_desc: chr "SEX reports whether the person was male or female."
 $ age      : int+lbl [1:795260] 21, 19, 24, 20, 31, 27, 39, 30, 48, 52, 18, 39, 19,...
   ..@ labels  : Named int [1:6] 0 90 100 112 115 999
   .. ..- attr(*, "names")= chr [1:6] "Less than 1 year old" "90 (90+ in 1980 and 1990)" "100 (100+ in 1960-1970)" "112 (112+ in the 1980 internal data)" ...
   ..@ label   : chr "Age"
   ..@ var_desc: chr "AGE reports the person's age in years as of the last birthday.\n\nPlease see the Comparability section regardin"| __truncated__
 $ race     : int+lbl [1:795260] 2, 1, 2, 1, 1, 7, 1, 1, 1, 2, 6, 1, 1, 1, 1, 1, 1, ...
   ..@ labels  : Named int [1:9] 1 2 3 4 5 6 7 8 9
   .. ..- attr(*, "names")= chr [1:9] "White" "Black/African American" "American Indian or Alaska Native" "Chinese" ...
   ..@ label   : chr "Race [general version]"
   ..@ var_desc: chr "The concept of race has changed over the more than 150 years represented in IPUMS. Currently, the Census Bureau"| __truncated__
 $ raced    : int+lbl [1:795260] 200, 100, 200, 100, 100, 700, 100, 100, 100, 200, 6...
   ..@ labels  : Named int [1:239] 100 110 120 130 140 150 200 210 300 302 ...
   .. ..- attr(*, "names")= chr [1:239] "White" "Spanish write_in" "Blank (white) (1850)" "Portuguese" ...
   ..@ label   : chr "Race [detailed version]"
   ..@ var_desc: chr "The concept of race has changed over the more than 150 years represented in IPUMS. Currently, the Census Bureau"| __truncated__
 $ hispan   : int+lbl [1:795260] 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, ...
   ..@ labels  : Named int [1:6] 0 1 2 3 4 9
   .. ..- attr(*, "names")= chr [1:6] "Not Hispanic" "Mexican" "Puerto Rican" "Cuban" ...
   ..@ label   : chr "Hispanic origin [general version]"
   ..@ var_desc: chr "HISPAN identifies persons of Hispanic/Spanish/Latino origin and classifies them according to their country of o"| __truncated__
 $ hispand  : int+lbl [1:795260]   0, 100,   0,   0,   0, 100,   0, 100,   0,   0,  ...
   ..@ labels  : Named int [1:55] 0 100 102 103 104 105 106 107 200 300 ...
   .. ..- attr(*, "names")= chr [1:55] "Not Hispanic" "Mexican" "Mexican American" "Mexicano/Mexicana" ...
   ..@ label   : chr "Hispanic origin [detailed version]"
   ..@ var_desc: chr "HISPAN identifies persons of Hispanic/Spanish/Latino origin and classifies them according to their country of o"| __truncated__
 $ educ     : int+lbl [1:795260]  6,  6,  3,  6,  6,  6, 10,  4,  7,  6,  6,  6,  1,...
   ..@ labels  : Named int [1:13] 0 1 2 3 4 5 6 7 8 9 ...
   .. ..- attr(*, "names")= chr [1:13] "N/A or no schooling" "Nursery school to grade 4" "Grade 5, 6, 7, or 8" "Grade 9" ...
   ..@ label   : chr "Educational attainment [general version]"
   ..@ var_desc: chr "EDUC indicates respondents' educational attainment, as measured by the highest year of school or degree complet"| __truncated__
 $ educd    : int+lbl [1:795260]  63,  63,  30,  63,  64,  65, 101,  40,  71,  65,  ...
   ..@ labels  : Named int [1:44] 0 1 2 10 11 12 13 14 15 16 ...
   .. ..- attr(*, "names")= chr [1:44] "N/A or no schooling" "N/A" "No schooling completed" "Nursery school to grade 4" ...
   ..@ label   : chr "Educational attainment [detailed version]"
   ..@ var_desc: chr "EDUC indicates respondents' educational attainment, as measured by the highest year of school or degree complet"| __truncated__
 $ inctot   : dbl+lbl [1:795260]  3741,     0,     0, 40917, 11691,     0, 35072,   ...
   ..@ labels  : Named num [1:6] -1e+04 -1e+00 0e+00 1e+00 1e+07 ...
   .. ..- attr(*, "names")= chr [1:6] "$9,900 (1980)" "Net loss (1950)" "None" "1 or break even (2000, 2005-onward ACS and PRCS)" ...
   ..@ label   : chr "Total personal income"
   ..@ var_desc: chr "INCTOT reports each respondent's total pre-tax personal income or losses from all sources for the previous year"| __truncated__

tx_data <- acs_00061 %>%
  filter(statefip == 48)

Here I recode the educd variable and create a combined race_ethnicity variable

library(dplyr)

tx_data1 <- tx_data %>%
  mutate(
    educ_level = case_when(
      educd %in% c(0, 1) ~ NA_character_,  # N/A 
      educd == 2 ~ "None",                  # No school
      educd %in% c(10:17, 20:26, 30, 40, 50, 61) ~ "LT_HS",  # Nursery school to grade 4, Grade 5, 6, 7, or 8, Grade 9, Grade 10, Grade 11, 12th grade, no diploma
      educd %in% c(62:64) ~ "HSD_GED",      # High school graduate or GED, Regular high school diploma, GED or alternative credential
      educd %in% c(65, 70:71, 80, 90, 100) ~ "SomeColl",  # Some college, but less than 1 year, 1 year of college, 1 or more years of college credit, no degree, 2 years of college, 3 years of college, 4 years of college
      educd %in% c(81:83) ~ "AssocDegree",  # Associate's degree, type not specified, Associate's degree, occupational program, Associate's degree, academic program
      educd == 101 ~ "BachelorDegree",       # Bachelor's degree
      educd %in% c(110:116) ~ "GradDegree",  # 5+ years of college, 6 years of college (6+ in 1960-1970), 7 years of college, 8+ years of college, Master's degree, Professional degree beyond a bachelor's degree, Doctoral degree
      TRUE ~ NA_character_                  # Other cases
    ),
   race_ethnicity = case_when(
      hispan %in% 1:4 ~ "Hispanic",
      race == 1 & hispan == 0 ~ "NH White",
      race == 2 & hispan == 0 ~ "NH Black",
      race == 3 & hispan == 0 ~ "NH AIAN",
      race %in% c(4, 5, 6) & hispan == 0 ~ "NH Asian",
      race %in% c(7, 8, 9) & hispan == 0 ~ "NH Other Race",
      TRUE ~ "Unknown"
    ),
       mywage = ifelse(inctot %in% c(999998, 999999), NA, inctot)  #fromCsparks
  ) %>%
  filter(!is.na(inctot), inctot > 0)

One way to get weighted median

library(survey)

Loading required package: grid

Loading required package: Matrix

Loading required package: survival


Attaching package: 'survey'

The following object is masked from 'package:graphics':

    dotchart

des <- svydesign(ids = ~1, 
                    strata = ~strata, 
                    weights = ~perwt, 
                    data = tx_data1)

library(spatstat)

Loading required package: spatstat.data

Loading required package: spatstat.geom

spatstat.geom 3.1-0


Attaching package: 'spatstat.geom'

The following object is masked from 'package:grid':

    as.mask

Loading required package: spatstat.random

spatstat.random 3.1-4

Loading required package: spatstat.explore

Loading required package: nlme


Attaching package: 'nlme'

The following object is masked from 'package:dplyr':

    collapse

spatstat.explore 3.1-0

Loading required package: spatstat.model

Loading required package: rpart

spatstat.model 3.2-3

Loading required package: spatstat.linnet

spatstat.linnet 3.1-0


spatstat 3.0-5 
For an introduction to spatstat, type 'beginner'

table <- tx_data1 %>% 
     group_by(educ_level, race_ethnicity) %>%
     dplyr::summarise(MedianIncome = weighted.median(mywage, w= perwt),
           .groups = 'drop')

table

# A tibble: 42 × 3
   educ_level     race_ethnicity MedianIncome
   <chr>          <chr>                 <dbl>
 1 AssocDegree    Hispanic             35997 
 2 AssocDegree    NH AIAN              43862 
 3 AssocDegree    NH Asian             35047 
 4 AssocDegree    NH Black             39997 
 5 AssocDegree    NH Other Race        39997 
 6 AssocDegree    NH White             49974.
 7 BachelorDegree Hispanic             53983 
 8 BachelorDegree NH AIAN              54798.
 9 BachelorDegree NH Asian             64954.
10 BachelorDegree NH Black             55988.
# ℹ 32 more rows

library(writexl)

write_xlsx(table, "dalmedian.xlsx")

A different way from an old code of mine I found from a Csparks class using the matrixstats library

library(matrixStats)


Attaching package: 'matrixStats'

The following object is masked from 'package:dplyr':

    count

library(dplyr)

# Calculating  weighted mean and median income for each education level and race_ethnicity
inctot_summary <- tx_data1 %>%
  group_by(educ_level, race_ethnicity) %>%
  summarise(mean_income = weighted.mean(mywage, w = perwt, na.rm = TRUE),
            median_income = weightedMedian(mywage, w = perwt, na.rm = TRUE))

`summarise()` has grouped output by 'educ_level'. You can override using the
`.groups` argument.

print(inctot_summary)

# A tibble: 42 × 4
# Groups:   educ_level [7]
   educ_level     race_ethnicity mean_income median_income
   <chr>          <chr>                <dbl>         <dbl>
 1 AssocDegree    Hispanic            45460.        36000 
 2 AssocDegree    NH AIAN             53918.        44377.
 3 AssocDegree    NH Asian            48203.        35072 
 4 AssocDegree    NH Black            46266.        40000 
 5 AssocDegree    NH Other Race       49767.        40000 
 6 AssocDegree    NH White            61793.        50000 
 7 BachelorDegree Hispanic            64226.        54000 
 8 BachelorDegree NH AIAN             76139.        55000 
 9 BachelorDegree NH Asian            79134.        65000 
10 BachelorDegree NH Black            64389.        56000 
# ℹ 32 more rows

library(writexl)

write_xlsx(table, "dalmedmatrix.xlsx")

Some Viz’s

library(ggplot2)

ggplot(inctot_summary, aes(x = educ_level, y = median_income, fill = race_ethnicity)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Median Income by Education Level and Race/Ethnicity",
       x = "Education Level",
       y = "Median Income") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_fill_brewer(palette = "Dark2")

library(ggplot2)


ggplot(inctot_summary, aes(x = race_ethnicity, y = educ_level, fill = median_income)) +
  geom_tile(color = "white") +
  scale_fill_gradient(low = "lightblue", high = "darkblue") +  
  labs(title = "Median Income by Race/Ethnicity and Education Level",
       x = "Race/Ethnicity", y = "Education Level", fill = "Median Income") +
  theme_minimal()