Income, Nativity, Educ

Author

Coda

Median Income Using 1-Year ACS Data from 2021

#Commented code was for 5-year data, did not filter for age. Reformat.

# library(tidyverse)
# library(broom)
# library (ipumsr)
# library(dplyr)
# 
# 
# 
# # Set the working directory
# setwd("C:/Users/rayo-garza/Documents/R 2022")
# 
# usa_00035<- read_ipums_ddi("usa_00035.xml")
# kc_00035 <- read_ipums_micro(usa_00035, data_file = ("usa_00035.dat.gz"), verbose = FALSE)
# 
# 
# names(kc_00035) <- tolower(names(kc_00035))
# 
# 
# head(kc_00035)
# 
# names(kc_00035)
# 
# na.omit(kc_00035)
# 
# 
# #Going to remove the birthplace and single (2021) year data to only use 5 year data from 2019. I also dont need the bpl variable. 
# newpums1 <- select(kc_00035, -year, -bpl, -bpld)

# # Create a new variable for education categories
# newpums2<-newpums1%>%
#    mutate(education_category = case_when(
#     as.character(educd) %in% c("10", "11", "12", "13", "14", "15", "16", "17", "20", "21", "22", "23", "24", "25", "26", "30", "40", "50", "60", "61", "999") ~ "Less than a high school diploma",
#     as.character(educd) %in% c("62, 63", "64") ~ "HS diploma/GED alternative credential",
#     as.character(educd) %in% c("65", "70, 71") ~ "Some college (no degree)",
#     as.character(educd) %in% c("80", "81", "82", "83") ~ "Associate's degree",
#     as.character(educd) %in% c("101") ~ "Bachelor's degree",
#     as.character(educd) %in% c("114", "115", "116") ~ "Professional degree beyond Bachelor's",
#     TRUE ~ NA_character_
#   ))
# 
# # na.omit(newpums2)
# 
# newpums2 <- filter(newpums2, !is.na(education_category))
# 
# 
# 
# # Count the number of observations in each education category
# counts <- newpums2 %>%
#   group_by(education_category) %>%
#   summarize(n = n())
# 
# counts   #we've got plenty of observations

The library matrixStats is interesting: ….It might be useful later

2021 1 Year Data with Age Variable for US

Load the Data, Libraries and minor changes

library(tidyverse)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.4.0      ✔ purrr   1.0.0 
✔ tibble  3.1.8      ✔ dplyr   1.0.10
✔ tidyr   1.2.1      ✔ stringr 1.5.0 
✔ readr   2.1.3      ✔ forcats 0.5.2 
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()

library(broom)
library (ipumsr)
library(dplyr)



# Set the working directory
setwd("C:/Users/rayo-garza/Documents/R 2022")

usa_00038<- read_ipums_ddi("usa_00038.xml")
kc_00038 <- read_ipums_micro(usa_00038, data_file = ("usa_00038.dat.gz"), verbose = FALSE)


names(kc_00038) <- tolower(names(kc_00038))


head(kc_00038)

# A tibble: 6 × 22
   year sample            serial cbser…¹  hhwt cluster stateicp countyfip strata
  <int> <int+lbl>          <dbl>   <dbl> <dbl>   <dbl> <int+lb> <dbl+lbl>  <dbl>
1  2021 202101 [2021 ACS]      1 2.02e12    13 2.02e12 41 [Ala…   0 [Cou…  80001
2  2021 202101 [2021 ACS]      2 2.02e12    51 2.02e12 41 [Ala…   0 [Cou…  80001
3  2021 202101 [2021 ACS]      3 2.02e12    17 2.02e12 41 [Ala… 117       120001
4  2021 202101 [2021 ACS]      5 2.02e12    15 2.02e12 41 [Ala…   0 [Cou…  50001
5  2021 202101 [2021 ACS]      7 2.02e12    55 2.02e12 41 [Ala…  73       130201
6  2021 202101 [2021 ACS]      8 2.02e12    31 2.02e12 41 [Ala…   0 [Cou… 210001
# … with 13 more variables: gq <int+lbl>, pernum <dbl>, perwt <dbl>,
#   age <int+lbl>, race <int+lbl>, raced <int+lbl>, hispan <int+lbl>,
#   hispand <int+lbl>, citizen <int+lbl>, educ <int+lbl>, educd <int+lbl>,
#   inctot <dbl+lbl>, incwage <dbl+lbl>, and abbreviated variable name
#   ¹cbserial

names(kc_00038)

 [1] "year"      "sample"    "serial"    "cbserial"  "hhwt"      "cluster"  
 [7] "stateicp"  "countyfip" "strata"    "gq"        "pernum"    "perwt"    
[13] "age"       "race"      "raced"     "hispan"    "hispand"   "citizen"  
[19] "educ"      "educd"     "inctot"    "incwage"

# na.omit(kc_00037)

Re-Coding Variables. Basically, I am creating educational att. categories that mirror the pums cuts. I then filter by age, mutate the citizen variable to group citizen vs. non citizen, clean up the inctot variable, remove any missing or NA’s, specifically from the education and income (mywage) vectors.

library(dplyr)
# Create a new variable for education categories
US21<-kc_00038%>%
   mutate(education_category = case_when(
    as.character(educd) %in% c("10", "11", "12", "13", "14", "15", "16", "17", "20", "21", "22", "23", "24", "25", "26", "30", "40", "50", "60", "61", "999") ~ "Less than a high school diploma",
    as.character(educd) %in% c("62, 63", "64") ~ "HS diploma/GED alternative credential",
    as.character(educd) %in% c("65", "70, 71") ~ "Some college (no degree)",
    as.character(educd) %in% c("80", "81", "82", "83") ~ "Associate's degree",
    as.character(educd) %in% c("101") ~ "Bachelor's degree",
    as.character(educd) %in% c("114", "115", "116") ~ "Professional degree beyond Bachelor's",
    TRUE ~ NA_character_
  ))

us21a <- US21%>% filter (age >= 22)


us21c <- us21a %>%
  # mutate(citizen = case_when(citizen == 1:2 ~ 1, 
  #                            citizen == 3:5 ~ 0, 
  #                            TRUE ~  NA_real_))  #0 is not a citizen 
  mutate(citizen = case_when(citizen %in% c(1, 2) ~ 1, 
                             citizen %in% c(3, 4, 5) ~ 0, 
                             TRUE ~  NA_real_))  #0 is not a citizen


us21d <- us21c %>%
  filter(complete.cases (education_category, age, perwt, inctot, strata, citizen)) %>%
  select(education_category, age, perwt, inctot, strata, citizen)


us21e <- us21d%>%
  mutate(mywage= ifelse(inctot%in%c(999998,999999), NA, inctot))

names(us21e)

[1] "education_category" "age"                "perwt"             
[4] "inctot"             "strata"             "citizen"           
[7] "mywage"

us21e <- filter(us21e, !is.na(education_category))
us21e <- filter(us21e, !is.na(mywage))


# %>%    summarise(meanold=mean(inctot), meannew=mean(mywage, na.rm=T), n=n()) ##Mean

us21e$education_category2 <-as.factor(us21e$education_category)

Survey Design

library(survey)

Loading required package: grid

Loading required package: Matrix


Attaching package: 'Matrix'

The following objects are masked from 'package:tidyr':

    expand, pack, unpack

Loading required package: survival


Attaching package: 'survey'

The following object is masked from 'package:graphics':

    dotchart

options(survey.lonely.psu = "adjust")
us21e$perwt<-us21e$perwt
des <- svydesign(ids = ~1, 
                    strata = ~strata, 
                    weights = ~perwt, 
                    data = us21e)



names(us21e)

[1] "education_category"  "age"                 "perwt"              
[4] "inctot"              "strata"              "citizen"            
[7] "mywage"              "education_category2"

is.numeric(us21e$inctot) #True numeric

[1] TRUE

is.numeric(us21e$citizen) #True numeric

[1] TRUE

is.character(us21e$education_category)  #Characther variable

[1] TRUE

# us21e$education_category2 <-as.numeric(us21e$education_category)


names

function (x)  .Primitive("names")

library(survey)
library(srvyr)


Attaching package: 'srvyr'

The following object is masked from 'package:stats':

    filter

svymean(~education_category2, des, na.rm = TRUE)

                                                             mean     SE
education_category2Associate's degree                    0.100915 0.0008
education_category2Bachelor's degree                     0.292179 0.0011
education_category2HS diploma/GED alternative credential 0.041842 0.0005
education_category2Less than a high school diploma       0.276844 0.0011
education_category2Professional degree beyond Bachelor's 0.227038 0.0010
education_category2Some college (no degree)              0.061182 0.0006

output <- svyglm(inctot ~ education_category2 +
                       citizen,
          family = gaussian(),
          data   = nhanesAnalysis,
          design = des)

output

Stratified Independent Sampling design (with replacement)
svydesign(ids = ~1, strata = ~strata, weights = ~perwt, data = us21e)

Call:  svyglm(formula = inctot ~ education_category2 + citizen, design = des, 
    family = gaussian(), data = nhanesAnalysis)

Coefficients:
                                             (Intercept)  
                                                   31917  
                    education_category2Bachelor's degree  
                                                   22240  
education_category2HS diploma/GED alternative credential  
                                                   -6016  
      education_category2Less than a high school diploma  
                                                  -12653  
education_category2Professional degree beyond Bachelor's  
                                                   64931  
             education_category2Some college (no degree)  
                                                   -2876  
                                                 citizen  
                                                   11608  

Degrees of Freedom: 262898 Total (i.e. Null);  260542 Residual
Null Deviance:      1.782e+15 
Residual Deviance: 1.539e+15    AIC: 6722000

output2 <- us21e %>%
  as_survey_design(weights = c(perwt)) %>%
  group_by(education_category2, citizen)%>%
  summarize(n= survey_mean(inctot, na.rm =T, vartype = "ci"))

# Plot it
ggplot(output2, aes(x = education_category2, y = n)) +
  geom_bar(aes(fill = education_category2), stat = "identity") +
  geom_errorbar(aes(ymin = n_low, max = n_upp), width = 0.2) +
  facet_wrap(~citizen) +
  theme_bw()

library(dplyr)
library(spatstat)

Loading required package: spatstat.data

Loading required package: spatstat.geom

spatstat.geom 3.0-3


Attaching package: 'spatstat.geom'

The following object is masked from 'package:grid':

    as.mask

Loading required package: spatstat.random

spatstat.random 3.0-1

Loading required package: spatstat.explore

Loading required package: nlme


Attaching package: 'nlme'

The following object is masked from 'package:dplyr':

    collapse

spatstat.explore 3.0-5

Loading required package: spatstat.model

Loading required package: rpart

spatstat.model 3.0-2

Loading required package: spatstat.linnet

spatstat.linnet 3.0-3


spatstat 3.0-2 
For an introduction to spatstat, type 'beginner'

table <- us21e %>% 
     group_by(education_category2, citizen) %>%
     dplyr::summarise(MedianIncome = weighted.median(mywage, w= perwt),
           .groups = 'drop')

table

# A tibble: 12 × 3
   education_category2                   citizen MedianIncome
   <fct>                                   <dbl>        <dbl>
 1 Associate's degree                          0        23950
 2 Associate's degree                          1        33675
 3 Bachelor's degree                           0        29900
 4 Bachelor's degree                           1        47990
 5 HS diploma/GED alternative credential       0        21850
 6 HS diploma/GED alternative credential       1        24950
 7 Less than a high school diploma             0        17950
 8 Less than a high school diploma             1        16350
 9 Professional degree beyond Bachelor's       0        64995
10 Professional degree beyond Bachelor's       1        79950
11 Some college (no degree)                    0        23150
12 Some college (no degree)                    1        29950

library(writexl)

write_xlsx(table, "21acsmedian.xlsx")

Visualizations of the Data

library(ggplot2)
library(questionr)



ggplot(data = table, aes(x= MedianIncome, y= education_category2, color = education_category2, fill = education_category2, group = citizen)) +
  geom_bar(stat = "identity") + # scatter plot
  facet_wrap(~citizen)

library(plotly)


Attaching package: 'plotly'

The following object is masked from 'package:ggplot2':

    last_plot

The following object is masked from 'package:stats':

    filter

The following object is masked from 'package:graphics':

    layout

p <- ggplot(data = table, aes(x= MedianIncome, y= education_category2, color = education_category2, fill= education_category2, group = citizen)) +
  geom_bar(stat = "identity") + # scatter plot
  facet_wrap(~citizen)

plotly_obj <- ggplotly(p)

plotly_obj

Notes on the matrixStats library - alternative approach to calc stats

# 
# library(plyr)
# library(matrixStats)
# 
# # Calculate the weighted mean and median income for each education level and citizenship status
# inctot1 <- ddply(us21d, .(education_category, citizen), function(x) {
#   data.frame(mean_income = weighted.mean(x$inctot, x$perwt),
#              median_income = weightedMedian(x$inctot, x$perwt))
# })
# 
# # Print the resulting table
# print(inctot1)