Income, Nativity, Educ

Author

Coda

Median Income Using 1-Year ACS Data from 2021

#Commented code was for 5-year data, did not filter for age. Reformat.

# library(tidyverse)
# library(broom)
# library (ipumsr)
# library(dplyr)
# 
# 
# 
# # Set the working directory
# setwd("C:/Users/rayo-garza/Documents/R 2022")
# 
# usa_00035<- read_ipums_ddi("usa_00035.xml")
# kc_00035 <- read_ipums_micro(usa_00035, data_file = ("usa_00035.dat.gz"), verbose = FALSE)
# 
# 
# names(kc_00035) <- tolower(names(kc_00035))
# 
# 
# head(kc_00035)
# 
# names(kc_00035)
# 
# na.omit(kc_00035)
# 
# 
# #Going to remove the birthplace and single (2021) year data to only use 5 year data from 2019. I also dont need the bpl variable. 
# newpums1 <- select(kc_00035, -year, -bpl, -bpld)
# # Create a new variable for education categories
# newpums2<-newpums1%>%
#    mutate(education_category = case_when(
#     as.character(educd) %in% c("10", "11", "12", "13", "14", "15", "16", "17", "20", "21", "22", "23", "24", "25", "26", "30", "40", "50", "60", "61", "999") ~ "Less than a high school diploma",
#     as.character(educd) %in% c("62, 63", "64") ~ "HS diploma/GED alternative credential",
#     as.character(educd) %in% c("65", "70, 71") ~ "Some college (no degree)",
#     as.character(educd) %in% c("80", "81", "82", "83") ~ "Associate's degree",
#     as.character(educd) %in% c("101") ~ "Bachelor's degree",
#     as.character(educd) %in% c("114", "115", "116") ~ "Professional degree beyond Bachelor's",
#     TRUE ~ NA_character_
#   ))
# 
# # na.omit(newpums2)
# 
# newpums2 <- filter(newpums2, !is.na(education_category))
# 
# 
# 
# # Count the number of observations in each education category
# counts <- newpums2 %>%
#   group_by(education_category) %>%
#   summarize(n = n())
# 
# counts   #we've got plenty of observations

The library matrixStats is interesting: ….It might be useful later

2021 1 Year Data with Age Variable for US

  1. Load the Data, Libraries and minor changes
library(tidyverse)
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.4.0      ✔ purrr   1.0.0 
✔ tibble  3.1.8      ✔ dplyr   1.0.10
✔ tidyr   1.2.1      ✔ stringr 1.5.0 
✔ readr   2.1.3      ✔ forcats 0.5.2 
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
library(broom)
library (ipumsr)
library(dplyr)



# Set the working directory
setwd("C:/Users/rayo-garza/Documents/R 2022")

usa_00038<- read_ipums_ddi("usa_00038.xml")
kc_00038 <- read_ipums_micro(usa_00038, data_file = ("usa_00038.dat.gz"), verbose = FALSE)


names(kc_00038) <- tolower(names(kc_00038))


head(kc_00038)
# A tibble: 6 × 22
   year sample            serial cbser…¹  hhwt cluster stateicp countyfip strata
  <int> <int+lbl>          <dbl>   <dbl> <dbl>   <dbl> <int+lb> <dbl+lbl>  <dbl>
1  2021 202101 [2021 ACS]      1 2.02e12    13 2.02e12 41 [Ala…   0 [Cou…  80001
2  2021 202101 [2021 ACS]      2 2.02e12    51 2.02e12 41 [Ala…   0 [Cou…  80001
3  2021 202101 [2021 ACS]      3 2.02e12    17 2.02e12 41 [Ala… 117       120001
4  2021 202101 [2021 ACS]      5 2.02e12    15 2.02e12 41 [Ala…   0 [Cou…  50001
5  2021 202101 [2021 ACS]      7 2.02e12    55 2.02e12 41 [Ala…  73       130201
6  2021 202101 [2021 ACS]      8 2.02e12    31 2.02e12 41 [Ala…   0 [Cou… 210001
# … with 13 more variables: gq <int+lbl>, pernum <dbl>, perwt <dbl>,
#   age <int+lbl>, race <int+lbl>, raced <int+lbl>, hispan <int+lbl>,
#   hispand <int+lbl>, citizen <int+lbl>, educ <int+lbl>, educd <int+lbl>,
#   inctot <dbl+lbl>, incwage <dbl+lbl>, and abbreviated variable name
#   ¹​cbserial
names(kc_00038)
 [1] "year"      "sample"    "serial"    "cbserial"  "hhwt"      "cluster"  
 [7] "stateicp"  "countyfip" "strata"    "gq"        "pernum"    "perwt"    
[13] "age"       "race"      "raced"     "hispan"    "hispand"   "citizen"  
[19] "educ"      "educd"     "inctot"    "incwage"  
# na.omit(kc_00037)
  1. Re-Coding Variables. Basically, I am creating educational att. categories that mirror the pums cuts. I then filter by age, mutate the citizen variable to group citizen vs. non citizen, clean up the inctot variable, remove any missing or NA’s, specifically from the education and income (mywage) vectors.
library(dplyr)
# Create a new variable for education categories
US21<-kc_00038%>%
   mutate(education_category = case_when(
    as.character(educd) %in% c("10", "11", "12", "13", "14", "15", "16", "17", "20", "21", "22", "23", "24", "25", "26", "30", "40", "50", "60", "61", "999") ~ "Less than a high school diploma",
    as.character(educd) %in% c("62, 63", "64") ~ "HS diploma/GED alternative credential",
    as.character(educd) %in% c("65", "70, 71") ~ "Some college (no degree)",
    as.character(educd) %in% c("80", "81", "82", "83") ~ "Associate's degree",
    as.character(educd) %in% c("101") ~ "Bachelor's degree",
    as.character(educd) %in% c("114", "115", "116") ~ "Professional degree beyond Bachelor's",
    TRUE ~ NA_character_
  ))

us21a <- US21%>% filter (age >= 22)


us21c <- us21a %>%
  # mutate(citizen = case_when(citizen == 1:2 ~ 1, 
  #                            citizen == 3:5 ~ 0, 
  #                            TRUE ~  NA_real_))  #0 is not a citizen 
  mutate(citizen = case_when(citizen %in% c(1, 2) ~ 1, 
                             citizen %in% c(3, 4, 5) ~ 0, 
                             TRUE ~  NA_real_))  #0 is not a citizen


us21d <- us21c %>%
  filter(complete.cases (education_category, age, perwt, inctot, strata, citizen)) %>%
  select(education_category, age, perwt, inctot, strata, citizen)


us21e <- us21d%>%
  mutate(mywage= ifelse(inctot%in%c(999998,999999), NA, inctot))

names(us21e)
[1] "education_category" "age"                "perwt"             
[4] "inctot"             "strata"             "citizen"           
[7] "mywage"            
us21e <- filter(us21e, !is.na(education_category))
us21e <- filter(us21e, !is.na(mywage))


# %>%    summarise(meanold=mean(inctot), meannew=mean(mywage, na.rm=T), n=n()) ##Mean

us21e$education_category2 <-as.factor(us21e$education_category)
  1. Survey Design
library(survey)
Loading required package: grid
Loading required package: Matrix

Attaching package: 'Matrix'
The following objects are masked from 'package:tidyr':

    expand, pack, unpack
Loading required package: survival

Attaching package: 'survey'
The following object is masked from 'package:graphics':

    dotchart
options(survey.lonely.psu = "adjust")
us21e$perwt<-us21e$perwt
des <- svydesign(ids = ~1, 
                    strata = ~strata, 
                    weights = ~perwt, 
                    data = us21e)



names(us21e)
[1] "education_category"  "age"                 "perwt"              
[4] "inctot"              "strata"              "citizen"            
[7] "mywage"              "education_category2"
is.numeric(us21e$inctot) #True numeric 
[1] TRUE
is.numeric(us21e$citizen) #True numeric 
[1] TRUE
is.character(us21e$education_category)  #Characther variable 
[1] TRUE
# us21e$education_category2 <-as.numeric(us21e$education_category)


names
function (x)  .Primitive("names")
library(survey)
library(srvyr)

Attaching package: 'srvyr'
The following object is masked from 'package:stats':

    filter
svymean(~education_category2, des, na.rm = TRUE)
                                                             mean     SE
education_category2Associate's degree                    0.100915 0.0008
education_category2Bachelor's degree                     0.292179 0.0011
education_category2HS diploma/GED alternative credential 0.041842 0.0005
education_category2Less than a high school diploma       0.276844 0.0011
education_category2Professional degree beyond Bachelor's 0.227038 0.0010
education_category2Some college (no degree)              0.061182 0.0006
output <- svyglm(inctot ~ education_category2 +
                       citizen,
          family = gaussian(),
          data   = nhanesAnalysis,
          design = des)

output
Stratified Independent Sampling design (with replacement)
svydesign(ids = ~1, strata = ~strata, weights = ~perwt, data = us21e)

Call:  svyglm(formula = inctot ~ education_category2 + citizen, design = des, 
    family = gaussian(), data = nhanesAnalysis)

Coefficients:
                                             (Intercept)  
                                                   31917  
                    education_category2Bachelor's degree  
                                                   22240  
education_category2HS diploma/GED alternative credential  
                                                   -6016  
      education_category2Less than a high school diploma  
                                                  -12653  
education_category2Professional degree beyond Bachelor's  
                                                   64931  
             education_category2Some college (no degree)  
                                                   -2876  
                                                 citizen  
                                                   11608  

Degrees of Freedom: 262898 Total (i.e. Null);  260542 Residual
Null Deviance:      1.782e+15 
Residual Deviance: 1.539e+15    AIC: 6722000
output2 <- us21e %>%
  as_survey_design(weights = c(perwt)) %>%
  group_by(education_category2, citizen)%>%
  summarize(n= survey_mean(inctot, na.rm =T, vartype = "ci"))

# Plot it
ggplot(output2, aes(x = education_category2, y = n)) +
  geom_bar(aes(fill = education_category2), stat = "identity") +
  geom_errorbar(aes(ymin = n_low, max = n_upp), width = 0.2) +
  facet_wrap(~citizen) +
  theme_bw()

library(dplyr)
library(spatstat)
Loading required package: spatstat.data
Loading required package: spatstat.geom
spatstat.geom 3.0-3

Attaching package: 'spatstat.geom'
The following object is masked from 'package:grid':

    as.mask
Loading required package: spatstat.random
spatstat.random 3.0-1
Loading required package: spatstat.explore
Loading required package: nlme

Attaching package: 'nlme'
The following object is masked from 'package:dplyr':

    collapse
spatstat.explore 3.0-5
Loading required package: spatstat.model
Loading required package: rpart
spatstat.model 3.0-2
Loading required package: spatstat.linnet
spatstat.linnet 3.0-3

spatstat 3.0-2 
For an introduction to spatstat, type 'beginner' 
table <- us21e %>% 
     group_by(education_category2, citizen) %>%
     dplyr::summarise(MedianIncome = weighted.median(mywage, w= perwt),
           .groups = 'drop')

table
# A tibble: 12 × 3
   education_category2                   citizen MedianIncome
   <fct>                                   <dbl>        <dbl>
 1 Associate's degree                          0        23950
 2 Associate's degree                          1        33675
 3 Bachelor's degree                           0        29900
 4 Bachelor's degree                           1        47990
 5 HS diploma/GED alternative credential       0        21850
 6 HS diploma/GED alternative credential       1        24950
 7 Less than a high school diploma             0        17950
 8 Less than a high school diploma             1        16350
 9 Professional degree beyond Bachelor's       0        64995
10 Professional degree beyond Bachelor's       1        79950
11 Some college (no degree)                    0        23150
12 Some college (no degree)                    1        29950
library(writexl)

write_xlsx(table, "21acsmedian.xlsx")

Visualizations of the Data

library(ggplot2)
library(questionr)



ggplot(data = table, aes(x= MedianIncome, y= education_category2, color = education_category2, fill = education_category2, group = citizen)) +
  geom_bar(stat = "identity") + # scatter plot
  facet_wrap(~citizen)

library(plotly)

Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':

    last_plot
The following object is masked from 'package:stats':

    filter
The following object is masked from 'package:graphics':

    layout
p <- ggplot(data = table, aes(x= MedianIncome, y= education_category2, color = education_category2, fill= education_category2, group = citizen)) +
  geom_bar(stat = "identity") + # scatter plot
  facet_wrap(~citizen)

plotly_obj <- ggplotly(p)

plotly_obj

Notes on the matrixStats library - alternative approach to calc stats

# 
# library(plyr)
# library(matrixStats)
# 
# # Calculate the weighted mean and median income for each education level and citizenship status
# inctot1 <- ddply(us21d, .(education_category, citizen), function(x) {
#   data.frame(mean_income = weighted.mean(x$inctot, x$perwt),
#              median_income = weightedMedian(x$inctot, x$perwt))
# })
# 
# # Print the resulting table
# print(inctot1)