# library(tidyverse)
# library(broom)
# library (ipumsr)
# library(dplyr)
#
#
#
# # Set the working directory
# setwd("C:/Users/rayo-garza/Documents/R 2022")
#
# usa_00035<- read_ipums_ddi("usa_00035.xml")
# kc_00035 <- read_ipums_micro(usa_00035, data_file = ("usa_00035.dat.gz"), verbose = FALSE)
#
#
# names(kc_00035) <- tolower(names(kc_00035))
#
#
# head(kc_00035)
#
# names(kc_00035)
#
# na.omit(kc_00035)
#
#
# #Going to remove the birthplace and single (2021) year data to only use 5 year data from 2019. I also dont need the bpl variable.
# newpums1 <- select(kc_00035, -year, -bpl, -bpld)
Income, Nativity, Educ
Median Income Using 1-Year ACS Data from 2021
#Commented code was for 5-year data, did not filter for age. Reformat.
# # Create a new variable for education categories
# newpums2<-newpums1%>%
# mutate(education_category = case_when(
# as.character(educd) %in% c("10", "11", "12", "13", "14", "15", "16", "17", "20", "21", "22", "23", "24", "25", "26", "30", "40", "50", "60", "61", "999") ~ "Less than a high school diploma",
# as.character(educd) %in% c("62, 63", "64") ~ "HS diploma/GED alternative credential",
# as.character(educd) %in% c("65", "70, 71") ~ "Some college (no degree)",
# as.character(educd) %in% c("80", "81", "82", "83") ~ "Associate's degree",
# as.character(educd) %in% c("101") ~ "Bachelor's degree",
# as.character(educd) %in% c("114", "115", "116") ~ "Professional degree beyond Bachelor's",
# TRUE ~ NA_character_
# ))
#
# # na.omit(newpums2)
#
# newpums2 <- filter(newpums2, !is.na(education_category))
#
#
#
# # Count the number of observations in each education category
# counts <- newpums2 %>%
# group_by(education_category) %>%
# summarize(n = n())
#
# counts #we've got plenty of observations
The library matrixStats is interesting: ….It might be useful later
2021 1 Year Data with Age Variable for US
- Load the Data, Libraries and minor changes
library(tidyverse)
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.4.0 ✔ purrr 1.0.0
✔ tibble 3.1.8 ✔ dplyr 1.0.10
✔ tidyr 1.2.1 ✔ stringr 1.5.0
✔ readr 2.1.3 ✔ forcats 0.5.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
library(broom)
library (ipumsr)
library(dplyr)
# Set the working directory
setwd("C:/Users/rayo-garza/Documents/R 2022")
<- read_ipums_ddi("usa_00038.xml")
usa_00038<- read_ipums_micro(usa_00038, data_file = ("usa_00038.dat.gz"), verbose = FALSE)
kc_00038
names(kc_00038) <- tolower(names(kc_00038))
head(kc_00038)
# A tibble: 6 × 22
year sample serial cbser…¹ hhwt cluster stateicp countyfip strata
<int> <int+lbl> <dbl> <dbl> <dbl> <dbl> <int+lb> <dbl+lbl> <dbl>
1 2021 202101 [2021 ACS] 1 2.02e12 13 2.02e12 41 [Ala… 0 [Cou… 80001
2 2021 202101 [2021 ACS] 2 2.02e12 51 2.02e12 41 [Ala… 0 [Cou… 80001
3 2021 202101 [2021 ACS] 3 2.02e12 17 2.02e12 41 [Ala… 117 120001
4 2021 202101 [2021 ACS] 5 2.02e12 15 2.02e12 41 [Ala… 0 [Cou… 50001
5 2021 202101 [2021 ACS] 7 2.02e12 55 2.02e12 41 [Ala… 73 130201
6 2021 202101 [2021 ACS] 8 2.02e12 31 2.02e12 41 [Ala… 0 [Cou… 210001
# … with 13 more variables: gq <int+lbl>, pernum <dbl>, perwt <dbl>,
# age <int+lbl>, race <int+lbl>, raced <int+lbl>, hispan <int+lbl>,
# hispand <int+lbl>, citizen <int+lbl>, educ <int+lbl>, educd <int+lbl>,
# inctot <dbl+lbl>, incwage <dbl+lbl>, and abbreviated variable name
# ¹cbserial
names(kc_00038)
[1] "year" "sample" "serial" "cbserial" "hhwt" "cluster"
[7] "stateicp" "countyfip" "strata" "gq" "pernum" "perwt"
[13] "age" "race" "raced" "hispan" "hispand" "citizen"
[19] "educ" "educd" "inctot" "incwage"
# na.omit(kc_00037)
- Re-Coding Variables. Basically, I am creating educational att. categories that mirror the pums cuts. I then filter by age, mutate the citizen variable to group citizen vs. non citizen, clean up the inctot variable, remove any missing or NA’s, specifically from the education and income (mywage) vectors.
library(dplyr)
# Create a new variable for education categories
<-kc_00038%>%
US21mutate(education_category = case_when(
as.character(educd) %in% c("10", "11", "12", "13", "14", "15", "16", "17", "20", "21", "22", "23", "24", "25", "26", "30", "40", "50", "60", "61", "999") ~ "Less than a high school diploma",
as.character(educd) %in% c("62, 63", "64") ~ "HS diploma/GED alternative credential",
as.character(educd) %in% c("65", "70, 71") ~ "Some college (no degree)",
as.character(educd) %in% c("80", "81", "82", "83") ~ "Associate's degree",
as.character(educd) %in% c("101") ~ "Bachelor's degree",
as.character(educd) %in% c("114", "115", "116") ~ "Professional degree beyond Bachelor's",
TRUE ~ NA_character_
))
<- US21%>% filter (age >= 22)
us21a
<- us21a %>%
us21c # mutate(citizen = case_when(citizen == 1:2 ~ 1,
# citizen == 3:5 ~ 0,
# TRUE ~ NA_real_)) #0 is not a citizen
mutate(citizen = case_when(citizen %in% c(1, 2) ~ 1,
%in% c(3, 4, 5) ~ 0,
citizen TRUE ~ NA_real_)) #0 is not a citizen
<- us21c %>%
us21d filter(complete.cases (education_category, age, perwt, inctot, strata, citizen)) %>%
select(education_category, age, perwt, inctot, strata, citizen)
<- us21d%>%
us21e mutate(mywage= ifelse(inctot%in%c(999998,999999), NA, inctot))
names(us21e)
[1] "education_category" "age" "perwt"
[4] "inctot" "strata" "citizen"
[7] "mywage"
<- filter(us21e, !is.na(education_category))
us21e <- filter(us21e, !is.na(mywage))
us21e
# %>% summarise(meanold=mean(inctot), meannew=mean(mywage, na.rm=T), n=n()) ##Mean
$education_category2 <-as.factor(us21e$education_category) us21e
- Survey Design
library(survey)
Loading required package: grid
Loading required package: Matrix
Attaching package: 'Matrix'
The following objects are masked from 'package:tidyr':
expand, pack, unpack
Loading required package: survival
Attaching package: 'survey'
The following object is masked from 'package:graphics':
dotchart
options(survey.lonely.psu = "adjust")
$perwt<-us21e$perwt
us21e<- svydesign(ids = ~1,
des strata = ~strata,
weights = ~perwt,
data = us21e)
names(us21e)
[1] "education_category" "age" "perwt"
[4] "inctot" "strata" "citizen"
[7] "mywage" "education_category2"
is.numeric(us21e$inctot) #True numeric
[1] TRUE
is.numeric(us21e$citizen) #True numeric
[1] TRUE
is.character(us21e$education_category) #Characther variable
[1] TRUE
# us21e$education_category2 <-as.numeric(us21e$education_category)
names
function (x) .Primitive("names")
library(survey)
library(srvyr)
Attaching package: 'srvyr'
The following object is masked from 'package:stats':
filter
svymean(~education_category2, des, na.rm = TRUE)
mean SE
education_category2Associate's degree 0.100915 0.0008
education_category2Bachelor's degree 0.292179 0.0011
education_category2HS diploma/GED alternative credential 0.041842 0.0005
education_category2Less than a high school diploma 0.276844 0.0011
education_category2Professional degree beyond Bachelor's 0.227038 0.0010
education_category2Some college (no degree) 0.061182 0.0006
<- svyglm(inctot ~ education_category2 +
output
citizen,family = gaussian(),
data = nhanesAnalysis,
design = des)
output
Stratified Independent Sampling design (with replacement)
svydesign(ids = ~1, strata = ~strata, weights = ~perwt, data = us21e)
Call: svyglm(formula = inctot ~ education_category2 + citizen, design = des,
family = gaussian(), data = nhanesAnalysis)
Coefficients:
(Intercept)
31917
education_category2Bachelor's degree
22240
education_category2HS diploma/GED alternative credential
-6016
education_category2Less than a high school diploma
-12653
education_category2Professional degree beyond Bachelor's
64931
education_category2Some college (no degree)
-2876
citizen
11608
Degrees of Freedom: 262898 Total (i.e. Null); 260542 Residual
Null Deviance: 1.782e+15
Residual Deviance: 1.539e+15 AIC: 6722000
<- us21e %>%
output2 as_survey_design(weights = c(perwt)) %>%
group_by(education_category2, citizen)%>%
summarize(n= survey_mean(inctot, na.rm =T, vartype = "ci"))
# Plot it
ggplot(output2, aes(x = education_category2, y = n)) +
geom_bar(aes(fill = education_category2), stat = "identity") +
geom_errorbar(aes(ymin = n_low, max = n_upp), width = 0.2) +
facet_wrap(~citizen) +
theme_bw()
library(dplyr)
library(spatstat)
Loading required package: spatstat.data
Loading required package: spatstat.geom
spatstat.geom 3.0-3
Attaching package: 'spatstat.geom'
The following object is masked from 'package:grid':
as.mask
Loading required package: spatstat.random
spatstat.random 3.0-1
Loading required package: spatstat.explore
Loading required package: nlme
Attaching package: 'nlme'
The following object is masked from 'package:dplyr':
collapse
spatstat.explore 3.0-5
Loading required package: spatstat.model
Loading required package: rpart
spatstat.model 3.0-2
Loading required package: spatstat.linnet
spatstat.linnet 3.0-3
spatstat 3.0-2
For an introduction to spatstat, type 'beginner'
<- us21e %>%
table group_by(education_category2, citizen) %>%
::summarise(MedianIncome = weighted.median(mywage, w= perwt),
dplyr.groups = 'drop')
table
# A tibble: 12 × 3
education_category2 citizen MedianIncome
<fct> <dbl> <dbl>
1 Associate's degree 0 23950
2 Associate's degree 1 33675
3 Bachelor's degree 0 29900
4 Bachelor's degree 1 47990
5 HS diploma/GED alternative credential 0 21850
6 HS diploma/GED alternative credential 1 24950
7 Less than a high school diploma 0 17950
8 Less than a high school diploma 1 16350
9 Professional degree beyond Bachelor's 0 64995
10 Professional degree beyond Bachelor's 1 79950
11 Some college (no degree) 0 23150
12 Some college (no degree) 1 29950
library(writexl)
write_xlsx(table, "21acsmedian.xlsx")
Visualizations of the Data
library(ggplot2)
library(questionr)
ggplot(data = table, aes(x= MedianIncome, y= education_category2, color = education_category2, fill = education_category2, group = citizen)) +
geom_bar(stat = "identity") + # scatter plot
facet_wrap(~citizen)
library(plotly)
Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':
last_plot
The following object is masked from 'package:stats':
filter
The following object is masked from 'package:graphics':
layout
<- ggplot(data = table, aes(x= MedianIncome, y= education_category2, color = education_category2, fill= education_category2, group = citizen)) +
p geom_bar(stat = "identity") + # scatter plot
facet_wrap(~citizen)
<- ggplotly(p)
plotly_obj
plotly_obj
Notes on the matrixStats library - alternative approach to calc stats
#
# library(plyr)
# library(matrixStats)
#
# # Calculate the weighted mean and median income for each education level and citizenship status
# inctot1 <- ddply(us21d, .(education_category, citizen), function(x) {
# data.frame(mean_income = weighted.mean(x$inctot, x$perwt),
# median_income = weightedMedian(x$inctot, x$perwt))
# })
#
# # Print the resulting table
# print(inctot1)