library(haven)
ipums<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")
## `curl` package not installed, falling back to using `url()`
names(ipums) #print the column names
## [1] "year" "datanum" "serial" "hhwt" "statefip"
## [6] "met2013" "puma" "gq" "pernum" "perwt"
## [11] "famsize" "nchild" "nchlt5" "eldch" "nsibs"
## [16] "relate" "related" "sex" "age" "marst"
## [21] "birthyr" "fertyr" "race" "raced" "hispan"
## [26] "hispand" "bpl" "bpld" "citizen" "yrsusa1"
## [31] "language" "languaged" "speakeng" "educ" "educd"
## [36] "empstat" "empstatd" "labforce" "occ" "ind"
## [41] "inctot" "incwage" "poverty" "hwsei" "migrate1"
## [46] "migrate1d" "carpool" "trantime"
We review the codebook to identify the variables within incwage that do not include any values
We calculate the mean by replacing the unknown variables first
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
ipums%>%
mutate(mywage= ifelse(incwage%in%c(999998,999999), NA, incwage))%>%
summarise(meanold=mean(incwage), meannew=mean(mywage, na.rm=T), n=n())
## # A tibble: 1 x 3
## meanold meannew n
## <dbl> <dbl> <int>
## 1 205672.4 27489.69 300552
We calculate the median
ipums%>%
mutate(mywage= ifelse(incwage%in%c(999998,999999), NA, incwage))%>%
summarise(medianold=median(incwage), mediannew=median(mywage, na.rm=T), n=n())
## # A tibble: 1 x 3
## medianold mediannew n
## <dbl> <dbl> <int>
## 1 20000 7000 300552
We calculate the standard deviation and sample size
ipums%>%
mutate(mywage= ifelse(incwage%in%c(999998,999999), NA, incwage))%>%
summarise(sdold=sd(incwage), sdnew=sd(mywage, na.rm=T), n=n())
## # A tibble: 1 x 3
## sdold sdnew n
## <dbl> <dbl> <int>
## 1 378988.6 50665.1 300552
We calculate the median wage of those invididuals over the age of 25
ipums%>%
mutate(mywage= ifelse(incwage%in%c(999998,999999), NA, incwage))%>%
filter(labforce==2, age>=25) %>%
summarise(mednold=median(incwage), mednew=median(mywage, na.rm=T), n=n())
## # A tibble: 1 x 3
## mednold mednew n
## <dbl> <dbl> <int>
## 1 36000 36000 127061
Also, we look at this summary by sex
ipums%>%
mutate(mywage= ifelse(incwage%in%c(999998,999999), NA, incwage))%>%
filter(labforce==2, age>=25) %>%
mutate(sexrecode=ifelse(sex==1, "male", "female")) %>%
group_by(sexrecode)%>%
summarise(mednew=median(mywage, na.rm=T), sdwage=sd(mywage, na.rm=T), n=n())
## # A tibble: 2 x 4
## sexrecode mednew sdwage n
## <chr> <dbl> <dbl> <int>
## 1 female 30000 44576.08 60194
## 2 male 42000 71848.51 66867
Education groups are divided by less than high school, high school, some college, college graduate
ipums%>%
mutate(mywage= ifelse(incwage%in%c(999998,999999), NA, incwage))%>%
filter(labforce==2, age>25) %>%
mutate(sexrecode=ifelse(sex==1, "male", "female"), edurec = case_when(.$educd %in% c(0:61)~"nohs", .$educd %in% c(62:64)~"hs", .$educd %in% c(65:100)~"somecoll", .$educd %in% c(101:116)~"collgrad", .$educd ==999 ~ "missing"))%>%
group_by(sexrecode, edurec)%>%
summarise(medinc=median(mywage, na.rm=T),meaninc=mean(mywage, na.rm=T), sdwage=sd(mywage, na.rm=T), n=n())
## # A tibble: 8 x 6
## # Groups: sexrecode [?]
## sexrecode edurec medinc meaninc sdwage n
## <chr> <chr> <dbl> <dbl> <dbl> <int>
## 1 female collgrad 49500 58349.99 57186.03 22975
## 2 female hs 21900 25729.43 26523.98 13199
## 3 female nohs 15000 18361.46 23312.47 3724
## 4 female somecoll 28000 33058.59 31811.67 18914
## 5 male collgrad 70000 93302.39 98629.45 23391
## 6 male hs 33000 38268.80 37317.79 17005
## 7 male nohs 23000 28374.60 33113.55 6261
## 8 male somecoll 40550 48672.07 47372.76 18670