library(haven)
  ipums<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")
## `curl` package not installed, falling back to using `url()`
  names(ipums) #print the column names
##  [1] "year"      "datanum"   "serial"    "hhwt"      "statefip" 
##  [6] "met2013"   "puma"      "gq"        "pernum"    "perwt"    
## [11] "famsize"   "nchild"    "nchlt5"    "eldch"     "nsibs"    
## [16] "relate"    "related"   "sex"       "age"       "marst"    
## [21] "birthyr"   "fertyr"    "race"      "raced"     "hispan"   
## [26] "hispand"   "bpl"       "bpld"      "citizen"   "yrsusa1"  
## [31] "language"  "languaged" "speakeng"  "educ"      "educd"    
## [36] "empstat"   "empstatd"  "labforce"  "occ"       "ind"      
## [41] "inctot"    "incwage"   "poverty"   "hwsei"     "migrate1" 
## [46] "migrate1d" "carpool"   "trantime"

We review the codebook to identify the variables within incwage that do not include any values

We calculate the mean by replacing the unknown variables first

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
ipums%>%
mutate(mywage= ifelse(incwage%in%c(999998,999999), NA, incwage))%>%
summarise(meanold=mean(incwage), meannew=mean(mywage, na.rm=T), n=n())
## # A tibble: 1 x 3
##    meanold  meannew      n
##      <dbl>    <dbl>  <int>
## 1 205672.4 27489.69 300552

We calculate the median

ipums%>%
mutate(mywage= ifelse(incwage%in%c(999998,999999), NA, incwage))%>%
summarise(medianold=median(incwage), mediannew=median(mywage, na.rm=T), n=n())
## # A tibble: 1 x 3
##   medianold mediannew      n
##       <dbl>     <dbl>  <int>
## 1     20000      7000 300552

We calculate the standard deviation and sample size

ipums%>%
mutate(mywage= ifelse(incwage%in%c(999998,999999), NA, incwage))%>%
summarise(sdold=sd(incwage), sdnew=sd(mywage, na.rm=T), n=n())
## # A tibble: 1 x 3
##      sdold   sdnew      n
##      <dbl>   <dbl>  <int>
## 1 378988.6 50665.1 300552

We calculate the median wage of those invididuals over the age of 25

ipums%>%
mutate(mywage= ifelse(incwage%in%c(999998,999999), NA, incwage))%>%
filter(labforce==2, age>=25) %>%
summarise(mednold=median(incwage), mednew=median(mywage, na.rm=T), n=n())
## # A tibble: 1 x 3
##   mednold mednew      n
##     <dbl>  <dbl>  <int>
## 1   36000  36000 127061

Also, we look at this summary by sex

ipums%>%
mutate(mywage= ifelse(incwage%in%c(999998,999999), NA, incwage))%>%
filter(labforce==2, age>=25) %>%
mutate(sexrecode=ifelse(sex==1, "male", "female")) %>%
group_by(sexrecode)%>%
summarise(mednew=median(mywage, na.rm=T), sdwage=sd(mywage, na.rm=T), n=n())
## # A tibble: 2 x 4
##   sexrecode mednew   sdwage     n
##       <chr>  <dbl>    <dbl> <int>
## 1    female  30000 44576.08 60194
## 2      male  42000 71848.51 66867

Education groups are divided by less than high school, high school, some college, college graduate

ipums%>%
mutate(mywage= ifelse(incwage%in%c(999998,999999), NA, incwage))%>%
filter(labforce==2, age>25) %>%
mutate(sexrecode=ifelse(sex==1, "male", "female"), edurec = case_when(.$educd %in% c(0:61)~"nohs", .$educd %in% c(62:64)~"hs", .$educd %in% c(65:100)~"somecoll", .$educd %in% c(101:116)~"collgrad", .$educd ==999 ~ "missing"))%>% 
group_by(sexrecode, edurec)%>%
summarise(medinc=median(mywage, na.rm=T),meaninc=mean(mywage, na.rm=T), sdwage=sd(mywage, na.rm=T), n=n())
## # A tibble: 8 x 6
## # Groups:   sexrecode [?]
##   sexrecode   edurec medinc  meaninc   sdwage     n
##       <chr>    <chr>  <dbl>    <dbl>    <dbl> <int>
## 1    female collgrad  49500 58349.99 57186.03 22975
## 2    female       hs  21900 25729.43 26523.98 13199
## 3    female     nohs  15000 18361.46 23312.47  3724
## 4    female somecoll  28000 33058.59 31811.67 18914
## 5      male collgrad  70000 93302.39 98629.45 23391
## 6      male       hs  33000 38268.80 37317.79 17005
## 7      male     nohs  23000 28374.60 33113.55  6261
## 8      male somecoll  40550 48672.07 47372.76 18670