This is more a note to me. Import the data set:
library(haven)
ipums<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")
names(ipums) #print the column names
## [1] "year" "datanum" "serial" "hhwt" "statefip"
## [6] "met2013" "puma" "gq" "pernum" "perwt"
## [11] "famsize" "nchild" "nchlt5" "eldch" "nsibs"
## [16] "relate" "related" "sex" "age" "marst"
## [21] "birthyr" "fertyr" "race" "raced" "hispan"
## [26] "hispand" "bpl" "bpld" "citizen" "yrsusa1"
## [31] "language" "languaged" "speakeng" "educ" "educd"
## [36] "empstat" "empstatd" "labforce" "occ" "ind"
## [41] "inctot" "incwage" "poverty" "hwsei" "migrate1"
## [46] "migrate1d" "carpool" "trantime"
For all,
Create a subset incwagenew1 without 9999998 and 999999:
incwagenew1<-subset(ipums, incwage<999998)
mean(incwagenew1$incwage)
## [1] 27489.69
median(incwagenew1$incwage)
## [1] 7000
sd(incwagenew1$incwage)
## [1] 50665.1
For in the workforce,
Create a subset incwagenew2 without 0, 999998 and 999999:
incwagenew2<-subset(ipums, incwage>0 & incwage<999998)
mean(incwagenew2\(incwage) - for direct mean and mean(incwagenew2\)incwage, trim=[value]) for trimming
mean(incwagenew2$incwage)
## [1] 46781.04
median(incwagenew2$incwage)
## [1] 32000
sd(incwagenew2$incwage)
## [1] 58871.82
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
ipums%>%
mutate(edurec = case_when(.$educd %in% c(0:61)~"nohs",
.$educd %in% c(62:64)~"hs",
.$educd %in% c(65:100)~"somecoll",
.$educd %in% c(101:116)~"collgrad",
.$educd ==999 ~ "missing"))%>%
mutate(edurec=ifelse(educd%in%c(999998,999999),NA,educd))%>%
filter(labforce==2,age>=25) %>%
mutate(sexrecode=ifelse(sex==1,"male","female"))%>%
group_by(edurec,sexrecode)%>%
summarise(meanold=mean(educd),meannew=mean(edurec,na.rm=T),n=n())
## # A tibble: 48 x 5
## # Groups: edurec [?]
## edurec sexrecode meanold meannew n
## <dbl> <chr> <dbl> <dbl> <int>
## 1 2 female 2 2 500
## 2 2 male 2 2 809
## 3 11 female 11 11 12
## 4 11 male 11 11 11
## 5 12 female 12 12 7
## 6 12 male 12 12 8
## 7 14 female 14 14 17
## 8 14 male 14 14 25
## 9 15 female 15 15 25
## 10 15 male 15 15 56
## # ... with 38 more rows