First Things First…

This is more a note to me. Import the data set:

library(haven)
ipums<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")
names(ipums) #print the column names
##  [1] "year"      "datanum"   "serial"    "hhwt"      "statefip" 
##  [6] "met2013"   "puma"      "gq"        "pernum"    "perwt"    
## [11] "famsize"   "nchild"    "nchlt5"    "eldch"     "nsibs"    
## [16] "relate"    "related"   "sex"       "age"       "marst"    
## [21] "birthyr"   "fertyr"    "race"      "raced"     "hispan"   
## [26] "hispand"   "bpl"       "bpld"      "citizen"   "yrsusa1"  
## [31] "language"  "languaged" "speakeng"  "educ"      "educd"    
## [36] "empstat"   "empstatd"  "labforce"  "occ"       "ind"      
## [41] "inctot"    "incwage"   "poverty"   "hwsei"     "migrate1" 
## [46] "migrate1d" "carpool"   "trantime"

For all,

Create a subset incwagenew1 without 9999998 and 999999:

incwagenew1<-subset(ipums, incwage<999998)

1.1) in the workforce - mean, median, STD, sample size of incwage:

a) mean = 27,489.69

mean(incwagenew1$incwage)
## [1] 27489.69

b) median = 7,000

median(incwagenew1$incwage)
## [1] 7000

c) Standard Deviation = 50,665.1

sd(incwagenew1$incwage)
## [1] 50665.1

For in the workforce,

Create a subset incwagenew2 without 0, 999998 and 999999:

incwagenew2<-subset(ipums, incwage>0 & incwage<999998)

1.2) in the workforce - mean, median, STD, sample size of incwage:

a) mean = 46,781.04

mean(incwagenew2\(incwage) - for direct mean and mean(incwagenew2\)incwage, trim=[value]) for trimming

mean(incwagenew2$incwage)
## [1] 46781.04

b) median = 32,000

median(incwagenew2$incwage)
## [1] 32000

c) Standard Deviation = 58,871.82

sd(incwagenew2$incwage)
## [1] 58871.82

2) Age, Sex, Education…

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
ipums%>%
mutate(edurec = case_when(.$educd %in% c(0:61)~"nohs", 
.$educd %in% c(62:64)~"hs",
.$educd %in% c(65:100)~"somecoll",
.$educd %in% c(101:116)~"collgrad", 
.$educd ==999 ~ "missing"))%>%
mutate(edurec=ifelse(educd%in%c(999998,999999),NA,educd))%>%
filter(labforce==2,age>=25) %>%
mutate(sexrecode=ifelse(sex==1,"male","female"))%>%
group_by(edurec,sexrecode)%>%
summarise(meanold=mean(educd),meannew=mean(edurec,na.rm=T),n=n())
## # A tibble: 48 x 5
## # Groups:   edurec [?]
##    edurec sexrecode meanold meannew     n
##     <dbl>     <chr>   <dbl>   <dbl> <int>
##  1      2    female       2       2   500
##  2      2      male       2       2   809
##  3     11    female      11      11    12
##  4     11      male      11      11    11
##  5     12    female      12      12     7
##  6     12      male      12      12     8
##  7     14    female      14      14    17
##  8     14      male      14      14    25
##  9     15    female      15      15    25
## 10     15      male      15      15    56
## # ... with 38 more rows