Question 1:
# Loading ACS Microdata from GitHub
library(haven)
ipums<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")
#print the column names
names(ipums)
## [1] "year" "datanum" "serial" "hhwt" "statefip"
## [6] "met2013" "puma" "gq" "pernum" "perwt"
## [11] "famsize" "nchild" "nchlt5" "eldch" "nsibs"
## [16] "relate" "related" "sex" "age" "marst"
## [21] "birthyr" "fertyr" "race" "raced" "hispan"
## [26] "hispand" "bpl" "bpld" "citizen" "yrsusa1"
## [31] "language" "languaged" "speakeng" "educ" "educd"
## [36] "empstat" "empstatd" "labforce" "occ" "ind"
## [41] "inctot" "incwage" "poverty" "hwsei" "migrate1"
## [46] "migrate1d" "carpool" "trantime"
Mean
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#for the variable 'incwage'
ipums%>%
mutate(mywage= ifelse(incwage%in%c(999998,999999), NA, incwage))%>% #recoding 'incwage' to 'mywage'
summarise(meanold=mean(incwage), meannew=mean(mywage, na.rm=T), n=n()) ##estimating Mean of recoded 'mywage' variable
## # A tibble: 1 x 3
## meanold meannew n
## <dbl> <dbl> <int>
## 1 205672.4 27489.69 300552
Standard Deviation
ipums%>%
mutate(mywage= ifelse(incwage%in%c(999998,999999), NA, incwage))%>% #recoding 'incwage' to 'mywage'
summarise(sdold=sd(incwage), sdnew=median(mywage, na.rm=T), n=n()) ##estimating Standard Deviation of recoded 'mywage' variable
## # A tibble: 1 x 3
## sdold sdnew n
## <dbl> <dbl> <int>
## 1 378988.6 7000 300552
Summary Statistics
ipums%>%
mutate(mywage= ifelse(incwage%in%c(999998,999999), NA, incwage))%>% #recoding 'incwage' to 'mywage'
summarise(meannew=mean(mywage, na.rm=T), mednew=median(mywage, na.rm=T), sdnew=sd(mywage, na.rm=T), n=n()) ##summary estimates
## # A tibble: 1 x 4
## meannew mednew sdnew n
## <dbl> <dbl> <dbl> <int>
## 1 27489.69 7000 50665.1 300552
Question 2: Calculating summary statistics
Comparing Mean
library(dplyr)
ipums%>%
mutate(edurec = case_when(.$educd %in% c(0:61)~"nohs",
.$educd %in% c(62:64)~"hs",
.$educd %in% c(65:100)~"somecoll",
.$educd %in% c(101:116)~"collgrad",
.$educd == 999 ~ "missing")) %>% #recoding 'educd' to 'edurec'
mutate(mywage= ifelse(incwage%in%c(999998,999999), NA, incwage))%>% #replacing missing value in 'incwage' and recoding 'incwage' to 'mywage'
filter(labforce==2, age>=25) %>% # filter by who are currently in the labor force and over age 25
mutate(sexrecode=ifelse(sex==1, "male", "female"))%>% #recoding 'sex'
group_by(edurec, sexrecode)%>% #to compare between men and women
summarise(meanold=mean(incwage), meannew=mean(mywage, na.rm=T), n=n()) ##summary estimates
## # A tibble: 8 x 5
## # Groups: edurec [?]
## edurec sexrecode meanold meannew n
## <chr> <chr> <dbl> <dbl> <int>
## 1 collgrad female 57775.23 57775.23 23539
## 2 collgrad male 92236.55 92236.55 23860
## 3 hs female 25607.74 25607.74 13454
## 4 hs male 37929.02 37929.02 17412
## 5 nohs female 18332.41 18332.41 3797
## 6 nohs male 28148.86 28148.86 6394
## 7 somecoll female 32798.44 32798.44 19404
## 8 somecoll male 48098.73 48098.73 19201
Comparing Standard Deviation
library(dplyr)
#mean for the variable 'edurec' with the applied filters
ipums%>%
mutate(edurec = case_when(.$educd %in% c(0:61)~"nohs",
.$educd %in% c(62:64)~"hs",
.$educd %in% c(65:100)~"somecoll",
.$educd %in% c(101:116)~"collgrad",
.$educd == 999 ~ "missing")) %>% #recoding 'educd' to 'edurec'
mutate(mywage= ifelse(incwage%in%c(999998,999999), NA, incwage))%>% #replacing missing value in 'incwage' and recoding 'incwage' to 'mywage'
filter(labforce==2, age>=25) %>% # filter by who are currently in the labor force and over age 25
mutate(sexrecode=ifelse(sex==1, "male", "female"))%>%
group_by(edurec, sexrecode)%>% #to compare between men and women
summarise(sdold=sd(incwage), sdnew=sd(mywage, na.rm=T), n=n()) ##summary estimates
## # A tibble: 8 x 5
## # Groups: edurec [?]
## edurec sexrecode sdold sdnew n
## <chr> <chr> <dbl> <dbl> <int>
## 1 collgrad female 56722.86 56722.86 23539
## 2 collgrad male 98040.42 98040.42 23860
## 3 hs female 26361.37 26361.37 13454
## 4 hs male 37056.63 37056.63 17412
## 5 nohs female 23212.05 23212.05 3797
## 6 nohs male 32879.64 32879.64 6394
## 7 somecoll female 31673.19 31673.19 19404
## 8 somecoll male 47090.02 47090.02 19201
Summary Statistics
library(dplyr)
#mean for the variable 'edurec' with the applied filters
ipums%>%
mutate(edurec = case_when(.$educd %in% c(0:61)~"nohs",
.$educd %in% c(62:64)~"hs",
.$educd %in% c(65:100)~"somecoll",
.$educd %in% c(101:116)~"collgrad",
.$educd == 999 ~ "missing")) %>% #recoding 'educd' to 'edurec'
mutate(mywage= ifelse(incwage%in%c(999998,999999), NA, incwage))%>% #replacing missing value in 'incwage' and recoding 'incwage' to 'mywage'
filter(labforce==2, age>=25) %>% # filter by who are currently in the labor force and over age 25
mutate(sexrecode=ifelse(sex==1, "male", "female"))%>%
group_by(edurec, sexrecode)%>% #to compare between men and women
summarise(meannew=mean(mywage, na.rm=T), mednew=median(mywage, na.rm=T), sdnew=sd(mywage, na.rm=T), n=n()) ##summary estimates
## # A tibble: 8 x 6
## # Groups: edurec [?]
## edurec sexrecode meannew mednew sdnew n
## <chr> <chr> <dbl> <dbl> <dbl> <int>
## 1 collgrad female 57775.23 48000 56722.86 23539
## 2 collgrad male 92236.55 70000 98040.42 23860
## 3 hs female 25607.74 21600 26361.37 13454
## 4 hs male 37929.02 32000 37056.63 17412
## 5 nohs female 18332.41 15000 23212.05 3797
## 6 nohs male 28148.86 23000 32879.64 6394
## 7 somecoll female 32798.44 28000 31673.19 19404
## 8 somecoll male 48098.73 40000 47090.02 19201