Question 1:

# Loading ACS Microdata from GitHub
library(haven)
ipums<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")

#print the column names
names(ipums)

##  [1] "year"      "datanum"   "serial"    "hhwt"      "statefip" 
##  [6] "met2013"   "puma"      "gq"        "pernum"    "perwt"    
## [11] "famsize"   "nchild"    "nchlt5"    "eldch"     "nsibs"    
## [16] "relate"    "related"   "sex"       "age"       "marst"    
## [21] "birthyr"   "fertyr"    "race"      "raced"     "hispan"   
## [26] "hispand"   "bpl"       "bpld"      "citizen"   "yrsusa1"  
## [31] "language"  "languaged" "speakeng"  "educ"      "educd"    
## [36] "empstat"   "empstatd"  "labforce"  "occ"       "ind"      
## [41] "inctot"    "incwage"   "poverty"   "hwsei"     "migrate1" 
## [46] "migrate1d" "carpool"   "trantime"

Mean

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

#for the variable 'incwage'

ipums%>%
  mutate(mywage= ifelse(incwage%in%c(999998,999999), NA, incwage))%>% #recoding 'incwage' to 'mywage'
  summarise(meanold=mean(incwage), meannew=mean(mywage, na.rm=T), n=n()) ##estimating Mean of recoded 'mywage' variable

## # A tibble: 1 x 3
##    meanold  meannew      n
##      <dbl>    <dbl>  <int>
## 1 205672.4 27489.69 300552

Median

ipums%>%
  mutate(mywage= ifelse(incwage%in%c(999998,999999), NA, incwage))%>% #recoding 'incwage' to 'mywage'
  summarise(medianold=median(incwage), mediannew=median(mywage, na.rm=T), n=n()) ##estimating Median of recoded 'mywage' variable

## # A tibble: 1 x 3
##   medianold mediannew      n
##       <dbl>     <dbl>  <int>
## 1     20000      7000 300552

Standard Deviation

ipums%>%
  mutate(mywage= ifelse(incwage%in%c(999998,999999), NA, incwage))%>% #recoding 'incwage' to 'mywage'
  summarise(sdold=sd(incwage), sdnew=median(mywage, na.rm=T), n=n()) ##estimating Standard Deviation of recoded 'mywage' variable

## # A tibble: 1 x 3
##      sdold sdnew      n
##      <dbl> <dbl>  <int>
## 1 378988.6  7000 300552

Summary Statistics

ipums%>%
mutate(mywage= ifelse(incwage%in%c(999998,999999), NA, incwage))%>% #recoding 'incwage' to 'mywage'
summarise(meannew=mean(mywage, na.rm=T), mednew=median(mywage, na.rm=T), sdnew=sd(mywage, na.rm=T), n=n()) ##summary estimates

## # A tibble: 1 x 4
##    meannew mednew   sdnew      n
##      <dbl>  <dbl>   <dbl>  <int>
## 1 27489.69   7000 50665.1 300552

Question 2: Calculating summary statistics

Comparing Mean

library(dplyr)

ipums%>%
  mutate(edurec = case_when(.$educd %in% c(0:61)~"nohs", 
                            .$educd %in% c(62:64)~"hs", 
                            .$educd %in% c(65:100)~"somecoll", 
                            .$educd %in% c(101:116)~"collgrad", 
                            .$educd == 999 ~ "missing")) %>% #recoding 'educd' to 'edurec'
  mutate(mywage= ifelse(incwage%in%c(999998,999999), NA, incwage))%>% #replacing missing value in 'incwage' and recoding 'incwage' to 'mywage' 
  filter(labforce==2, age>=25) %>% # filter by who are currently in the labor force and over age 25
  mutate(sexrecode=ifelse(sex==1, "male", "female"))%>% #recoding 'sex'
  group_by(edurec, sexrecode)%>% #to compare between men and women
  summarise(meanold=mean(incwage), meannew=mean(mywage, na.rm=T), n=n()) ##summary estimates

## # A tibble: 8 x 5
## # Groups:   edurec [?]
##     edurec sexrecode  meanold  meannew     n
##      <chr>     <chr>    <dbl>    <dbl> <int>
## 1 collgrad    female 57775.23 57775.23 23539
## 2 collgrad      male 92236.55 92236.55 23860
## 3       hs    female 25607.74 25607.74 13454
## 4       hs      male 37929.02 37929.02 17412
## 5     nohs    female 18332.41 18332.41  3797
## 6     nohs      male 28148.86 28148.86  6394
## 7 somecoll    female 32798.44 32798.44 19404
## 8 somecoll      male 48098.73 48098.73 19201

Comparing Median

library(dplyr)

#mean for the variable 'edurec' with the applied filters

ipums%>%
  mutate(edurec = case_when(.$educd %in% c(0:61)~"nohs", 
                            .$educd %in% c(62:64)~"hs", 
                            .$educd %in% c(65:100)~"somecoll", 
                            .$educd %in% c(101:116)~"collgrad", 
                            .$educd == 999 ~ "missing")) %>% #recoding 'educd' to 'edurec'
  mutate(mywage= ifelse(incwage%in%c(999998,999999), NA, incwage))%>% #replacing missing value in 'incwage' and recoding 'incwage' to 'mywage' 
  filter(labforce==2, age>=25) %>% # filter by who are currently in the labor force and over age 25
  mutate(sexrecode=ifelse(sex==1, "male", "female"))%>%
  group_by(edurec, sexrecode)%>% #to compare between men and women
  summarise(medold=median(incwage), mednew=median(mywage, na.rm=T), n=n()) ##summary estimates

## # A tibble: 8 x 5
## # Groups:   edurec [?]
##     edurec sexrecode medold mednew     n
##      <chr>     <chr>  <dbl>  <dbl> <int>
## 1 collgrad    female  48000  48000 23539
## 2 collgrad      male  70000  70000 23860
## 3       hs    female  21600  21600 13454
## 4       hs      male  32000  32000 17412
## 5     nohs    female  15000  15000  3797
## 6     nohs      male  23000  23000  6394
## 7 somecoll    female  28000  28000 19404
## 8 somecoll      male  40000  40000 19201

Comparing Standard Deviation

library(dplyr)

#mean for the variable 'edurec' with the applied filters

ipums%>%
  mutate(edurec = case_when(.$educd %in% c(0:61)~"nohs", 
                            .$educd %in% c(62:64)~"hs", 
                            .$educd %in% c(65:100)~"somecoll", 
                            .$educd %in% c(101:116)~"collgrad", 
                            .$educd == 999 ~ "missing")) %>% #recoding 'educd' to 'edurec'
  mutate(mywage= ifelse(incwage%in%c(999998,999999), NA, incwage))%>% #replacing missing value in 'incwage' and recoding 'incwage' to 'mywage' 
  filter(labforce==2, age>=25) %>% # filter by who are currently in the labor force and over age 25
  mutate(sexrecode=ifelse(sex==1, "male", "female"))%>%
  group_by(edurec, sexrecode)%>% #to compare between men and women
  summarise(sdold=sd(incwage), sdnew=sd(mywage, na.rm=T), n=n()) ##summary estimates

## # A tibble: 8 x 5
## # Groups:   edurec [?]
##     edurec sexrecode    sdold    sdnew     n
##      <chr>     <chr>    <dbl>    <dbl> <int>
## 1 collgrad    female 56722.86 56722.86 23539
## 2 collgrad      male 98040.42 98040.42 23860
## 3       hs    female 26361.37 26361.37 13454
## 4       hs      male 37056.63 37056.63 17412
## 5     nohs    female 23212.05 23212.05  3797
## 6     nohs      male 32879.64 32879.64  6394
## 7 somecoll    female 31673.19 31673.19 19404
## 8 somecoll      male 47090.02 47090.02 19201

Summary Statistics

library(dplyr)

#mean for the variable 'edurec' with the applied filters

ipums%>%
  mutate(edurec = case_when(.$educd %in% c(0:61)~"nohs", 
                            .$educd %in% c(62:64)~"hs", 
                            .$educd %in% c(65:100)~"somecoll", 
                            .$educd %in% c(101:116)~"collgrad", 
                            .$educd == 999 ~ "missing")) %>% #recoding 'educd' to 'edurec'
  mutate(mywage= ifelse(incwage%in%c(999998,999999), NA, incwage))%>% #replacing missing value in 'incwage' and recoding 'incwage' to 'mywage' 
  filter(labforce==2, age>=25) %>% # filter by who are currently in the labor force and over age 25
  mutate(sexrecode=ifelse(sex==1, "male", "female"))%>%
  group_by(edurec, sexrecode)%>% #to compare between men and women
  summarise(meannew=mean(mywage, na.rm=T), mednew=median(mywage, na.rm=T), sdnew=sd(mywage, na.rm=T), n=n()) ##summary estimates

## # A tibble: 8 x 6
## # Groups:   edurec [?]
##     edurec sexrecode  meannew mednew    sdnew     n
##      <chr>     <chr>    <dbl>  <dbl>    <dbl> <int>
## 1 collgrad    female 57775.23  48000 56722.86 23539
## 2 collgrad      male 92236.55  70000 98040.42 23860
## 3       hs    female 25607.74  21600 26361.37 13454
## 4       hs      male 37929.02  32000 37056.63 17412
## 5     nohs    female 18332.41  15000 23212.05  3797
## 6     nohs      male 28148.86  23000 32879.64  6394
## 7 somecoll    female 32798.44  28000 31673.19 19404
## 8 somecoll      male 48098.73  40000 47090.02 19201

DEM 7273 - Homework 2

Muntasir

September 5, 2017

Question 1:

Mean

Median

Standard Deviation

Summary Statistics

Question 2: Calculating summary statistics

Comparing Mean

Comparing Median

Comparing Standard Deviation

Summary Statistics