1) First things first, load the libraries, add a function for Standard Error (mySE) and load ipums data:
library(haven)
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
mySE<-function(x){sd(x,na.rm=T)/length(x)}
ipums<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")
Filter to only have household heads by creating a new variable: onlyheads
onlyheads<-subset(ipums, relate==1)
2) Average, SD, Standard Error of Family Size, Head inside U.S. / outside U.S. born
onlyheads<-ipums%>%
mutate(bornus= case_when(ipums$bpl %in% c(1:120)~"US Born",ipums$bpl %in% c(120:998)~"Foreign Born"))%>%
filter(relate==1)
onlyheads%>%
group_by(bornus) %>%
summarise(mean(famsize), sd(famsize), mySE(famsize))
## # A tibble: 2 x 4
## bornus `mean(famsize)` `sd(famsize)` `mySE(famsize)`
## <chr> <dbl> <dbl> <dbl>
## 1 Foreign Born 2.934445 1.683525 1.059154e-04
## 2 US Born 2.290301 1.332221 1.313672e-05
3)t-Test
US Born:
t.test(onlyheads$famsize[onlyheads$bornus=="US Born"])
##
## One Sample t-test
##
## data: onlyheads$famsize[onlyheads$bornus == "US Born"]
## t = 547.47, df = 101410, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 2.282101 2.298500
## sample estimates:
## mean of x
## 2.290301
Foreign Born:
t.test(onlyheads$famsize[onlyheads$bornus=="Foreign Born"])
##
## One Sample t-test
##
## data: onlyheads$famsize[onlyheads$bornus == "Foreign Born"]
## t = 219.75, df = 15894, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 2.908271 2.960619
## sample estimates:
## mean of x
## 2.934445
4) Confidence Interval using Bootsrap
US Born:
n.sim<-1000
mus<-numeric(n.sim)
OUTvars<-numeric(n.sim)
for (i in 1:n.sim){
dat<-sample(onlyheads$famsize[onlyheads$bornus=="US Born"],size=length(onlyheads$famsize[onlyheads$bornus=="US Born"]), replace=T)
mus[i]<-mean(dat, na.rm=T)
}
quantile(mus, p=c(.025,.975))
## 2.5% 97.5%
## 2.282304 2.298013
# or t.test(dat)
Foreign Born:
n.sim<-1000
mus<-numeric(n.sim)
OUTvars<-numeric(n.sim)
for (i in 1:n.sim){
dat<-sample(onlyheads$famsize[onlyheads$bornus=="Foreign Born"],size=length(onlyheads$famsize[onlyheads$bornus=="Foreign Born"]), replace=T)
mus[i]<-mean(dat, na.rm=T)
}
quantile(mus, p=c(.025,.975))
## 2.5% 97.5%
## 2.909446 2.962631
# or t.test(dat)
The result should be very similar (I’m to 95 percent confident).