library(haven)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
acs<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")
acs%>%
mutate(usborn= case_when(acs$bpl %in% c(1:120)~"US born", acs$bpl %in% c(121:998)~"Foreign born"))%>%
filter(relate==1)%>%
group_by(usborn)%>%
summarise(avefs=mean(famsize), sd(famsize), n(), sem=sd(famsize)/sqrt(n()))
## # A tibble: 2 x 5
## usborn avefs `sd(famsize)` `n()` sem
## <chr> <dbl> <dbl> <int> <dbl>
## 1 Foreign born 2.934445 1.683525 15895 0.013353324
## 2 US born 2.290301 1.332221 101412 0.004183421
usborn<-acs%>%
mutate(usborn= case_when(acs$bpl %in% c(1:120)~"US born"))%>%
filter(relate==1, usborn == "US born")
foreignborn<-acs%>%
mutate(usborn= case_when(acs$bpl %in% c(121:998)~"Foreign born"))%>%
filter(relate==1, usborn == "Foreign born")
norm.interval = function(data, conf.level = 0.95)
{z = qnorm((1 - conf.level)/2, lower.tail = FALSE)
variance = var(data, na.rm=T)
xbar = mean(data, na.rm=T)
sdx = sqrt(variance/length(data))
c(xbar - z * sdx, xbar + z * sdx) }
usbci<-norm.interval(usborn$famsize)
fbci<-norm.interval(foreignborn$famsize)
The confidence intervals for the average family size for US born-household heads and Foreign born-household head do not overlap.
tibble(usbci, fbci)
## # A tibble: 2 x 2
## usbci fbci
## <dbl> <dbl>
## 1 2.282102 2.908273
## 2 2.298500 2.960617
n.sim<-999
mus<-numeric(n.sim)
vars<-numeric(n.sim)
for (i in 1:n.sim){
dat<-sample(usborn$famsize,size=length(usborn$famsize), replace=T)
mus[i]<-mean(dat, na.rm=T)
vars[i]<-var(dat, na.rm=T)
}
n.sim.f<-999
mus.f<-numeric(n.sim.f)
vars.f<-numeric(n.sim.f)
for (i in 1:n.sim){
dat<-sample(foreignborn$famsize,size=length(foreignborn$famsize), replace=T)
mus.f[i]<-mean(dat, na.rm=T)
vars.f[i]<-var(dat, na.rm=T)
}
par(mfrow=c(2,2))
hist(mus,freq=F, main="Bootstrap distribution. US born-family size", xlab = )
abline(v=mean(usborn$famsize, na.rm=T), col=2, lwd=3)
hist(vars, freq=F,main="Bootstrap distribution. US born-family size")
abline(v=var(usborn$famsize, na.rm=T), col=2, lwd=3)
hist(mus.f,freq=F, main="Bootstrap distribution. Foreign born-family size")
abline(v=mean(foreignborn$famsize, na.rm=T), col=2, lwd=3)
hist(vars.f, freq=F,main="Bootstrap distribution. Foreign born-family size")
abline(v=var(foreignborn$famsize, na.rm=T), col=2, lwd=3)
usbbm<-quantile(mus, p=c(.025, .975))
usbbv<-quantile(vars, p=c(.025, .975))
fbbm<-quantile(mus.f, p=c(.025, .975))
fbbv<-quantile(vars.f, p=c(.025, .975))
usbbm; usbbv; fbbm; fbbv
## 2.5% 97.5%
## 2.281608 2.298374
## 2.5% 97.5%
## 1.749834 1.800022
## 2.5% 97.5%
## 2.907867 2.964470
## 2.5% 97.5%
## 2.756549 2.920287
As we can see on the next table, bootstrap gave us confidence intervals more narrow than normal approximation.
tibble(usbci,usbbm, fbci, fbbm)
## # A tibble: 2 x 4
## usbci usbbm fbci fbbm
## <dbl> <dbl> <dbl> <dbl>
## 1 2.282102 2.281608 2.908273 2.907867
## 2 2.298500 2.298374 2.960617 2.964470