Homework 4

Applied Probability

Calculate the standard error for the mean household size for families of foreign born household heads and for families of US born household heads.

library(haven)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
acs<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")
acs%>%
  mutate(usborn= case_when(acs$bpl %in% c(1:120)~"US born", acs$bpl %in% c(121:998)~"Foreign born"))%>%
  filter(relate==1)%>%
  group_by(usborn)%>%
  summarise(avefs=mean(famsize), sd(famsize), n(), sem=sd(famsize)/sqrt(n()))
## # A tibble: 2 x 5
##         usborn    avefs `sd(famsize)`  `n()`         sem
##          <chr>    <dbl>         <dbl>  <int>       <dbl>
## 1 Foreign born 2.934445      1.683525  15895 0.013353324
## 2      US born 2.290301      1.332221 101412 0.004183421

Confidence intervals for the mean family size using the normal approximation.

usborn<-acs%>%
  mutate(usborn= case_when(acs$bpl %in% c(1:120)~"US born"))%>%
  filter(relate==1, usborn == "US born")
foreignborn<-acs%>%
  mutate(usborn= case_when(acs$bpl %in% c(121:998)~"Foreign born"))%>%
  filter(relate==1, usborn == "Foreign born")
norm.interval = function(data, conf.level = 0.95) 
{z = qnorm((1 - conf.level)/2, lower.tail = FALSE)

 variance = var(data, na.rm=T)
 xbar = mean(data, na.rm=T)
 sdx = sqrt(variance/length(data))
 c(xbar - z * sdx, xbar + z * sdx) }

usbci<-norm.interval(usborn$famsize)
fbci<-norm.interval(foreignborn$famsize)

The confidence intervals for the average family size for US born-household heads and Foreign born-household head do not overlap.

tibble(usbci, fbci)
## # A tibble: 2 x 2
##      usbci     fbci
##      <dbl>    <dbl>
## 1 2.282102 2.908273
## 2 2.298500 2.960617

Confidence intervals for the mean family size using the bootstrap.

n.sim<-999

mus<-numeric(n.sim)
vars<-numeric(n.sim)
for (i in 1:n.sim){  
  dat<-sample(usborn$famsize,size=length(usborn$famsize), replace=T)
  mus[i]<-mean(dat, na.rm=T)
  vars[i]<-var(dat, na.rm=T)
}

n.sim.f<-999

mus.f<-numeric(n.sim.f)
vars.f<-numeric(n.sim.f)
for (i in 1:n.sim){  
  dat<-sample(foreignborn$famsize,size=length(foreignborn$famsize), replace=T)
  mus.f[i]<-mean(dat, na.rm=T)
  vars.f[i]<-var(dat, na.rm=T)
}

par(mfrow=c(2,2))
hist(mus,freq=F, main="Bootstrap distribution. US born-family size", xlab = )
abline(v=mean(usborn$famsize, na.rm=T), col=2, lwd=3)

hist(vars, freq=F,main="Bootstrap distribution. US born-family size")
abline(v=var(usborn$famsize, na.rm=T), col=2, lwd=3)

hist(mus.f,freq=F, main="Bootstrap distribution. Foreign born-family size")
abline(v=mean(foreignborn$famsize, na.rm=T), col=2, lwd=3)

hist(vars.f, freq=F,main="Bootstrap distribution. Foreign born-family size")
abline(v=var(foreignborn$famsize, na.rm=T), col=2, lwd=3)

usbbm<-quantile(mus, p=c(.025, .975))
usbbv<-quantile(vars, p=c(.025, .975))

fbbm<-quantile(mus.f, p=c(.025, .975))
fbbv<-quantile(vars.f, p=c(.025, .975))

usbbm; usbbv; fbbm; fbbv
##     2.5%    97.5% 
## 2.281608 2.298374
##     2.5%    97.5% 
## 1.749834 1.800022
##     2.5%    97.5% 
## 2.907867 2.964470
##     2.5%    97.5% 
## 2.756549 2.920287

As we can see on the next table, bootstrap gave us confidence intervals more narrow than normal approximation.

tibble(usbci,usbbm, fbci, fbbm)
## # A tibble: 2 x 4
##      usbci    usbbm     fbci     fbbm
##      <dbl>    <dbl>    <dbl>    <dbl>
## 1 2.282102 2.281608 2.908273 2.907867
## 2 2.298500 2.298374 2.960617 2.964470