1. Using IPUMS Data

library(dplyr) #to manipulate data
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2) #to visualize data

## Loading Data
library(haven)
ipums<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")

2. Standard Error for the Mean Household Size, by BirthPlace of Household Heads

ipums %>%
  filter(relate==1) %>% #to only have household heads
  mutate(BirthPlace=ifelse(bpl<=120,"US Born","Foreign Born")) %>% #applying the conditions
  group_by(BirthPlace) %>%
  summarise(mean_familysize=mean(famsize), sd=sd(famsize), standard_error=sd(famsize)/sqrt(length(famsize)))
## # A tibble: 2 x 4
##     BirthPlace mean_familysize       sd standard_error
##          <chr>           <dbl>    <dbl>          <dbl>
## 1 Foreign Born        2.934445 1.683525    0.013353324
## 2      US Born        2.290301 1.332221    0.004183421

3. Confidence Intervals for the Mean Family Size, by BirthPlace of Household Heads

Loading the function to compute a confidence interval

norm.interval = function(data, conf.level = 0.95) #loading the function to compute a confidence interval from a Normal Distribution
{z = qnorm((1 - conf.level)/2, lower.tail = FALSE)

 variance = var(data, na.rm=T)
 xbar = mean(data, na.rm=T)
 sdx = sqrt(variance/length(data))
 c(xbar - z * sdx, xbar + z * sdx) }

norm.interval(ipums$famsize)
## [1] 3.043024 3.055395

Calculating Confidence Intervals for the Mean Family Size for Families of Foreign Born and US Born Household Heads

foreign_born<-ipums %>%
  filter(relate==1) %>%
  mutate(birthplace=ifelse(bpl<=120,"US_BORN","FOREIGN_BORN")) %>%
  filter(birthplace=="FOREIGN_BORN")

us_born<-ipums %>%
  filter(relate==1) %>%
  mutate(birthplace=ifelse(bpl<=120,"US_BORN","FOREIGN_BORN")) %>%
  filter(birthplace=="US_BORN")

norm.interval(foreign_born$famsize) #confidence intervals for foreign born
## [1] 2.908273 2.960617
norm.interval(us_born$famsize) #confidence intervals for us born
## [1] 2.282102 2.298500

The confidence intervals for Foreign Born is (2.908273-2.960617) and US Born is (2.282102-2.298500). Hence, they do not overlap.

4. Calculating Confidence Intervals for the Mean Family Size for Households by BirthPlace using the Bootstrap Method

For Foreign Born Household Heads

n.sim<-1000

foreign_mus<-numeric(n.sim)
foreign_vars<-numeric(n.sim)
for (i in 1:n.sim){  
  dat<-sample(foreign_born$famsize,size=length(foreign_born$famsize), replace=T)
  foreign_mus[i]<-mean(dat, na.rm=T)
  foreign_vars[i]<-var(dat, na.rm=T)
}

par(mfrow=c(1,2))
hist(foreign_mus,freq=F, main="Bootstrap distrbution of 
    foereign born famsize means")

abline(v=mean(foreign_born$famsize, na.rm=T), col=2, lwd=3)

hist(foreign_vars, freq=F,main="Bootstrap distrbution of 
    foereign born famsize variance")
abline(v=var(foreign_born$famsize, na.rm=T), col=2, lwd=3)

Bootstrap confidence intervals using percentile method for Foreign Born Means

quantile(foreign_mus,p=c(.025, .975))
##     2.5%    97.5% 
## 2.910346 2.960503

For US Born Household Heads

n.sim<-1000

us_mus<-numeric(n.sim)
us_vars<-numeric(n.sim)
for (i in 1:n.sim){  
  dat<-sample(us_born$famsize,size=length(us_born$famsize), replace=T)
  us_mus[i]<-mean(dat, na.rm=T)
  us_vars[i]<-var(dat, na.rm=T)
}

par(mfrow=c(1,2))
hist(us_mus,freq=F, main="Bootstrap distrbution of 
  US born famsize means")
abline(v=mean(us_born$famsize, na.rm=T), col=2, lwd=3)

hist(us_vars, freq=F,main="Bootstrap distrbution of 
  US born famsize means")
abline(v=var(us_born$famsize, na.rm=T), col=2, lwd=3)

Bootstrap confidence intervals using percentile method for US Born Means

quantile(us_mus,p=c(.025, .975))
##     2.5%    97.5% 
## 2.282194 2.298930

Comparing the confidence intervals

for foreign born

norm.interval(foreign_born$famsize)
## [1] 2.908273 2.960617
quantile(foreign_mus,p=c(.025, .975))
##     2.5%    97.5% 
## 2.910346 2.960503

for US born

norm.interval(us_born$famsize)
## [1] 2.282102 2.298500
quantile(us_mus,p=c(.025, .975))
##     2.5%    97.5% 
## 2.282194 2.298930

The comparison shows that the two bootstrapped confidence intervals are pretty close to the original confidence intervals, and they do not overlap either.