1. Using IPUMS Data
library(dplyr) #to manipulate data
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2) #to visualize data
## Loading Data
library(haven)
ipums<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")
2. Standard Error for the Mean Household Size, by BirthPlace of Household Heads
ipums %>%
filter(relate==1) %>% #to only have household heads
mutate(BirthPlace=ifelse(bpl<=120,"US Born","Foreign Born")) %>% #applying the conditions
group_by(BirthPlace) %>%
summarise(mean_familysize=mean(famsize), sd=sd(famsize), standard_error=sd(famsize)/sqrt(length(famsize)))
## # A tibble: 2 x 4
## BirthPlace mean_familysize sd standard_error
## <chr> <dbl> <dbl> <dbl>
## 1 Foreign Born 2.934445 1.683525 0.013353324
## 2 US Born 2.290301 1.332221 0.004183421
3. Confidence Intervals for the Mean Family Size, by BirthPlace of Household Heads
Loading the function to compute a confidence interval
norm.interval = function(data, conf.level = 0.95) #loading the function to compute a confidence interval from a Normal Distribution
{z = qnorm((1 - conf.level)/2, lower.tail = FALSE)
variance = var(data, na.rm=T)
xbar = mean(data, na.rm=T)
sdx = sqrt(variance/length(data))
c(xbar - z * sdx, xbar + z * sdx) }
norm.interval(ipums$famsize)
## [1] 3.043024 3.055395
Calculating Confidence Intervals for the Mean Family Size for Families of Foreign Born and US Born Household Heads
foreign_born<-ipums %>%
filter(relate==1) %>%
mutate(birthplace=ifelse(bpl<=120,"US_BORN","FOREIGN_BORN")) %>%
filter(birthplace=="FOREIGN_BORN")
us_born<-ipums %>%
filter(relate==1) %>%
mutate(birthplace=ifelse(bpl<=120,"US_BORN","FOREIGN_BORN")) %>%
filter(birthplace=="US_BORN")
norm.interval(foreign_born$famsize) #confidence intervals for foreign born
## [1] 2.908273 2.960617
norm.interval(us_born$famsize) #confidence intervals for us born
## [1] 2.282102 2.298500
The confidence intervals for Foreign Born is (2.908273-2.960617) and US Born is (2.282102-2.298500). Hence, they do not overlap.
4. Calculating Confidence Intervals for the Mean Family Size for Households by BirthPlace using the Bootstrap Method
For Foreign Born Household Heads
n.sim<-1000
foreign_mus<-numeric(n.sim)
foreign_vars<-numeric(n.sim)
for (i in 1:n.sim){
dat<-sample(foreign_born$famsize,size=length(foreign_born$famsize), replace=T)
foreign_mus[i]<-mean(dat, na.rm=T)
foreign_vars[i]<-var(dat, na.rm=T)
}
par(mfrow=c(1,2))
hist(foreign_mus,freq=F, main="Bootstrap distrbution of
foereign born famsize means")
abline(v=mean(foreign_born$famsize, na.rm=T), col=2, lwd=3)
hist(foreign_vars, freq=F,main="Bootstrap distrbution of
foereign born famsize variance")
abline(v=var(foreign_born$famsize, na.rm=T), col=2, lwd=3)

Bootstrap confidence intervals using percentile method for Foreign Born Means
quantile(foreign_mus,p=c(.025, .975))
## 2.5% 97.5%
## 2.910346 2.960503
For US Born Household Heads
n.sim<-1000
us_mus<-numeric(n.sim)
us_vars<-numeric(n.sim)
for (i in 1:n.sim){
dat<-sample(us_born$famsize,size=length(us_born$famsize), replace=T)
us_mus[i]<-mean(dat, na.rm=T)
us_vars[i]<-var(dat, na.rm=T)
}
par(mfrow=c(1,2))
hist(us_mus,freq=F, main="Bootstrap distrbution of
US born famsize means")
abline(v=mean(us_born$famsize, na.rm=T), col=2, lwd=3)
hist(us_vars, freq=F,main="Bootstrap distrbution of
US born famsize means")
abline(v=var(us_born$famsize, na.rm=T), col=2, lwd=3)

Bootstrap confidence intervals using percentile method for US Born Means
quantile(us_mus,p=c(.025, .975))
## 2.5% 97.5%
## 2.282194 2.298930
Comparing the confidence intervals
for foreign born
norm.interval(foreign_born$famsize)
## [1] 2.908273 2.960617
quantile(foreign_mus,p=c(.025, .975))
## 2.5% 97.5%
## 2.910346 2.960503
for US born
norm.interval(us_born$famsize)
## [1] 2.282102 2.298500
quantile(us_mus,p=c(.025, .975))
## 2.5% 97.5%
## 2.282194 2.298930
The comparison shows that the two bootstrapped confidence intervals are pretty close to the original confidence intervals, and they do not overlap either.