library(haven)
ipums<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")
## `curl` package not installed, falling back to using `url()`
names(ipums) #print the column names
##  [1] "year"      "datanum"   "serial"    "hhwt"      "statefip" 
##  [6] "met2013"   "puma"      "gq"        "pernum"    "perwt"    
## [11] "famsize"   "nchild"    "nchlt5"    "eldch"     "nsibs"    
## [16] "relate"    "related"   "sex"       "age"       "marst"    
## [21] "birthyr"   "fertyr"    "race"      "raced"     "hispan"   
## [26] "hispand"   "bpl"       "bpld"      "citizen"   "yrsusa1"  
## [31] "language"  "languaged" "speakeng"  "educ"      "educd"    
## [36] "empstat"   "empstatd"  "labforce"  "occ"       "ind"      
## [41] "inctot"    "incwage"   "poverty"   "hwsei"     "migrate1" 
## [46] "migrate1d" "carpool"   "trantime"
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

We calculate the standard error for the mean household size of the American population

newpums2<-ipums%>%
mutate(familysize= ifelse(famsize%in%c(999998,999999), NA, famsize))%>%
filter(relate==1, bpl<=120)
mean(newpums2$familysize)
## [1] 2.290301
newpums2%>%
  summarise(newsd=sd(familysize), meannew=mean(familysize, na.rm=T), n=n())
## # A tibble: 1 x 3
##      newsd  meannew      n
##      <dbl>    <dbl>  <int>
## 1 1.332221 2.290301 101412
norm.interval = function(data, conf.level = 0.95) 
{z = qnorm((1 - conf.level)/2, lower.tail = FALSE)

 variance = var(data, na.rm=T)
 xbar = mean(data, na.rm=T)
 sdx = sqrt(variance/length(data))
 c(xbar - z * sdx, xbar + z * sdx) }

norm.interval(newpums2$familysize)
## [1] 2.282102 2.298500

We calculate the confidence interval for the mean family size for the American households using the normal approximation. The confidence intervals do not overlap.

 var.interval = function(data, conf.level = 0.95) {
  df = length(data) - 1
       chilower = qchisq((1 - conf.level)/2, df)
       chiupper = qchisq((1 - conf.level)/2, df, lower.tail = FALSE)
       v = var(data, na.rm=T)
   c(df * v/chiupper, df * v/chilower) }
var(newpums2$familysize, na.rm=T)
## [1] 1.774813
var.interval(newpums2$familysize)
## [1] 1.759466 1.790363

We complete the Bootstrap method for the American households

n.sim<-1000

mus<-numeric(n.sim)
vars<-numeric(n.sim)
for (i in 1:n.sim){  
  dat<-sample(newpums2$familysize,size=length(newpums2$familysize), replace=T)
  mus[i]<-mean(dat, na.rm=T)
  vars[i]<-var(dat, na.rm=T)
}


par(mfrow=c(1,2))
hist(mus,freq=F, main="Bootstrap distribution of means")
abline(v=mean(newpums2$familysize, na.rm=T), col=2, lwd=3)

hist(vars, freq=F,main="Bootstrap distribution of variance")
abline(v=var(newpums2$familysize, na.rm=T), col=2, lwd=3)

We calculate the standard error for the mean household size of the Foreign population

newpums2<-ipums%>%
  mutate(familysize2= ifelse(famsize%in%c(999998,999999), NA, famsize))%>%
  filter(relate==1, bpl>=120)
mean(newpums2$familysize2)
## [1] 2.934445
newpums2%>%
  summarise(newsd=sd(familysize2), meannew=mean(familysize2, na.rm=T), n=n())
## # A tibble: 1 x 3
##      newsd  meannew     n
##      <dbl>    <dbl> <int>
## 1 1.683525 2.934445 15895
norm.interval = function(data, conf.level = 0.95) 
{z = qnorm((1 - conf.level)/2, lower.tail = FALSE)

variance = var(data, na.rm=T)
xbar = mean(data, na.rm=T)
sdx = sqrt(variance/length(data))
c(xbar - z * sdx, xbar + z * sdx) }

norm.interval(newpums2$familysize2)
## [1] 2.908273 2.960617

We repeat the calculations of the confidence interval for the mean family size but this time for the foreign households using the normal approximation. The confidence intervals do not overlap.

var.interval = function(data, conf.level = 0.95) {
  df = length(data) - 1
  chilower = qchisq((1 - conf.level)/2, df)
  chiupper = qchisq((1 - conf.level)/2, df, lower.tail = FALSE)
  v = var(data, na.rm=T)
  c(df * v/chiupper, df * v/chilower) }
var(newpums2$familysize2, na.rm=T)
## [1] 2.834258
var.interval(newpums2$familysize2)
## [1] 2.772961 2.897619

We complete the Bootstrap method for the Foreign households

n.sim<-1000

mus<-numeric(n.sim)
vars<-numeric(n.sim)
for (i in 1:n.sim){  
  dat<-sample(newpums2$familysize2,size=length(newpums2$familysize2), replace=T)
  mus[i]<-mean(dat, na.rm=T)
  vars[i]<-var(dat, na.rm=T)
}


par(mfrow=c(1,2))
hist(mus,freq=F, main="Bootstrap distribution of means")
abline(v=mean(newpums2$familysize2, na.rm=T), col=2, lwd=3)

hist(vars, freq=F,main="Bootstrap distribution of variance")
abline(v=var(newpums2$familysize2, na.rm=T), col=2, lwd=3)