library(haven)
ipums<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")
## `curl` package not installed, falling back to using `url()`
names(ipums) #print the column names
## [1] "year" "datanum" "serial" "hhwt" "statefip"
## [6] "met2013" "puma" "gq" "pernum" "perwt"
## [11] "famsize" "nchild" "nchlt5" "eldch" "nsibs"
## [16] "relate" "related" "sex" "age" "marst"
## [21] "birthyr" "fertyr" "race" "raced" "hispan"
## [26] "hispand" "bpl" "bpld" "citizen" "yrsusa1"
## [31] "language" "languaged" "speakeng" "educ" "educd"
## [36] "empstat" "empstatd" "labforce" "occ" "ind"
## [41] "inctot" "incwage" "poverty" "hwsei" "migrate1"
## [46] "migrate1d" "carpool" "trantime"
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
We calculate the standard error for the mean household size of the American population
newpums2<-ipums%>%
mutate(familysize= ifelse(famsize%in%c(999998,999999), NA, famsize))%>%
filter(relate==1, bpl<=120)
mean(newpums2$familysize)
## [1] 2.290301
newpums2%>%
summarise(newsd=sd(familysize), meannew=mean(familysize, na.rm=T), n=n())
## # A tibble: 1 x 3
## newsd meannew n
## <dbl> <dbl> <int>
## 1 1.332221 2.290301 101412
norm.interval = function(data, conf.level = 0.95)
{z = qnorm((1 - conf.level)/2, lower.tail = FALSE)
variance = var(data, na.rm=T)
xbar = mean(data, na.rm=T)
sdx = sqrt(variance/length(data))
c(xbar - z * sdx, xbar + z * sdx) }
norm.interval(newpums2$familysize)
## [1] 2.282102 2.298500
We calculate the confidence interval for the mean family size for the American households using the normal approximation. The confidence intervals do not overlap.
var.interval = function(data, conf.level = 0.95) {
df = length(data) - 1
chilower = qchisq((1 - conf.level)/2, df)
chiupper = qchisq((1 - conf.level)/2, df, lower.tail = FALSE)
v = var(data, na.rm=T)
c(df * v/chiupper, df * v/chilower) }
var(newpums2$familysize, na.rm=T)
## [1] 1.774813
var.interval(newpums2$familysize)
## [1] 1.759466 1.790363
We complete the Bootstrap method for the American households
n.sim<-1000
mus<-numeric(n.sim)
vars<-numeric(n.sim)
for (i in 1:n.sim){
dat<-sample(newpums2$familysize,size=length(newpums2$familysize), replace=T)
mus[i]<-mean(dat, na.rm=T)
vars[i]<-var(dat, na.rm=T)
}
par(mfrow=c(1,2))
hist(mus,freq=F, main="Bootstrap distribution of means")
abline(v=mean(newpums2$familysize, na.rm=T), col=2, lwd=3)
hist(vars, freq=F,main="Bootstrap distribution of variance")
abline(v=var(newpums2$familysize, na.rm=T), col=2, lwd=3)
We calculate the standard error for the mean household size of the Foreign population
newpums2<-ipums%>%
mutate(familysize2= ifelse(famsize%in%c(999998,999999), NA, famsize))%>%
filter(relate==1, bpl>=120)
mean(newpums2$familysize2)
## [1] 2.934445
newpums2%>%
summarise(newsd=sd(familysize2), meannew=mean(familysize2, na.rm=T), n=n())
## # A tibble: 1 x 3
## newsd meannew n
## <dbl> <dbl> <int>
## 1 1.683525 2.934445 15895
norm.interval = function(data, conf.level = 0.95)
{z = qnorm((1 - conf.level)/2, lower.tail = FALSE)
variance = var(data, na.rm=T)
xbar = mean(data, na.rm=T)
sdx = sqrt(variance/length(data))
c(xbar - z * sdx, xbar + z * sdx) }
norm.interval(newpums2$familysize2)
## [1] 2.908273 2.960617
We repeat the calculations of the confidence interval for the mean family size but this time for the foreign households using the normal approximation. The confidence intervals do not overlap.
var.interval = function(data, conf.level = 0.95) {
df = length(data) - 1
chilower = qchisq((1 - conf.level)/2, df)
chiupper = qchisq((1 - conf.level)/2, df, lower.tail = FALSE)
v = var(data, na.rm=T)
c(df * v/chiupper, df * v/chilower) }
var(newpums2$familysize2, na.rm=T)
## [1] 2.834258
var.interval(newpums2$familysize2)
## [1] 2.772961 2.897619
We complete the Bootstrap method for the Foreign households
n.sim<-1000
mus<-numeric(n.sim)
vars<-numeric(n.sim)
for (i in 1:n.sim){
dat<-sample(newpums2$familysize2,size=length(newpums2$familysize2), replace=T)
mus[i]<-mean(dat, na.rm=T)
vars[i]<-var(dat, na.rm=T)
}
par(mfrow=c(1,2))
hist(mus,freq=F, main="Bootstrap distribution of means")
abline(v=mean(newpums2$familysize2, na.rm=T), col=2, lwd=3)
hist(vars, freq=F,main="Bootstrap distribution of variance")
abline(v=var(newpums2$familysize2, na.rm=T), col=2, lwd=3)