Assigment #4

library(haven)
ipums<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")

## `curl` package not installed, falling back to using `url()`

names(ipums) #print the column names

##  [1] "year"      "datanum"   "serial"    "hhwt"      "statefip" 
##  [6] "met2013"   "puma"      "gq"        "pernum"    "perwt"    
## [11] "famsize"   "nchild"    "nchlt5"    "eldch"     "nsibs"    
## [16] "relate"    "related"   "sex"       "age"       "marst"    
## [21] "birthyr"   "fertyr"    "race"      "raced"     "hispan"   
## [26] "hispand"   "bpl"       "bpld"      "citizen"   "yrsusa1"  
## [31] "language"  "languaged" "speakeng"  "educ"      "educd"    
## [36] "empstat"   "empstatd"  "labforce"  "occ"       "ind"      
## [41] "inctot"    "incwage"   "poverty"   "hwsei"     "migrate1" 
## [46] "migrate1d" "carpool"   "trantime"

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

We calculate the standard error for the mean household size of the American population

newpums2<-ipums%>%
mutate(familysize= ifelse(famsize%in%c(999998,999999), NA, famsize))%>%
filter(relate==1, bpl<=120)

mean(newpums2$familysize)

## [1] 2.290301

newpums2%>%
  summarise(newsd=sd(familysize), meannew=mean(familysize, na.rm=T), n=n())

## # A tibble: 1 x 3
##      newsd  meannew      n
##      <dbl>    <dbl>  <int>
## 1 1.332221 2.290301 101412

norm.interval = function(data, conf.level = 0.95) 
{z = qnorm((1 - conf.level)/2, lower.tail = FALSE)

 variance = var(data, na.rm=T)
 xbar = mean(data, na.rm=T)
 sdx = sqrt(variance/length(data))
 c(xbar - z * sdx, xbar + z * sdx) }

norm.interval(newpums2$familysize)

## [1] 2.282102 2.298500

We calculate the confidence interval for the mean family size for the American households using the normal approximation. The confidence intervals do not overlap.

 var.interval = function(data, conf.level = 0.95) {
  df = length(data) - 1
       chilower = qchisq((1 - conf.level)/2, df)
       chiupper = qchisq((1 - conf.level)/2, df, lower.tail = FALSE)
       v = var(data, na.rm=T)
   c(df * v/chiupper, df * v/chilower) }
var(newpums2$familysize, na.rm=T)

## [1] 1.774813

var.interval(newpums2$familysize)

## [1] 1.759466 1.790363

We complete the Bootstrap method for the American households

n.sim<-1000

mus<-numeric(n.sim)
vars<-numeric(n.sim)
for (i in 1:n.sim){  
  dat<-sample(newpums2$familysize,size=length(newpums2$familysize), replace=T)
  mus[i]<-mean(dat, na.rm=T)
  vars[i]<-var(dat, na.rm=T)
}


par(mfrow=c(1,2))
hist(mus,freq=F, main="Bootstrap distribution of means")
abline(v=mean(newpums2$familysize, na.rm=T), col=2, lwd=3)

hist(vars, freq=F,main="Bootstrap distribution of variance")
abline(v=var(newpums2$familysize, na.rm=T), col=2, lwd=3)

We calculate the standard error for the mean household size of the Foreign population

newpums2<-ipums%>%
  mutate(familysize2= ifelse(famsize%in%c(999998,999999), NA, famsize))%>%
  filter(relate==1, bpl>=120)

mean(newpums2$familysize2)

## [1] 2.934445

newpums2%>%
  summarise(newsd=sd(familysize2), meannew=mean(familysize2, na.rm=T), n=n())

## # A tibble: 1 x 3
##      newsd  meannew     n
##      <dbl>    <dbl> <int>
## 1 1.683525 2.934445 15895

norm.interval = function(data, conf.level = 0.95) 
{z = qnorm((1 - conf.level)/2, lower.tail = FALSE)

variance = var(data, na.rm=T)
xbar = mean(data, na.rm=T)
sdx = sqrt(variance/length(data))
c(xbar - z * sdx, xbar + z * sdx) }

norm.interval(newpums2$familysize2)

## [1] 2.908273 2.960617

We repeat the calculations of the confidence interval for the mean family size but this time for the foreign households using the normal approximation. The confidence intervals do not overlap.

var.interval = function(data, conf.level = 0.95) {
  df = length(data) - 1
  chilower = qchisq((1 - conf.level)/2, df)
  chiupper = qchisq((1 - conf.level)/2, df, lower.tail = FALSE)
  v = var(data, na.rm=T)
  c(df * v/chiupper, df * v/chilower) }
var(newpums2$familysize2, na.rm=T)

## [1] 2.834258

var.interval(newpums2$familysize2)

## [1] 2.772961 2.897619

We complete the Bootstrap method for the Foreign households

n.sim<-1000

mus<-numeric(n.sim)
vars<-numeric(n.sim)
for (i in 1:n.sim){  
  dat<-sample(newpums2$familysize2,size=length(newpums2$familysize2), replace=T)
  mus[i]<-mean(dat, na.rm=T)
  vars[i]<-var(dat, na.rm=T)
}


par(mfrow=c(1,2))
hist(mus,freq=F, main="Bootstrap distribution of means")
abline(v=mean(newpums2$familysize2, na.rm=T), col=2, lwd=3)

hist(vars, freq=F,main="Bootstrap distribution of variance")
abline(v=var(newpums2$familysize2, na.rm=T), col=2, lwd=3)

Assigment #4

Paulina Cano

September 15, 2017