library(haven)
library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
mySE<-function(x){sd(x,na.rm=T)/length(x)}
ipums<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")
myForeignHead<-subset(ipums,bpl>=121,bpl<=998)
myUSHead<-subset(ipums,bpl<=120,bpl>=1)
myForeignHeadR<-subset(myForeignHead,relate==1)
myUSHeadR<-subset(myUSHead,relate==1)

1) Descriptive Statistics and Graphs for each Group (US vs. Foreign Born Houshold Heads)

Creating a subset to only use households with “Head of Household” (as done before):

onlyheads<-subset(ipums, relate==1)

Average, SD, Standard Error of Family Size, Head inside U.S. / outside U.S. born from previous homework

onlyheads<-ipums%>%
  mutate(bornus= case_when(ipums$bpl %in% c(1:120)~"US Born",ipums$bpl %in% c(120:998)~"Foreign Born"))%>%
  filter(relate==1)

onlyheads%>%
  group_by(bornus) %>%
  summarise(mean(famsize), sd(famsize), mySE(famsize))
## # A tibble: 2 x 4
##         bornus `mean(famsize)` `sd(famsize)` `mySE(famsize)`
##          <chr>           <dbl>         <dbl>           <dbl>
## 1 Foreign Born        2.934445      1.683525    1.059154e-04
## 2      US Born        2.290301      1.332221    1.313672e-05

Histogram

ipums%>%
  mutate(bornus= case_when(.$bpl %in% c(1:120)~"US Born",.$bpl %in% c(120:998)~"Foreign Born"))%>%
  filter(relate==1) %>%
  ggplot(.)+
  geom_histogram(mapping = aes(famsize), binwidth = .5)+
  scale_x_discrete("number of family members", limits = seq(0,20,by=2))+
  facet_wrap(~bornus)+
  ggtitle("Family Size by Birth Place")+
  xlab("Size")+
  ylab("number of households")

Box Plot

ipums%>%
  mutate(bornus= case_when(.$bpl %in% c(1:120)~"US Born",.$bpl %in% c(120:998)~"Foreign Born"))%>%
  filter(relate==1) %>%
  ggplot(.)+
  geom_boxplot(aes(bornus,famsize),ymin=1,ymax=20)+
  ggtitle("Family Size by Birth Place")+
  xlab("head by birthplace")+
  ylab("number of houshold members")
## Don't know how to automatically pick scale for object of type labelled. Defaulting to continuous.

2) Linear Model Test for Equality of the Family Size Variable

Summary using the Linear Model

library(broom)
myPlot1<-lm(famsize~bornus,data=onlyheads)
tidy(myPlot1)
##            term   estimate  std.error statistic p.value
## 1   (Intercept)  2.9344448 0.01098588 267.11061       0
## 2 bornusUS Born -0.6441438 0.01181550 -54.51685       0

Graph

qqnorm(rstudent(myPlot1), main="Q-Q Plot for Family Size")

Transforamtion using Logarithm

library(broom)
myPlot2<-lm(log(famsize)~bornus,data=onlyheads)
tidy(myPlot2)
##            term   estimate   std.error statistic p.value
## 1   (Intercept)  0.9053601 0.004402847 205.63063       0
## 2 bornusUS Born -0.2286906 0.004735337 -48.29448       0

Logarythmic Graph

qqnorm(rstudent(myPlot2), main="Q-Q Plot for Logarithmic Family Size Graph")

Transformation using Square Root

library(broom)
myPlot3<-lm(sqrt(famsize)~bornus,data=onlyheads)
tidy(myPlot3)
##            term   estimate   std.error statistic p.value
## 1   (Intercept)  1.6438702 0.003334192 493.03400       0
## 2 bornusUS Born -0.1870618 0.003585981 -52.16474       0

Square Rooted Graph

qqnorm(rstudent(myPlot3), main="Q-Q Plot for Square Rooted Family Size")

Reciprocal Transformation

library(broom)
myPlot4<-lm(I(1/famsize)~bornus,data=onlyheads)
tidy(myPlot4)
##            term   estimate   std.error statistic p.value
## 1   (Intercept) 0.48544443 0.002368842 204.92901       0
## 2 bornusUS Born 0.09986373 0.002547730  39.19714       0

Reciprocal Graph

qqnorm(rstudent(myPlot4), main="Q-Q Plot for Reciprocal Family Size")