Homework 5.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(haven)
library(haven)
library(ggplot2)
acs<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")

Descriptive statistics for U.S. born-household heads and Foreign born-household heads.

usborn<-acs%>%
  mutate(usborn= case_when(acs$bpl %in% c(1:120)~"US born"))%>%
  filter(relate==1, usborn == "US born")

foreignborn<-acs%>%
  mutate(usborn= case_when(acs$bpl %in% c(121:998)~"Foreign born"))%>%
  filter(relate==1, usborn == "Foreign born")

tribble(~"Birth place of household head", ~"Mean family size",  ~"Median family size", ~"Standard deviation", ~"Number of cases", "U.S. Born", mean(usborn$famsize) ,median(usborn$famsize),  sd(usborn$famsize), length(usborn$famsize), "Foreign born", mean(foreignborn$famsize),median(foreignborn$famsize),  sd(foreignborn$famsize), length(foreignborn$famsize))
## # A tibble: 2 x 5
##   `Birth place of household head` `Mean family size` `Median family size`
##                             <chr>              <dbl>                <dbl>
## 1                       U.S. Born           2.290301                    2
## 2                    Foreign born           2.934445                    3
## # ... with 2 more variables: `Standard deviation` <dbl>, `Number of
## #   cases` <int>

Histograms and boxplots.

acs %>%
  mutate(usborn= case_when(acs$bpl %in% c(1:120)~"US born", acs$bpl %in% c(121:998)~"Foreign born")) %>%
  filter(relate==1) %>%
  group_by(usborn) %>%
  ggplot(.)+
  geom_histogram(mapping =  aes(famsize), binwidth = .5)+
  scale_x_discrete("Number of own family members in household", limits = seq(0, 20, by = 2)) +
  facet_wrap(~usborn)+
  ggtitle("Family size by birthplace of household head","Data from ACS 2015")+
  xlab("Number of own family members in household")+
  ylab("Number of households")

acs %>%  
  mutate(usborn= case_when(acs$bpl %in% c(1:120)~"US born", acs$bpl %in% c(121:998)~"Foreign born")) %>%
  filter(relate==1) %>%
  group_by(usborn) %>%
  ggplot(.)+
  geom_boxplot(aes(usborn,famsize), ymin=1, ymax=20)+
  ggtitle("Boxplot of family size by birthplace of household head","Data from ACS 2015")+
  xlab("Household head by birthplace")+
  ylab("Number of own family members in household")
## Don't know how to automatically pick scale for object of type labelled. Defaulting to continuous.

Are family sizes different for birthplace of the household head?

library(broom)
acs_hh<-acs%>%
mutate(usborn= case_when(acs$bpl %in% c(1:120)~"US born", acs$bpl %in% c(121:998)~"Foreign born"))%>%
filter(relate==1)

hh<-lm(famsize~usborn, data=acs_hh)
tidy(hh)
##            term   estimate  std.error statistic p.value
## 1   (Intercept)  2.9344448 0.01098588 267.11061       0
## 2 usbornUS born -0.6441438 0.01181550 -54.51685       0
t.test(acs_hh$famsize~acs_hh$usborn)
## 
##  Welch Two Sample t-test
## 
## data:  acs_hh$famsize by acs_hh$usborn
## t = 46.032, df = 19138, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.6167158 0.6715719
## sample estimates:
## mean in group Foreign born      mean in group US born 
##                   2.934445                   2.290301
qqnorm(rstudent(hh), main="Q-Q Plot for Model Residuals")

Applying a log-transformation

hh_log<-lm(log(famsize)~usborn, data=acs_hh)
tidy(hh_log)
##            term   estimate   std.error statistic p.value
## 1   (Intercept)  0.9053601 0.004402847 205.63063       0
## 2 usbornUS born -0.2286906 0.004735337 -48.29448       0
# US born and foreign born average family size after transformation: 

exp(coef(hh_log)["(Intercept)"])
## (Intercept) 
##    2.472822
exp(sum(coef(hh_log))) 
## [1] 1.967315
tribble(~"US born", ~"Foreign born", exp(sum(coef(hh_log))), exp(coef(hh_log)["(Intercept)"]))
## # A tibble: 1 x 2
##   `US born` `Foreign born`
##       <dbl>          <dbl>
## 1  1.967315       2.472822

Applying a square root transformation

hh_sqrt<-lm(sqrt(famsize)~usborn, data=acs_hh)
tidy(hh_sqrt)
##            term   estimate   std.error statistic p.value
## 1   (Intercept)  1.6438702 0.003334192 493.03400       0
## 2 usbornUS born -0.1870618 0.003585981 -52.16474       0
# US born and foreign born average family size after transformation: 

exp(coef(hh_sqrt)["(Intercept)"])
## (Intercept) 
##     5.17516
exp(sum(coef(hh_sqrt))) 
## [1] 4.292239
tribble(~"US born", ~"Foreign born", exp(sum(coef(hh_sqrt))), exp(coef(hh_sqrt)["(Intercept)"]))
## # A tibble: 1 x 2
##   `US born` `Foreign born`
##       <dbl>          <dbl>
## 1  4.292239        5.17516

Applying a reciprocal transformation

hh_recip<-lm(I(1/famsize)~usborn, data=acs_hh)
tidy(hh_recip)
##            term   estimate   std.error statistic p.value
## 1   (Intercept) 0.48544443 0.002368842 204.92901       0
## 2 usbornUS born 0.09986373 0.002547730  39.19714       0
# US born and foreign born average family size after transformation: 

exp(coef(hh_recip)["(Intercept)"])
## (Intercept) 
##    1.624897
exp(sum(coef(hh_recip))) 
## [1] 1.795544
tribble(~"US born", ~"Foreign born", exp(sum(coef(hh_recip))), exp(coef(hh_recip)["(Intercept)"]))
## # A tibble: 1 x 2
##   `US born` `Foreign born`
##       <dbl>          <dbl>
## 1  1.795544       1.624897
par(mfrow=c(1,3))
qqnorm(rstudent(hh_log), main="Q-Q Plot. Residuals-log transformation")
qqnorm(rstudent(hh_sqrt), main="Q-Q Plot. Residuals-sqrt transformation")
qqnorm(rstudent(hh_recip), main="Q-Q Plot. Residuals-reciprocal transformation")