install.packages("dlpyr", repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/Uriel/Documents/R/win-library/3.4'
## (as 'lib' is unspecified)
## Warning: package 'dlpyr' is not available (for R version 3.4.1)
install.packages("ggplot2", repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/Uriel/Documents/R/win-library/3.4'
## (as 'lib' is unspecified)
## package 'ggplot2' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Uriel\AppData\Local\Temp\Rtmpe46sfR\downloaded_packages
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(haven)
library(ggplot2)
acs<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")

1) Filter the data to only have households heads and create a histogram of family size.

acs%>%
  filter(relate == 1) %>%
  ggplot(.) +  geom_histogram(aes(famsize), bins = 20, binwidth = .5) +
  scale_x_discrete("Number of own family members in household", limits = c(1:20)) + 
  ggtitle("Family size","Data from ACS 2015") +
  xlab("Number of own family members in household") +
  ylab("Number of Households")

2) Estimate the average family size for families whose head was born outside the US and those who were born in the US.

acs %>%
  mutate(usborn= case_when(acs$bpl %in% c(1:120)~"US born", acs$bpl %in% c(121:998)~"Foreign born")) %>%
  filter(relate==1) %>%
  group_by(usborn) %>%
  summarise(mean(famsize), median(famsize), sd(famsize), n())
## # A tibble: 2 x 5
##         usborn `mean(famsize)` `median(famsize)` `sd(famsize)`  `n()`
##          <chr>           <dbl>         <dbl+lbl>         <dbl>  <int>
## 1 Foreign born        2.934445                 3      1.683525  15895
## 2      US born        2.290301                 2      1.332221 101412

3) Create the histogram from part 2 for US born and Foreign born, as well as a box and whisker plot for the two groups.

Histogram

acs %>%
  mutate(usborn= case_when(acs$bpl %in% c(1:120)~"US born", acs$bpl %in% c(121:998)~"Foreign born")) %>%
  filter(relate==1) %>%
  group_by(usborn) %>%
  ggplot(.)+
  geom_histogram(mapping =  aes(famsize), binwidth = .5)+
  scale_x_discrete("Number of own family members in household", limits = seq(0, 20, by = 2)) +
  facet_wrap(~usborn)+
  ggtitle("Family size by birthplace of household head","Data from ACS 2015")+
  xlab("Number of own family members in household")+
  ylab("Number of households")

Boxplot

acs %>%
  mutate(usborn= case_when(acs$bpl %in% c(1:120)~"US born", acs$bpl %in% c(121:998)~"Foreign born")) %>%
  filter(relate==1) %>%
  group_by(usborn) %>%
  ggplot(.)+
  geom_boxplot(aes(usborn,famsize), ymin=1, ymax=20)+
  ggtitle("Boxplot of family size by birthplace of household head","Data from ACS 2015")+
  xlab("Household head by birthplace")+
  ylab("Number of own family members in household")
## Don't know how to automatically pick scale for object of type labelled. Defaulting to continuous.

4) Estimate the average family size by age and create a x-y scatter plot of these estimates using the average family size as the y-axis and age of the household head as the x-axis.

acs %>%
  mutate(usborn= case_when(acs$bpl %in% c(1:120)~"US born", acs$bpl %in% c(121:998)~"Foreign born")) %>%
  filter(relate==1) %>%
  group_by(age) %>%
  summarise(avefs=mean(famsize), sd(famsize), n()) %>%
  ggplot(.)+
  geom_point(aes(age, avefs),size=.9)+
  ggtitle("Average of family size by age of the household head", "Data from ACS 2015")+
  xlab("Age")+
  ylab("Average of family size")+
  scale_x_discrete("Age", limits=seq(15,105, by= 5))+
  geom_smooth(aes(age, avefs))
## `geom_smooth()` using method = 'loess'