library(haven)
ipums<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")
## `curl` package not installed, falling back to using `url()`
names(ipums) #print the column names
##  [1] "year"      "datanum"   "serial"    "hhwt"      "statefip" 
##  [6] "met2013"   "puma"      "gq"        "pernum"    "perwt"    
## [11] "famsize"   "nchild"    "nchlt5"    "eldch"     "nsibs"    
## [16] "relate"    "related"   "sex"       "age"       "marst"    
## [21] "birthyr"   "fertyr"    "race"      "raced"     "hispan"   
## [26] "hispand"   "bpl"       "bpld"      "citizen"   "yrsusa1"  
## [31] "language"  "languaged" "speakeng"  "educ"      "educd"    
## [36] "empstat"   "empstatd"  "labforce"  "occ"       "ind"      
## [41] "inctot"    "incwage"   "poverty"   "hwsei"     "migrate1" 
## [46] "migrate1d" "carpool"   "trantime"
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library("ggplot2")

A histogram showing the family sizes is computed

ipums%>%
mutate(familysize= ifelse(famsize%in%c(999998,999999), NA, famsize))%>%
filter(relate==1) %>%
ggplot()+
geom_histogram(aes(familysize),binwidth = .5)

We calculate the mean

ipums%>%
  mutate(familysize= ifelse(famsize%in%c(999998,999999), NA, famsize))%>%
filter(relate==1) %>%
  mutate(birthplace= ifelse(bpl%in%c(001:120), "American", "Foreign")) %>%
  group_by(birthplace)%>%
  summarise(newsd=sd(familysize), meannew=mean(familysize, na.rm=T), n=n())
## # A tibble: 2 x 4
##   birthplace    newsd  meannew      n
##        <chr>    <dbl>    <dbl>  <int>
## 1   American 1.332221 2.290301 101412
## 2    Foreign 1.683525 2.934445  15895

Below are histograms for the American and Foreign family sizes accordingly

ipums%>%
  mutate(familysize= ifelse(famsize%in%c(999998,999999), NA, famsize))%>%
  filter(relate==1) %>%
  mutate(birthplace= ifelse(bpl%in%c(001:120), "American", "Foreign")) %>%
  ggplot()+
  geom_histogram(aes(familysize),binwidth = .5)+
  facet_wrap(~birthplace)

Below is a box plot representation

ipums%>%
  mutate(familysize= ifelse(famsize%in%c(999998,999999), NA, famsize))%>%
  filter(relate==1, bpl%in%c(001:120, 150:999) ) %>%
  mutate(birthplace= ifelse(bpl%in%c(001:120), "American", "Foreign"),
         CountryRegion= case_when(.$bpl==001:120~"United States", 
                                  .$bpl==150:999~"Foreign")) %>%
  ggplot()+
  geom_boxplot(aes(x=CountryRegion, y=familysize))
## Warning in .$bpl == 1:120: longer object length is not a multiple of
## shorter object length
## Warning in .$bpl == 150:999: longer object length is not a multiple of
## shorter object length

Below is a more descriptive box plot representation with all regions and/or countries included

 ipums%>%
  mutate(familysize= ifelse(famsize%in%c(999998,999999), NA, famsize))%>%
  filter(relate==1, bpl%in%c(001:120, 150:300, 400:499, 500:519, 520:547, 548:599, 600, 700, 710,800, 900, 950, 999) ) %>%
  mutate(birthplace= ifelse(bpl%in%c(001:120), "American", "Foreign"),
        CountryRegion= case_when(.$bpl==001:120~"United States", 
                                 .$bpl==150:300~"Central and South America",
                                 .$bpl==400:499~"Europe",
                                 .$bpl==500:519~"Asia",
                                 .$bpl==520:547~"Middle East",
                                 .$bpl==548:599~"Other Asia",
                                 .$bpl==600~"Asia",
                                 .$bpl==700~"Australia New Zealand",
                                 .$bpl==710~"Pacific Islands",
                                 .$bpl==800~"Antartica",
                                 .$bpl==900~"Unknown",
                                 .$bpl==950~"Other",
                                 .$bpl==999~"Missing")) %>%
  ggplot()+
  geom_boxplot(aes(x=CountryRegion, y=familysize))+coord_flip()
## Warning in .$bpl == 1:120: longer object length is not a multiple of
## shorter object length
## Warning in .$bpl == 150:300: longer object length is not a multiple of
## shorter object length
## Warning in .$bpl == 400:499: longer object length is not a multiple of
## shorter object length
## Warning in .$bpl == 500:519: longer object length is not a multiple of
## shorter object length
## Warning in .$bpl == 520:547: longer object length is not a multiple of
## shorter object length
## Warning in .$bpl == 548:599: longer object length is not a multiple of
## shorter object length

Below is a scatterplot that shows the age of the household and average family size with a smooth line.

ipums%>%
  mutate(familysize= ifelse(famsize%in%c(999998,999999), NA, famsize))%>%
filter(relate==1) %>%
  mutate(birthplace= ifelse(bpl%in%c(001:120), "American", "Foreign")) %>%
  group_by(birthplace)%>%
ggplot()+
  geom_point(mapping= aes(x=age, y=familysize))+
  ggtitle(label = "Age of Household Head by Average Family Size")+
  xlab(label = "age of the household head")+
  ylab(label="average family size")+
  geom_smooth(aes(x=age, y=familysize, colour=birthplace))
## Don't know how to automatically pick scale for object of type labelled. Defaulting to continuous.
## `geom_smooth()` using method = 'gam'