library(dplyr) #to manipulate data
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2) #to visualize data

## Loading Data
library(haven)
acs<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")
names(acs)
##  [1] "year"      "datanum"   "serial"    "hhwt"      "statefip" 
##  [6] "met2013"   "puma"      "gq"        "pernum"    "perwt"    
## [11] "famsize"   "nchild"    "nchlt5"    "eldch"     "nsibs"    
## [16] "relate"    "related"   "sex"       "age"       "marst"    
## [21] "birthyr"   "fertyr"    "race"      "raced"     "hispan"   
## [26] "hispand"   "bpl"       "bpld"      "citizen"   "yrsusa1"  
## [31] "language"  "languaged" "speakeng"  "educ"      "educd"    
## [36] "empstat"   "empstatd"  "labforce"  "occ"       "ind"      
## [41] "inctot"    "incwage"   "poverty"   "hwsei"     "migrate1" 
## [46] "migrate1d" "carpool"   "trantime"

1 & 2) Filtering the data and Creating Histogram

acs%>%
  filter(relate == 1) %>% #to only have household heads
  ggplot(.) +
#creating a histogram of Family Size
  geom_histogram(aes(famsize)) +
    ggtitle(label="Family Size", subtitle="2015 American Community Survey Data") +
  xlab(label="Number of own Family Members in Household") +
  ylab(label="Number of Households")
## Don't know how to automatically pick scale for object of type labelled. Defaulting to continuous.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

3) Numerical Estimation

Mean and Standard Deviation of Family Size according Birth Place

acs %>%
  mutate(BirthPlace= case_when(.$bpl %in% c(1:120)~"US Born", 
                               .$bpl %in% c(121:998)~"Foreign Born")) %>% #applying the conditions
  filter(relate == 1) %>%
  group_by(BirthPlace) %>%
  summarise(mean(famsize), sd(famsize), n()) #numerical estimation
## # A tibble: 2 x 4
##     BirthPlace `mean(famsize)` `sd(famsize)`  `n()`
##          <chr>           <dbl>         <dbl>  <int>
## 1 Foreign Born        2.934445      1.683525  15895
## 2      US Born        2.290301      1.332221 101412

4) Histogram according to Birthplace

acs %>%
  mutate(BirthPlace= case_when(.$bpl %in% c(1:120)~"US Born", 
                               .$bpl %in% c(121:998)~"Foreign Born")) %>%
  filter(relate==1) %>%
  group_by(BirthPlace) %>%
  ggplot(.)+
  geom_histogram(mapping = aes(famsize, colour=BirthPlace, bins=10))+
  facet_wrap(~BirthPlace)+
  xlab(label="Number of Own Family Members in Household") +
  ylab(label="Number of Households")
## Warning: Ignoring unknown aesthetics: bins
## Don't know how to automatically pick scale for object of type labelled. Defaulting to continuous.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

  ggtitle(label="Family Size by BirthPlace of Household Head", subtitle="2015 American Community Survey Data")
## $title
## [1] "Family Size by BirthPlace of Household Head"
## 
## $subtitle
## [1] "2015 American Community Survey Data"
## 
## attr(,"class")
## [1] "labels"

4.1) Box and Whisker Plot

acs %>%
  mutate(BirthPlace= case_when(.$bpl %in% c(1:120)~"US Born", 
                               .$bpl %in% c(121:998)~"Foreign Born")) %>%
  filter(relate==1) %>%
  group_by(BirthPlace) %>%
  ggplot(.)+
  geom_boxplot(aes(x=BirthPlace, y=famsize, fill=BirthPlace))+
  ggtitle(label="Family Size by BirthPlace of Household Head", subtitle="2015 American Community Survey Data") + coord_flip()  
## Don't know how to automatically pick scale for object of type labelled. Defaulting to continuous.

5) X-Y Scatterplot with a smooth line

acs %>%
  mutate(BirthPlace= case_when(.$bpl %in% c(1:120)~"US Born", 
                               .$bpl %in% c(121:998)~"Foreign Born")) %>%
  filter(relate==1) %>%
  group_by(age) %>%
  summarise(avefs=mean(famsize), sd(famsize), n()) %>%
  ggplot(.)+
  geom_point(aes(x=age, y=avefs, color=age),size=.5)+
  geom_smooth(aes(age, avefs, color=age))+
  xlab(label="age of the Household Head")+
  ylab(label="Average Family Size")+
  ggtitle(label="Average Family Size by Age of the Household Head", subtitle="2015 American Community Survey Data")
## Don't know how to automatically pick scale for object of type labelled. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type labelled. Defaulting to continuous.
## `geom_smooth()` using method = 'loess'