library(dplyr) #to manipulate data
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2) #to visualize data
## Loading Data
library(haven)
acs<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")
names(acs)
## [1] "year" "datanum" "serial" "hhwt" "statefip"
## [6] "met2013" "puma" "gq" "pernum" "perwt"
## [11] "famsize" "nchild" "nchlt5" "eldch" "nsibs"
## [16] "relate" "related" "sex" "age" "marst"
## [21] "birthyr" "fertyr" "race" "raced" "hispan"
## [26] "hispand" "bpl" "bpld" "citizen" "yrsusa1"
## [31] "language" "languaged" "speakeng" "educ" "educd"
## [36] "empstat" "empstatd" "labforce" "occ" "ind"
## [41] "inctot" "incwage" "poverty" "hwsei" "migrate1"
## [46] "migrate1d" "carpool" "trantime"
1 & 2) Filtering the data and Creating Histogram
acs%>%
filter(relate == 1) %>% #to only have household heads
ggplot(.) +
#creating a histogram of Family Size
geom_histogram(aes(famsize)) +
ggtitle(label="Family Size", subtitle="2015 American Community Survey Data") +
xlab(label="Number of own Family Members in Household") +
ylab(label="Number of Households")
## Don't know how to automatically pick scale for object of type labelled. Defaulting to continuous.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

3) Numerical Estimation
Mean and Standard Deviation of Family Size according Birth Place
acs %>%
mutate(BirthPlace= case_when(.$bpl %in% c(1:120)~"US Born",
.$bpl %in% c(121:998)~"Foreign Born")) %>% #applying the conditions
filter(relate == 1) %>%
group_by(BirthPlace) %>%
summarise(mean(famsize), sd(famsize), n()) #numerical estimation
## # A tibble: 2 x 4
## BirthPlace `mean(famsize)` `sd(famsize)` `n()`
## <chr> <dbl> <dbl> <int>
## 1 Foreign Born 2.934445 1.683525 15895
## 2 US Born 2.290301 1.332221 101412
4) Histogram according to Birthplace
acs %>%
mutate(BirthPlace= case_when(.$bpl %in% c(1:120)~"US Born",
.$bpl %in% c(121:998)~"Foreign Born")) %>%
filter(relate==1) %>%
group_by(BirthPlace) %>%
ggplot(.)+
geom_histogram(mapping = aes(famsize, colour=BirthPlace, bins=10))+
facet_wrap(~BirthPlace)+
xlab(label="Number of Own Family Members in Household") +
ylab(label="Number of Households")
## Warning: Ignoring unknown aesthetics: bins
## Don't know how to automatically pick scale for object of type labelled. Defaulting to continuous.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggtitle(label="Family Size by BirthPlace of Household Head", subtitle="2015 American Community Survey Data")
## $title
## [1] "Family Size by BirthPlace of Household Head"
##
## $subtitle
## [1] "2015 American Community Survey Data"
##
## attr(,"class")
## [1] "labels"
4.1) Box and Whisker Plot
acs %>%
mutate(BirthPlace= case_when(.$bpl %in% c(1:120)~"US Born",
.$bpl %in% c(121:998)~"Foreign Born")) %>%
filter(relate==1) %>%
group_by(BirthPlace) %>%
ggplot(.)+
geom_boxplot(aes(x=BirthPlace, y=famsize, fill=BirthPlace))+
ggtitle(label="Family Size by BirthPlace of Household Head", subtitle="2015 American Community Survey Data") + coord_flip()
## Don't know how to automatically pick scale for object of type labelled. Defaulting to continuous.

5) X-Y Scatterplot with a smooth line
acs %>%
mutate(BirthPlace= case_when(.$bpl %in% c(1:120)~"US Born",
.$bpl %in% c(121:998)~"Foreign Born")) %>%
filter(relate==1) %>%
group_by(age) %>%
summarise(avefs=mean(famsize), sd(famsize), n()) %>%
ggplot(.)+
geom_point(aes(x=age, y=avefs, color=age),size=.5)+
geom_smooth(aes(age, avefs, color=age))+
xlab(label="age of the Household Head")+
ylab(label="Average Family Size")+
ggtitle(label="Average Family Size by Age of the Household Head", subtitle="2015 American Community Survey Data")
## Don't know how to automatically pick scale for object of type labelled. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type labelled. Defaulting to continuous.
## `geom_smooth()` using method = 'loess'
