library(haven)
ipums<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")
## `curl` package not installed, falling back to using `url()`
names(ipums) #print the column names
## [1] "year" "datanum" "serial" "hhwt" "statefip"
## [6] "met2013" "puma" "gq" "pernum" "perwt"
## [11] "famsize" "nchild" "nchlt5" "eldch" "nsibs"
## [16] "relate" "related" "sex" "age" "marst"
## [21] "birthyr" "fertyr" "race" "raced" "hispan"
## [26] "hispand" "bpl" "bpld" "citizen" "yrsusa1"
## [31] "language" "languaged" "speakeng" "educ" "educd"
## [36] "empstat" "empstatd" "labforce" "occ" "ind"
## [41] "inctot" "incwage" "poverty" "hwsei" "migrate1"
## [46] "migrate1d" "carpool" "trantime"
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library("ggplot2")
A histogram showing the family sizes is computed
ipums%>%
mutate(familysize= ifelse(famsize%in%c(999998,999999), NA, famsize))%>%
filter(relate==1) %>%
ggplot()+
geom_histogram(aes(familysize),binwidth = .5)
We calculate the mean
ipums%>%
mutate(familysize= ifelse(famsize%in%c(999998,999999), NA, famsize))%>%
filter(relate==1) %>%
mutate(birthplace= ifelse(bpl%in%c(001:120), "American", "Foreign")) %>%
group_by(birthplace)%>%
summarise(newsd=sd(familysize), meannew=mean(familysize, na.rm=T), n=n())
## # A tibble: 2 x 4
## birthplace newsd meannew n
## <chr> <dbl> <dbl> <int>
## 1 American 1.332221 2.290301 101412
## 2 Foreign 1.683525 2.934445 15895
Below are histograms for the American and Foreign family sizes accordingly
ipums%>%
mutate(familysize= ifelse(famsize%in%c(999998,999999), NA, famsize))%>%
filter(relate==1) %>%
mutate(birthplace= ifelse(bpl%in%c(001:120), "American", "Foreign")) %>%
ggplot()+
geom_histogram(aes(familysize),binwidth = .5)+
facet_wrap(~birthplace)
Below is a box plot representation
ipums%>%
mutate(familysize= ifelse(famsize%in%c(999998,999999), NA, famsize))%>%
filter(relate==1, bpl%in%c(001:120, 150:999) ) %>%
mutate(birthplace= ifelse(bpl%in%c(001:120), "American", "Foreign"),
CountryRegion= case_when(.$bpl==001:120~"United States",
.$bpl==150:999~"Foreign")) %>%
ggplot()+
geom_boxplot(aes(x=CountryRegion, y=familysize))
## Warning in .$bpl == 1:120: longer object length is not a multiple of
## shorter object length
## Warning in .$bpl == 150:999: longer object length is not a multiple of
## shorter object length
Below is a more descriptive box plot representation with all regions and/or countries included
ipums%>%
mutate(familysize= ifelse(famsize%in%c(999998,999999), NA, famsize))%>%
filter(relate==1, bpl%in%c(001:120, 150:300, 400:499, 500:519, 520:547, 548:599, 600, 700, 710,800, 900, 950, 999) ) %>%
mutate(birthplace= ifelse(bpl%in%c(001:120), "American", "Foreign"),
CountryRegion= case_when(.$bpl==001:120~"United States",
.$bpl==150:300~"Central and South America",
.$bpl==400:499~"Europe",
.$bpl==500:519~"Asia",
.$bpl==520:547~"Middle East",
.$bpl==548:599~"Other Asia",
.$bpl==600~"Asia",
.$bpl==700~"Australia New Zealand",
.$bpl==710~"Pacific Islands",
.$bpl==800~"Antartica",
.$bpl==900~"Unknown",
.$bpl==950~"Other",
.$bpl==999~"Missing")) %>%
ggplot()+
geom_boxplot(aes(x=CountryRegion, y=familysize))+coord_flip()
## Warning in .$bpl == 1:120: longer object length is not a multiple of
## shorter object length
## Warning in .$bpl == 150:300: longer object length is not a multiple of
## shorter object length
## Warning in .$bpl == 400:499: longer object length is not a multiple of
## shorter object length
## Warning in .$bpl == 500:519: longer object length is not a multiple of
## shorter object length
## Warning in .$bpl == 520:547: longer object length is not a multiple of
## shorter object length
## Warning in .$bpl == 548:599: longer object length is not a multiple of
## shorter object length
Below is a scatterplot that shows the age of the household and average family size with a smooth line.
ipums%>%
mutate(familysize= ifelse(famsize%in%c(999998,999999), NA, famsize))%>%
filter(relate==1) %>%
mutate(birthplace= ifelse(bpl%in%c(001:120), "American", "Foreign")) %>%
group_by(birthplace)%>%
ggplot()+
geom_point(mapping= aes(x=age, y=familysize))+
ggtitle(label = "Age of Household Head by Average Family Size")+
xlab(label = "age of the household head")+
ylab(label="average family size")+
geom_smooth(aes(x=age, y=familysize, colour=birthplace))
## Don't know how to automatically pick scale for object of type labelled. Defaulting to continuous.
## `geom_smooth()` using method = 'gam'