Welcome to the Rplicate Series! In this article we will try to replicate the plot from The Economist article, The irresistible rise of internet dating. The graph is based on Rosenfeld’s (2012) study on how couples meet and stay together published in an article entitled Searching for a Mate: The Rise of the Internet as a Social Intermediary. This article will cover how to change the value of axes, making arrow to highlight information on our plot, and set multiple row on our title.
Below is the required packages for data wrangling and visualization.
library(extrafont)
library(haven)
library(foreign)
library(lubridate)
library(dplyr)
library(ggplot2)
library(reshape2)
library(tidyr)
library(ggthemes)
library(ggrepel)
library(png)
library(grid)
library(gridExtra)
You can use Officina Sans based on ggthemes package documentary regarding the economist theme for an exact result. Officina Sans is a commercial font and is available here.
custom_font_family <- "ITC Officina Sans"
Dataset is taken from data.stanford.edu with SPSS format.
data <- read_sav('data_input/HCMST_ver_3.04.sav')
head(data)
#> # A tibble: 6 x 387
#> caseid_new weight1 weight2 ppage ppagecat ppagect4 ppeduc ppeducat ppethm
#> <dbl> <dbl> <dbl> <dbl+> <dbl+lb> <dbl+lb> <dbl+lbl> <dbl+lb> <dbl+l>
#> 1 22526 4265 4265 52 4 [45-5~ 3 [45-5~ 12 [bach~ 4 [bach~ 4 [his~
#> 2 23286 16485 16485 28 2 [25-3~ 1 [18-2~ 13 [mast~ 4 [bach~ 1 [whi~
#> 3 25495 52464 NA 49 4 [45-5~ 3 [45-5~ 9 [high~ 2 [high~ 2 [bla~
#> 4 26315 4575 4575 31 2 [25-3~ 2 [30-4~ 11 [asso~ 3 [some~ 1 [whi~
#> 5 27355 12147 NA 35 3 [35-4~ 2 [30-4~ 9 [high~ 2 [high~ 1 [whi~
#> 6 27695 1799 NA 69 6 [65-7~ 4 [60+] 10 [some~ 3 [some~ 1 [whi~
#> # ... with 378 more variables: ppgender <dbl+lbl>, pphhhead <dbl+lbl>,
#> # pphouseholdsize <dbl>, pphouse <dbl+lbl>, ppincimp <dbl+lbl>, hhinc <dbl>,
#> # ppmarit <dbl+lbl>, ppmsacat <dbl+lbl>, ppreg4 <dbl+lbl>, ppreg9 <dbl+lbl>,
#> # pprent <dbl+lbl>, ppt01 <dbl+lbl>, ppt1317 <dbl+lbl>, ppt18ov <dbl+lbl>,
#> # ppt25 <dbl+lbl>, ppt612 <dbl+lbl>, children_in_hh <dbl>, ppwork <dbl+lbl>,
#> # ppnet <dbl+lbl>, ppq14arace <dbl+lbl>, pphispan <dbl+lbl>,
#> # pprace_white <dbl+lbl>, pprace_black <dbl+lbl>, ...
str(data, list.len = 3)
#> tibble [4,002 x 387] (S3: tbl_df/tbl/data.frame)
#> $ caseid_new : num [1:4002] 22526 23286 25495 26315 27355 ...
#> ..- attr(*, "label")= chr "unique case ID"
#> ..- attr(*, "format.spss")= chr "F12.0"
#> ..- attr(*, "display_width")= int 12
#> $ weight1 : num [1:4002] 4265 16485 52464 4575 12147 ...
#> ..- attr(*, "label")= chr "Main Weight for all respondents"
#> ..- attr(*, "format.spss")= chr "F12.0"
#> ..- attr(*, "display_width")= int 12
#> $ weight2 : num [1:4002] 4265 16485 NA 4575 NA ...
#> ..- attr(*, "label")= chr "Main Weight for Partnered Respondents"
#> ..- attr(*, "format.spss")= chr "F12.0"
#> ..- attr(*, "display_width")= int 12
#> [list output truncated]
dim(data)
#> [1] 4002 387
The dataset consist of 4002 observations and 387 variables. There are 3009 of 4002 participants of How Couples Met and Stay Together main survey based on the Brief Guide. The remaining 993 respondents did not have a spouse or partner at the time of the main survey. This information can be derived from qflag variables.
unique(data$qflag)
#> <labelled<double>[2]>: Does respondent have a spouse or partner?
#> [1] 1 2
#>
#> Labels:
#> value label
#> 1 partnered
#> 2 no spouse or partner or otherwise unqualified
For further data wrangling, it’s easier if we use the data in dataframe format and drop the SPSS labels. We do this by using read.spss function in foreign library.
data <- read.spss('data_input/HCMST_ver_3.04.sav', to.data.frame = TRUE,
use.value.labels = FALSE)
head(data)[1:5,1:7]
#> caseid_new weight1 weight2 ppage ppagecat ppagect4 ppeduc
#> 1 22526 4265 4265 52 4 3 12
#> 2 23286 16485 16485 28 2 1 13
#> 3 25495 52464 NA 49 4 3 9
#> 4 26315 4575 4575 31 2 2 11
#> 5 27355 12147 NA 35 3 2 9
The graph was plotting the year couple met and where did they meet. The dataset doesn’t have the variable that enlists what year the respondent and their couple met, but instead, they ask how long ago the respondent met their partner in years. We can get year_couple_met by subtracting how_long_ago_first_met variable with HCMST_main_interview_yrmo variable. See example below:
ym(data$HCMST_main_interview_yrmo)[1] - years(data$how_long_ago_first_met)[1]
#> [1] "2002-02-01"
The survey documented places where couples met on q24_* variables. Don’t forget that the graph is divided by whether the couple was heterosexual or not, so we also need to take the same_sex_couple variable. We can do all of this by piping with the dplyr package.
data_hcmst <- data %>%
filter(qflag == 1) %>%
select(c("HCMST_main_interview_yrmo",
"how_long_ago_first_met",
"same_sex_couple",
"q24_met_online",
"q24_school" ,
"q24_college",
"q24_church",
"q24_bar_restaurant",
"q24_R_cowork",
"met_through_friends"
)) %>%
mutate(HCMST_main_interview_yrmo = ym(HCMST_main_interview_yrmo)) %>%
mutate(year_couple_met = HCMST_main_interview_yrmo - years(how_long_ago_first_met)) %>%
mutate(year_couple_met = year(year_couple_met))
head(data_hcmst)
#> HCMST_main_interview_yrmo how_long_ago_first_met same_sex_couple
#> 1 2009-02-01 7 1
#> 2 2009-02-01 9 1
#> 3 2009-02-01 8 1
#> 4 2009-02-01 12 1
#> 5 2009-02-01 30 0
#> 6 2009-02-01 3 1
#> q24_met_online q24_school q24_college q24_church q24_bar_restaurant
#> 1 1 0 0 0 0
#> 2 0 0 1 0 0
#> 3 1 0 0 0 0
#> 4 0 0 0 0 0
#> 5 0 0 0 0 0
#> 6 1 0 0 0 0
#> q24_R_cowork met_through_friends year_couple_met
#> 1 0 0 2002
#> 2 0 1 2000
#> 3 0 1 2001
#> 4 0 0 1997
#> 5 1 0 1979
#> 6 0 0 2006
dim(data_hcmst)
#> [1] 3009 11
colSums(is.na(data_hcmst))
#> HCMST_main_interview_yrmo how_long_ago_first_met same_sex_couple
#> 0 15 0
#> q24_met_online q24_school q24_college
#> 75 75 75
#> q24_church q24_bar_restaurant q24_R_cowork
#> 75 75 75
#> met_through_friends year_couple_met
#> 75 15
Since The Economist article did not include any N sample explanation on the article, we assume they dropped the missing values instead of imputation.
data_hcmst <- data_hcmst %>%
drop_na()
dim(data_hcmst)
#> [1] 2924 11
table(data_hcmst$same_sex_couple)
#>
#> 0 1
#> 2462 462
Now we have 2924 observation, 2462 observations for heterosexual couples and 462 observations for same-sex couples. with clean dataset.
colSums(is.na(data_hcmst))
#> HCMST_main_interview_yrmo how_long_ago_first_met same_sex_couple
#> 0 0 0
#> q24_met_online q24_school q24_college
#> 0 0 0
#> q24_church q24_bar_restaurant q24_R_cowork
#> 0 0 0
#> met_through_friends year_couple_met
#> 0 0
Relative percentage: for each year, what percentage of partners of each sexual preference first met on each occasion/place?
We create custom_color_palette to customize the color of geom_smooth() for each category. We can extract color pallete from image picker or find the The Economist style guide.
custom_color_pallete <- list(met_through_friends = "#01A4DC",
q24_bar_restaurant = "#F15B40",
q24_church = "#692916",
q24_college = "#91b8bd",
q24_met_online = "#336666",
q24_R_cowork = "#6ED0F6",
q24_school = "#AC8B96"
)
custom_date_left <- as.Date(paste0(seq(1940, 2010, 10), "-01-01"))
x.labs <- ifelse((substr(as.character(custom_date_left), 1, 4) == "1940") | (substr(as.character(custom_date_left), 1, 4) == "2000"),
substr(as.character(custom_date_left), 1, 4),
substr(as.character(custom_date_left), 3, 4))
x.labs
#> [1] "1940" "50" "60" "70" "80" "90" "2000" "10"
custom_date_right <- as.Date(c("1980-01-01",
"1985-01-01",
"1990-01-01",
"2000-01-01",
"2010-01-01"))
x.labsr <- ifelse(substr(as.character(custom_date_right), 1, 4) == "2000",
substr(as.character(custom_date_right), 1, 4),
substr(as.character(custom_date_right), 3, 4))
x.labsr[2] <- ""
x.labsr
#> [1] "80" "" "90" "2000" "10"
data_hcmst_select <- data_hcmst %>%
select(-c("HCMST_main_interview_yrmo",
"how_long_ago_first_met")) %>%
pivot_longer(cols = -c(year_couple_met, same_sex_couple),
names_to = "where_couple_met",
values_to = "value",
) %>%
group_by(same_sex_couple, year_couple_met, where_couple_met) %>%
summarise(frequency = sum(value)) %>%
mutate(percentage = frequency/sum(frequency)*100) %>%
mutate(year_couple_met = ymd(year_couple_met, truncated = 2L)) %>%
filter(if(same_sex_couple == 1) year_couple_met > "1985-01-01" else TRUE)
head(data_hcmst_select)
#> # A tibble: 6 x 5
#> # Groups: same_sex_couple, year_couple_met [1]
#> same_sex_couple year_couple_met where_couple_met frequency percentage
#> <dbl> <date> <chr> <dbl> <dbl>
#> 1 0 1934-01-01 met_through_friends 0 0
#> 2 0 1934-01-01 q24_bar_restaurant 0 0
#> 3 0 1934-01-01 q24_church 1 50
#> 4 0 1934-01-01 q24_college 0 0
#> 5 0 1934-01-01 q24_met_online 0 0
#> 6 0 1934-01-01 q24_R_cowork 0 0
facet_label <- c( `0` = "Heterosexual couples",
`1` = "Same-sex couples")
unique(data_hcmst_select$same_sex_couple)
#> [1] 0 1
full_plot <- ggplot(data_hcmst_select,
aes(x = year_couple_met,
y = percentage)) +
geom_smooth(aes(group = where_couple_met,
colour = where_couple_met),
method = "loess",
span = 0.8,
se = FALSE) +
facet_grid(.~same_sex_couple,
space = "free",
scales = "free",
labeller = as_labeller(facet_label)) +
scale_x_date(breaks = custom_date_left,
labels = x.labs,
# limits = as.Date(c("1940-01-01","2010-01-01")),
# expand = c(0.01, 0)
) +
scale_y_continuous(expand = c(0, 0),
limits = c(0, 70),
breaks = seq(0, 50, by = 10),
sec.axis = sec_axis(trans = ~.,
breaks = seq(0,
70,
by = 10))) +
theme_economist_white(base_size = 11,
base_family = "ITC Officina Sans",
gray_bg = F,
horizontal = TRUE) +
theme(text = element_text(family = "custom_font_family",
color = "black"),
axis.ticks.length = unit(5, "pt"),
axis.ticks.x = element_line(size = 0.75),
axis.title.x = element_text(margin = margin(t = 10,
r = 100)),
panel.grid.major.y = element_line(color = "#B2C2CA",
size = 1),
strip.text = element_text(hjust = 0,
face = "bold"),
strip.text.x = element_text(size = 10),
plot.caption = element_text(hjust = 0,
margin = margin(t =20)),
# plot.title = element_text(margin = margin(t = -3)),
legend.position = "none") +
labs(title="Meet Markets",
subtitle = "United States, how couples meet, %",
caption='Source: "Searching for a Mate: The Rise of the Internet as a Social Intermediary", by Michael J. Rosenfield and Reuben J. Thomas',
x = expression(italic("Year Couple Met")),
y=NULL) +
scale_color_manual(values = unlist(custom_color_pallete, use.names = FALSE))
full_plot
full_plot_right_label <- full_plot +
geom_text(data = data_hcmst_select[data_hcmst_select$where_couple_met == "q24_college" & data_hcmst_select$year_couple_met == "1940-01-01",],
label = "College",
nudge_y = 5,
hjust ="left",
colour = "#383537") +
geom_text_repel(data = data_hcmst_select[data_hcmst_select$where_couple_met == "q24_school" & data_hcmst_select$year_couple_met == "1946-01-01",],
label = "Primary/\nsecondary\nschool",
nudge_y = 0.2,
hjust = "left",
direction = "y",
colour = "#383537") +
geom_text(data = data_hcmst_select[data_hcmst_select$where_couple_met == "q24_church" & data_hcmst_select$year_couple_met == "1959-01-01",],
label = "Church",
nudge_y = 1.9,
hjust = "left",
colour = "#383537") +
geom_text(data = data_hcmst_select[data_hcmst_select$where_couple_met == "q24_bar_restaurant" & data_hcmst_select$year_couple_met == "1967-01-01",],
label = "Bar/restaurant",
nudge_y = -2,
hjust = "left",
colour = "#383537") +
geom_text(data = data_hcmst_select[data_hcmst_select$where_couple_met == "met_through_friends" & data_hcmst_select$year_couple_met == "1981-01-01",],
label = "Through friends",
nudge_y = -1,
hjust = "left",
colour = "#383537") +
geom_text(data = data_hcmst_select[data_hcmst_select$where_couple_met == "q24_met_online" & data_hcmst_select$year_couple_met == "1989-01-01" & data_hcmst_select$same_sex_couple == 0,],
label = expression(bold("Online")),
nudge_y = 2,
colour = "#383537")
full_plot_right_label
full_plot_full_label <- full_plot_right_label +
geom_text(data = data_hcmst_select[data_hcmst_select$where_couple_met == "q24_church" & data_hcmst_select$year_couple_met == "1987-01-01" & data_hcmst_select$same_sex_couple == 1,],
label = "Church",
nudge_y = -10,
colour = "#383537") +
geom_text(data = data_hcmst_select[data_hcmst_select$where_couple_met == "q24_college" & data_hcmst_select$year_couple_met == "1987-01-01" & data_hcmst_select$same_sex_couple == 1,],
label = "College",
nudge_y = 5,
colour = "#383537") +
geom_text(data = data_hcmst_select[data_hcmst_select$where_couple_met == "q24_R_cowork" & data_hcmst_select$year_couple_met == "1987-01-01" & data_hcmst_select$same_sex_couple == 1,],
label = "Co-workers",
nudge_y = 3,
colour = "#383537") +
geom_text(data = data_hcmst_select[data_hcmst_select$where_couple_met == "met_through_friends" & data_hcmst_select$year_couple_met == "1987-01-01" & data_hcmst_select$same_sex_couple == 1,],
label = "Through\nfriends",
nudge_y = 27,
colour = "#383537") +
geom_text(data = data_hcmst_select[data_hcmst_select$where_couple_met == "q24_met_online" & data_hcmst_select$year_couple_met == "2004-01-01" & data_hcmst_select$same_sex_couple == 1,],
label = expression(bold("Online")),
nudge_y = 28,
colour = "#383537")
full_plot_full_label