Introduction

Welcome to the Rplicate Series! In this article we will try to replicate the plot from The Economist article, The irresistible rise of internet dating. The graph is based on Rosenfeld’s (2012) study on how couples meet and stay together published in an article entitled Searching for a Mate: The Rise of the Internet as a Social Intermediary. This article will cover how to change the value of axes, making arrow to highlight information on our plot, and set multiple row on our title.

Load Packages

Below is the required packages for data wrangling and visualization.

library(extrafont)
library(haven)
library(foreign)
library(lubridate) 
library(dplyr)
library(ggplot2)
library(reshape2)
library(tidyr)
library(ggthemes)
library(ggrepel)
library(png)
library(grid)
library(gridExtra)

Load Fonts

You can use Officina Sans based on ggthemes package documentary regarding the economist theme for an exact result. Officina Sans is a commercial font and is available here.

custom_font_family <- "ITC Officina Sans"

Data Collection

Dataset is taken from data.stanford.edu with SPSS format.

data <- read_sav('data_input/HCMST_ver_3.04.sav')
head(data)
#> # A tibble: 6 x 387
#>   caseid_new weight1 weight2  ppage ppagecat ppagect4    ppeduc ppeducat  ppethm
#>        <dbl>   <dbl>   <dbl> <dbl+> <dbl+lb> <dbl+lb> <dbl+lbl> <dbl+lb> <dbl+l>
#> 1      22526    4265    4265     52 4 [45-5~ 3 [45-5~ 12 [bach~ 4 [bach~ 4 [his~
#> 2      23286   16485   16485     28 2 [25-3~ 1 [18-2~ 13 [mast~ 4 [bach~ 1 [whi~
#> 3      25495   52464      NA     49 4 [45-5~ 3 [45-5~  9 [high~ 2 [high~ 2 [bla~
#> 4      26315    4575    4575     31 2 [25-3~ 2 [30-4~ 11 [asso~ 3 [some~ 1 [whi~
#> 5      27355   12147      NA     35 3 [35-4~ 2 [30-4~  9 [high~ 2 [high~ 1 [whi~
#> 6      27695    1799      NA     69 6 [65-7~ 4 [60+]  10 [some~ 3 [some~ 1 [whi~
#> # ... with 378 more variables: ppgender <dbl+lbl>, pphhhead <dbl+lbl>,
#> #   pphouseholdsize <dbl>, pphouse <dbl+lbl>, ppincimp <dbl+lbl>, hhinc <dbl>,
#> #   ppmarit <dbl+lbl>, ppmsacat <dbl+lbl>, ppreg4 <dbl+lbl>, ppreg9 <dbl+lbl>,
#> #   pprent <dbl+lbl>, ppt01 <dbl+lbl>, ppt1317 <dbl+lbl>, ppt18ov <dbl+lbl>,
#> #   ppt25 <dbl+lbl>, ppt612 <dbl+lbl>, children_in_hh <dbl>, ppwork <dbl+lbl>,
#> #   ppnet <dbl+lbl>, ppq14arace <dbl+lbl>, pphispan <dbl+lbl>,
#> #   pprace_white <dbl+lbl>, pprace_black <dbl+lbl>, ...
str(data, list.len = 3)
#> tibble [4,002 x 387] (S3: tbl_df/tbl/data.frame)
#>  $ caseid_new                      : num [1:4002] 22526 23286 25495 26315 27355 ...
#>   ..- attr(*, "label")= chr "unique case ID"
#>   ..- attr(*, "format.spss")= chr "F12.0"
#>   ..- attr(*, "display_width")= int 12
#>  $ weight1                         : num [1:4002] 4265 16485 52464 4575 12147 ...
#>   ..- attr(*, "label")= chr "Main Weight for all respondents"
#>   ..- attr(*, "format.spss")= chr "F12.0"
#>   ..- attr(*, "display_width")= int 12
#>  $ weight2                         : num [1:4002] 4265 16485 NA 4575 NA ...
#>   ..- attr(*, "label")= chr "Main Weight for Partnered Respondents"
#>   ..- attr(*, "format.spss")= chr "F12.0"
#>   ..- attr(*, "display_width")= int 12
#>   [list output truncated]
dim(data)
#> [1] 4002  387

The dataset consist of 4002 observations and 387 variables. There are 3009 of 4002 participants of How Couples Met and Stay Together main survey based on the Brief Guide. The remaining 993 respondents did not have a spouse or partner at the time of the main survey. This information can be derived from qflag variables.

unique(data$qflag)
#> <labelled<double>[2]>: Does respondent have a spouse or partner?
#> [1] 1 2
#> 
#> Labels:
#>  value                                         label
#>      1                                     partnered
#>      2 no spouse or partner or otherwise unqualified

For further data wrangling, it’s easier if we use the data in dataframe format and drop the SPSS labels. We do this by using read.spss function in foreign library.

data <- read.spss('data_input/HCMST_ver_3.04.sav', to.data.frame = TRUE,
                    use.value.labels = FALSE)
head(data)[1:5,1:7]
#>   caseid_new weight1 weight2 ppage ppagecat ppagect4 ppeduc
#> 1      22526    4265    4265    52        4        3     12
#> 2      23286   16485   16485    28        2        1     13
#> 3      25495   52464      NA    49        4        3      9
#> 4      26315    4575    4575    31        2        2     11
#> 5      27355   12147      NA    35        3        2      9

Data Wrangling

The graph was plotting the year couple met and where did they meet. The dataset doesn’t have the variable that enlists what year the respondent and their couple met, but instead, they ask how long ago the respondent met their partner in years. We can get year_couple_met by subtracting how_long_ago_first_met variable with HCMST_main_interview_yrmo variable. See example below:

ym(data$HCMST_main_interview_yrmo)[1] - years(data$how_long_ago_first_met)[1]
#> [1] "2002-02-01"

The survey documented places where couples met on q24_* variables. Don’t forget that the graph is divided by whether the couple was heterosexual or not, so we also need to take the same_sex_couple variable. We can do all of this by piping with the dplyr package.

data_hcmst <- data %>% 
  filter(qflag == 1) %>% 
  select(c("HCMST_main_interview_yrmo", 
         "how_long_ago_first_met",
         "same_sex_couple",
         "q24_met_online",
         "q24_school" ,                     
         "q24_college",
         "q24_church",
         "q24_bar_restaurant",
         "q24_R_cowork",
         "met_through_friends"
         )) %>% 
  mutate(HCMST_main_interview_yrmo = ym(HCMST_main_interview_yrmo)) %>% 
  mutate(year_couple_met = HCMST_main_interview_yrmo - years(how_long_ago_first_met)) %>% 
  mutate(year_couple_met = year(year_couple_met))

head(data_hcmst)
#>   HCMST_main_interview_yrmo how_long_ago_first_met same_sex_couple
#> 1                2009-02-01                      7               1
#> 2                2009-02-01                      9               1
#> 3                2009-02-01                      8               1
#> 4                2009-02-01                     12               1
#> 5                2009-02-01                     30               0
#> 6                2009-02-01                      3               1
#>   q24_met_online q24_school q24_college q24_church q24_bar_restaurant
#> 1              1          0           0          0                  0
#> 2              0          0           1          0                  0
#> 3              1          0           0          0                  0
#> 4              0          0           0          0                  0
#> 5              0          0           0          0                  0
#> 6              1          0           0          0                  0
#>   q24_R_cowork met_through_friends year_couple_met
#> 1            0                   0            2002
#> 2            0                   1            2000
#> 3            0                   1            2001
#> 4            0                   0            1997
#> 5            1                   0            1979
#> 6            0                   0            2006
dim(data_hcmst)
#> [1] 3009   11
colSums(is.na(data_hcmst))
#> HCMST_main_interview_yrmo    how_long_ago_first_met           same_sex_couple 
#>                         0                        15                         0 
#>            q24_met_online                q24_school               q24_college 
#>                        75                        75                        75 
#>                q24_church        q24_bar_restaurant              q24_R_cowork 
#>                        75                        75                        75 
#>       met_through_friends           year_couple_met 
#>                        75                        15

Since The Economist article did not include any N sample explanation on the article, we assume they dropped the missing values instead of imputation.

data_hcmst <- data_hcmst %>%
  drop_na()

dim(data_hcmst)
#> [1] 2924   11
table(data_hcmst$same_sex_couple)
#> 
#>    0    1 
#> 2462  462

Now we have 2924 observation, 2462 observations for heterosexual couples and 462 observations for same-sex couples. with clean dataset.

colSums(is.na(data_hcmst))
#> HCMST_main_interview_yrmo    how_long_ago_first_met           same_sex_couple 
#>                         0                         0                         0 
#>            q24_met_online                q24_school               q24_college 
#>                         0                         0                         0 
#>                q24_church        q24_bar_restaurant              q24_R_cowork 
#>                         0                         0                         0 
#>       met_through_friends           year_couple_met 
#>                         0                         0

Relative percentage: for each year, what percentage of partners of each sexual preference first met on each occasion/place?

We create custom_color_palette to customize the color of geom_smooth() for each category. We can extract color pallete from image picker or find the The Economist style guide.

custom_color_pallete <- list(met_through_friends = "#01A4DC",
                             q24_bar_restaurant = "#F15B40",
                             q24_church = "#692916",
                             q24_college = "#91b8bd",
                             q24_met_online = "#336666",
                             q24_R_cowork = "#6ED0F6",
                             q24_school = "#AC8B96"
                             )
custom_date_left <- as.Date(paste0(seq(1940, 2010, 10), "-01-01"))

x.labs <- ifelse((substr(as.character(custom_date_left), 1, 4) == "1940") | (substr(as.character(custom_date_left), 1, 4) == "2000"),
                 substr(as.character(custom_date_left), 1, 4),
                 substr(as.character(custom_date_left), 3, 4))

x.labs
#> [1] "1940" "50"   "60"   "70"   "80"   "90"   "2000" "10"
custom_date_right <- as.Date(c("1980-01-01", 
                             "1985-01-01", 
                             "1990-01-01",
                             "2000-01-01",
                             "2010-01-01"))

x.labsr <- ifelse(substr(as.character(custom_date_right), 1, 4) == "2000",
                 substr(as.character(custom_date_right), 1, 4),
                 substr(as.character(custom_date_right), 3, 4))

x.labsr[2] <- ""
x.labsr
#> [1] "80"   ""     "90"   "2000" "10"

Using facet

Data Wrangling

data_hcmst_select <- data_hcmst %>% 
  select(-c("HCMST_main_interview_yrmo",
         "how_long_ago_first_met")) %>% 
  pivot_longer(cols = -c(year_couple_met, same_sex_couple),
               names_to = "where_couple_met",
               values_to = "value",
               ) %>% 
  group_by(same_sex_couple, year_couple_met, where_couple_met) %>% 
  summarise(frequency = sum(value)) %>% 
  mutate(percentage = frequency/sum(frequency)*100) %>% 
  mutate(year_couple_met = ymd(year_couple_met, truncated = 2L)) %>% 
  filter(if(same_sex_couple == 1) year_couple_met > "1985-01-01" else TRUE) 

head(data_hcmst_select)
#> # A tibble: 6 x 5
#> # Groups:   same_sex_couple, year_couple_met [1]
#>   same_sex_couple year_couple_met where_couple_met    frequency percentage
#>             <dbl> <date>          <chr>                   <dbl>      <dbl>
#> 1               0 1934-01-01      met_through_friends         0          0
#> 2               0 1934-01-01      q24_bar_restaurant          0          0
#> 3               0 1934-01-01      q24_church                  1         50
#> 4               0 1934-01-01      q24_college                 0          0
#> 5               0 1934-01-01      q24_met_online              0          0
#> 6               0 1934-01-01      q24_R_cowork                0          0

Plotting

facet_label <- c( `0` = "Heterosexual couples",
                  `1` = "Same-sex couples")

unique(data_hcmst_select$same_sex_couple)
#> [1] 0 1
full_plot <- ggplot(data_hcmst_select,
       aes(x = year_couple_met,
           y = percentage)) +
  geom_smooth(aes(group = where_couple_met,
              colour = where_couple_met), 
          method = "loess",
          span = 0.8,
          se = FALSE) +
  facet_grid(.~same_sex_couple,
             space = "free",
             scales = "free",
             labeller = as_labeller(facet_label)) +
  scale_x_date(breaks = custom_date_left,
               labels = x.labs,
#               limits = as.Date(c("1940-01-01","2010-01-01")), 
 #              expand = c(0.01, 0)
               ) +
  scale_y_continuous(expand = c(0, 0), 
                     limits = c(0, 70), 
                     breaks = seq(0, 50, by = 10),
                     sec.axis = sec_axis(trans = ~., 
                                         breaks = seq(0, 
                                                      70, 
                                                      by = 10))) +
  theme_economist_white(base_size = 11, 
                        base_family = "ITC Officina Sans",
                        gray_bg = F, 
                       horizontal = TRUE) +
  theme(text = element_text(family = "custom_font_family",
                            color = "black"),
        axis.ticks.length = unit(5, "pt"),
        axis.ticks.x = element_line(size = 0.75),
        axis.title.x = element_text(margin = margin(t = 10,
                                                    r = 100)),
        panel.grid.major.y = element_line(color = "#B2C2CA", 
                                          size = 1),
        strip.text = element_text(hjust = 0, 
                                  face = "bold"),
        strip.text.x = element_text(size = 10),
        plot.caption = element_text(hjust = 0,
                                    margin = margin(t =20)),
#        plot.title = element_text(margin = margin(t = -3)),
        legend.position = "none") +
  labs(title="Meet Markets", 
       subtitle = "United States, how couples meet, %",
       caption='Source: "Searching for a Mate: The Rise of the Internet as a Social Intermediary", by Michael J. Rosenfield and Reuben J. Thomas',
       x = expression(italic("Year Couple Met")), 
       y=NULL) +
  scale_color_manual(values = unlist(custom_color_pallete, use.names = FALSE)) 

full_plot

Labels

full_plot_right_label <- full_plot +
  geom_text(data = data_hcmst_select[data_hcmst_select$where_couple_met == "q24_college" & data_hcmst_select$year_couple_met == "1940-01-01",],
            label = "College",
            nudge_y = 5,
            hjust ="left", 
            colour = "#383537") +
  geom_text_repel(data = data_hcmst_select[data_hcmst_select$where_couple_met == "q24_school" & data_hcmst_select$year_couple_met == "1946-01-01",],
            label = "Primary/\nsecondary\nschool",
            nudge_y = 0.2,
            hjust = "left",
            direction = "y", 
            colour = "#383537") +
    geom_text(data = data_hcmst_select[data_hcmst_select$where_couple_met == "q24_church" & data_hcmst_select$year_couple_met == "1959-01-01",],
            label = "Church",
            nudge_y = 1.9,
            hjust = "left", 
            colour = "#383537") +
    geom_text(data = data_hcmst_select[data_hcmst_select$where_couple_met == "q24_bar_restaurant" & data_hcmst_select$year_couple_met == "1967-01-01",],
            label = "Bar/restaurant",
            nudge_y = -2,
            hjust = "left", 
            colour = "#383537") +
    geom_text(data = data_hcmst_select[data_hcmst_select$where_couple_met == "met_through_friends" & data_hcmst_select$year_couple_met == "1981-01-01",],
            label = "Through friends",
            nudge_y = -1,
            hjust = "left", 
            colour = "#383537") +
        geom_text(data = data_hcmst_select[data_hcmst_select$where_couple_met == "q24_met_online" & data_hcmst_select$year_couple_met == "1989-01-01" & data_hcmst_select$same_sex_couple == 0,],
            label = expression(bold("Online")),
            nudge_y = 2, 
            colour = "#383537")

full_plot_right_label

full_plot_full_label <- full_plot_right_label +
    geom_text(data = data_hcmst_select[data_hcmst_select$where_couple_met == "q24_church" & data_hcmst_select$year_couple_met == "1987-01-01" & data_hcmst_select$same_sex_couple == 1,],
            label = "Church",
            nudge_y = -10, 
            colour = "#383537") +
    geom_text(data = data_hcmst_select[data_hcmst_select$where_couple_met == "q24_college" & data_hcmst_select$year_couple_met == "1987-01-01" & data_hcmst_select$same_sex_couple == 1,],
            label = "College",
            nudge_y = 5, 
            colour = "#383537") +
  geom_text(data = data_hcmst_select[data_hcmst_select$where_couple_met == "q24_R_cowork" & data_hcmst_select$year_couple_met == "1987-01-01" & data_hcmst_select$same_sex_couple == 1,],
            label = "Co-workers",
            nudge_y = 3, 
            colour = "#383537") +
    geom_text(data = data_hcmst_select[data_hcmst_select$where_couple_met == "met_through_friends" & data_hcmst_select$year_couple_met == "1987-01-01" & data_hcmst_select$same_sex_couple == 1,],
            label = "Through\nfriends",
            nudge_y = 27, 
            colour = "#383537") +
    geom_text(data = data_hcmst_select[data_hcmst_select$where_couple_met == "q24_met_online" & data_hcmst_select$year_couple_met == "2004-01-01" & data_hcmst_select$same_sex_couple == 1,],
            label = expression(bold("Online")),
            nudge_y = 28, 
            colour = "#383537")

full_plot_full_label