1 Introduction.

This file creates a dataframe of Id, Event and Date (IED) variables. This kind of data is common in data analysis and so having a dummy set of data is useful.

Journeys are also created, which are events combined in chronological order.

For each id, the number of events, and forward and reverse order (in journey) variables are also created.

2 Create event dates.

The var_number_of_events variable below can be modified to increase or reduce the number of event dates. The date range can also be varied to suit.

var_number_of_events = 20000

d01df_id_and_events <- as.data.frame(sample(seq(
  as.Date('2020/01/01'),
  as.Date('2020/11/01'),
  by = "day"
),
var_number_of_events, replace = TRUE))

2.1 Set the column name.

colnames(d01df_id_and_events) <- "date"

head(d01df_id_and_events)
##         date
## 1 2020-04-09
## 2 2020-09-20
## 3 2020-05-19
## 4 2020-08-02
## 5 2020-04-07
## 6 2020-01-05

3 Create events from random letters.

Vary the range of letters to increase or reduce the numbers of types of events.

d01df_id_and_events$event <- sample(LETTERS[1:5], nrow(d01df_id_and_events), replace = TRUE)

4 Create ids.

Increasing the range of letters increases the number of unique ids thereby reducing the average length of journey each id has.

d01df_id_and_events$id <- paste0(sample(LETTERS[1:26], 
                                        nrow(d01df_id_and_events), 
                                        replace = TRUE),
                                 sample(LETTERS[1:26], 
                                        nrow(d01df_id_and_events), 
                                        replace = TRUE))

tibble::glimpse(d01df_id_and_events)
## Rows: 20,000
## Columns: 3
## $ date  <date> 2020-04-09, 2020-09-20, 2020-05-19, 2020-08-02, 2020-04-07, 20…
## $ event <chr> "C", "C", "A", "A", "C", "D", "A", "D", "E", "D", "E", "E", "D"…
## $ id    <chr> "TG", "YX", "KH", "QY", "ZG", "WS", "IY", "SM", "RT", "VP", "VL…

4.1 Re-order the variables.

Calculate the lead_days between each consecutive event for an id.

Also create a lead_event that shows the next event in order for the id.

d02df_id_and_events <- d01df_id_and_events %>% 
  relocate(id, event, date) %>%
  arrange(id, date, event) %>%
  group_by(id) %>%
  mutate(lead_days = as.integer((lead(date) - date))) %>%
  mutate(lead_event = lead(event)) %>%
  mutate(event_pair = paste0(event," + ",lead_event)) %>%
  ungroup()

tibble::glimpse(d02df_id_and_events)
## Rows: 20,000
## Columns: 6
## $ id         <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA"…
## $ event      <chr> "B", "E", "A", "D", "B", "A", "C", "E", "D", "E", "E", "D"…
## $ date       <date> 2020-01-05, 2020-01-05, 2020-01-06, 2020-01-06, 2020-01-2…
## $ lead_days  <int> 0, 1, 0, 17, 9, 5, 33, 26, 20, 8, 1, 37, 6, 8, 13, 7, 6, 1…
## $ lead_event <chr> "E", "A", "D", "B", "A", "C", "E", "D", "E", "E", "D", "E"…
## $ event_pair <chr> "B + E", "E + A", "A + D", "D + B", "B + A", "A + C", "C +…
head(d02df_id_and_events)
## # A tibble: 6 x 6
##   id    event date       lead_days lead_event event_pair
##   <chr> <chr> <date>         <int> <chr>      <chr>     
## 1 AA    B     2020-01-05         0 E          B + E     
## 2 AA    E     2020-01-05         1 A          E + A     
## 3 AA    A     2020-01-06         0 D          A + D     
## 4 AA    D     2020-01-06        17 B          D + B     
## 5 AA    B     2020-01-23         9 A          B + A     
## 6 AA    A     2020-02-01         5 C          A + C

5 Create a dataframe of journeys.

d03df_journeys <- d02df_id_and_events %>%
  group_by(id) %>%
  arrange(date) %>%
    dplyr::summarise(journey = paste(event, collapse ='+'), 
                     .groups = "keep") %>%
  ungroup()

head(d03df_journeys, 10)
## # A tibble: 10 x 2
##    id    journey                                                                
##    <chr> <chr>                                                                  
##  1 AA    B+E+A+D+B+A+C+E+D+E+E+D+E+E+D+D+E+B+C+D+D+A+B+E+E+A+E+D+A+E+C+C+C      
##  2 AB    C+D+A+E+A+E+A+B+E+E+E+D+D+C+C+B+C+C+B+C+A+E+E+E+B+B+D+D+B+E+C+D+D      
##  3 AC    A+D+D+A+C+D+D+B+E+D+A+A+D+D+D+E+A+B+C+C+D+B+A+E                        
##  4 AD    B+E+E+E+C+B+E+B+C+D+C+E+A+E+E+D+A+C+A+A+A+C+E+E+A+C+C+A+D+B            
##  5 AE    E+C+D+C+A+B+B+E+A+B+B+A+E+E+A+B+C+E+D+A+D+C+E+E+C+C+B+E+B+D+D          
##  6 AF    C+B+E+C+E+C+E+D+C+B+E+D+E+D+E+D+B+D+A+E+D+C+B+A+A                      
##  7 AG    A+E+A+A+E+A+C+D+B+D+C+B+E+B+D+D+D+B+D                                  
##  8 AH    C+E+D+D+D+A+B+D+D+A+D+E+D+D+B+D+E+B+B+A+D                              
##  9 AI    B+D+C+D+A+A+E+E+E+A+C+D+E+D+E+B+D+E+B+B+E+E+C+C+E                      
## 10 AJ    B+E+E+B+B+D+D+C+C+B+D+C+B+D+C+B+E+A+E+D+C+D+D+A+A+D+A+B+E+D+A+B+B+C+C+…

6 Create a data frame of journeys (including lead_days)

d03df_journeys_lead <- d02df_id_and_events %>%
  group_by(id) %>%
  arrange(date) %>%
    dplyr::summarise(journey_days = paste0(event, 
                                     "-", 
                                     lead_days, 
                                     "-",
                                     collapse = ''), 
                     .groups = "keep") %>%
  ungroup()

head(d03df_journeys_lead, 10)
## # A tibble: 10 x 2
##    id    journey_days                                                           
##    <chr> <chr>                                                                  
##  1 AA    B-0-E-1-A-0-D-17-B-9-A-5-C-33-E-26-D-20-E-8-E-1-D-37-E-6-E-8-D-13-D-7-…
##  2 AB    C-0-D-2-A-27-E-2-A-36-E-11-A-8-B-5-E-18-E-23-E-10-D-8-D-23-C-9-C-9-B-6…
##  3 AC    A-43-D-4-D-19-A-15-C-5-D-14-D-9-B-0-E-8-D-25-A-15-A-12-D-19-D-3-D-54-E…
##  4 AD    B-6-E-0-E-13-E-8-C-29-B-10-E-20-B-15-C-5-D-2-C-26-E-4-A-1-E-5-E-1-D-5-…
##  5 AE    E-14-C-6-D-3-C-17-A-5-B-9-B-13-E-1-A-9-B-14-B-10-A-18-E-4-E-12-A-14-B-…
##  6 AF    C-6-B-5-E-4-C-2-E-11-C-1-E-3-D-14-C-21-B-5-E-13-D-5-E-3-D-0-E-38-D-30-…
##  7 AG    A-1-E-7-A-6-A-2-E-5-A-34-C-2-D-6-B-12-D-18-C-33-B-4-E-19-B-16-D-52-D-1…
##  8 AH    C-25-E-31-D-9-D-14-D-18-A-0-B-18-D-8-D-18-A-7-D-8-E-1-D-5-D-21-B-0-D-5…
##  9 AI    B-14-D-10-C-8-D-16-A-3-A-24-E-2-E-4-E-24-A-6-C-5-D-9-E-5-D-1-E-11-B-4-…
## 10 AJ    B-1-E-6-E-7-B-7-B-4-D-16-D-5-C-1-C-4-B-26-D-1-C-22-B-6-D-19-C-1-B-3-E-…

7 Join three data frames.

The events and journeys and journeys with days.

This is so you can see the total journey for an id alongside each event.

d04df_events_and_journeys <- left_join(d02df_id_and_events, 
                                       d03df_journeys) %>%
  arrange(id, date, event) 
## Joining, by = "id"
d05df_events_and_journeys <- left_join(d04df_events_and_journeys, 
                                       d03df_journeys_lead) %>%
  arrange(id, date, event) 
## Joining, by = "id"
head(d05df_events_and_journeys,10)
## # A tibble: 10 x 8
##    id    event date       lead_days lead_event event_pair journey  journey_days 
##    <chr> <chr> <date>         <int> <chr>      <chr>      <chr>    <chr>        
##  1 AA    B     2020-01-05         0 E          B + E      B+E+A+D… B-0-E-1-A-0-…
##  2 AA    E     2020-01-05         1 A          E + A      B+E+A+D… B-0-E-1-A-0-…
##  3 AA    A     2020-01-06         0 D          A + D      B+E+A+D… B-0-E-1-A-0-…
##  4 AA    D     2020-01-06        17 B          D + B      B+E+A+D… B-0-E-1-A-0-…
##  5 AA    B     2020-01-23         9 A          B + A      B+E+A+D… B-0-E-1-A-0-…
##  6 AA    A     2020-02-01         5 C          A + C      B+E+A+D… B-0-E-1-A-0-…
##  7 AA    C     2020-02-06        33 E          C + E      B+E+A+D… B-0-E-1-A-0-…
##  8 AA    E     2020-03-10        26 D          E + D      B+E+A+D… B-0-E-1-A-0-…
##  9 AA    D     2020-04-05        20 E          D + E      B+E+A+D… B-0-E-1-A-0-…
## 10 AA    E     2020-04-25         8 E          E + E      B+E+A+D… B-0-E-1-A-0-…

8 Add count, order variables.

Add the following:

d06df_events_and_journeys <- d05df_events_and_journeys %>%
  group_by(id) %>%
  add_count(name = "events_in_journey") %>%
  arrange(id, date) %>%
  mutate(order = row_number()) %>%
  arrange(id, desc(date)) %>%
  mutate(reverse_order = row_number()) %>%
  arrange(id, date) %>%
  ungroup()

head(d06df_events_and_journeys)
## # A tibble: 6 x 11
##   id    event date       lead_days lead_event event_pair journey journey_days
##   <chr> <chr> <date>         <int> <chr>      <chr>      <chr>   <chr>       
## 1 AA    B     2020-01-05         0 E          B + E      B+E+A+… B-0-E-1-A-0…
## 2 AA    E     2020-01-05         1 A          E + A      B+E+A+… B-0-E-1-A-0…
## 3 AA    A     2020-01-06         0 D          A + D      B+E+A+… B-0-E-1-A-0…
## 4 AA    D     2020-01-06        17 B          D + B      B+E+A+… B-0-E-1-A-0…
## 5 AA    B     2020-01-23         9 A          B + A      B+E+A+… B-0-E-1-A-0…
## 6 AA    A     2020-02-01         5 C          A + C      B+E+A+… B-0-E-1-A-0…
## # … with 3 more variables: events_in_journey <int>, order <int>,
## #   reverse_order <int>
tibble::glimpse(d06df_events_and_journeys)
## Rows: 20,000
## Columns: 11
## $ id                <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA…
## $ event             <chr> "B", "E", "A", "D", "B", "A", "C", "E", "D", "E", "…
## $ date              <date> 2020-01-05, 2020-01-05, 2020-01-06, 2020-01-06, 20…
## $ lead_days         <int> 0, 1, 0, 17, 9, 5, 33, 26, 20, 8, 1, 37, 6, 8, 13, …
## $ lead_event        <chr> "E", "A", "D", "B", "A", "C", "E", "D", "E", "E", "…
## $ event_pair        <chr> "B + E", "E + A", "A + D", "D + B", "B + A", "A + C…
## $ journey           <chr> "B+E+A+D+B+A+C+E+D+E+E+D+E+E+D+D+E+B+C+D+D+A+B+E+E+…
## $ journey_days      <chr> "B-0-E-1-A-0-D-17-B-9-A-5-C-33-E-26-D-20-E-8-E-1-D-…
## $ events_in_journey <int> 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,…
## $ order             <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, …
## $ reverse_order     <int> 32, 33, 30, 31, 29, 28, 27, 26, 25, 24, 23, 22, 21,…

9 Journey length.

How many journeys of each length are there?

d06df_events_and_journeys$events_in_journey <- as.factor(d06df_events_and_journeys$events_in_journey)

ggplot(d06df_events_and_journeys, 
       aes(x=events_in_journey)) +
  geom_bar()

10 Event Pair Frequency.

d07df_event_pairs <- d06df_events_and_journeys %>%
  filter(!is.na(lead_event)) %>%
  group_by(event_pair) %>%
  summarize(value = n()) 
## `summarise()` ungrouping output (override with `.groups` argument)
ggplot(d07df_event_pairs,
       aes(y = reorder(event_pair, value), x = value)) +
  geom_bar(stat = "identity") +
  labs(title = "Event Pair Frequency.") +
       xlab("Frequency.") +
  ylab("Event Pair.") +
  theme_bw()

11 How many Days Between Events.

d06df_events_and_journeys %>%
  filter(!is.na(lead_event)) %>%
ggplot(aes(x = lead_days)) +
  facet_wrap(~event_pair) +
  geom_bar() +
  labs(title = "Days Between Pairs of Events.") +
       xlab("Lead Days") +
  ylab("Event Pair.") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

12 Todo:

13 End.