This file creates a dataframe of Id, Event and Date (IED) variables. This kind of data is common in data analysis and so having a dummy set of data is useful.
Journeys are also created, which are events combined in chronological order.
For each id, the number of events, and forward and reverse order (in journey) variables are also created.
The var_number_of_events variable below can be modified to increase or reduce the number of event dates. The date range can also be varied to suit.
var_number_of_events = 20000
d01df_id_and_events <- as.data.frame(sample(seq(
as.Date('2020/01/01'),
as.Date('2020/11/01'),
by = "day"
),
var_number_of_events, replace = TRUE))colnames(d01df_id_and_events) <- "date"
head(d01df_id_and_events)## date
## 1 2020-04-09
## 2 2020-09-20
## 3 2020-05-19
## 4 2020-08-02
## 5 2020-04-07
## 6 2020-01-05
Vary the range of letters to increase or reduce the numbers of types of events.
d01df_id_and_events$event <- sample(LETTERS[1:5], nrow(d01df_id_and_events), replace = TRUE)Increasing the range of letters increases the number of unique ids thereby reducing the average length of journey each id has.
d01df_id_and_events$id <- paste0(sample(LETTERS[1:26],
nrow(d01df_id_and_events),
replace = TRUE),
sample(LETTERS[1:26],
nrow(d01df_id_and_events),
replace = TRUE))
tibble::glimpse(d01df_id_and_events)## Rows: 20,000
## Columns: 3
## $ date <date> 2020-04-09, 2020-09-20, 2020-05-19, 2020-08-02, 2020-04-07, 20…
## $ event <chr> "C", "C", "A", "A", "C", "D", "A", "D", "E", "D", "E", "E", "D"…
## $ id <chr> "TG", "YX", "KH", "QY", "ZG", "WS", "IY", "SM", "RT", "VP", "VL…
Calculate the lead_days between each consecutive event for an id.
Also create a lead_event that shows the next event in order for the id.
d02df_id_and_events <- d01df_id_and_events %>%
relocate(id, event, date) %>%
arrange(id, date, event) %>%
group_by(id) %>%
mutate(lead_days = as.integer((lead(date) - date))) %>%
mutate(lead_event = lead(event)) %>%
mutate(event_pair = paste0(event," + ",lead_event)) %>%
ungroup()
tibble::glimpse(d02df_id_and_events)## Rows: 20,000
## Columns: 6
## $ id <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA"…
## $ event <chr> "B", "E", "A", "D", "B", "A", "C", "E", "D", "E", "E", "D"…
## $ date <date> 2020-01-05, 2020-01-05, 2020-01-06, 2020-01-06, 2020-01-2…
## $ lead_days <int> 0, 1, 0, 17, 9, 5, 33, 26, 20, 8, 1, 37, 6, 8, 13, 7, 6, 1…
## $ lead_event <chr> "E", "A", "D", "B", "A", "C", "E", "D", "E", "E", "D", "E"…
## $ event_pair <chr> "B + E", "E + A", "A + D", "D + B", "B + A", "A + C", "C +…
head(d02df_id_and_events)## # A tibble: 6 x 6
## id event date lead_days lead_event event_pair
## <chr> <chr> <date> <int> <chr> <chr>
## 1 AA B 2020-01-05 0 E B + E
## 2 AA E 2020-01-05 1 A E + A
## 3 AA A 2020-01-06 0 D A + D
## 4 AA D 2020-01-06 17 B D + B
## 5 AA B 2020-01-23 9 A B + A
## 6 AA A 2020-02-01 5 C A + C
d03df_journeys <- d02df_id_and_events %>%
group_by(id) %>%
arrange(date) %>%
dplyr::summarise(journey = paste(event, collapse ='+'),
.groups = "keep") %>%
ungroup()
head(d03df_journeys, 10)## # A tibble: 10 x 2
## id journey
## <chr> <chr>
## 1 AA B+E+A+D+B+A+C+E+D+E+E+D+E+E+D+D+E+B+C+D+D+A+B+E+E+A+E+D+A+E+C+C+C
## 2 AB C+D+A+E+A+E+A+B+E+E+E+D+D+C+C+B+C+C+B+C+A+E+E+E+B+B+D+D+B+E+C+D+D
## 3 AC A+D+D+A+C+D+D+B+E+D+A+A+D+D+D+E+A+B+C+C+D+B+A+E
## 4 AD B+E+E+E+C+B+E+B+C+D+C+E+A+E+E+D+A+C+A+A+A+C+E+E+A+C+C+A+D+B
## 5 AE E+C+D+C+A+B+B+E+A+B+B+A+E+E+A+B+C+E+D+A+D+C+E+E+C+C+B+E+B+D+D
## 6 AF C+B+E+C+E+C+E+D+C+B+E+D+E+D+E+D+B+D+A+E+D+C+B+A+A
## 7 AG A+E+A+A+E+A+C+D+B+D+C+B+E+B+D+D+D+B+D
## 8 AH C+E+D+D+D+A+B+D+D+A+D+E+D+D+B+D+E+B+B+A+D
## 9 AI B+D+C+D+A+A+E+E+E+A+C+D+E+D+E+B+D+E+B+B+E+E+C+C+E
## 10 AJ B+E+E+B+B+D+D+C+C+B+D+C+B+D+C+B+E+A+E+D+C+D+D+A+A+D+A+B+E+D+A+B+B+C+C+…
d03df_journeys_lead <- d02df_id_and_events %>%
group_by(id) %>%
arrange(date) %>%
dplyr::summarise(journey_days = paste0(event,
"-",
lead_days,
"-",
collapse = ''),
.groups = "keep") %>%
ungroup()
head(d03df_journeys_lead, 10)## # A tibble: 10 x 2
## id journey_days
## <chr> <chr>
## 1 AA B-0-E-1-A-0-D-17-B-9-A-5-C-33-E-26-D-20-E-8-E-1-D-37-E-6-E-8-D-13-D-7-…
## 2 AB C-0-D-2-A-27-E-2-A-36-E-11-A-8-B-5-E-18-E-23-E-10-D-8-D-23-C-9-C-9-B-6…
## 3 AC A-43-D-4-D-19-A-15-C-5-D-14-D-9-B-0-E-8-D-25-A-15-A-12-D-19-D-3-D-54-E…
## 4 AD B-6-E-0-E-13-E-8-C-29-B-10-E-20-B-15-C-5-D-2-C-26-E-4-A-1-E-5-E-1-D-5-…
## 5 AE E-14-C-6-D-3-C-17-A-5-B-9-B-13-E-1-A-9-B-14-B-10-A-18-E-4-E-12-A-14-B-…
## 6 AF C-6-B-5-E-4-C-2-E-11-C-1-E-3-D-14-C-21-B-5-E-13-D-5-E-3-D-0-E-38-D-30-…
## 7 AG A-1-E-7-A-6-A-2-E-5-A-34-C-2-D-6-B-12-D-18-C-33-B-4-E-19-B-16-D-52-D-1…
## 8 AH C-25-E-31-D-9-D-14-D-18-A-0-B-18-D-8-D-18-A-7-D-8-E-1-D-5-D-21-B-0-D-5…
## 9 AI B-14-D-10-C-8-D-16-A-3-A-24-E-2-E-4-E-24-A-6-C-5-D-9-E-5-D-1-E-11-B-4-…
## 10 AJ B-1-E-6-E-7-B-7-B-4-D-16-D-5-C-1-C-4-B-26-D-1-C-22-B-6-D-19-C-1-B-3-E-…
The events and journeys and journeys with days.
This is so you can see the total journey for an id alongside each event.
d04df_events_and_journeys <- left_join(d02df_id_and_events,
d03df_journeys) %>%
arrange(id, date, event) ## Joining, by = "id"
d05df_events_and_journeys <- left_join(d04df_events_and_journeys,
d03df_journeys_lead) %>%
arrange(id, date, event) ## Joining, by = "id"
head(d05df_events_and_journeys,10)## # A tibble: 10 x 8
## id event date lead_days lead_event event_pair journey journey_days
## <chr> <chr> <date> <int> <chr> <chr> <chr> <chr>
## 1 AA B 2020-01-05 0 E B + E B+E+A+D… B-0-E-1-A-0-…
## 2 AA E 2020-01-05 1 A E + A B+E+A+D… B-0-E-1-A-0-…
## 3 AA A 2020-01-06 0 D A + D B+E+A+D… B-0-E-1-A-0-…
## 4 AA D 2020-01-06 17 B D + B B+E+A+D… B-0-E-1-A-0-…
## 5 AA B 2020-01-23 9 A B + A B+E+A+D… B-0-E-1-A-0-…
## 6 AA A 2020-02-01 5 C A + C B+E+A+D… B-0-E-1-A-0-…
## 7 AA C 2020-02-06 33 E C + E B+E+A+D… B-0-E-1-A-0-…
## 8 AA E 2020-03-10 26 D E + D B+E+A+D… B-0-E-1-A-0-…
## 9 AA D 2020-04-05 20 E D + E B+E+A+D… B-0-E-1-A-0-…
## 10 AA E 2020-04-25 8 E E + E B+E+A+D… B-0-E-1-A-0-…
Add the following:
d06df_events_and_journeys <- d05df_events_and_journeys %>%
group_by(id) %>%
add_count(name = "events_in_journey") %>%
arrange(id, date) %>%
mutate(order = row_number()) %>%
arrange(id, desc(date)) %>%
mutate(reverse_order = row_number()) %>%
arrange(id, date) %>%
ungroup()
head(d06df_events_and_journeys)## # A tibble: 6 x 11
## id event date lead_days lead_event event_pair journey journey_days
## <chr> <chr> <date> <int> <chr> <chr> <chr> <chr>
## 1 AA B 2020-01-05 0 E B + E B+E+A+… B-0-E-1-A-0…
## 2 AA E 2020-01-05 1 A E + A B+E+A+… B-0-E-1-A-0…
## 3 AA A 2020-01-06 0 D A + D B+E+A+… B-0-E-1-A-0…
## 4 AA D 2020-01-06 17 B D + B B+E+A+… B-0-E-1-A-0…
## 5 AA B 2020-01-23 9 A B + A B+E+A+… B-0-E-1-A-0…
## 6 AA A 2020-02-01 5 C A + C B+E+A+… B-0-E-1-A-0…
## # … with 3 more variables: events_in_journey <int>, order <int>,
## # reverse_order <int>
tibble::glimpse(d06df_events_and_journeys)## Rows: 20,000
## Columns: 11
## $ id <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA…
## $ event <chr> "B", "E", "A", "D", "B", "A", "C", "E", "D", "E", "…
## $ date <date> 2020-01-05, 2020-01-05, 2020-01-06, 2020-01-06, 20…
## $ lead_days <int> 0, 1, 0, 17, 9, 5, 33, 26, 20, 8, 1, 37, 6, 8, 13, …
## $ lead_event <chr> "E", "A", "D", "B", "A", "C", "E", "D", "E", "E", "…
## $ event_pair <chr> "B + E", "E + A", "A + D", "D + B", "B + A", "A + C…
## $ journey <chr> "B+E+A+D+B+A+C+E+D+E+E+D+E+E+D+D+E+B+C+D+D+A+B+E+E+…
## $ journey_days <chr> "B-0-E-1-A-0-D-17-B-9-A-5-C-33-E-26-D-20-E-8-E-1-D-…
## $ events_in_journey <int> 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,…
## $ order <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, …
## $ reverse_order <int> 32, 33, 30, 31, 29, 28, 27, 26, 25, 24, 23, 22, 21,…
How many journeys of each length are there?
d06df_events_and_journeys$events_in_journey <- as.factor(d06df_events_and_journeys$events_in_journey)
ggplot(d06df_events_and_journeys,
aes(x=events_in_journey)) +
geom_bar()d07df_event_pairs <- d06df_events_and_journeys %>%
filter(!is.na(lead_event)) %>%
group_by(event_pair) %>%
summarize(value = n()) ## `summarise()` ungrouping output (override with `.groups` argument)
ggplot(d07df_event_pairs,
aes(y = reorder(event_pair, value), x = value)) +
geom_bar(stat = "identity") +
labs(title = "Event Pair Frequency.") +
xlab("Frequency.") +
ylab("Event Pair.") +
theme_bw()d06df_events_and_journeys %>%
filter(!is.na(lead_event)) %>%
ggplot(aes(x = lead_days)) +
facet_wrap(~event_pair) +
geom_bar() +
labs(title = "Days Between Pairs of Events.") +
xlab("Lead Days") +
ylab("Event Pair.") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))