MNL Data Prep
This R Markdown document demonstrates how to expand a crash-level dataset so that each original crash record is repeated three times, corresponding to the three severity categories:
1 = O2 = BC3 = KA
The workflow includes:
- creating a small example dataset,
- expanding each row three times,
- generating severity indicator variables,
- providing an alternative base R approach, and
- reading from a CSV file and exporting the expanded output.
Example 1: Small Illustrative Dataset
# Create a small example dataset
df <- data.frame(
Crash_ID = c(1,2,3,4,5,6,7,8,9,10,25),
Cluster = c(1,1,3,1,3,3,3,3,3,3,3),
Crash_Sev = c("BC","BC","O","O","BC","BC","BC","BC","BC","BC","KA"),
stringsAsFactors = FALSE
)
# Display original data
df## Crash_ID Cluster Crash_Sev
## 1 1 1 BC
## 2 2 1 BC
## 3 3 3 O
## 4 4 1 O
## 5 5 3 BC
## 6 6 3 BC
## 7 7 3 BC
## 8 8 3 BC
## 9 9 3 BC
## 10 10 3 BC
## 11 25 3 KA
# Convert crash severity to numeric category
# O = 1, BC = 2, KA = 3
df <- df %>%
mutate(
Crash_Sev_Cat = case_when(
Crash_Sev == "O" ~ 1,
Crash_Sev == "BC" ~ 2,
Crash_Sev == "KA" ~ 3,
TRUE ~ NA_real_
)
)
# Repeat each crash record 3 times
##################################################
####################### METHOD 01
##################################################
df2 <- df %>%
crossing(CrSev = 1:3) %>%
mutate(
# Indicator for whether the candidate severity matches the observed severity
Severity = if_else(CrSev == Crash_Sev_Cat, 1, 0),
# Dummy variables for the three severity levels
InO = if_else(CrSev == 1, 1, 0),
InBC = if_else(CrSev == 2, 1, 0),
InKA = if_else(CrSev == 3, 1, 0)
) %>%
arrange(Crash_ID, CrSev)
# Display expanded data
print(df2, n = Inf)## # A tidytable: 33 × 9
## Crash_ID Cluster Crash_Sev Crash_Sev_Cat CrSev Severity InO InBC InKA
## <dbl> <dbl> <chr> <dbl> <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 BC 2 1 0 1 0 0
## 2 1 1 BC 2 2 1 0 1 0
## 3 1 1 BC 2 3 0 0 0 1
## 4 2 1 BC 2 1 0 1 0 0
## 5 2 1 BC 2 2 1 0 1 0
## 6 2 1 BC 2 3 0 0 0 1
## 7 3 3 O 1 1 1 1 0 0
## 8 3 3 O 1 2 0 0 1 0
## 9 3 3 O 1 3 0 0 0 1
## 10 4 1 O 1 1 1 1 0 0
## 11 4 1 O 1 2 0 0 1 0
## 12 4 1 O 1 3 0 0 0 1
## 13 5 3 BC 2 1 0 1 0 0
## 14 5 3 BC 2 2 1 0 1 0
## 15 5 3 BC 2 3 0 0 0 1
## 16 6 3 BC 2 1 0 1 0 0
## 17 6 3 BC 2 2 1 0 1 0
## 18 6 3 BC 2 3 0 0 0 1
## 19 7 3 BC 2 1 0 1 0 0
## 20 7 3 BC 2 2 1 0 1 0
## 21 7 3 BC 2 3 0 0 0 1
## 22 8 3 BC 2 1 0 1 0 0
## 23 8 3 BC 2 2 1 0 1 0
## 24 8 3 BC 2 3 0 0 0 1
## 25 9 3 BC 2 1 0 1 0 0
## 26 9 3 BC 2 2 1 0 1 0
## 27 9 3 BC 2 3 0 0 0 1
## 28 10 3 BC 2 1 0 1 0 0
## 29 10 3 BC 2 2 1 0 1 0
## 30 10 3 BC 2 3 0 0 0 1
## 31 25 3 KA 3 1 0 1 0 0
## 32 25 3 KA 3 2 0 0 1 0
## 33 25 3 KA 3 3 1 0 0 1
##################################################
####################### METHOD 02
##################################################
df2 <- df %>%
mutate(
Crash_Sev_Cat = dplyr::case_when(
Crash_Sev == "O" ~ 1,
Crash_Sev == "BC" ~ 2,
Crash_Sev == "KA" ~ 3,
TRUE ~ NA_real_
)
) %>%
tidyr::crossing(CrSev = 1:3) %>%
mutate(
Severity = dplyr::if_else(CrSev == Crash_Sev_Cat, 1, 0),
InO = dplyr::if_else(CrSev == 1, 1, 0),
InBC = dplyr::if_else(CrSev == 2, 1, 0),
InKA = dplyr::if_else(CrSev == 3, 1, 0)
) %>%
arrange(Crash_ID, CrSev)
df2## # A tidytable: 33 × 9
## Crash_ID Cluster Crash_Sev Crash_Sev_Cat CrSev Severity InO InBC InKA
## <dbl> <dbl> <chr> <dbl> <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 BC 2 1 0 1 0 0
## 2 1 1 BC 2 2 1 0 1 0
## 3 1 1 BC 2 3 0 0 0 1
## 4 2 1 BC 2 1 0 1 0 0
## 5 2 1 BC 2 2 1 0 1 0
## 6 2 1 BC 2 3 0 0 0 1
## 7 3 3 O 1 1 1 1 0 0
## 8 3 3 O 1 2 0 0 1 0
## 9 3 3 O 1 3 0 0 0 1
## 10 4 1 O 1 1 1 1 0 0
## # ℹ 23 more rows
## # A tidytable: 33 × 9
## Crash_ID Cluster Crash_Sev Crash_Sev_Cat CrSev Severity InO InBC InKA
## <dbl> <dbl> <chr> <dbl> <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 BC 2 1 0 1 0 0
## 2 1 1 BC 2 2 1 0 1 0
## 3 1 1 BC 2 3 0 0 0 1
## 4 2 1 BC 2 1 0 1 0 0
## 5 2 1 BC 2 2 1 0 1 0
## 6 2 1 BC 2 3 0 0 0 1
## 7 3 3 O 1 1 1 1 0 0
## 8 3 3 O 1 2 0 0 1 0
## 9 3 3 O 1 3 0 0 0 1
## 10 4 1 O 1 1 1 1 0 0
## 11 4 1 O 1 2 0 0 1 0
## 12 4 1 O 1 3 0 0 0 1
## 13 5 3 BC 2 1 0 1 0 0
## 14 5 3 BC 2 2 1 0 1 0
## 15 5 3 BC 2 3 0 0 0 1
## 16 6 3 BC 2 1 0 1 0 0
## 17 6 3 BC 2 2 1 0 1 0
## 18 6 3 BC 2 3 0 0 0 1
## 19 7 3 BC 2 1 0 1 0 0
## 20 7 3 BC 2 2 1 0 1 0
## 21 7 3 BC 2 3 0 0 0 1
## 22 8 3 BC 2 1 0 1 0 0
## 23 8 3 BC 2 2 1 0 1 0
## 24 8 3 BC 2 3 0 0 0 1
## 25 9 3 BC 2 1 0 1 0 0
## 26 9 3 BC 2 2 1 0 1 0
## 27 9 3 BC 2 3 0 0 0 1
## 28 10 3 BC 2 1 0 1 0 0
## 29 10 3 BC 2 2 1 0 1 0
## 30 10 3 BC 2 3 0 0 0 1
## 31 25 3 KA 3 1 0 1 0 0
## 32 25 3 KA 3 2 0 0 1 0
## 33 25 3 KA 3 3 1 0 0 1
##################################################
####################### METHOD 03
##################################################
df$Crash_Sev_Cat <- c(O = 1, BC = 2, KA = 3)[df$Crash_Sev]
df2 <- df[rep(seq_len(nrow(df)), each = 3), ]
df2$CrSev <- rep(1:3, times = nrow(df))
df2$Severity <- ifelse(df2$CrSev == df2$Crash_Sev_Cat, 1, 0)
df2$InO <- ifelse(df2$CrSev == 1, 1, 0)
df2$InBC <- ifelse(df2$CrSev == 2, 1, 0)
df2$InKA <- ifelse(df2$CrSev == 3, 1, 0)
row.names(df2) <- NULL
print(df2, n = Inf)## # A tidytable: 33 × 9
## Crash_ID Cluster Crash_Sev Crash_Sev_Cat CrSev Severity InO InBC InKA
## <dbl> <dbl> <chr> <dbl> <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 BC 2 1 0 1 0 0
## 2 1 1 BC 2 2 1 0 1 0
## 3 1 1 BC 2 3 0 0 0 1
## 4 2 1 BC 2 1 0 1 0 0
## 5 2 1 BC 2 2 1 0 1 0
## 6 2 1 BC 2 3 0 0 0 1
## 7 3 3 O 1 1 1 1 0 0
## 8 3 3 O 1 2 0 0 1 0
## 9 3 3 O 1 3 0 0 0 1
## 10 4 1 O 1 1 1 1 0 0
## 11 4 1 O 1 2 0 0 1 0
## 12 4 1 O 1 3 0 0 0 1
## 13 5 3 BC 2 1 0 1 0 0
## 14 5 3 BC 2 2 1 0 1 0
## 15 5 3 BC 2 3 0 0 0 1
## 16 6 3 BC 2 1 0 1 0 0
## 17 6 3 BC 2 2 1 0 1 0
## 18 6 3 BC 2 3 0 0 0 1
## 19 7 3 BC 2 1 0 1 0 0
## 20 7 3 BC 2 2 1 0 1 0
## 21 7 3 BC 2 3 0 0 0 1
## 22 8 3 BC 2 1 0 1 0 0
## 23 8 3 BC 2 2 1 0 1 0
## 24 8 3 BC 2 3 0 0 0 1
## 25 9 3 BC 2 1 0 1 0 0
## 26 9 3 BC 2 2 1 0 1 0
## 27 9 3 BC 2 3 0 0 0 1
## 28 10 3 BC 2 1 0 1 0 0
## 29 10 3 BC 2 2 1 0 1 0
## 30 10 3 BC 2 3 0 0 0 1
## 31 25 3 KA 3 1 0 1 0 0
## 32 25 3 KA 3 2 0 0 1 0
## 33 25 3 KA 3 3 1 0 0 1
Example 2: Any Dataset
# Read CSV file
# Replace "Moto4.csv" with your file name
df_csv <- read.csv("Moto4.csv", stringsAsFactors = FALSE)
datatable(
df_csv, extensions = 'Buttons', options = list(
dom = 'Bfrtip',
buttons = c('csv', 'excel')
)
)## CRASH_VEH_NUM1 SpLim Align Eject Access Road Age Int Injury
## 1 LA14_1310110334237182 0 0 1 0 0 0 0 3
## 2 LA14_1310200250243592 0 0 1 0 0 0 1 3
## 3 LA14_1401011516425592 1 0 0 0 1 0 0 3
## 4 LA14_1401030944055782 0 0 0 1 0 0 1 3
## 5 LA14_1401031918472811 0 0 0 0 0 0 1 3
## 6 LA14_1401050857473642 0 0 1 0 1 0 1 3
# Considering your dataset already has a numeric severity column such as Injury:
# 1 = O, 2 = BC, 3 = KA
df_expanded <- df_csv %>%
crossing(CrSev = 1:3) %>%
mutate(
# Severity equals 1 when CrSev matches the observed severity category
Severity = if_else(CrSev == Injury, 1, 0),
# Severity dummy variables
InO = if_else(CrSev == 1, 1, 0),
InBC = if_else(CrSev == 2, 1, 0),
InKA = if_else(CrSev == 3, 1, 0)
) %>%
arrange(CRASH_VEH_NUM1, CrSev)
# Display first few rows of expanded data
print(df_expanded, n = 100, width = Inf)## # A tidytable: 35,799 × 14
## CRASH_VEH_NUM1 SpLim Align Eject Access Road Age Int Injury
## <chr> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 LA14_1310110334237182 0 0 1 0 0 0 0 3
## 2 LA14_1310110334237182 0 0 1 0 0 0 0 3
## 3 LA14_1310110334237182 0 0 1 0 0 0 0 3
## 4 LA14_1310200250243592 0 0 1 0 0 0 1 3
## 5 LA14_1310200250243592 0 0 1 0 0 0 1 3
## 6 LA14_1310200250243592 0 0 1 0 0 0 1 3
## 7 LA14_1401011516425592 1 0 0 0 1 0 0 3
## 8 LA14_1401011516425592 1 0 0 0 1 0 0 3
## 9 LA14_1401011516425592 1 0 0 0 1 0 0 3
## 10 LA14_1401030944055782 0 0 0 1 0 0 1 3
## 11 LA14_1401030944055782 0 0 0 1 0 0 1 3
## 12 LA14_1401030944055782 0 0 0 1 0 0 1 3
## 13 LA14_1401031918472811 0 0 0 0 0 0 1 3
## 14 LA14_1401031918472811 0 0 0 0 0 0 1 3
## 15 LA14_1401031918472811 0 0 0 0 0 0 1 3
## 16 LA14_1401050857473642 0 0 1 0 1 0 1 3
## 17 LA14_1401050857473642 0 0 1 0 1 0 1 3
## 18 LA14_1401050857473642 0 0 1 0 1 0 1 3
## 19 LA14_1401051443077561 0 0 0 0 1 1 1 3
## 20 LA14_1401051443077561 0 0 0 0 1 1 1 3
## 21 LA14_1401051443077561 0 0 0 0 1 1 1 3
## 22 LA14_1401071126026871 1 0 1 1 0 0 0 2
## 23 LA14_1401071126026871 1 0 1 1 0 0 0 2
## 24 LA14_1401071126026871 1 0 1 1 0 0 0 2
## 25 LA14_1401071719483421 1 0 0 0 1 0 0 2
## 26 LA14_1401071719483421 1 0 0 0 1 0 0 2
## 27 LA14_1401071719483421 1 0 0 0 1 0 0 2
## 28 LA14_1401081314590931 0 0 1 1 0 1 1 3
## 29 LA14_1401081314590931 0 0 1 1 0 1 1 3
## 30 LA14_1401081314590931 0 0 1 1 0 1 1 3
## 31 LA14_1401081924345451 0 0 1 0 1 1 0 3
## 32 LA14_1401081924345451 0 0 1 0 1 1 0 3
## 33 LA14_1401081924345451 0 0 1 0 1 1 0 3
## 34 LA14_1401091542368501 1 1 0 0 0 0 0 2
## 35 LA14_1401091542368501 1 1 0 0 0 0 0 2
## 36 LA14_1401091542368501 1 1 0 0 0 0 0 2
## 37 LA14_1401091927136482 1 0 0 0 1 0 0 2
## 38 LA14_1401091927136482 1 0 0 0 1 0 0 2
## 39 LA14_1401091927136482 1 0 0 0 1 0 0 2
## 40 LA14_1401101206117652 0 0 1 0 0 0 1 3
## 41 LA14_1401101206117652 0 0 1 0 0 0 1 3
## 42 LA14_1401101206117652 0 0 1 0 0 0 1 3
## 43 LA14_1401101601337942 0 0 1 0 1 0 1 3
## 44 LA14_1401101601337942 0 0 1 0 1 0 1 3
## 45 LA14_1401101601337942 0 0 1 0 1 0 1 3
## 46 LA14_1401110941031251 0 0 1 0 0 0 1 3
## 47 LA14_1401110941031251 0 0 1 0 0 0 1 3
## 48 LA14_1401110941031251 0 0 1 0 0 0 1 3
## 49 LA14_1401111708118541 0 1 0 1 0 1 0 2
## 50 LA14_1401111708118541 0 1 0 1 0 1 0 2
## 51 LA14_1401111708118541 0 1 0 1 0 1 0 2
## 52 LA14_1401112117480001 0 0 1 0 1 0 1 2
## 53 LA14_1401112117480001 0 0 1 0 1 0 1 2
## 54 LA14_1401112117480001 0 0 1 0 1 0 1 2
## 55 LA14_1401121341001201 0 1 0 0 1 0 0 3
## 56 LA14_1401121341001201 0 1 0 0 1 0 0 3
## 57 LA14_1401121341001201 0 1 0 0 1 0 0 3
## 58 LA14_1401121704511402 0 0 0 0 1 0 1 2
## 59 LA14_1401121704511402 0 0 0 0 1 0 1 2
## 60 LA14_1401121704511402 0 0 0 0 1 0 1 2
## 61 LA14_1401122029344591 0 0 0 1 0 0 0 3
## 62 LA14_1401122029344591 0 0 0 1 0 0 0 3
## 63 LA14_1401122029344591 0 0 0 1 0 0 0 3
## 64 LA14_1401122121426961 0 1 1 0 1 1 0 3
## 65 LA14_1401122121426961 0 1 1 0 1 1 0 3
## 66 LA14_1401122121426961 0 1 1 0 1 1 0 3
## 67 LA14_1401130431114861 0 1 0 0 0 0 0 3
## 68 LA14_1401130431114861 0 1 0 0 0 0 0 3
## 69 LA14_1401130431114861 0 1 0 0 0 0 0 3
## 70 LA14_1401130717385311 0 0 0 0 0 0 0 3
## 71 LA14_1401130717385311 0 0 0 0 0 0 0 3
## 72 LA14_1401130717385311 0 0 0 0 0 0 0 3
## 73 LA14_1401131034096441 0 0 1 0 1 0 0 1
## 74 LA14_1401131034096441 0 0 1 0 1 0 0 1
## 75 LA14_1401131034096441 0 0 1 0 1 0 0 1
## 76 LA14_1401140826105372 0 0 0 1 0 0 1 2
## 77 LA14_1401140826105372 0 0 0 1 0 0 1 2
## 78 LA14_1401140826105372 0 0 0 1 0 0 1 2
## 79 LA14_1401141209356441 1 0 0 0 1 0 0 2
## 80 LA14_1401141209356441 1 0 0 0 1 0 0 2
## 81 LA14_1401141209356441 1 0 0 0 1 0 0 2
## 82 LA14_1401141703461091 0 0 0 1 0 1 1 2
## 83 LA14_1401141703461091 0 0 0 1 0 1 1 2
## 84 LA14_1401141703461091 0 0 0 1 0 1 1 2
## 85 LA14_1401142228121521 1 0 0 0 1 0 1 1
## 86 LA14_1401142228121521 1 0 0 0 1 0 1 1
## 87 LA14_1401142228121521 1 0 0 0 1 0 1 1
## 88 LA14_1401161929477541 1 1 0 0 1 0 0 3
## 89 LA14_1401161929477541 1 1 0 0 1 0 0 3
## 90 LA14_1401161929477541 1 1 0 0 1 0 0 3
## 91 LA14_1401171256051901 0 1 0 0 1 0 0 2
## 92 LA14_1401171256051901 0 1 0 0 1 0 0 2
## 93 LA14_1401171256051901 0 1 0 0 1 0 0 2
## 94 LA14_1401171512395931 0 0 1 0 0 1 0 2
## 95 LA14_1401171512395931 0 0 1 0 0 1 0 2
## 96 LA14_1401171512395931 0 0 1 0 0 1 0 2
## 97 LA14_1401171810153751 1 0 0 0 0 0 0 3
## 98 LA14_1401171810153751 1 0 0 0 0 0 0 3
## 99 LA14_1401171810153751 1 0 0 0 0 0 0 3
## 100 LA14_1401171911460001 0 0 1 0 0 0 0 2
## CrSev Severity InO InBC InKA
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 0 1 0 0
## 2 2 0 0 1 0
## 3 3 1 0 0 1
## 4 1 0 1 0 0
## 5 2 0 0 1 0
## 6 3 1 0 0 1
## 7 1 0 1 0 0
## 8 2 0 0 1 0
## 9 3 1 0 0 1
## 10 1 0 1 0 0
## 11 2 0 0 1 0
## 12 3 1 0 0 1
## 13 1 0 1 0 0
## 14 2 0 0 1 0
## 15 3 1 0 0 1
## 16 1 0 1 0 0
## 17 2 0 0 1 0
## 18 3 1 0 0 1
## 19 1 0 1 0 0
## 20 2 0 0 1 0
## 21 3 1 0 0 1
## 22 1 0 1 0 0
## 23 2 1 0 1 0
## 24 3 0 0 0 1
## 25 1 0 1 0 0
## 26 2 1 0 1 0
## 27 3 0 0 0 1
## 28 1 0 1 0 0
## 29 2 0 0 1 0
## 30 3 1 0 0 1
## 31 1 0 1 0 0
## 32 2 0 0 1 0
## 33 3 1 0 0 1
## 34 1 0 1 0 0
## 35 2 1 0 1 0
## 36 3 0 0 0 1
## 37 1 0 1 0 0
## 38 2 1 0 1 0
## 39 3 0 0 0 1
## 40 1 0 1 0 0
## 41 2 0 0 1 0
## 42 3 1 0 0 1
## 43 1 0 1 0 0
## 44 2 0 0 1 0
## 45 3 1 0 0 1
## 46 1 0 1 0 0
## 47 2 0 0 1 0
## 48 3 1 0 0 1
## 49 1 0 1 0 0
## 50 2 1 0 1 0
## 51 3 0 0 0 1
## 52 1 0 1 0 0
## 53 2 1 0 1 0
## 54 3 0 0 0 1
## 55 1 0 1 0 0
## 56 2 0 0 1 0
## 57 3 1 0 0 1
## 58 1 0 1 0 0
## 59 2 1 0 1 0
## 60 3 0 0 0 1
## 61 1 0 1 0 0
## 62 2 0 0 1 0
## 63 3 1 0 0 1
## 64 1 0 1 0 0
## 65 2 0 0 1 0
## 66 3 1 0 0 1
## 67 1 0 1 0 0
## 68 2 0 0 1 0
## 69 3 1 0 0 1
## 70 1 0 1 0 0
## 71 2 0 0 1 0
## 72 3 1 0 0 1
## 73 1 1 1 0 0
## 74 2 0 0 1 0
## 75 3 0 0 0 1
## 76 1 0 1 0 0
## 77 2 1 0 1 0
## 78 3 0 0 0 1
## 79 1 0 1 0 0
## 80 2 1 0 1 0
## 81 3 0 0 0 1
## 82 1 0 1 0 0
## 83 2 1 0 1 0
## 84 3 0 0 0 1
## 85 1 1 1 0 0
## 86 2 0 0 1 0
## 87 3 0 0 0 1
## 88 1 0 1 0 0
## 89 2 0 0 1 0
## 90 3 1 0 0 1
## 91 1 0 1 0 0
## 92 2 1 0 1 0
## 93 3 0 0 0 1
## 94 1 0 1 0 0
## 95 2 1 0 1 0
## 96 3 0 0 0 1
## 97 1 0 1 0 0
## 98 2 0 0 1 0
## 99 3 1 0 0 1
## 100 1 0 1 0 0
## # ℹ 35,699 more rows