Read Data
library(data.table)
library(tidytext)
library(dplyr)
library(ggplot2)
library(tidyr)
mydat <- fread('https://raw.githubusercontent.com/subasish/test1/master/ADis2.csv')
names(mydat)
## [1] "Business_Name" "Date" "Year" "Location_City"
## [5] "Vehicle_Was" "Pre_Crash" "Damage_Detail"
##
## 2016 2017 2018
## 15 26 74
table(mydat$Business_Name)
##
## Apple inc Aurora Innovation Inc
## 2 1
## Autora Innovation Cruise
## 1 57
## Cruise Automation Inc. Drive.ai Inc
## 1 1
## Google Google Auto LLC
## 3 12
## Jingchi.ai Nissan North America Inc.
## 1 1
## Toyota Research Institute UATC LLC
## 1 1
## Waymo Auto LLC Waymo LLC
## 1 25
## Zoox
## 7
table(mydat$Location_City)
##
## Crockett Los Altos
## 3 1 2
## Mountain Veiw Mountain View Palo Alto
## 1 23 15
## San Francisco San Jose South San Francisco
## 63 1 1
## Sunnyvale Sunnywale
## 3 2
##
## Minor Moving
## 2 1 70
## Stopped in Traffic
## 42
##
## Autonomous Conventional
## 73 42
bness <- (subset(mydat, Business_Name=="Cruise"| Business_Name=="Waymo LLC"))[,c(1, 7)]
was <- (subset(mydat, Vehicle_Was=="Moving"| Vehicle_Was=="Stopped in Traffic"))[,c(5, 7)]
year <- mydat[,c(3, 7)]
pre <- mydat[,c(6, 7)]
Basic Text Mining
### Mode
biss01 <- data.frame(Pre_Crash = unique(pre$Pre_Crash),
Damage = tapply(pre$Damage_Detail, pre$Pre_Crash, paste, collapse = ' '))
biss02 = biss01 %>%
unnest_tokens(word, Damage)
data(stop_words)
biss03 <- biss02 %>%
anti_join(stop_words)
biss03 %>%
count(word, sort = TRUE)
## # A tibble: 1,054 x 2
## word n
## <chr> <int>
## 1 av 341
## 2 vehicle 274
## 3 cruise 266
## 4 autonomous 177
## 5 mode 117
## 6 rear 113
## 7 street 113
## 8 waymo 108
## 9 left 100
## 10 lane 97
## # ... with 1,044 more rows
biss03 %>%
count(word, sort = TRUE) %>%
filter(n > 50) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
xlab(NULL) +theme_bw()+
coord_flip()

bigram01 <- biss01 %>%
unnest_tokens(bigram, Damage, token = "ngrams", n = 2)
bigram01 %>%
count(bigram, sort = TRUE)
## # A tibble: 4,069 x 2
## bigram n
## <chr> <int>
## 1 cruise av 176
## 2 the cruise 152
## 3 of the 143
## 4 autonomous vehicle 96
## 5 at the 95
## 6 autonomous mode 77
## 7 in autonomous 74
## 8 no injuries 74
## 9 av was 71
## 10 waymo av 71
## # ... with 4,059 more rows
bigrams_separated <- bigram01 %>%
separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
bigram_counts <- bigrams_filtered %>%
count(word1, word2, sort = TRUE)
bigram_counts
## # A tibble: 1,185 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 cruise av 176
## 2 autonomous vehicle 96
## 3 autonomous mode 77
## 4 waymo av 71
## 5 google av 63
## 6 vehicle cruise 52
## 7 cruise autonomous 50
## 8 minor damage 41
## 9 rear bumper 39
## 10 av operating 31
## # ... with 1,175 more rows
bigrams_united <- bigrams_filtered %>%
unite(bigram, word1, word2, sep = " ")
bigram_tf_idf <- bigrams_united %>%
count(Pre_Crash, bigram) %>%
bind_tf_idf(bigram, Pre_Crash, n) %>%
arrange(desc(tf_idf))
bigram_tf_idf
## # A tibble: 1,346 x 6
## Pre_Crash bigram n tf idf tf_idf
## <fct> <chr> <int> <dbl> <dbl> <dbl>
## 1 Autonomous manual mode 19 0.0179 0.693 0.0124
## 2 Autonomous aurora test 7 0.00660 0.693 0.00457
## 3 Autonomous mason st 5 0.00471 0.693 0.00327
## 4 Autonomous rear passenger 5 0.00471 0.693 0.00327
## 5 Autonomous subject vehicle 5 0.00471 0.693 0.00327
## 6 Autonomous traveling eastbound 5 0.00471 0.693 0.00327
## 7 Conventional san antonio 8 0.00455 0.693 0.00316
## 8 Conventional antonio road 7 0.00398 0.693 0.00276
## 9 Conventional cmise av 7 0.00398 0.693 0.00276
## 10 Autonomous 2 lane 4 0.00377 0.693 0.00261
## # ... with 1,336 more rows
bigram_tf_idf%>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(bigram, levels = rev(unique(bigram)))) %>%
group_by(Pre_Crash) %>%
top_n(15) %>%
ungroup %>%
ggplot(aes(word, tf_idf, fill = Pre_Crash)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +theme_bw()+
facet_wrap(~Pre_Crash, ncol = 2, scales = "free") +
coord_flip()

Sentiment Analysis
bing_word_counts <- biss03 %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
## # A tibble: 62 x 3
## word sentiment n
## <chr> <chr> <int>
## 1 autonomous positive 177
## 2 damage negative 79
## 3 struck negative 18
## 4 damaging negative 10
## 5 slowly negative 6
## 6 overtake positive 5
## 7 advanced positive 4
## 8 fell negative 4
## 9 lack negative 4
## 10 stiffness negative 4
## # ... with 52 more rows
bing_word_counts %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +theme_bw()+
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()
## Selecting by n
