TAMU ITE NLP Workshop

TAMU ITE NLP Workshop– 02212019

Read Data
Basic Text Mining
Sentiment Analysis

Read Data

library(data.table)
library(tidytext)
library(dplyr)
library(ggplot2)
library(tidyr)


mydat <- fread('https://raw.githubusercontent.com/subasish/test1/master/ADis2.csv')
names(mydat)

## [1] "Business_Name" "Date"          "Year"          "Location_City"
## [5] "Vehicle_Was"   "Pre_Crash"     "Damage_Detail"

table(mydat$Year)

## 
## 2016 2017 2018 
##   15   26   74

table(mydat$Business_Name)

## 
##                 Apple inc     Aurora Innovation Inc 
##                         2                         1 
##         Autora Innovation                    Cruise 
##                         1                        57 
##    Cruise Automation Inc.              Drive.ai Inc 
##                         1                         1 
##                    Google           Google Auto LLC 
##                         3                        12 
##                Jingchi.ai Nissan North America Inc. 
##                         1                         1 
## Toyota Research Institute                  UATC LLC 
##                         1                         1 
##            Waymo Auto LLC                 Waymo LLC 
##                         1                        25 
##                      Zoox 
##                         7

table(mydat$Location_City)

## 
##                                Crockett           Los Altos 
##                   3                   1                   2 
##       Mountain Veiw       Mountain View           Palo Alto 
##                   1                  23                  15 
##       San Francisco            San Jose South San Francisco 
##                  63                   1                   1 
##           Sunnyvale           Sunnywale 
##                   3                   2

table(mydat$Vehicle_Was)

## 
##                                 Minor             Moving 
##                  2                  1                 70 
## Stopped in Traffic 
##                 42

table(mydat$Pre_Crash)

## 
##   Autonomous Conventional 
##           73           42

bness <- (subset(mydat, Business_Name=="Cruise"| Business_Name=="Waymo LLC"))[,c(1, 7)]
was <- (subset(mydat, Vehicle_Was=="Moving"| Vehicle_Was=="Stopped in Traffic"))[,c(5, 7)]
year <- mydat[,c(3, 7)]
pre <- mydat[,c(6, 7)]

Basic Text Mining

### Mode
biss01 <- data.frame(Pre_Crash = unique(pre$Pre_Crash), 
                   Damage = tapply(pre$Damage_Detail, pre$Pre_Crash, paste, collapse = ' '))


biss02 = biss01  %>%
  unnest_tokens(word, Damage)

data(stop_words)

biss03 <- biss02 %>%
  anti_join(stop_words)

biss03 %>%
  count(word, sort = TRUE)

## # A tibble: 1,054 x 2
##    word           n
##    <chr>      <int>
##  1 av           341
##  2 vehicle      274
##  3 cruise       266
##  4 autonomous   177
##  5 mode         117
##  6 rear         113
##  7 street       113
##  8 waymo        108
##  9 left         100
## 10 lane          97
## # ... with 1,044 more rows

biss03 %>%
  count(word, sort = TRUE) %>%
  filter(n > 50) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col() +
  xlab(NULL) +theme_bw()+
  coord_flip()

bigram01 <- biss01 %>%
  unnest_tokens(bigram, Damage, token = "ngrams", n = 2)

bigram01 %>%
  count(bigram, sort = TRUE)

## # A tibble: 4,069 x 2
##    bigram                 n
##    <chr>              <int>
##  1 cruise av            176
##  2 the cruise           152
##  3 of the               143
##  4 autonomous vehicle    96
##  5 at the                95
##  6 autonomous mode       77
##  7 in autonomous         74
##  8 no injuries           74
##  9 av was                71
## 10 waymo av              71
## # ... with 4,059 more rows

bigrams_separated <- bigram01 %>%
  separate(bigram, c("word1", "word2"), sep = " ")

bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

bigram_counts <- bigrams_filtered %>% 
  count(word1, word2, sort = TRUE)

bigram_counts

## # A tibble: 1,185 x 3
##    word1      word2          n
##    <chr>      <chr>      <int>
##  1 cruise     av           176
##  2 autonomous vehicle       96
##  3 autonomous mode          77
##  4 waymo      av            71
##  5 google     av            63
##  6 vehicle    cruise        52
##  7 cruise     autonomous    50
##  8 minor      damage        41
##  9 rear       bumper        39
## 10 av         operating     31
## # ... with 1,175 more rows

bigrams_united <- bigrams_filtered %>%
  unite(bigram, word1, word2, sep = " ")


bigram_tf_idf <- bigrams_united %>%
  count(Pre_Crash, bigram) %>%
  bind_tf_idf(bigram, Pre_Crash, n) %>%
  arrange(desc(tf_idf))

bigram_tf_idf

## # A tibble: 1,346 x 6
##    Pre_Crash    bigram                  n      tf   idf  tf_idf
##    <fct>        <chr>               <int>   <dbl> <dbl>   <dbl>
##  1 Autonomous   manual mode            19 0.0179  0.693 0.0124 
##  2 Autonomous   aurora test             7 0.00660 0.693 0.00457
##  3 Autonomous   mason st                5 0.00471 0.693 0.00327
##  4 Autonomous   rear passenger          5 0.00471 0.693 0.00327
##  5 Autonomous   subject vehicle         5 0.00471 0.693 0.00327
##  6 Autonomous   traveling eastbound     5 0.00471 0.693 0.00327
##  7 Conventional san antonio             8 0.00455 0.693 0.00316
##  8 Conventional antonio road            7 0.00398 0.693 0.00276
##  9 Conventional cmise av                7 0.00398 0.693 0.00276
## 10 Autonomous   2 lane                  4 0.00377 0.693 0.00261
## # ... with 1,336 more rows

bigram_tf_idf%>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(bigram, levels = rev(unique(bigram)))) %>% 
  group_by(Pre_Crash) %>% 
  top_n(15) %>% 
  ungroup %>%
  ggplot(aes(word, tf_idf, fill = Pre_Crash)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +theme_bw()+
  facet_wrap(~Pre_Crash, ncol = 2, scales = "free") +
  coord_flip()

Sentiment Analysis

bing_word_counts <- biss03 %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

## Joining, by = "word"

bing_word_counts

## # A tibble: 62 x 3
##    word       sentiment     n
##    <chr>      <chr>     <int>
##  1 autonomous positive    177
##  2 damage     negative     79
##  3 struck     negative     18
##  4 damaging   negative     10
##  5 slowly     negative      6
##  6 overtake   positive      5
##  7 advanced   positive      4
##  8 fell       negative      4
##  9 lack       negative      4
## 10 stiffness  negative      4
## # ... with 52 more rows

bing_word_counts %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +theme_bw()+
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()

## Selecting by n