This is an exercise in visualizing relationships between words by analyzing text in news articles about autonomous vehicles.
##
## The downloaded binary packages are in
## /var/folders/xh/lpcnmw0s0tv272x7vrdlmvw00000gn/T//RtmpSLiuIk/downloaded_packages
##
## The downloaded binary packages are in
## /var/folders/xh/lpcnmw0s0tv272x7vrdlmvw00000gn/T//RtmpSLiuIk/downloaded_packages
##
## The downloaded binary packages are in
## /var/folders/xh/lpcnmw0s0tv272x7vrdlmvw00000gn/T//RtmpSLiuIk/downloaded_packages
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
We are now reading in data. This is a set of three full articles, one from Forbes and two from VentureBeat.
#reading in data
data2 <- read.csv("AV_NewsReader_SampleArticle.csv")
## Warning in read.table(file = file, header = header, sep = sep,
## quote = quote, : incomplete final line found by readTableHeader on
## 'AV_NewsReader_SampleArticle.csv'
text = data2$Text
#tokenizing by n-gram
art_bigrams <- data2 %>%
unnest_tokens(bigram, text, "ngrams", n = 2)
art_bigrams %>%
count(bigram, sort = TRUE)
## # A tibble: 2,301 x 2
## bigram n
## <chr> <int>
## 1 autonomous vehicles 57
## 2 self driving 45
## 3 on the 33
## 4 in the 30
## 5 autonomous vehicle 27
## 6 driving cars 27
## 7 of the 24
## 8 and the 21
## 9 of driverless 21
## 10 to be 21
## # ... with 2,291 more rows
#next step
bigrams_separated2 <- art_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered2 <- bigrams_separated2 %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
# new bigram counts:
bigram_counts2 <- bigrams_filtered2 %>%
count(word1, word2, sort = TRUE)
#Visualizing a network of bigrams with ggraph
install.packages("igraph")
##
## The downloaded binary packages are in
## /var/folders/xh/lpcnmw0s0tv272x7vrdlmvw00000gn/T//RtmpSLiuIk/downloaded_packages
library(igraph)
##
## Attaching package: 'igraph'
## The following object is masked from 'package:tidyr':
##
## crossing
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
bigram_graph2 <- bigram_counts2 %>%
filter(n > 5) %>%
graph_from_data_frame()
bigram_graph2
## IGRAPH 01aa887 DN-- 292 235 --
## + attr: name (v/c), n (e/n)
## + edges from 01aa887 (vertex names):
## [1] autonomous->vehicles autonomous->vehicle
## [3] driving ->cars autonomous->cars
## [5] vehicle ->technologies driverless->cars
## [7] driverless->vehicle human ->drivers
## [9] real ->estate technology->isn’t
## [11] av ->start los ->altos
## [13] safety ->standards start ->act
## [15] autonomous->taxi car ->culture
## + ... omitted several edges
#plotting
install.packages("ggraph")
##
## The downloaded binary packages are in
## /var/folders/xh/lpcnmw0s0tv272x7vrdlmvw00000gn/T//RtmpSLiuIk/downloaded_packages
library(ggraph)
## Loading required package: ggplot2
set.seed(500)
ggraph(bigram_graph2, layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1)
a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
#directed graph
ggraph(bigram_graph2, layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
arrow = a, end_cap = circle(.07, 'inches')) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()