This report aims to document some initial exploratory data analysis of the data sets which will be used to create a prediction algorithm for the Data Science Capstone.
The motivation for this project is to:
#read in data
blogs <- readLines("../final/en_US/en_US.blogs.txt")
twitter <- suppressWarnings(readLines("../final/en_US/en_US.twitter.txt"))
news <- suppressWarnings(readLines("../final/en_US/en_US.news.txt"))
summary <- data.frame(length(blogs), length(twitter), length(news))
summary <- rbind(summary, c(format(object.size(blogs), units = "Mb"),
format(object.size(twitter), units = "Mb"),
format(object.size(news), units = "Mb")))
colnames(summary) <- c("Blogs", "Twitter", "News")
rownames(summary) <- c("Lines", "Memory")
library(kableExtra)
summary %>% kable() %>% kable_styling(full_width = FALSE, position = "left")
| Blogs | News | ||
|---|---|---|---|
| Lines | 899288 | 2360148 | 1010242 |
| Memory | 248.5 Mb | 301.4 Mb | 249.6 Mb |
The table above shows a summary of the each data set.
set.seed(123456)
random_twitter <- ceiling(runif(n=25000, min=0, max=length(twitter)))
random_news <- ceiling(runif(n=25000, min=0, max=length(news)))
random_blogs <- ceiling(runif(n=25000, min=0, max=length(blogs)))
#subset
twitter <- twitter[random_twitter]
news <- news[random_news]
blogs <- blogs[random_blogs]
#remove all non-English characters
blogs <- iconv(blogs, "latin1", "ASCII", sub="")
news <- iconv(news, "latin1", "ASCII", sub="")
twitter <- iconv(twitter, "latin1", "ASCII", sub="")
#convert to tidy format
twitter_df <- data_frame(source = "twitter", text = twitter)
blogs_df <- data_frame(source = "blogs", text = blogs)
news_df <- data_frame(source = "news", text = news)
samples_df <- bind_rows(list(twitter_df, blogs_df, news_df)) %>%
group_by(source) %>%
mutate(line = row_number()) %>%
ungroup()
tidy_samples <- samples_df %>%
unnest_tokens(word, text)
tidy_samples
## # A tibble: 2,211,714 x 3
## source line word
## <chr> <int> <chr>
## 1 twitter 1 why
## 2 twitter 1 do
## 3 twitter 1 native
## 4 twitter 1 apps
## 5 twitter 1 get
## 6 twitter 1 more
## 7 twitter 1 usage
## 8 twitter 1 than
## 9 twitter 1 mobile
## 10 twitter 1 web
## # ... with 2,211,704 more rows
25,000 lines from each data set was sampled for in order to perform an exploratory data analysis. All non-English characters were then removed. The sampled data sets were converted to tidy format before combining them into one “samples_df”. Each line of text was then seperated into tokens for further analysis.
The next step was to remove any profanity, stop words (such as “the”), numbers and retweet symbols from the twitter data. Below is an example of the data frame after these filtering efforts.
# list of bad words downloaded and used to filter
url <- "http://www.cs.cmu.edu/~biglou/resources/bad-words.txt"
download.file(url, destfile = "../Resources/bad-words.txt")
badWordList <- readLines("../Resources/bad-words.txt")
data("stop_words")
tidy_samples <- tidy_samples %>%
anti_join(stop_words) %>% #remove stop words
anti_join(data_frame(word = badWordList)) %>% #remove bad words
filter(!str_detect(word, "^\\d")) %>% #remove numbers
filter(!str_detect(word, "rt")) #remove retweet
tidy_samples
## # A tibble: 849,819 x 3
## source line word
## <chr> <int> <chr>
## 1 twitter 1 native
## 2 twitter 1 apps
## 3 twitter 1 usage
## 4 twitter 1 mobile
## 5 twitter 1 web
## 6 twitter 2 pettitte
## 7 twitter 2 berkman
## 8 twitter 2 jeter
## 9 twitter 2 clutch
## 10 twitter 2 hitters
## # ... with 849,809 more rows
tidy_samples %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100, colors=brewer.pal(6,"Dark2")))
tidy_samples_count <- tidy_samples %>%
group_by(source) %>%
count(word, sort = TRUE)
tidy_samples_count %>%
group_by(source) %>%
top_n(20) %>%
ungroup %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = source)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "Word Count") +
facet_wrap(~source, ncol = 3, scales = "free") +
coord_flip() +
ggtitle("Twenty most frequent words in each source of text",
subtitle = "Sorted in order of overall frequency in the sampled dataset") +
theme_minimal()
The word “love” is frequent in blogs and twitter data and absent from top 20 list in news data. “lol” is unique to the top20 twitter data set.
count_bigrams <- function(dataset) { #function to count bigrams
dataset %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word,
!word1 %in% badWordList,
!word2 %in% badWordList,
!str_detect(word1, "^\\d"),
!str_detect(word2, "^\\d")) %>%
count(word1, word2, sort = TRUE)
}
visualize_bigrams <- function(bigrams) { #function to visualise bigrams
set.seed(2016)
a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
bigrams %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()
}
#apply count_bigrams to our samples_df
sample_bigrams <- samples_df %>%
count_bigrams()
# filter out rare combinations and rt/amp and visualise
sample_bigrams %>%
filter(n > 30,
!str_detect(word1, "rt"),
!str_detect(word2, "rt"),
!str_detect(word1, "amp"),
!str_detect(word2, "amp")) %>%
visualize_bigrams()
The above plot shows common bigrams encountered in the sampled data set. For example in the centre “parking” is often followed by “lot” and “head” is often followed by “coach”. “san” could be followed by “francisco” or “diego”. Similarly “president” can be preceded by “vice” or followed by “barack” or “obama”.
sample_trigrams <- samples_df %>%
unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word,
!word3 %in% stop_words$word,
!word1 %in% badWordList,
!word2 %in% badWordList,
!word3 %in% badWordList,
!str_detect(word1, "^\\d"),
!str_detect(word2, "^\\d"),
!str_detect(word3, "^\\d"),
!str_detect(word1, "rt"),
!str_detect(word2, "rt"),
!str_detect(word3, "rt"),
!str_detect(word1, "amp"),
!str_detect(word2, "amp"),
!str_detect(word3, "amp")) %>%
count(word1, word2, word3, sort = TRUE)
sample_trigrams %>%
top_n(20) %>%
mutate(x = paste(word1, word2, word3, sep = " ")) %>%
mutate(x = reorder(x, n)) %>%
ggplot(aes(x, n)) + xlab("Trigram") + ylab("Frequency") +
geom_col(show.legend = FALSE, fill = "purple") + coord_flip() + theme_minimal() +
ggtitle("Twenty most frequent trigrams in the sampled data set")
The plot above shows the most frequent observed trigrams in the sampled data set.
return_third_word <- function(woord1, woord2){
woord <- sample_trigrams %>%
filter_(~word1 == woord1, ~word2 == woord2) %>%
sample_n(1, weight = n) %>% #select random row from filtered df, but weighted towards most freq trigram
.[["word3"]]
woord
}
return_third_word("president", "barack")
## [1] "obama"
return_third_word("jones", "industrial")
## [1] "average"
return_third_word("st", "louis") #market, district, pinnacle
## [1] "county"
return_third_word("st", "patrick's")
## [1] "day"
return_third_word("dow", "jones") #global, industrial, fell, most often industrial
## [1] "global"
return_second_word <- function(woord1){
woord <- sample_bigrams %>%
filter_(~word1 == woord1) %>%
sample_n(1, weight = n) %>%
.[["word2"]]
woord
}
return_second_word("happy")
## [1] "hour"
sample_bigrams %>%
filter_(~word1 == "happy") #173 instances where happy is word1 in a bigram
## # A tibble: 174 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 happy birthday 107
## 2 happy hour 36
## 3 happy mother's 25
## 4 happy mothers 24
## 5 happy friday 22
## 6 happy easter 15
## 7 happy monday 13
## 8 happy thanksgiving 12
## 9 happy bday 10
## 10 happy sunday 8
## # ... with 164 more rows
#repeated execution of return_second_word("happy") -> birthday and other terms freq in training set appear more often
return_second_word("police")
## [1] "plan"
return_second_word("olive")
## [1] "oil"
return_second_word("hip")
## [1] "hop"
return_second_word("bottom")
## [1] "line"
return_second_word("credit")
## [1] "crisis"
return_second_word("san")
## [1] "francisco"
The bigrams observed in the training data sets will be used to inform the output of the return_second_word and return_third_word functions described above. A shiny app will be developed which will predict the next word when a user inputs text.
##Session Info
sessionInfo()
## R version 3.4.4 (2018-03-15)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.1 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_IE.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_IE.UTF-8 LC_COLLATE=en_IE.UTF-8
## [5] LC_MONETARY=en_IE.UTF-8 LC_MESSAGES=en_IE.UTF-8
## [7] LC_PAPER=en_IE.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_IE.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] bindrcpp_0.2.2 kableExtra_0.9.0 igraph_1.2.2
## [4] stringr_1.3.1 tidyr_0.8.1 ggraph_1.0.2
## [7] wordcloud_2.6 RColorBrewer_1.1-2 ggplot2_3.0.0
## [10] tidytext_0.2.0 dplyr_0.7.6
##
## loaded via a namespace (and not attached):
## [1] tidyselect_0.2.4 purrr_0.2.5 lattice_0.20-35
## [4] colorspace_1.3-2 htmltools_0.3.6 SnowballC_0.5.1
## [7] viridisLite_0.3.0 yaml_2.2.0 utf8_1.1.4
## [10] rlang_0.2.2 pillar_1.3.0 glue_1.3.0
## [13] withr_2.1.2 tweenr_1.0.0 bindr_0.1.1
## [16] plyr_1.8.4 munsell_0.5.0 gtable_0.2.0
## [19] rvest_0.3.2 evaluate_0.11 labeling_0.3
## [22] knitr_1.20 fansi_0.4.0 highr_0.7
## [25] broom_0.5.0 tokenizers_0.2.1 Rcpp_0.12.19
## [28] readr_1.1.1 scales_1.0.0 backports_1.1.2
## [31] farver_1.0 gridExtra_2.3 hms_0.4.2
## [34] ggforce_0.1.3 digest_0.6.17 stringi_1.2.4
## [37] ggrepel_0.8.0 grid_3.4.4 cli_1.0.1
## [40] tools_3.4.4 magrittr_1.5 lazyeval_0.2.1
## [43] tibble_1.4.2 janeaustenr_0.1.5 crayon_1.3.4
## [46] pkgconfig_2.0.2 MASS_7.3-49 Matrix_1.2-12
## [49] xml2_1.2.0 httr_1.3.1 rstudioapi_0.8
## [52] assertthat_0.2.0 rmarkdown_1.10.11 htmldeps_0.1.1
## [55] viridis_0.5.1 R6_2.3.0 units_0.6-1
## [58] nlme_3.1-131 compiler_3.4.4