list.of.packages <- c("tm", "wordcloud", "RColorBrewer", "SnowballC", "corpus", "ggplot2", "tidytext", "tidyr","dplyr")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
lapply(list.of.packages, require, character.only=TRUE)
library(tm)
library(SnowballC)
library(wordcloud)
library(ggplot2)
library(tidyr)
library(dplyr)
library(corpus)
# Three files in the corpus:
twitterFile <- "final/en_US/en_US.twitter.txt"
newsFile <- "final/en_US/en_US.news.txt"
blogsFile <- "final/en_US/en_US.blogs.txt"
# First read in the three files, and take a 5% random sample for practice:
# File 1 = twitter
con <- file(twitterFile, "r")
twitterFull <- readLines(con)
close(con)
twitterLines <- length(twitterFull)
twitterSample <- twitterFull[sample(1:twitterLines, twitterLines * 0.05, replace=FALSE)]
# File 2 = news
con <- file(newsFile, "r")
newsFull <- readLines(con)
close(con)
newsLines <- length(newsFull)
newsSample <- newsFull[sample(1:newsLines, newsLines * 0.05, replace=FALSE)]
# File 3 = blogs
con <- file(blogsFile, "r")
blogsFull <- readLines(con)
close(con)
blogsLines <- length(blogsFull)
blogsSample <- blogsFull[sample(1:blogsLines, blogsLines * 0.05, replace=FALSE)]
# Get rid of these files, because they are huge:
rm(twitterFull,newsFull, blogsFull)
This is the basic code for cleaning
twitterCorp <- Corpus(VectorSource(c(twitterSample)))
newsCorp <- Corpus(VectorSource(c(newsSample)))
blogsCorp <- Corpus(VectorSource(c(blogsSample)))
allCorp <- Corpus(VectorSource(c(twitterSample, newsSample, blogsSample)))
# Make lowercase; remove punctuation/numbers/stopwords; stem.
twitterCorp <- twitterCorp %>%
tm_map(tolower) %>%
tm_map(removePunctuation) %>%
tm_map(removeNumbers) %>%
tm_map(content_transformer(function(x, pattern) gsub("”", "", x))) %>%
tm_map(content_transformer(function(x, pattern) gsub("“", "", x))) %>%
tm_map(content_transformer(function(x, pattern) gsub("’", "", x))) %>%
tm_map(removeNumbers) %>%
tm_map(removeWords, stopwords("en")) %>%
tm_map(stemDocument)
[Same manipulations done in background for newsCorp, blogsCorp and allCorp]
twitterStats <- term_stats(twitterCorp)
newsStats <- term_stats(newsCorp)
blogsStats <- term_stats(blogsCorp)
statsTable <- cbind(twitterStats[1:30,1:2],newsStats[1:30,1:2], blogsStats[1:30,1:2])
colnames(statsTable) <- c("Twitter top words", "Count", "News top words", "Count", "Blogs top words", "Count")
statsTable
## Twitter top words Count News top words Count Blogs top words Count
## 1 im 8004 said 12540 one 6721
## 2 just 7573 year 5448 like 5502
## 3 get 7429 will 5581 will 5888
## 4 thank 6491 one 4250 time 5318
## 5 like 6577 new 3551 just 5020
## 6 go 6329 time 3247 can 5075
## 7 love 6087 also 3004 get 4653
## 8 day 5503 state 3368 go 4150
## 9 good 5034 say 3124 make 4002
## 10 will 4737 two 2917 day 3560
## 11 rt 4563 get 3003 know 3299
## 12 dont 4500 can 2942 now 2968
## 13 can 4453 like 2859 use 3241
## 14 time 4258 last 2720 im 3335
## 15 one 4285 make 2719 year 3223
## 16 know 4217 just 2699 thing 3039
## 17 now 4085 go 2911 love 3196
## 18 see 3897 first 2648 work 3044
## 19 great 3836 work 2449 even 2814
## 20 today 3735 peopl 2456 also 2778
## 21 make 3721 day 2234 see 2865
## 22 follow 3974 game 2460 think 2860
## 23 new 3421 us 2222 look 2846
## 24 think 3344 citi 2356 want 2930
## 25 come 3298 includ 2002 peopl 3049
## 26 lol 3349 play 2176 way 2748
## 27 look 3336 school 2383 dont 2753
## 28 u 3838 use 1915 new 2816
## 29 need 3258 take 1804 first 2618
## 30 work 3211 — 2482 well 2535
allStats <- term_stats(allCorp)
top20 <- allStats[1:20,]
top20 <- mutate(top20,term = reorder(term, -count))
g <- ggplot(top20, aes(term, count))
g <- g+geom_col()
g
# Most frequent digrams
term_stats(allCorp, ngrams = 2)
## term count support
## 1 right now 1295 1278
## 2 last year 1148 1119
## 3 look like 1082 1055
## 4 dont know 967 935
## 5 new york 1000 920
## 6 cant wait 903 895
## 7 feel like 884 861
## 8 year ago 846 830
## 9 look forward 787 783
## 10 last night 772 763
## 11 last week 718 708
## 12 im go 708 674
## 13 high school 713 661
## 14 thank follow 634 633
## 15 make sure 634 618
## 16 first time 595 589
## 17 im sure 552 538
## 18 can get 538 536
## 19 dont think 523 513
## 20 dont want 516 494
## ⋮ (1619143 rows total)
# Most frequent trigrams
term_stats(allCorp, ngrams = 3)
## term count support
## 1 happi mother day 181 181
## 2 cant wait see 168 168
## 3 let us know 164 164
## 4 new york citi 141 136
## 5 happi new year 103 102
## 6 presid barack obama 99 99
## 7 two year ago 81 80
## 8 im pretti sure 79 79
## 9 look forward see 79 79
## 10 feel like im 68 68
## 11 new york time 62 62
## 12 gov chris christi 61 61
## 13 dont even know 58 58
## 14 world war ii 56 53
## 15 st loui counti 53 52
## 16 st patrick day 55 51
## 17 im look forward 50 50
## 18 cinco de mayo 49 48
## 19 cant wait till 47 47
## 20 cant wait get 46 46
## ⋮ (2321360 rows total)
Ngram model will be based on a markov stragegy
Simplest model: using a markov matrix of frequencies.
Matrix tells us how frequently any word is likely to appear after a given word. Prediction model then suggests next word using the previous word only. This can be done recursively, with the prediction changing every time someone types a word. Note: unlike in the exploratory work, matrix will not use stemming for the word list, as this would lead to grammatically nonsensical predictions.
More invloved model: add recursive frequencies looking 2 words or 3 words back, to suggest next word more precisely