library(wordcloud)
library(RWeka)
library(tm)
library(ggplot2)
library(dplyr)
library(SnowballC)
This is a report for a Coursera Data Science Capstone project that involves next-word prediction. The method of prediction is n-grams algorithm, which predict next word given ‘n’ words.
This report has 3 text data sets, from blog, news, and Twitter. It will be reported using these sets. According these sets, this report figure out how many words come out with 1 to 3(1 gram to 3 grams model)
For figuring out, I’ll use two visualization Bar Plot, and Wordcloud
Unfortunately, Reading line, and Sampling in R is too slow in my labtop, so I use Python graphlab packages.
Sorry for inconvinience :(
IPython Notebook LINK
summary = read.csv('summary.csv')
colnames(summary) = c('type','# of characters in the longest line', 'Avg words per line','Total line count','Total word count')
knitr::kable(summary)
| type | # of characters in the longest line | Avg words per line | Total line count | Total word count |
|---|---|---|---|---|
| blog | 40836 | 232.69601 | 899288 | 209260726 |
| news | 5761 | 204.00242 | 77259 | 15761023 |
| twit | 214 | 69.80291 | 2360148 | 164745190 |
sample_blog <- read.csv("sample_blog.csv", header=FALSE, stringsAsFactors=FALSE)
sample_news <- read.csv("sample_news.csv", header=FALSE, stringsAsFactors=FALSE)
sample_twitter <- read.csv("sample_twitter.csv", header=FALSE, stringsAsFactors=FALSE)
sample = c(as.array(sample_blog$V1),as.array(sample_news$V1),as.array(sample_twitter$V1))
writeLines(sample, 'sample/sample.txt')
docs <- Corpus(DirSource('sample'))
docs <- tm_map(docs, content_transformer(tolower))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
docs <- tm_map(docs, toSpace, "/|@|\\|")
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, stripWhitespace)
docs <- tm_map(docs, removeWords, stopwords("english"))
docs <- tm_map(docs, stemDocument)
# Make Token and Matrix
unitoken <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
unimat <- DocumentTermMatrix(docs,
control = list(tokenize = unitoken))
bitoken <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
bimat <- DocumentTermMatrix(docs,
control = list(tokenize = bitoken))
tritoken <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
trimat <- DocumentTermMatrix(docs,
control = list(tokenize = tritoken))
# Sorting Matrix
unifreq <- sort(colSums(as.matrix(unimat)), decreasing=TRUE)
bifreq <- sort(colSums(as.matrix(bimat)), decreasing=TRUE)
trifreq <- sort(colSums(as.matrix(trimat)), decreasing=TRUE)
# Convert to data frame
uniframe <- data.frame(word=names(unifreq), freq=unifreq)
biframe <- data.frame(word=names(bifreq), freq=bifreq)
triframe <- data.frame(word=names(trifreq), freq=trifreq)
| word | freq | |
|---|---|---|
| just | just | 25287 |
| get | get | 24352 |
| like | like | 24174 |
| will | will | 21749 |
| one | one | 21495 |
| can | can | 19068 |
| word | freq | |
|---|---|---|
| look like | look like | 1767 |
| cant wait | cant wait | 1755 |
| feel like | feel like | 1626 |
| right now | right now | 1432 |
| look forward | look forward | 1428 |
| last night | last night | 1318 |
| word | freq | |
|---|---|---|
| cant wait see | cant wait see | 371 |
| happi mother day | happi mother day | 255 |
| let us know | let us know | 249 |
| happi new year | happi new year | 155 |
| im pretti sure | im pretti sure | 151 |
| look forward see | look forward see | 150 |
unifreq_plot <- uniframe %>%
filter(freq > 10000) %>%
ggplot(aes(word,freq)) +
geom_bar(stat="identity") +
ggtitle("Unigrams with frequencies > 10000") +
xlab("Unigrams") + ylab("Frequency") +
theme(axis.text.x=element_text(angle=90, hjust=1))
bifreq_plot <- biframe %>%
filter(freq > 1000) %>%
ggplot(aes(word,freq)) +
geom_bar(stat="identity") +
ggtitle("Bigrams with frequencies > 1000") +
xlab("Bigrams") + ylab("Frequency") +
theme(axis.text.x=element_text(angle=90, hjust=1))
trifreq_plot <- triframe %>%
filter(freq > 100) %>%
ggplot(aes(word,freq)) +
geom_bar(stat="identity") +
ggtitle("Trigrams with frequencies > 100") +
xlab("Trigrams") + ylab("Frequency") +
theme(axis.text.x=element_text(angle=90, hjust=1))