Following shows the purpose of this report:
Demonstrate that I have downloaded the data and have successfully loaded it in.
# Read data set
if(!file.exists("../original_data_set")){dir.create("../original_data_set")}
data_set_url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
download.file(data_set_url,
destfile="../original_data_set/Coursera-SwiftKey.zip")
# Unzip dataSet
unzip(zipfile="../original_data_set/Coursera-SwiftKey.zip",
exdir="../original_data_set")
library(stringi)
library(tm)
library(dplyr)
library(SnowballC)
library(RWeka)
library(ggplot2)
# summary statistics about the data sets
# Word counts, line counts and basic data tables
deta_file_list <- c("../original_data_set/final/en_US/en_US.blogs.txt",
"../original_data_set/final/en_US/en_US.news.txt",
"../original_data_set/final/en_US/en_US.twitter.txt")
raw_data <- list(blogs = "", news = "", twitter = "")
basic_data_table <- matrix(0, nrow = 3, ncol = 2,
dimnames = list(c("blogs", "news", "twitter"),
c("Word counts", "line counts")))
library(stringi)
for (i in 1:3) {
con <- file(deta_file_list[i], "rb")
raw_data[[i]] <- readLines(con, encoding = "UTF-8",skipNul = TRUE)
close(con)
basic_data_table[i,1] <- sum(stri_count_words(raw_data[[i]]))
basic_data_table[i,2] <- length(raw_data[[i]])
}
# Print Basic Data Table
basic_data_table
## Word counts line counts
## blogs 37546246 899288
## news 34762395 1010242
## twitter 30093410 2360148
set.seed(123)
# Sample 1% of lines from each source file
sample_blog <- sample(raw_data$blogs, 0.01*length(raw_data$blogs))
sample_news <- sample(raw_data$news, 0.01*length(raw_data$news))
sample_twitter <- sample(raw_data$twitter, 0.01*length(raw_data$twitter))
sample <- c(sample_blog, sample_news, sample_twitter)
sample <- iconv(sample, 'UTF-8', 'ASCII')
corpus <- Corpus(VectorSource(as.data.frame(sample, stringsAsFactors = FALSE)))
corpus <- corpus %>%
tm_map(tolower) %>%
tm_map(PlainTextDocument) %>%
tm_map(removePunctuation) %>%
tm_map(removeNumbers) %>%
tm_map(stemDocument) %>%
tm_map(stripWhitespace)
tokenize_word_freq <-NGramTokenizer(corpus, Weka_control(min = 1, max = 1))
dist_word_freq <- data.frame(table(tokenize_word_freq))
dist_word_freq <- dist_word_freq[order(dist_word_freq$Freq, decreasing = TRUE),]
ggplot(dist_word_freq[1:20,], aes(x=tokenize_word_freq, y=Freq)) +
geom_bar(stat="Identity")+
xlab("Unigrams") + ylab("Frequency")+
ggtitle("Distributions of word frequencies") +
theme(axis.text.x=element_text(angle=90, hjust=1))
What are the frequencies of 2-grams and 3-grams in the dataset?
2-grams
tokenize_2grams_freq <-NGramTokenizer(corpus, Weka_control(min = 2, max = 2))
dist_2grams_freq <- data.frame(table(tokenize_2grams_freq))
dist_2grams_freq <- dist_2grams_freq[order(dist_2grams_freq$Freq, decreasing = TRUE),]
ggplot(dist_2grams_freq[1:20,], aes(x=tokenize_2grams_freq, y=Freq)) +
geom_bar(stat="Identity")+
xlab("2-grams") + ylab("Frequency")+
ggtitle("Distributions of 2-grams frequencies") +
theme(axis.text.x=element_text(angle=90, hjust=1))
tokenize_3grams_freq <-NGramTokenizer(corpus, Weka_control(min = 3, max = 3))
dist_3grams_freq <- data.frame(table(tokenize_3grams_freq))
dist_3grams_freq <- dist_3grams_freq[order(dist_3grams_freq$Freq, decreasing = TRUE),]
ggplot(dist_3grams_freq[1:20,], aes(x=tokenize_3grams_freq, y=Freq)) +
geom_bar(stat="Identity")+
xlab("3-grams") + ylab("Frequency")+
ggtitle("Distributions of 3-grams frequencies") +
theme(axis.text.x=element_text(angle=90, hjust=1))
I’m planning to develop a Shiny application which predicts next word/sentence with n-grim model data based on user text input.