The goal of this project is to create a prediction algorithm. The exploratory analysis and goals for the eventual app and algorithm are explained in this Milestone Report. Tables and plots are also used to illustrate important summaries of the data set.
The motivation for this project is to: 1. Demonstrate that the data have been successfully downloaded and loaded.2. Create a basic report of summary statistics about the data sets.3. Report any interesting findings that have been amassed so far.4. Get feedback on the plans for creating a prediction algorithm and Shiny app.
# if(!file.exists("data")){
# dir.create("data")
# }
#
# fileUrl <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
#
# download.file(fileUrl, destfile = "data/Coursera-SwiftKey.zip")
#
# unzip(zipfile = "data/Coursera-SwiftKey.zip", exdir = "data")
The data are locally downloaded in the folder “data/final/en_US” and loaded into r.
blogs <- file("data/final/en_US/en_US.blogs.txt", "r")
news <- file("data/final/en_US/en_US.news.txt", "r")
twitter <- file("data/final/en_US/en_US.twitter.txt", "r")
The stringi package is used to compute the file sizes, and the numbers of words and lines in a basic data tables.
library(stringi)
library(stringr)
blogs <- readLines("data/final/en_US/en_US.blogs.txt", encoding="UTF-8", warn= TRUE, skipNul = TRUE)
news <- readLines("data/final/en_US/en_US.news.txt", encoding="UTF-8", warn= TRUE, skipNul = TRUE)
## Warning in readLines("data/final/en_US/en_US.news.txt", encoding =
## "UTF-8", : incomplete final line found on 'data/final/en_US/en_US.news.txt'
twitter <- readLines("data/final/en_US/en_US.twitter.txt", encoding="UTF-8", warn= TRUE, skipNul = TRUE)
# Get file sizes
blogs.size <- file.info("data/final/en_US/en_US.blogs.txt")$size / 1024 ^ 2
news.size <- file.info("data/final/en_US/en_US.news.txt")$size / 1024 ^ 2
twitter.size <- file.info("data/final/en_US/en_US.twitter.txt")$size / 1024 ^ 2
# Get words in files
blogs.words <- stri_count_words(blogs)
news.words <- stri_count_words(news)
twitter.words <- stri_count_words(twitter)
# Summary of the data sets
data.frame(source = c("blogs", "news", "twitter"),
file.size.MB = c(blogs.size, news.size, twitter.size),
num.lines = c(length(blogs), length(news), length(twitter)),
num.words = c(sum(blogs.words), sum(news.words), sum(twitter.words)),
mean.num.words = c(mean(blogs.words), mean(news.words), mean(twitter.words)))
## source file.size.MB num.lines num.words mean.num.words
## 1 blogs 200.4242 899288 37546246 41.75108
## 2 news 196.2775 77259 2674536 34.61779
## 3 twitter 159.3641 2360148 30093410 12.75065
Each file is too large for an exploratory analysis. So we draw a small sample made of the first 1000 lines from each of them.
We will combine these sample in a single data.sample file, which will be cleaned.
library(tm)
library(NLP)
#creating a corpus with the small samples
set.seed(550)
sampleBlogs <- readLines("data/final/en_US/en_US.blogs.txt", encoding="UTF-8", warn= TRUE, skipNul = TRUE, 1000)
sampleNews <- readLines("data/final/en_US/en_US.news.txt", encoding="UTF-8", warn= TRUE, skipNul = TRUE, 1000)
SampleTwitter <- readLines("data/final/en_US/en_US.twitter.txt", encoding="UTF-8", warn= TRUE, skipNul = TRUE, 1000)
data.sample <- c(sampleBlogs, sampleNews, SampleTwitter)
corpus <- Corpus(VectorSource(data.sample))
corpus <- Corpus(VectorSource(corpus))
# Clean text
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
cleanset <- tm_map(corpus, removeWords, stopwords('english'))
removeURL <- function(x) gsub('http[[:alnum:]]*', '', x)
cleanset <- tm_map(cleanset, content_transformer(removeURL))
cleanset <- tm_map(cleanset, stripWhitespace)
# Term document matrix
dtm <- TermDocumentMatrix(cleanset)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)
## word freq
## said said 304
## one one 256
## just just 251
## like like 248
## can can 216
## time time 192
## new new 186
## get get 171
## day day 144
## know know 144
From the above data sample we calculate the most frequent words, and we will construct a barplot as well as a word cloud.
#finding the most frequent term
findFreqTerms(dtm, lowfreq = 3000)
## character(0)
# Bar plot
barplot(d[1:10,]$freq, las = 2, names.arg = d[1:10,]$word,
col ="lightblue", main ="Most frequent words",
ylab = "Word frequencies")
# Word cloud
library(wordcloud)
## Loading required package: RColorBrewer
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
The ngram package is the most commonly used algorithm for the prediction on word strings. Therefore, I plan to use ngram package as a prediction algorithm with shiny app web application in my final submission.