Note: some parts are commented because, I don’t know for which reason, I have encountered problems in publishing my work on RPubs. The code works correctly and feel free to copy it to RStudio and try it directly.
This report is the final assigment of the second week of the Johns Hopkins Data Science Capstone course. The goal of this task is to show that I have acquired the ability to work with data and create a prediction algorithm. The principal goals are: to demonstrate to have downloaded the data and uploaded it correctly: to create a summary report of the basic summary data: to report any interesting discoveries; to get feedback on the future plans for creating a forecasting algorithm and a Shiny app.
# Install
#install.packages("wordcloud")
#install.packages("RColorBrewer")
#install.packages("wordcloud2")
#install.packages("tm")
#install.packages("SnowballC")
#install.packages("NLP")
#install.packages("ggplot2")
# Load
library(RColorBrewer)
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.4.4
library(wordcloud2)
## Warning: package 'wordcloud2' was built under R version 3.4.4
library(tm)
## Warning: package 'tm' was built under R version 3.4.4
## Loading required package: NLP
library(SnowballC)
library(NLP)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
setwd("C:/Users/bmast/Desktop/Capstone")
if(!file.exists("Coursera-SwiftKey.zip")){
download.file("https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip", "Coursera-SwiftKey.zip")
unzip("Coursera-SwiftKey.zip")
}
The Coursera-SwiftKey.zip contain various folders, based on country of origin of the files itself. In the folder “en_US” there are three files: en_US.blogs.txt is a large collection of character lines (899288 elements) with over 39.3 million words. en_US.news.txt is a large collection of character lines (77259 elements) with over 2.8 million words en_US.twitter.txt is a very large collection of character lines (over 2 million elements) with over 32.8 million words.
Load data in three vectors
blogs <- readLines("C:/Users/bmast/Desktop/Capstone/final/en_US/en_US.blogs.txt", warn = FALSE, encoding = "UTF-8")
news <- readLines("C:/Users/bmast/Desktop/Capstone/final/en_US/en_US.news.txt", warn = FALSE, encoding = "UTF-8")
tweets <- readLines("C:/Users/bmast/Desktop/Capstone/final/en_US/en_US.twitter.txt", warn = FALSE, encoding = "UTF-8")
Create a summary of the files and view its statistics.
summary <- data.frame('File' = c("Blogs","News","Twitter"),
"File Size" = sapply(list(blogs, news, tweets), function(x){format(object.size(x),"MB")}),
'Nentries' = sapply(list(blogs, news, tweets), function(x){length(x)}),
'TotalCharacters' = sapply(list(blogs, news, tweets), function(x){sum(nchar(x))}),
'MaxCharacters' = sapply(list(blogs, news, tweets), function(x){max(unlist(lapply(x, function(y) nchar(y))))})
)
summary
## File File.Size Nentries TotalCharacters MaxCharacters
## 1 Blogs 248.5 Mb 899288 206824505 40833
## 2 News 19.2 Mb 77259 15639408 5760
## 3 Twitter 301.4 Mb 2360148 162096031 140
The files are huge so I create a sample for each of the
blogs_sample <- blogs[rbinom(length(blogs)*.01, length(blogs), .01)]
news_sample <- news[rbinom(length(news)*.01, length(news), .01)]
tweets_sample <- tweets[rbinom(length(tweets)*.01, length(tweets), .01)]
Creating a Corpus via the VCorpus function
doc_blogs <- VCorpus(VectorSource(blogs_sample))
doc_news <- VCorpus(VectorSource(news_sample))
doc_tweets <- VCorpus(VectorSource(tweets_sample))
#Inspect the samples: I commented them to avoid inserting unnecessary details into the document
#inspect(doc_blogs)
#inspect(doc_news)
#inspect(doc_tweets)
Creating a function to “clean” the samples from special characters, numbers and convert all characters to lowercase etc. Apply the clean function to all the samples.
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
clean_docs <- function(corpus){
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, toSpace, "/")
corpus <- tm_map(corpus, toSpace, "@")
corpus <- tm_map(corpus, toSpace, "\\|")
corpus <- tm_map(corpus, stemDocument)
return(corpus)
}
clean_docB <- clean_docs(doc_blogs)
clean_docN <- clean_docs(doc_news)
#clean_docT <- clean_docs(doc_tweets)
BlogMatrix <- TermDocumentMatrix(clean_docB)
matrixB <- as.matrix(BlogMatrix)
frequencyB <- rowSums(matrixB)
length(frequencyB)
## [1] 4123
findFreqTerms(BlogMatrix,lowfreq=500)
## [1] "also" "can" "come" "day" "even" "first" "get"
## [8] "great" "just" "know" "last" "life" "like" "littl"
## [15] "look" "love" "make" "mani" "may" "much" "need"
## [22] "new" "one" "peopl" "realli" "say" "thing" "time"
## [29] "two" "will" "work" "year"
wordsB <- sort(frequencyB,decreasing=TRUE)
dfB <- data.frame(wordB = names(wordsB),freqB=wordsB)
NewsMatrix <- TermDocumentMatrix(clean_docN)
matrixN <- as.matrix(NewsMatrix)
frequencyN <- rowSums(matrixN)
length(frequencyN)
## [1] 1487
findFreqTerms(NewsMatrix,lowfreq=20)
## [1] "accord" "agenc" "allow" "also" "american"
## [6] "associ" "austin" "avail" "aztex" "base"
## [11] "beach" "board" "can" "cant" "case"
## [16] "charg" "chief" "chris" "close" "colleg"
## [21] "come" "crime" "day" "defend" "done"
## [26] "dont" "dunkin" "earli" "engin" "even"
## [31] "everybodi" "execut" "fight" "figur" "financ"
## [36] "finish" "fire" "forward" "game" "get"
## [41] "goal" "god" "group" "head" "heavili"
## [46] "help" "home" "huge" "involv" "job"
## [51] "johnson" "just" "kind" "know" "last"
## [56] "latest" "law" "leagu" "led" "let"
## [61] "like" "line" "littl" "long" "look"
## [66] "make" "middl" "minist" "monday" "much"
## [71] "name" "new" "next" "now" "nurs"
## [76] "offic" "one" "open" "part" "percent"
## [81] "perform" "plan" "play" "polic" "polit"
## [86] "posit" "presid" "put" "readi" "real"
## [91] "realli" "record" "releas" "report" "research"
## [96] "result" "return" "right" "run" "said"
## [101] "say" "season" "shoot" "show" "sinc"
## [106] "singl" "six" "someth" "spring" "start"
## [111] "state" "strike" "support" "tabl" "teacher"
## [116] "team" "theater" "think" "though" "three"
## [121] "time" "today" "tough" "tri" "univers"
## [126] "use" "walk" "want" "water" "will"
## [131] "win" "women" "work" "year" "yearold"
wordsN <- sort(frequencyN,decreasing=TRUE)
dfN <- data.frame(wordN = names(wordsN),freqT=wordsN)
# TweetMatrix <- TermDocumentMatrix(clean_docT)
# matrixT <- as.matrix(TweetMatrix)
# frequencyT <- rowSums(matrixT)
# length(frequencyT)
# findFreqTerms(TweetMatrix,lowfreq=100)
# wordsT <- sort(frequencyT,decreasing=TRUE)
# dfT <- data.frame(wordT = names(wordsT),freqT=wordsT)
Blog
wordcloud2(data=dfB, size=0.3, shape = 'pentagon', color='random-dark')
The same procedure to get the wordcloud of data from the “News” and “Tweeter” files
# News
# wordcloud2(data=dfN, size=0.6, shape = 'pentagon', color='random-dark')
# Twitter
# wordcloud2(data=dfT, size=0.6, shape = 'pentagon', color='random-dark')
Blog Histogram
The same procedure to get the plot of data from the “News” and “Tweeter” files
# News
# p <- ggplot(subset(dfN, freqN>1000), aes(wordN, freqN))
# p <- p + geom_bar(stat="identity", fill="darkred", colour="darkgreen")
# p <- p + scale_x_discrete("Most frequent words") + scale_y_continuous("Word frequencies")
# p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
# p
# Twitter
# p <- ggplot(subset(dfT, freqT>1000), aes(wordT, freqT))
# p <- p + geom_bar(stat="identity", fill="darkred", colour="darkgreen")
# p <- p + scale_x_discrete("Most frequent words") + scale_y_continuous("Word frequencies")
# p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
# p
In conclusion it is a complex project but I would say that the next step is to understand the distribution and the relationship between words, tokens and sentences in the text. The second could be to construct an n-gram model for the relationship between words to predict the next word based on the previous 1, 2 or 3 words.