Note: some parts are commented because, I don’t know for which reason, I have encountered problems in publishing my work on RPubs. The code works correctly and feel free to copy it to RStudio and try it directly.

Introdution

This report is the final assigment of the second week of the Johns Hopkins Data Science Capstone course. The goal of this task is to show that I have acquired the ability to work with data and create a prediction algorithm. The principal goals are: to demonstrate to have downloaded the data and uploaded it correctly: to create a summary report of the basic summary data: to report any interesting discoveries; to get feedback on the future plans for creating a forecasting algorithm and a Shiny app.

Install and loading libraries

# Install
#install.packages("wordcloud")
#install.packages("RColorBrewer")
#install.packages("wordcloud2")
#install.packages("tm")
#install.packages("SnowballC") 
#install.packages("NLP") 
#install.packages("ggplot2")
# Load
library(RColorBrewer)
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.4.4
library(wordcloud2)
## Warning: package 'wordcloud2' was built under R version 3.4.4
library(tm)
## Warning: package 'tm' was built under R version 3.4.4
## Loading required package: NLP
library(SnowballC)
library(NLP)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate

Downloaded the data

setwd("C:/Users/bmast/Desktop/Capstone")
if(!file.exists("Coursera-SwiftKey.zip")){
  download.file("https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip", "Coursera-SwiftKey.zip")
  unzip("Coursera-SwiftKey.zip")
}

The Coursera-SwiftKey.zip contain various folders, based on country of origin of the files itself. In the folder “en_US” there are three files: en_US.blogs.txt is a large collection of character lines (899288 elements) with over 39.3 million words. en_US.news.txt is a large collection of character lines (77259 elements) with over 2.8 million words en_US.twitter.txt is a very large collection of character lines (over 2 million elements) with over 32.8 million words.

Data Summary

Load data in three vectors

blogs <- readLines("C:/Users/bmast/Desktop/Capstone/final/en_US/en_US.blogs.txt", warn = FALSE, encoding = "UTF-8")
news <- readLines("C:/Users/bmast/Desktop/Capstone/final/en_US/en_US.news.txt", warn = FALSE, encoding = "UTF-8")
tweets <- readLines("C:/Users/bmast/Desktop/Capstone/final/en_US/en_US.twitter.txt", warn = FALSE, encoding = "UTF-8")

Create a summary of the files and view its statistics.

summary <- data.frame('File' = c("Blogs","News","Twitter"),
                      "File Size" = sapply(list(blogs, news, tweets), function(x){format(object.size(x),"MB")}),
                      'Nentries' = sapply(list(blogs, news, tweets), function(x){length(x)}),
                      'TotalCharacters' = sapply(list(blogs, news, tweets), function(x){sum(nchar(x))}),
                      'MaxCharacters' = sapply(list(blogs, news, tweets), function(x){max(unlist(lapply(x, function(y) nchar(y))))})
)
summary
##      File File.Size Nentries TotalCharacters MaxCharacters
## 1   Blogs  248.5 Mb   899288       206824505         40833
## 2    News   19.2 Mb    77259        15639408          5760
## 3 Twitter  301.4 Mb  2360148       162096031           140

Cleaning data

The files are huge so I create a sample for each of the

blogs_sample <- blogs[rbinom(length(blogs)*.01, length(blogs), .01)]
news_sample <- news[rbinom(length(news)*.01, length(news), .01)]
tweets_sample <- tweets[rbinom(length(tweets)*.01, length(tweets), .01)]

Creating a Corpus via the VCorpus function

doc_blogs <- VCorpus(VectorSource(blogs_sample))
doc_news <- VCorpus(VectorSource(news_sample))
doc_tweets <- VCorpus(VectorSource(tweets_sample))

#Inspect the samples: I commented them to avoid inserting unnecessary details into the document
#inspect(doc_blogs)
#inspect(doc_news)
#inspect(doc_tweets)

Creating a function to “clean” the samples from special characters, numbers and convert all characters to lowercase etc. Apply the clean function to all the samples.

toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
clean_docs <- function(corpus){
  corpus <- tm_map(corpus, content_transformer(tolower))
  corpus <- tm_map(corpus, removeNumbers)
  corpus <- tm_map(corpus, removePunctuation)
  corpus <- tm_map(corpus, stripWhitespace)
  corpus <- tm_map(corpus, removeWords, stopwords("english"))
  corpus <- tm_map(corpus, toSpace, "/")
  corpus <- tm_map(corpus, toSpace, "@")
  corpus <- tm_map(corpus, toSpace, "\\|")
  corpus <- tm_map(corpus, stemDocument)
  return(corpus)
}

clean_docB <- clean_docs(doc_blogs)
clean_docN <- clean_docs(doc_news)
#clean_docT <- clean_docs(doc_tweets)

Data preparation for analysis and chart creation

Blog

BlogMatrix <- TermDocumentMatrix(clean_docB) 
matrixB <- as.matrix(BlogMatrix) 
frequencyB <- rowSums(matrixB)
length(frequencyB)
## [1] 4123
findFreqTerms(BlogMatrix,lowfreq=500)
##  [1] "also"   "can"    "come"   "day"    "even"   "first"  "get"   
##  [8] "great"  "just"   "know"   "last"   "life"   "like"   "littl" 
## [15] "look"   "love"   "make"   "mani"   "may"    "much"   "need"  
## [22] "new"    "one"    "peopl"  "realli" "say"    "thing"  "time"  
## [29] "two"    "will"   "work"   "year"
wordsB <- sort(frequencyB,decreasing=TRUE) 
dfB <- data.frame(wordB = names(wordsB),freqB=wordsB)

News

NewsMatrix <- TermDocumentMatrix(clean_docN) 
matrixN <- as.matrix(NewsMatrix) 
frequencyN <- rowSums(matrixN)
length(frequencyN)
## [1] 1487
findFreqTerms(NewsMatrix,lowfreq=20)
##   [1] "accord"    "agenc"     "allow"     "also"      "american" 
##   [6] "associ"    "austin"    "avail"     "aztex"     "base"     
##  [11] "beach"     "board"     "can"       "cant"      "case"     
##  [16] "charg"     "chief"     "chris"     "close"     "colleg"   
##  [21] "come"      "crime"     "day"       "defend"    "done"     
##  [26] "dont"      "dunkin"    "earli"     "engin"     "even"     
##  [31] "everybodi" "execut"    "fight"     "figur"     "financ"   
##  [36] "finish"    "fire"      "forward"   "game"      "get"      
##  [41] "goal"      "god"       "group"     "head"      "heavili"  
##  [46] "help"      "home"      "huge"      "involv"    "job"      
##  [51] "johnson"   "just"      "kind"      "know"      "last"     
##  [56] "latest"    "law"       "leagu"     "led"       "let"      
##  [61] "like"      "line"      "littl"     "long"      "look"     
##  [66] "make"      "middl"     "minist"    "monday"    "much"     
##  [71] "name"      "new"       "next"      "now"       "nurs"     
##  [76] "offic"     "one"       "open"      "part"      "percent"  
##  [81] "perform"   "plan"      "play"      "polic"     "polit"    
##  [86] "posit"     "presid"    "put"       "readi"     "real"     
##  [91] "realli"    "record"    "releas"    "report"    "research" 
##  [96] "result"    "return"    "right"     "run"       "said"     
## [101] "say"       "season"    "shoot"     "show"      "sinc"     
## [106] "singl"     "six"       "someth"    "spring"    "start"    
## [111] "state"     "strike"    "support"   "tabl"      "teacher"  
## [116] "team"      "theater"   "think"     "though"    "three"    
## [121] "time"      "today"     "tough"     "tri"       "univers"  
## [126] "use"       "walk"      "want"      "water"     "will"     
## [131] "win"       "women"     "work"      "year"      "yearold"
wordsN <- sort(frequencyN,decreasing=TRUE) 
dfN <- data.frame(wordN = names(wordsN),freqT=wordsN)

Tweeter

# TweetMatrix <- TermDocumentMatrix(clean_docT) 
# matrixT <- as.matrix(TweetMatrix) 
# frequencyT <- rowSums(matrixT)
# length(frequencyT)
# findFreqTerms(TweetMatrix,lowfreq=100)
# wordsT <- sort(frequencyT,decreasing=TRUE) 
# dfT <- data.frame(wordT = names(wordsT),freqT=wordsT)

Visualize finding with a wordcloud

Blog

wordcloud2(data=dfB, size=0.3, shape = 'pentagon', color='random-dark')

The same procedure to get the wordcloud of data from the “News” and “Tweeter” files

# News
# wordcloud2(data=dfN, size=0.6, shape = 'pentagon', color='random-dark')

# Twitter
# wordcloud2(data=dfT, size=0.6, shape = 'pentagon', color='random-dark')

Display it in a histogram

Blog Histogram

The same procedure to get the plot of data from the “News” and “Tweeter” files

# News
# p <- ggplot(subset(dfN, freqN>1000), aes(wordN, freqN))
# p <- p + geom_bar(stat="identity", fill="darkred", colour="darkgreen")
# p <- p + scale_x_discrete("Most frequent words") +  scale_y_continuous("Word frequencies")
# p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
# p

# Twitter
# p <- ggplot(subset(dfT, freqT>1000), aes(wordT, freqT))
# p <- p + geom_bar(stat="identity", fill="darkred", colour="darkgreen")
# p <- p + scale_x_discrete("Most frequent words") +  scale_y_continuous("Word frequencies")
# p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
# p

Conclusion and request for a feedback

In conclusion it is a complex project but I would say that the next step is to understand the distribution and the relationship between words, tokens and sentences in the text. The second could be to construct an n-gram model for the relationship between words to predict the next word based on the previous 1, 2 or 3 words.