Introduction.

The aim of this report is to describe the three files that will be used to build the corpus employed to model a predictive algorithm for Swiftkey.

Getting the data.

downloaded the zip file containing the text files from https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip #Download and unzip the data set. if (!file.exists(“Coursera-SwiftKey.zip”)) { download.file(“https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip”, “/Users/carlosbarco/EDUCACION/COURSERA/Data Science SPECIALIZATION/10 Capstone project/Week 01/Resources/Coursera-SwiftKey.zip”, quiet = FALSE, mode = “w”, cacheOK = TRUE) }

Load the Data.

library(tm)
## Warning: package 'tm' was built under R version 3.4.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.4.1
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
myfile <- "/Users/carlosbarco/EDUCACION/COURSERA/Data Science SPECIALIZATION/10 Capstone project/Week 01/Resources/final/en_US/en_US.blogs.txt"
en_US.blogs <- scan(file=myfile, what="character", sep="\n", quote="")
myfile <- "/Users/carlosbarco/EDUCACION/COURSERA/Data Science SPECIALIZATION/10 Capstone project/Week 01/Resources/final/en_US/en_US.news.txt"
en_US.news <- scan(file=myfile, what="character", sep="\n", quote="")
myfile <- "/Users/carlosbarco/EDUCACION/COURSERA/Data Science SPECIALIZATION/10 Capstone project/Week 01/Resources/final/en_US/en_US.twitter.txt"
en_US.twitter <- scan(file=myfile, what="character", sep="\n", quote="")
## Warning in scan(file = myfile, what = "character", sep = "\n", quote = ""):
## embedded nul(s) found in input

Basic Report of Summary Statistics about the data sets.

summary(nchar(en_US.blogs))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1      47     156     230     329   40833
summary(nchar(en_US.news))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0   110.0   185.0   201.2   268.0 11384.0
summary(nchar(en_US.twitter))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00   37.00   64.00   68.68  100.00  140.00

Word Count

sum(sapply(strsplit(en_US.blogs, " "), length))
## [1] 37334131
sum(sapply(strsplit(en_US.news, " "), length))
## [1] 34372530
sum(sapply(strsplit(en_US.twitter, " "), length))
## [1] 30373543

Exploratory Analysis

Select a sample of 10% of the total documents

blogs <- sample(en_US.blogs, size = ceiling(length(en_US.blogs)/10),
                      replace = FALSE)
news <- sample(en_US.news, size = ceiling(length(en_US.news)/10),
                      replace = FALSE)
twitter <- sample(en_US.twitter, size = ceiling(length(en_US.twitter)/10),
                      replace = FALSE)

rm(en_US.blogs, en_US.news, en_US.twitter)

Create and Clean Corpus

Using the tm package, the sampled data is used to create a corpus.

corp.source <- VectorSource(paste(blogs, news, twitter))
rm(blogs, news, twitter)

corpus <- VCorpus(corp.source, readerControl = list(language = "English"))
rm(corp.source)
#Clean corpus; use getTransformations() to see all available.
##every word to lower case
corpus <- tm_map(corpus, content_transformer(tolower))
##remove numbers
corpus <- tm_map(corpus, removeNumbers)
##strip whitespaces
corpus <- tm_map(corpus, stripWhitespace)
##remove stopwords
corpus <- tm_map(corpus, removeWords, stopwords("english"))
##remove own stopwords
##profanity list from http://www.cs.cmu.edu/~biglou/resources/
url <- "http://www.cs.cmu.edu/~biglou/resources/bad-words.txt"
profanity <- read.csv(url, header=FALSE)
ownStopWords <- as.character(profanity[[1]])
corpus <- tm_map(corpus, removeWords, ownStopWords)

rm(url, profanity, ownStopWords)

Prepare data to construct a histogram of count of words

#Get the cleaned term matrix
dtm <- DocumentTermMatrix(corpus)
dim(dtm)
## [1] 236015 430374
#Remove sparse items
dtm2 <- removeSparseTerms(dtm, sparse=0.95)
dim(dtm2)
## [1] 236015     51
rm(dtm)

The histogram showing the frequency of occurence of uniques words # Show frequent terms

#Frequency
freq <- sort(colSums(as.matrix(dtm2)), decreasing=TRUE)
word.freq <- data.frame(word=names(freq), freq=freq)
#Plot histogram
barplot(height=word.freq$freq, names.arg=word.freq$word)

library("magrittr")
subset(word.freq, freq>250)    %>%
        ggplot(aes(word, freq)) +
        geom_bar(stat="identity", fill="darkred", colour="darkgreen") +
        theme(axis.text.x=element_text(angle=45, hjust=1))

Next Steps For Prediction Algorithm

The next steps of this capstone project would be to finalize our predictive algorithm, and deploy our algorithm as a Shiny app.