The aim of this report is to describe the three files that will be used to build the corpus employed to model a predictive algorithm for Swiftkey.
downloaded the zip file containing the text files from https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip #Download and unzip the data set. if (!file.exists(“Coursera-SwiftKey.zip”)) { download.file(“https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip”, “/Users/carlosbarco/EDUCACION/COURSERA/Data Science SPECIALIZATION/10 Capstone project/Week 01/Resources/Coursera-SwiftKey.zip”, quiet = FALSE, mode = “w”, cacheOK = TRUE) }
library(tm)
## Warning: package 'tm' was built under R version 3.4.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.4.1
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
myfile <- "/Users/carlosbarco/EDUCACION/COURSERA/Data Science SPECIALIZATION/10 Capstone project/Week 01/Resources/final/en_US/en_US.blogs.txt"
en_US.blogs <- scan(file=myfile, what="character", sep="\n", quote="")
myfile <- "/Users/carlosbarco/EDUCACION/COURSERA/Data Science SPECIALIZATION/10 Capstone project/Week 01/Resources/final/en_US/en_US.news.txt"
en_US.news <- scan(file=myfile, what="character", sep="\n", quote="")
myfile <- "/Users/carlosbarco/EDUCACION/COURSERA/Data Science SPECIALIZATION/10 Capstone project/Week 01/Resources/final/en_US/en_US.twitter.txt"
en_US.twitter <- scan(file=myfile, what="character", sep="\n", quote="")
## Warning in scan(file = myfile, what = "character", sep = "\n", quote = ""):
## embedded nul(s) found in input
summary(nchar(en_US.blogs))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 47 156 230 329 40833
summary(nchar(en_US.news))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 110.0 185.0 201.2 268.0 11384.0
summary(nchar(en_US.twitter))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 37.00 64.00 68.68 100.00 140.00
sum(sapply(strsplit(en_US.blogs, " "), length))
## [1] 37334131
sum(sapply(strsplit(en_US.news, " "), length))
## [1] 34372530
sum(sapply(strsplit(en_US.twitter, " "), length))
## [1] 30373543
blogs <- sample(en_US.blogs, size = ceiling(length(en_US.blogs)/10),
replace = FALSE)
news <- sample(en_US.news, size = ceiling(length(en_US.news)/10),
replace = FALSE)
twitter <- sample(en_US.twitter, size = ceiling(length(en_US.twitter)/10),
replace = FALSE)
rm(en_US.blogs, en_US.news, en_US.twitter)
Using the tm package, the sampled data is used to create a corpus.
corp.source <- VectorSource(paste(blogs, news, twitter))
rm(blogs, news, twitter)
corpus <- VCorpus(corp.source, readerControl = list(language = "English"))
rm(corp.source)
#Clean corpus; use getTransformations() to see all available.
##every word to lower case
corpus <- tm_map(corpus, content_transformer(tolower))
##remove numbers
corpus <- tm_map(corpus, removeNumbers)
##strip whitespaces
corpus <- tm_map(corpus, stripWhitespace)
##remove stopwords
corpus <- tm_map(corpus, removeWords, stopwords("english"))
##remove own stopwords
##profanity list from http://www.cs.cmu.edu/~biglou/resources/
url <- "http://www.cs.cmu.edu/~biglou/resources/bad-words.txt"
profanity <- read.csv(url, header=FALSE)
ownStopWords <- as.character(profanity[[1]])
corpus <- tm_map(corpus, removeWords, ownStopWords)
rm(url, profanity, ownStopWords)
#Get the cleaned term matrix
dtm <- DocumentTermMatrix(corpus)
dim(dtm)
## [1] 236015 430374
#Remove sparse items
dtm2 <- removeSparseTerms(dtm, sparse=0.95)
dim(dtm2)
## [1] 236015 51
rm(dtm)
The histogram showing the frequency of occurence of uniques words # Show frequent terms
#Frequency
freq <- sort(colSums(as.matrix(dtm2)), decreasing=TRUE)
word.freq <- data.frame(word=names(freq), freq=freq)
#Plot histogram
barplot(height=word.freq$freq, names.arg=word.freq$word)
library("magrittr")
subset(word.freq, freq>250) %>%
ggplot(aes(word, freq)) +
geom_bar(stat="identity", fill="darkred", colour="darkgreen") +
theme(axis.text.x=element_text(angle=45, hjust=1))
The next steps of this capstone project would be to finalize our predictive algorithm, and deploy our algorithm as a Shiny app.