This document describes steps for downloading and analyzing the Coursera-SwiftKey data.
library(tm)
## Loading required package: NLP
library(rlang)
library(readr)
library(stringi)
library(RWeka)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(wordcloud)
## Loading required package: RColorBrewer
library(SnowballC)
library(gridExtra)
library(RColorBrewer)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Two functions were created to remove internet characters and symbols respectively
remove_chars <- function(x) {
x <- gsub("[^ ]{1,}@[^ ]{1,}", " ",x)
x <- gsub("@[^ ]{1,}", " ",x)
x <- gsub("#[^ ]{1,}", " ",x)
x <- gsub("[^ ]{1,}://[^ ]{1,}", " ",x)
x
}
remove_symbols <- function(x) {
x <- gsub("[`??????]"," ",x)
x <- gsub("[^a-z']"," ",x)
x <- gsub("'{2,}"," ",x)
x <- gsub("' "," ",x)
x <- gsub(" '"," ",x)
x <- gsub("^'"," ",x)
x <- gsub("'$"," ",x)
x
}
This step is to download the data from the source and open it with R. I have commented it out to leave the choice to the user in case he/she already has the data. Also, note that the working directory needs to be updated
fileurl <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
# download.file(fileurl, destfile ="Coursera-SwiftKey.zip")
# unzip("Coursera-SwiftKey.zip")
setwd("./final/en_US")
In this section, file Size, number of lines and number of words are calculated for each file.
# file n°1 = Blog
fileName <- "./final/en_US/en_US.blogs.txt"
fileConnection <- file(fileName)
linesInFile <- readLines(fileConnection)
fileSize <- format(object.size (linesInFile), units = "Kb")
fileNLine <- length(linesInFile)
fileWords <- sum(stri_count_words(linesInFile))
close(fileConnection)
summaryTable <- data.frame("Blog", fileSize, fileNLine, fileWords)
colnames(summaryTable) <- c("File", "File Size", "Number of lines", "Number of words")
#File n°2 = News
fileName <- "./final/en_US/en_US.news.txt"
fileConnection <- file(fileName)
linesInFile2 <- readLines(fileConnection)
## Warning in readLines(fileConnection): incomplete final line found on './final/
## en_US/en_US.news.txt'
linesInFile <- c(linesInFile, linesInFile2)
fileSize <- format(object.size (linesInFile2), units = "Kb")
fileNLine <- length(linesInFile2)
fileWords <- sum(stri_count_words(linesInFile2))
close(fileConnection)
summaryTable <- rbind(summaryTable, c("News", fileSize, fileNLine, fileWords))
#File n°3 = Twitter
fileName <- "./final/en_US/en_US.twitter.txt"
fileConnection <- file(fileName)
linesInFile2 <- readLines(fileConnection)
## Warning in readLines(fileConnection): line 167155 appears to contain an embedded
## nul
## Warning in readLines(fileConnection): line 268547 appears to contain an embedded
## nul
## Warning in readLines(fileConnection): line 1274086 appears to contain an
## embedded nul
## Warning in readLines(fileConnection): line 1759032 appears to contain an
## embedded nul
linesInFile <- c(linesInFile, linesInFile2)
fileSize <- format(object.size (linesInFile2), units = "Kb")
fileNLine <- length(linesInFile2)
fileWords <- sum(stri_count_words(linesInFile2))
close(fileConnection)
summaryTable <- rbind(summaryTable, c("Twitter", fileSize, fileNLine, fileWords))
# Overall
fileSize <- format(object.size (linesInFile), units = "Kb")
fileNLine <- length(linesInFile)
fileWords <- sum(stri_count_words(linesInFile))
summaryTable <- rbind(summaryTable, c("Total", fileSize, fileNLine, fileWords))
print(summaryTable)
## File File Size Number of lines Number of words
## 1 Blog 261483 Kb 899288 37546250
## 2 News 20243.6 Kb 77259 2674536
## 3 Twitter 326645.2 Kb 2360148 30093372
## 4 Total 608286.8 Kb 3336695 70314158
Once the three files combined into one, we run some basic transformations like removing internet characters and symbols, converting all words into lower case, etc.To save memory and time, we have extracted a random sample which represents 10% of overall data.
set.seed("12345")
trainingData <- linesInFile[rbinom(fileNLine, 1, 0.01)==1]
corpusFeeds <- VCorpus(VectorSource(trainingData))
corpusFeeds <- tm_map(corpusFeeds, removePunctuation) # remove punctuation
corpusFeeds <- tm_map(corpusFeeds, content_transformer(tolower)) # put in lower char
corpusFeeds <- tm_map(corpusFeeds, content_transformer(remove_chars)) # remove internet chars
corpusFeeds <- tm_map(corpusFeeds, removeWords, stopwords("english")) # remove English stop words
corpusFeeds <- tm_map(corpusFeeds, content_transformer(remove_symbols)) #remove symbols
corpusFeeds <- tm_map(corpusFeeds, stripWhitespace) # remove extra spaces
Next step is to remove profanities listed in http://www.bannedwordlist.com.
filerul2 <- "http://www.bannedwordlist.com/lists/swearWords.txt"
download.file(filerul2, destfile = "badwords.txt")
badwords <- readLines("badwords.txt")
## Warning in readLines("badwords.txt"): incomplete final line found on
## 'badwords.txt'
profanity <- VectorSource(badwords)
corpusFeeds <- tm_map(corpusFeeds, removeWords, profanity)
I chose two ways to present interesting findings. The first one is with word to cloud to see which words are the most frequently used. The second is with NGrams histogram to understand which combination of words are frequently used.
corpusFeeds <- tm_map(corpusFeeds, PlainTextDocument)
# generate N-grams and plot histograms of top 10 occurring N-Grams.
dfForNGrams <- data.frame(text = sapply(corpusFeeds, as.character), stringsAsFactors = FALSE)
# unigram
uniGramToken <- NGramTokenizer(dfForNGrams, Weka_control(min = 1, max = 1))
unigram <- data.frame(table(uniGramToken))
colnames(unigram) <- c("Word", "Frequency")
unigram <- arrange(unigram, desc(Frequency))
wordcloud(unigram$Word,
unigram$Frequency,
max.words=100, min.freq = 1,
random.order=FALSE,
colors=brewer.pal(8,"Dark2"), font = 3)
# bigram
biGramToken <- NGramTokenizer(dfForNGrams, Weka_control(min = 2, max = 2))
bigram <- data.frame(table(biGramToken))
colnames(bigram) <- c("Word", "Frequency")
bigram <- arrange(bigram, desc(Frequency))
subBigram <- bigram[1:10,]
plotBiGram <- ggplot(subBigram, aes(x= reorder(Word, Frequency),y= Frequency )) +
geom_bar(stat="identity", fill="red") +
geom_text(aes(y= Frequency, label=Frequency), vjust=1) +
coord_flip() + labs(x="Word", y="Frequency", title="Bigrams frequency")
# trigram
triGramToken <- NGramTokenizer(dfForNGrams, Weka_control(min = 3, max = 3))
trigram <- data.frame(table(triGramToken))
colnames(trigram) <- c("Word", "Frequency")
trigram <- arrange(trigram, desc(Frequency))
subTrigram <- trigram[1:10,]
plotTriGram <- ggplot(subTrigram, aes(x= reorder(Word, Frequency),y= Frequency )) +
geom_bar(stat="identity", fill="green") +
geom_text(aes(y= Frequency, label=Frequency), vjust=1) +
coord_flip() + labs(x="Word", y="Frequency", title="Trigrams frequency")
grid.arrange(plotBiGram, plotTriGram, ncol=2)