Objectives

This document describes steps for downloading and analyzing the Coursera-SwiftKey data.

Load necessary packages

library(tm)
## Loading required package: NLP
library(rlang)
library(readr)
library(stringi)
library(RWeka)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(wordcloud)
## Loading required package: RColorBrewer
library(SnowballC)
library(gridExtra)
library(RColorBrewer)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Create functions for tidying the data

Two functions were created to remove internet characters and symbols respectively

remove_chars <- function(x) {
  x <- gsub("[^ ]{1,}@[^ ]{1,}", " ",x)
  x <- gsub("@[^ ]{1,}", " ",x)
  x <- gsub("#[^ ]{1,}", " ",x)
  x <- gsub("[^ ]{1,}://[^ ]{1,}", " ",x)
  x
}

remove_symbols <- function(x) {
  x <- gsub("[`??????]"," ",x)
  x <- gsub("[^a-z']"," ",x)
  x <- gsub("'{2,}"," ",x)
  x <- gsub("' "," ",x)
  x <- gsub(" '"," ",x)
  x <- gsub("^'"," ",x)
  x <- gsub("'$"," ",x)
  x
}

Download and upload the data

This step is to download the data from the source and open it with R. I have commented it out to leave the choice to the user in case he/she already has the data. Also, note that the working directory needs to be updated

fileurl <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
# download.file(fileurl, destfile ="Coursera-SwiftKey.zip")
# unzip("Coursera-SwiftKey.zip")
setwd("./final/en_US")

Basic summary

In this section, file Size, number of lines and number of words are calculated for each file.

# file n°1 = Blog
fileName <- "./final/en_US/en_US.blogs.txt"
fileConnection <- file(fileName)
linesInFile <- readLines(fileConnection) 
fileSize <- format(object.size (linesInFile), units = "Kb")
fileNLine <- length(linesInFile)
fileWords <- sum(stri_count_words(linesInFile))
close(fileConnection)
summaryTable <- data.frame("Blog", fileSize, fileNLine, fileWords)
colnames(summaryTable) <- c("File", "File Size", "Number of lines", "Number of words")

#File n°2 = News
fileName <- "./final/en_US/en_US.news.txt"
fileConnection <- file(fileName)
linesInFile2 <- readLines(fileConnection) 
## Warning in readLines(fileConnection): incomplete final line found on './final/
## en_US/en_US.news.txt'
linesInFile <- c(linesInFile, linesInFile2)
fileSize <- format(object.size (linesInFile2), units = "Kb")
fileNLine <- length(linesInFile2)
fileWords <- sum(stri_count_words(linesInFile2))
close(fileConnection)
summaryTable <- rbind(summaryTable, c("News", fileSize, fileNLine, fileWords))

#File n°3 = Twitter
fileName <- "./final/en_US/en_US.twitter.txt"
fileConnection <- file(fileName)
linesInFile2 <- readLines(fileConnection) 
## Warning in readLines(fileConnection): line 167155 appears to contain an embedded
## nul
## Warning in readLines(fileConnection): line 268547 appears to contain an embedded
## nul
## Warning in readLines(fileConnection): line 1274086 appears to contain an
## embedded nul
## Warning in readLines(fileConnection): line 1759032 appears to contain an
## embedded nul
linesInFile <- c(linesInFile, linesInFile2)
fileSize <- format(object.size (linesInFile2), units = "Kb")
fileNLine <- length(linesInFile2)
fileWords <- sum(stri_count_words(linesInFile2))
close(fileConnection)
summaryTable <- rbind(summaryTable, c("Twitter", fileSize, fileNLine, fileWords))

# Overall
fileSize <- format(object.size (linesInFile), units = "Kb")
fileNLine <- length(linesInFile)
fileWords <- sum(stri_count_words(linesInFile))
summaryTable <- rbind(summaryTable, c("Total", fileSize, fileNLine, fileWords))
print(summaryTable)
##      File   File Size Number of lines Number of words
## 1    Blog   261483 Kb          899288        37546250
## 2    News  20243.6 Kb           77259         2674536
## 3 Twitter 326645.2 Kb         2360148        30093372
## 4   Total 608286.8 Kb         3336695        70314158

Basic transformation

Once the three files combined into one, we run some basic transformations like removing internet characters and symbols, converting all words into lower case, etc.To save memory and time, we have extracted a random sample which represents 10% of overall data.

set.seed("12345")
trainingData <- linesInFile[rbinom(fileNLine, 1, 0.01)==1]
corpusFeeds <- VCorpus(VectorSource(trainingData))
corpusFeeds <- tm_map(corpusFeeds, removePunctuation) # remove punctuation
corpusFeeds <- tm_map(corpusFeeds, content_transformer(tolower))  # put in lower char
corpusFeeds <- tm_map(corpusFeeds, content_transformer(remove_chars)) # remove internet chars
corpusFeeds <- tm_map(corpusFeeds, removeWords, stopwords("english")) # remove English stop words
corpusFeeds <- tm_map(corpusFeeds, content_transformer(remove_symbols)) #remove symbols
corpusFeeds <- tm_map(corpusFeeds, stripWhitespace) # remove extra spaces

Remove profanities

Next step is to remove profanities listed in http://www.bannedwordlist.com.

filerul2 <- "http://www.bannedwordlist.com/lists/swearWords.txt"
download.file(filerul2, destfile = "badwords.txt")
badwords <- readLines("badwords.txt")
## Warning in readLines("badwords.txt"): incomplete final line found on
## 'badwords.txt'
profanity <- VectorSource(badwords)
corpusFeeds <- tm_map(corpusFeeds, removeWords, profanity)

Word cloud and histogram

I chose two ways to present interesting findings. The first one is with word to cloud to see which words are the most frequently used. The second is with NGrams histogram to understand which combination of words are frequently used.

corpusFeeds <- tm_map(corpusFeeds, PlainTextDocument)


# generate N-grams and plot histograms of top 10 occurring N-Grams.
dfForNGrams <- data.frame(text = sapply(corpusFeeds, as.character), stringsAsFactors = FALSE)

# unigram
uniGramToken <- NGramTokenizer(dfForNGrams, Weka_control(min = 1, max = 1))
unigram <- data.frame(table(uniGramToken))
colnames(unigram) <- c("Word", "Frequency")
unigram <- arrange(unigram, desc(Frequency))
wordcloud(unigram$Word,
          unigram$Frequency,
          max.words=100, min.freq = 1,
          random.order=FALSE,
          colors=brewer.pal(8,"Dark2"), font = 3)

# bigram
biGramToken <- NGramTokenizer(dfForNGrams, Weka_control(min = 2, max = 2))
bigram <- data.frame(table(biGramToken))
colnames(bigram) <- c("Word", "Frequency")
bigram <- arrange(bigram, desc(Frequency))
subBigram <- bigram[1:10,]
plotBiGram <- ggplot(subBigram, aes(x= reorder(Word, Frequency),y= Frequency )) + 
                geom_bar(stat="identity", fill="red") + 
                geom_text(aes(y= Frequency, label=Frequency), vjust=1) +
                coord_flip() + labs(x="Word", y="Frequency", title="Bigrams frequency")

# trigram
triGramToken <- NGramTokenizer(dfForNGrams, Weka_control(min = 3, max = 3))
trigram <- data.frame(table(triGramToken))
colnames(trigram) <- c("Word", "Frequency")
trigram <- arrange(trigram, desc(Frequency))
subTrigram <- trigram[1:10,]
plotTriGram <- ggplot(subTrigram, aes(x= reorder(Word, Frequency),y= Frequency )) +
                geom_bar(stat="identity", fill="green") +
                geom_text(aes(y= Frequency, label=Frequency), vjust=1) +
                coord_flip() + labs(x="Word", y="Frequency", title="Trigrams frequency")

grid.arrange(plotBiGram, plotTriGram, ncol=2)