Data Science Capstone Exploratory Analysis Report

Loading data

Read the data from files. The news file is read differently to circumvent an incomplete file read. Otherwise only 77259 out of the lines are read.

blogs <- readLines("Coursera-SwiftKey/final/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
conNews = file("Coursera-SwiftKey/final/en_US/en_US.news.txt", open="rb")
news <- readLines(conNews, encoding = "UTF-8", skipNul = TRUE)
close(conNews)
twitter <- readLines("Coursera-SwiftKey/final/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)

Descriptive analysis

Next we calculate some descriptive analysis of our three datasources to get a sense of what the datasets contain. We calculate the number of lines (nlines), the number of characters (ncharacter), the number of words (nwords) and the minimum, mean and maximum words per line (wpl_min, wpl_mean, wpl_max).

DescriptiveAnaylsis <- function(data){
  # Perform descriptive analysis of the data
  nlines  <- length(data)
  ncharacter <- sum(nchar(data))
  # Count the number of words on each line
  wpl <- stri_count_words(data)
  # Extract, sum, min, mean, and max
  nwords <- sum(wpl)
  wpl_min <- min(wpl)
  wpl_mean <- mean(wpl)
  wpl_max <- max(wpl)
  
  c(nlines, nwords, ncharacter, wpl_min, wpl_mean, wpl_max)
}

x1 <- c('blogs', DescriptiveAnaylsis(blogs))
x2 <- c('news', DescriptiveAnaylsis(news))
x3 <- c('twitter', DescriptiveAnaylsis(twitter))

DescriptiveAnalysisData <- data.frame(matrix(ncol = 7, nrow = 0))
DescriptiveAnalysisData <- rbind(DescriptiveAnalysisData, x1, x2, x3)
colnames(DescriptiveAnalysisData) <- c('Datafile', 'Lines', 'Words', 'Characters', 'wpl_min', 'wpl_mean', 'wpl_max')

DescriptiveAnalysisData

##   Datafile   Lines    Words Characters wpl_min         wpl_mean wpl_max
## 1    blogs  899288 37546246  206824505       0 41.7510808550765    6726
## 2     news 1010242 34762395  203223159       1 34.4099681066517    1796
## 3  twitter 2360148 30093410  162096241       1 12.7506452985152      47

Note that the twitter set contains the most lines but also has the lowest mean and maximum words per line due to the character restriction in twitter messages. Furthermore blog messages contain both the mean and max words per message.

Data cleaning and preparation

Next we clean the datasets by removing any non-English characters.

blogs <- iconv(blogs, from="latin1", to="ASCII", sub="")
news <- iconv(news, from="latin1", to="ASCII", sub="")
twitter <- iconv(twitter, from="latin1", to="ASCII", sub="")

Next we create an \(2\%\) sample of the original datasets.

data <- c(sample(blogs, length(blogs)*0.02), sample(news, length(news)*0.02), sample(twitter, length(twitter)*0.02))

We use the \(tm\) package to create and clean the corpus by removing multiple spaces, convert to lower cast, remove stopwords, any punctuation and numbers.

corpus <- VCorpus(VectorSource(data))
# Strip multiple whitespaces
corpus <- tm_map(corpus, stripWhitespace) 
# Convert all to lower cases
corpus <- tm_map(corpus, content_transformer(tolower)) 
# Remove stop words from dataset
corpus <- tm_map(corpus, removeWords, stopwords("english")) 
# Remove punctuation
corpus <- tm_map(corpus, removePunctuation) 
# Remove numbers
corpus <- tm_map(corpus, removeNumbers)

UnigramTokenizer <- function(x) 
  unlist(lapply(ngrams(words(x), 1), paste, collapse = " "), use.names = FALSE)
BigramTokenizer <- function(x) 
  unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
TrigramTokenizer <- function(x) 
  unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE)

TDMUni <- TermDocumentMatrix(corpus, control=list(tokenize=UnigramTokenizer))
TDMBi <- TermDocumentMatrix(corpus, control=list(tokenize=BigramTokenizer))
TDMTri <- TermDocumentMatrix(corpus, control=list(tokenize=TrigramTokenizer))

freqUni <- findFreqTerms(TDMUni, lowfreq = 200)
freqBi <- findFreqTerms(TDMBi, lowfreq = 25)
freqTri <- findFreqTerms(TDMTri, lowfreq = 5)

FreqCorpusUni <- rowSums(as.matrix(TDMUni[freqUni,]))
FreqCorpusBi <- rowSums(as.matrix(TDMBi[freqBi,]))
FreqCorpusTri <- rowSums(as.matrix(TDMTri[freqTri,]))

FreqCorpusUni <- data.frame(word=names(FreqCorpusUni), frequency=FreqCorpusUni)
FreqCorpusBi <- data.frame(word=names(FreqCorpusBi), frequency=FreqCorpusBi)
FreqCorpusTri <- data.frame(word=names(FreqCorpusTri), frequency=FreqCorpusTri)

Plots

plotNGrams <- function(data, title, num) {
  df <- data[order(-data$frequency),][1:num,] 
  ggplot(df) +
    geom_bar(aes(x = reorder(word, -frequency), y = frequency), stat = "identity", fill = "red", colour = "black", width = 0.80) +
    coord_cartesian(xlim = c(0, num)) + labs(title = title) + xlab("Words") + ylab("Frequency") +
    theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))
}

plotNGrams(FreqCorpusUni,"Most occuring Unigrams",20)

plotNGrams(FreqCorpusBi,"Most occuring Bigrams",20)

plotNGrams(FreqCorpusTri,"Most occuring Trigrams",20)

Further plans

The next step is construct a prediction algorithm based on frequency lookup. Using the methods above, the next word will be predicted based on frequency.