Objectives

The milestone report includes the collection and manipulation of the data including exploratory analysis of word count. A subset of the sample is converted into a corpus and examples of word search and contextual phase search are included.

Load libraries and data

knitr::opts_chunk$set(echo = TRUE)

library(dplyr)
library(stringi)
library(tm)
library(slam)
library(ggplot2)
library(quanteda)
library(readtext)

The raw data was downloaded and extracted

# Check for zip file and download if necessary
if (!file.exists("C:/Users/Downloads/Coursera-SwiftKey.zip")) {
    download.file("https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip", 
        destfile = "C:/Users/Gary Clarke/Downloads/Coursera-SwiftKey.zip")
}
# Check for data file and unzip if necessary
if (!file.exists("C:/Users/Gary Clarke/Desktop/Coursera/data/Coursera-Swiftkey/final/en_US")) {
    unzip("data/Coursera-SwiftKey.zip", exdir = "C:/Users/Gary Clarke/Desktop/Coursera/data/Coursera-Swiftkey/final/en_US", list = TRUE)
}

The data files were loaded

conn <- file("C:/Users/Gary Clarke/Desktop/Coursera/data/Coursera-Swiftkey/final/en_US/en_US.blogs.txt")
blogs <- readLines(conn, encoding = "UTF-8")
close(conn)

conn <- file("C:/Users/Gary Clarke/Desktop/Coursera/data/Coursera-Swiftkey/final/en_US/en_US.news.txt")
news <- readLines(conn, encoding = "UTF-8")
close(conn)

conn <- file("C:/Users/Gary Clarke/Desktop/Coursera/data/Coursera-Swiftkey/final/en_US/en_US.twitter.txt")
tweets <- readLines(conn, encoding = "UTF-8")
close(conn)

rm(conn)

Exploratory data

The words per line were computed and used to graph the results. All data types are biased to the right suggesting short sentences and messages.

# words per line
WPL<-lapply(list(blogs,news,tweets),function(x) stri_count_words(x))

# Compute info for each data type
rawStats<-data.frame(
            File=c("blogs","news","twitter"), 
            t(rbind(sapply(list(blogs,news,tweets),stri_stats_general),
                    TotalWords=sapply(list(blogs,news,tweets),stri_stats_latex)[4,])),
            # Compute words per line summary
            WPL_computed=rbind(summary(WPL[[1]]),summary(WPL[[2]]),summary(WPL[[3]]))
            )
print(rawStats)
##      File   Lines LinesNEmpty     Chars CharsNWhite TotalWords
## 1   blogs  899288      899288 206824382   170389539   37570839
## 2    news   77259       77259  15639408    13072698    2651432
## 3 twitter 2360148     2360148 162096031   134082634   30451128
##   WPL_computed.Min. WPL_computed.1st.Qu. WPL_computed.Median WPL_computed.Mean
## 1                 0                    9                  28          41.75107
## 2                 1                   19                  32          34.61779
## 3                 1                    7                  12          12.75063
##   WPL_computed.3rd.Qu. WPL_computed.Max.
## 1                   60              6726
## 2                   46              1123
## 3                   18                47
qplot(WPL[[1]],geom="histogram",main="Histogram for US Blogs",
      xlab="Number of Words",ylab="Frequency",binwidth=10)

qplot(WPL[[2]],geom="histogram",main="Histogram for US News",
      xlab="Number of Words",ylab="Frequency",binwidth=10)

qplot(WPL[[3]],geom="histogram",main="Histogram for US Tweets",
      xlab="Number of Words",ylab="Frequency",binwidth=1)

rm(WPL);rm(rawStats)

##Sampling raw data

samplesize <- 35000  # sample size
set.seed(3206)  # Ensure reproducibility 

# Create raw data and sample vectors
data <- list(blogs, news, tweets)
sample <- list()

# Iterate each raw data to create 'cleaned'' sample for each
for (i in 1:length(data)) {
    # Create sample dataset
    Filter <- sample(1:length(data[[i]]), samplesize, replace = FALSE)
    sample[[i]] <- data[[i]][Filter]
    # clean unusual characters
    for (j in 1:length(sample[[i]])) {
        row1 <- sample[[i]][j]
        row2 <- iconv(row1, "latin1", "ASCII", sub = "")
        sample[[i]][j] <- row2
    }
}

rm(blogs)
rm(news)
rm(tweets)

Create the Corpus

The corpus was created then converted into a quanteda corpus so the functionality of quanteda can be used to work with the data. The data was tokenised using the “tokens” function, Tokens are the building blocks for Natural Language Processing and are used to prepare a vocabulary.

conn <- file("C:/Users/Gary Clarke/Desktop/Coursera/data/Coursera-Swiftkey/final/en_US/sample.txt")
text <- readLines(conn, encoding = "UTF-8")
## Warning in readLines(conn, encoding = "UTF-8"): incomplete final line found
## on 'C:/Users/Gary Clarke/Desktop/Coursera/data/Coursera-Swiftkey/final/en_US/
## sample.txt'
close(conn)

docs <- Corpus(VectorSource(text))
docs <- tm_map(docs, stripWhitespace)
## Warning in tm_map.SimpleCorpus(docs, stripWhitespace): transformation drops
## documents
docs <- tm_map(docs, stemDocument)
## Warning in tm_map.SimpleCorpus(docs, stemDocument): transformation drops
## documents
docs <- tm_map(docs, removeNumbers)
## Warning in tm_map.SimpleCorpus(docs, removeNumbers): transformation drops
## documents
corp_quanteda <- corpus(docs)
token_docs <- tokens(corp_quanteda, remove_punct = TRUE)

Conclusion

The plan going forward is to use quanteda to create the ngrams and summarise the frequency and use of the words (which are now tokens) and to build a predictive model. This will then be used in a Shiny App to make a word recommendation based on the word the user inputs.