The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm. Please submit a report on R Pubs (http://rpubs.com/) that explains your exploratory analysis and your goals for the eventual app and algorithm. This document should be concise and explain only the major features of the data you have identified and briefly summarize your plans for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager. You should make use of tables and plots to illustrate important summaries of the data set.

The motivation for this project is to:

Demonstrate that you’ve downloaded the data and have successfully loaded it in.

The First step is to download file from Coursera Website. Since it is in zip format, we have to unzip it.

loadData <- function () {
  if (!file.exists("./data/Coursera-SwiftKey.zip")) {
    dataUrl <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
    download.file(dataUrl, destfile = "./data/Coursera-SwiftKey.zip", method = "curl")
  }
  
  unzip("./data/Coursera-SwiftKey.zip", exdir = "./data");
}

if (!file.exists("./data")) {
  dir.create("./data")
}

if (!file.exists("./data/final")) {
  loadData()
}

Then, check its structure:

list.files("./data/final")
## [1] "de_DE" "en_US" "fi_FI" "ru_RU"

It is four folders after unzipping. Contunie to check their structures:

list.files("./data/final/en_US")
## [1] "en_US.blogs.txt"   "en_US.news.txt"    "en_US.twitter.txt"

en_Us folder contains three files, as well as de_DE, fi_FI, and ru_RU.

Create a basic report of summary statistics about the data sets.

Since I do not know German, Franch, neither Russian, what I can do is to analyze English. First we load the three files in en_US folder.

readFiles <- function () {
  files <- c()
  paths <- c(
    "blogs"   = "./data/final/en_US/en_US.blogs.txt",
    "news"    = "./data/final/en_US/en_US.news.txt",
    "twitter" ="./data/final/en_US/en_US.twitter.txt"
  )
  
  for (name in names(paths)) {
    files[[name]] <- readLines(paths[name], encoding = "UTF-8", skipNul = TRUE)
  }
  
  return(files)
}

if (!exists("files")) {
  files <- readFiles()
}

Then, calculate the number of words and lines.

library(stringi)

getFileStats <- function () {
  files_stat <- NULL
  
  for (name in names(files)) {
    file <- files[[name]]
    stats <- stri_stats_general(file)
    data <- data.frame(t(stats), row.names = name)
    data$Words <- sum(stri_count_words(file))

    if (is.null(files_stat)) {
      files_stat <- data
    } else {
      files_stat <- rbind(files_stat, data)
    }
  }
  
  return(files_stat)
}

files_stat <- getFileStats()
files_stat
##           Lines LinesNEmpty     Chars CharsNWhite    Words
## blogs    899288      899288 206824382   170389539 37541795
## news    1010242     1010242 203223154   169860866 34762303
## twitter 2360148     2360148 162096241   134082806 30092907

According to data obtained, plot graphs, respectively.

barplot(files_stat[, "Lines"], names.arg = row.names(files_stat), main = "Line counts")

And

barplot(files_stat[, "Words"], names.arg = row.names(files_stat), main = "Word counts")

By these two graphs, we can know the line counts in blogs and news are similar, but in twitter is significantly larger. However, word counts in twitter is less than that in blogs and news.

Report any interesting findings that you amassed so far.

Now we do N-gram analysis, for N=1,2,3, and 4. The sample selected is 10,000 lines from news and blogs, and 50,000 lines from twitter, since it contains more lines than the previous two.

library("tm")
## Loading required package: NLP
set.seed(123)

createSample <- function (blogs, news, twitter) {
  samples <- c()

  for (name in names(files)) {
    file <- files[[name]]
    samples <- c(samples, sample(file, get(name)))
  }
  
  return(samples)
}

getCorpus <- function (data) {
  corpus <- Corpus(VectorSource(data))
  corpus <- tm_map(corpus, tolower)
  corpus <- tm_map(corpus, removePunctuation)
  corpus <- tm_map(corpus, removeNumbers)

  return(corpus)
}

getSampleCorpus <- function (blogs = 10000, news = 10000, twitter = 50000, cache = TRUE) {
  cacheFile <- "./data/sample_corpus.RDS"
  if (cache && file.exists(cacheFile)) {
    corpus <- readRDS(cacheFile)
  } else {
    samples <- createSample(blogs, news, twitter)
    corpus <- getCorpus(samples)
    saveRDS(corpus, file = "./data/sample_corpus.RDS")
  }

  return(corpus)
}

corpus <- getSampleCorpus(cache = TRUE)

Calculate top 15 tokens from 1-grams to 4-grams.

library("RWeka")

printNGram <- function (corpus, n = 1, topN = 15, delim = " \\r\\n\\t.!?,;\"()") {
  label <- paste('Top ', topN, ' ', n, '-grams', sep = '')
  token <- NGramTokenizer(corpus$content, Weka_control(min = n, max = n, delimiters = delim))
  top <- as.data.frame(table(token))
  top <- head(top[order(-top$Freq), ], topN)
  
  par(mar = c(5, 8, 2, 1))
  barplot(rev(top$Freq), names.arg = rev(top$token), main = label, xlab = "Frequency", horiz = TRUE, las = 1, cex.names = 0.9)
  
  return(top)
}

printNGrams <- function (corpus, num = 4, topN = 15) {
  for (n in 1:num) { 
    printNGram(corpus, n, topN)
  }
}

printNGrams(corpus)

Get feedback on your plans for creating a prediction algorithm and Shiny app.

Use found n-grams to make a prediction model(possibly Katz’s back-off model). Use prediction model to create Shiny app.