Purpose

Following shows the purpose of this report:

  1. Demonstrate that I have downloaded the data and have successfully loaded it in.

  2. Create a basic report of summary statistics about the data sets.
  1. Report any interesting findings that you amassed so far.
  1. Plans for creating a prediction algorithm and Shiny app.

Downloading the data

# Read data set
if(!file.exists("../original_data_set")){dir.create("../original_data_set")}
data_set_url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
download.file(data_set_url,
              destfile="../original_data_set/Coursera-SwiftKey.zip")

# Unzip dataSet
unzip(zipfile="../original_data_set/Coursera-SwiftKey.zip",
      exdir="../original_data_set")

Attaching required packages

library(stringi)
library(tm)
library(dplyr)
library(SnowballC)
library(RWeka)
library(ggplot2)

Creating a basic report of summary statistics about the data sets

# summary statistics about the data sets
# Word counts, line counts and basic data tables

deta_file_list <- c("../original_data_set/final/en_US/en_US.blogs.txt", 
                    "../original_data_set/final/en_US/en_US.news.txt", 
                    "../original_data_set/final/en_US/en_US.twitter.txt")

raw_data <- list(blogs = "", news = "", twitter = "")
basic_data_table <- matrix(0, nrow = 3, ncol = 2, 
                           dimnames = list(c("blogs", "news", "twitter"),
                                           c("Word counts", "line counts")))
library(stringi)
for (i in 1:3) {
  con <- file(deta_file_list[i], "rb")
  raw_data[[i]] <- readLines(con, encoding = "UTF-8",skipNul = TRUE)
  close(con)
  basic_data_table[i,1] <- sum(stri_count_words(raw_data[[i]]))
  basic_data_table[i,2] <- length(raw_data[[i]])
}

# Print Basic Data Table
basic_data_table
##         Word counts line counts
## blogs      37546246      899288
## news       34762395     1010242
## twitter    30093410     2360148

Reporting interesting findings

set.seed(123)
# Sample 1% of lines from each source file
sample_blog <- sample(raw_data$blogs, 0.01*length(raw_data$blogs))
sample_news <- sample(raw_data$news, 0.01*length(raw_data$news))
sample_twitter <- sample(raw_data$twitter, 0.01*length(raw_data$twitter))
sample <- c(sample_blog, sample_news, sample_twitter)
sample <- iconv(sample, 'UTF-8', 'ASCII')
corpus <- Corpus(VectorSource(as.data.frame(sample, stringsAsFactors = FALSE)))
corpus <- corpus %>%
  tm_map(tolower) %>%  
  tm_map(PlainTextDocument) %>%
  tm_map(removePunctuation) %>%
  tm_map(removeNumbers) %>%
  tm_map(stemDocument) %>% 
  tm_map(stripWhitespace)
tokenize_word_freq <-NGramTokenizer(corpus, Weka_control(min = 1, max = 1))
dist_word_freq <- data.frame(table(tokenize_word_freq))
dist_word_freq <- dist_word_freq[order(dist_word_freq$Freq, decreasing = TRUE),]

ggplot(dist_word_freq[1:20,], aes(x=tokenize_word_freq, y=Freq)) +
  geom_bar(stat="Identity")+
  xlab("Unigrams") + ylab("Frequency")+
  ggtitle("Distributions of word frequencies") +
  theme(axis.text.x=element_text(angle=90, hjust=1))

tokenize_2grams_freq <-NGramTokenizer(corpus, Weka_control(min = 2, max = 2))
dist_2grams_freq <- data.frame(table(tokenize_2grams_freq))
dist_2grams_freq <- dist_2grams_freq[order(dist_2grams_freq$Freq, decreasing = TRUE),]

ggplot(dist_2grams_freq[1:20,], aes(x=tokenize_2grams_freq, y=Freq)) +
  geom_bar(stat="Identity")+
  xlab("2-grams") + ylab("Frequency")+
  ggtitle("Distributions of 2-grams frequencies") +
  theme(axis.text.x=element_text(angle=90, hjust=1))

tokenize_3grams_freq <-NGramTokenizer(corpus, Weka_control(min = 3, max = 3))
dist_3grams_freq <- data.frame(table(tokenize_3grams_freq))
dist_3grams_freq <- dist_3grams_freq[order(dist_3grams_freq$Freq, decreasing = TRUE),]

ggplot(dist_3grams_freq[1:20,], aes(x=tokenize_3grams_freq, y=Freq)) +
  geom_bar(stat="Identity")+
  xlab("3-grams") + ylab("Frequency")+
  ggtitle("Distributions of 3-grams frequencies") +
  theme(axis.text.x=element_text(angle=90, hjust=1))

Planing for creating a prediction algorithm and Shiny app

I’m planning to develop a Shiny application which predicts next word/sentence with n-grim model data based on user text input.