Introduction

The motivation for this project is to:

  1. Demonstrate that we’ve downloaded the data and have successfully loaded it in.
  2. Create a basic report of summary statistics about the data sets.
  3. Report some interesting findings that we amassed so far.
  4. Get feedback on our plans for creating a prediction algorithm and Shiny app.

Getting and Loading Data

# download the data if not exists
fileUrl <- "http://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
if (!file.exists("./Coursera-SwiftKey.zip")){
        download.file(fileUrl, destfile = "./Coursera-SwiftKey.zip", method='curl')
}

# unzip the file if the folder not exists
if(!dir.exists("final")){
   unzip("./Coursera-SwiftKey.zip")     
}

# load and analyze three data sets
paths <- c("./final/en_US/en_US.blogs.txt", "./final/en_US/en_US.news.txt", "./final/en_US/en_US.twitter.txt")
## count words by indicating blank characters
count_words <- function(str){
        lengths(gregexpr(" ", str)) + 1L
}

ds <- list(blogs = "", news = "", twitter = "")
df_summary <- data.frame(lines.count = integer(), words.count = integer())
                 
for (i in 1:3) {
  ds[[i]] <- readLines(paths[i], skipNul = TRUE)
  df_summary[nrow(df_summary) + 1,] = list(length(ds[[i]]),sum(count_words(ds[[i]])))
}
row.names(df_summary) <- c("blogs", "news", "twitter")

Exploratory Analysis of the Course Data Set

library("ggplot2")
library("gridExtra")

gbar <- function(y_aes, y_lab){
        ggplot(df_summary, aes(row.names(df_summary), y_aes, fill = row.names(df_summary))) +
        geom_bar(stat="identity") + 
        labs(x = "Data sets", y = y_lab) +
        theme(legend.position = "none")
}

g1 <- gbar(df_summary$lines.count, "Number of lines")
g2 <- gbar(df_summary$words.count, "Number of words")

grid.arrange(g1, g2, nrow = 1, top = "Exploratory Analysis of the Course Data Set")

Creating A Basic Report of Summary Statistics About the Data Sets

In this section, we would like to show you some interesting findings that we amassed so far.

Taking samples of the data

The data sets are too large. Therefore, we only take 1% of them as the training data.

p <- 0.01
set.seed(100)

s_blogs <- sample(ds[[1]], length(ds[[1]]) * p)
s_news <- sample(ds[[2]], length(ds[[2]]) * p)
s_twitter <- sample(ds[[3]], length(ds[[3]]) * p)

# combine sample data sets
sample_ds <- c(s_blogs, s_news, s_twitter)

Data cleaning

Before analyzing the data in more depth, we need to do some simple cleaning work.

# lowercase all characters
sample_ds <- tolower(sample_ds)
# remove url
sample_ds <- gsub(" ?(f|ht)tp(s?)://(.*)[.][a-z]+", "", sample_ds)
# remove email
sample_ds <- gsub('\\S+@\\S+', '', sample_ds)
# remove punctuation except ' and -
sample_ds <- gsub("[^a-zA-Z0-9\\s'-]+", "", sample_ds, perl = TRUE)

Tokenizing by n-gram

We would present the most common used words in the course data sets.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidytext)
library(janeaustenr)

sample_df <- tibble(txt = sample_ds)

gram_n <- function(num){
  sample_df %>%
    unnest_tokens(word, txt, token = "ngrams", n = num) %>%
    count(word, sort = TRUE) %>%
    filter(!is.na(word))
}

gram_1 <- gram_n(1)
gram_2 <- gram_n(2)
gram_3 <- gram_n(3)

gbar2 <- function(df, x_labs){
  ggplot(head(df, 10), aes(word, n, fill = word)) + 
        geom_bar(stat="identity") +
        labs(x = x_labs, y = "Frequency", fill = x_labs) +
        theme(legend.position = "none",
              axis.text.x = element_text(angle = 45, hjust = 1)
              )
}

gb1 <- gbar2(gram_1, "Unigrams")
gb2 <- gbar2(gram_2, "Bigrams")
gb3 <- gbar2(gram_3, "Trigrams")

grid.arrange(gb1, gb2, gb3, nrow = 1, top = "Top ten most common n-grams")

Filtering “stop-words”

As you could find, a lot of the most common unigrams, bigrams, or trigrams are pairs of uninteresting words, such as “the”, “and”, “to”, “of”, what we call “stop-words”. We would like to remove them and show you the new results.

library(tidyr)
data("stop_words")
gram_1_filtered <- gram_1 %>%
              filter(!word %in% stop_words$word)

gram_2_filtered <- gram_2 %>%
              separate(word, into = c("word1", "word2"), sep = " ") %>%
              filter(!word1 %in% stop_words$word) %>%
              filter(!word2 %in% stop_words$word) %>%
              unite(word, c(word1, word2), sep = " ")
gram_3_filtered <- gram_3 %>%
              separate(word, into = c("word1", "word2", "word3"), sep = " ") %>%
              filter(!word1 %in% stop_words$word) %>%
              filter(!word2 %in% stop_words$word) %>%
              filter(!word3 %in% stop_words$word) %>%
              unite(word, c(word1, word2, word3), sep = " ")

ngb1_filtered <- gbar2(gram_1_filtered, "Unigrams")
ngb2_filtered <- gbar2(gram_2_filtered, "Bigrams")
ngb3_filtered <- gbar2(gram_3_filtered, "Trigrams")

grid.arrange(ngb1_filtered, ngb2_filtered, ngb3_filtered, nrow = 1, top = "Top ten most common n-grams excluding stop-words")

Summary and Outlook

  1. We have counted the lines and words in the course data sets.
  2. The data sets are too large so that we have only taken 1% of them in the sample data.
  3. We have showed the top 10 most commonly used unigrams, bigrams, or trigrams (including/excluding stop-words) in the sample data sets
  4. The frequencies of n-grams getting from the sample and machine leaning technology would be used to create a prediction algorithm and Shiny app.