Introductions

This is a Milestone Report (exploratory analysis of the data set) of the Data Science Specialization SwiftKey Capstone.
The goal of this project is just to display that I’ve gotten used to working with the data and that I’m on track to create my prediction algorithm.
The motivation for this project is to:
1. Demonstrate that you’ve downloaded the data and have successfully loaded it in.
2. Create a basic report of summary statistics about the data sets.
3. Report any interesting findings that you amassed so far.4. Get feedback on your plans for creating a prediction algorithm and Shiny app.

Load required packages

library(tidytext)
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(qdapRegex)
## 
## Attaching package: 'qdapRegex'
## The following object is masked from 'package:ggplot2':
## 
##     %+%
## The following object is masked from 'package:dplyr':
## 
##     explain
library(ngram)
library(RWeka)

Download and load the files

wd <- getwd()
f <- "Coursera-SwiftKey.zip"
if (!dir.exists("Coursera-SwiftKey")) {
    url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
    if (!file.exists("Coursera-SwiftKey.zip")) { 
        download.file(url, destfile = "Coursera-SwiftKey.zip", method = "curl")
        unzip("Coursera-SwiftKey.zip")
        download.file(url, file.path(wd, f), method = "curl")
        unzip(f)
    }
}

blogs_file<-"Coursera-SwiftKey/final/en_US/en_US.blogs.txt"
twitter_file<-"Coursera-SwiftKey/final/en_US/en_US.twitter.txt"
news_file<-"Coursera-SwiftKey/final/en_US/en_US.news.txt"

blogs <- readLines(blogs_file, warn = FALSE, encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines(twitter_file, warn = FALSE, encoding = "UTF-8", skipNul = TRUE)
news <- readLines(news_file, warn = FALSE, encoding = "UTF-8", skipNul = TRUE)

Exploratory data analysys

Basic summaries of the three files (Word counts, line counts).

line_news<-length(news)
line_twitter<-length(twitter)
line_blogs<-length(blogs)

wc_news<-wordcount(news)
wc_twitter<-wordcount(twitter)
wc_blogs<-wordcount(blogs)

a<-rbind(line_news,line_twitter,line_blogs)
b<-rbind(wc_news,wc_twitter,wc_blogs)
c<-as.data.frame(cbind(a,b))
names(c)<-c("number of lines","number of words")
rownames(c)<-c("news","twitter","blogs")
c
##         number of lines number of words
## news              77259         2643969
## twitter         2360148        30373583
## blogs            899288        37334131

Taking sample of data: 1% of each dataset for exploratory analysis.

set.seed(123)
twitter_sample  <- sample(twitter, length(twitter) * 0.01, replace = FALSE)
blogs_sample    <- sample(blogs, length(blogs) * 0.01, replace = FALSE)
news_sample     <- sample(news, length(news) * 0.01, replace = FALSE)
data_sample = c(twitter_sample, blogs_sample, news_sample)

Data Cleaning

# Remove lines with unidentify characters
NotKnown <- grep("NotKnown", iconv(data_sample, "latin1", "ASCII", sub="NotKnown"))
data_sample <- data_sample[-NotKnown]
# doing some cleaning
data_sample <- gsub("&amp", "", data_sample)
data_sample <- gsub("RT :|@[a-z,A-Z]*: ", "", data_sample) # remove tweets
data_sample <- gsub("@\\w+", "", data_sample)
data_sample <- gsub("[[:digit:]]", "", data_sample) # remove digits
data_sample <- gsub(" #\\S*","", data_sample)  # remove hash tags 
data_sample <- gsub(" ?(f|ht)tp(s?)://(.*)[.][a-z]+", "", data_sample) # remove url
data_sample <- rm_white(data_sample) # remove extra spaces


data_sample_df <- data_frame(line = 1:length(data_sample), text = data_sample)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.

Unigram

UnigramFreq <- data_sample_df %>%
        unnest_tokens(unigram, text, token = "ngrams", n = 3) %>%
        separate(unigram, c("word1"), sep = " ", 
                 extra = "drop", fill = "right") %>%
                filter(!word1 == "NA") %>%
        unite(unigram, word1, sep = " ") %>%
        count(unigram, sort = TRUE)

ggplot(head(UnigramFreq,15), aes(reorder(unigram,n), n)) +   
        geom_bar(stat="identity") + coord_flip() + 
        xlab("Unigrams") + ylab("Frequency") +
        ggtitle("Most frequent unigrams")

To some degree, the most frequent unigrams shown here are meaningless. So the stop word should be filtered out.

Stop word filter out

Stop words are any word in a stop list (or stoplist or negative dictionary) which are filtered out (i.e. stopped) before or after processing of natural language data (text).
UnigramFreq <- data_sample_df %>%
        unnest_tokens(unigram, text, token = "ngrams", n = 3) %>%
        separate(unigram, c("word1"), sep = " ", 
                 extra = "drop", fill = "right") %>%
                filter(!word1 == "NA") %>%
        filter(!word1 %in% stop_words$word) %>% # stop wors filter out
        unite(unigram, word1, sep = " ") %>%
        count(unigram, sort = TRUE)

ggplot(head(UnigramFreq,15), aes(reorder(unigram,n), n)) +   
        geom_bar(stat="identity") + coord_flip() + 
        xlab("Unigrams") + ylab("Frequency") +
        ggtitle("Most frequent unigrams (stop word filter out)")

Bigram

BigramFreq <- data_sample_df %>%
    unnest_tokens(bigram, text, token = "ngrams", n = 3) %>%
    separate(bigram, c("word1", "word2"), sep = " ", 
             extra = "drop", fill = "right") %>%
       filter(!word1 == "NA",
           !word2 == "NA") %>%
    unite(bigram, word1, word2, sep = " ") %>%
    count(bigram, sort = TRUE)

ggplot(head(BigramFreq,15), aes(reorder(bigram,n), n)) +   
    geom_bar(stat="identity") + coord_flip() + 
    xlab("Bigrams") + ylab("Frequency") +
    ggtitle("Most frequent bigrams")

BigramFreq <- data_sample_df %>%
    unnest_tokens(bigram, text, token = "ngrams", n = 3) %>%
    separate(bigram, c("word1", "word2"), sep = " ", 
             extra = "drop", fill = "right") %>%
    filter(!word1 %in% stop_words$word,
           !word2 %in% stop_words$word) %>%
    filter(!word1 == "NA",
           !word2 == "NA") %>%
    unite(bigram, word1, word2, sep = " ") %>%
    count(bigram, sort = TRUE)

ggplot(head(BigramFreq,15), aes(reorder(bigram,n), n)) +   
    geom_bar(stat="identity") + coord_flip() + 
    xlab("Bigrams") + ylab("Frequency") +
    ggtitle("Most frequent bigrams (stop word filter out)")

Trigram

TrigramFreq <- data_sample_df %>%
    unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
    separate(trigram, c("word1", "word2", "word3"), sep = " ", 
             extra = "drop", fill = "right") %>%
    filter(!word1 == "NA",
           !word2 == "NA",
           !word3 == "NA") %>%
    unite(trigram, word1, word2, word3, sep = " ") %>%
    count(trigram, sort = TRUE)

ggplot(head(TrigramFreq,15), aes(reorder(trigram,n), n)) +   
    geom_bar(stat="identity") + coord_flip() + 
    xlab("Trigrams") + ylab("Frequency") +
    ggtitle("Most frequent trigrams")

TrigramFreq <- data_sample_df %>%
    unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
    separate(trigram, c("word1", "word2", "word3"), sep = " ", 
             extra = "drop", fill = "right") %>%
    filter(!word1 %in% stop_words$word,
           !word2 %in% stop_words$word,
           !word3 %in% stop_words$word) %>%
    filter(!word1 == "NA",
           !word2 == "NA",
           !word3 == "NA") %>%
    unite(trigram, word1, word2, word3, sep = " ") %>%
    count(trigram, sort = TRUE)

ggplot(head(TrigramFreq,15), aes(reorder(trigram,n), n)) +   
    geom_bar(stat="identity") + coord_flip() + 
    xlab("Trigrams") + ylab("Frequency") +
    ggtitle("Most frequent trigrams (stop word filter out)")

Interesting findings

The purpose of this project is going to predict the next world based on this input. Although removing the stop words gives us more meaningful terms, the stop words represent a large proportion of the input.

Get feedback on your plans for creating a prediction algorithm and Shiny app.

Take them into consideration, I will combine both with and without stop words list in the further project to give a more humanize prediction.