This is a Milestone Report (exploratory analysis of the data set) of the Data Science Specialization SwiftKey Capstone.
The goal of this project is just to display that I’ve gotten used to working with the data and that I’m on track to create my prediction algorithm.
The motivation for this project is to:
1. Demonstrate that you’ve downloaded the data and have successfully loaded it in.
2. Create a basic report of summary statistics about the data sets.
3. Report any interesting findings that you amassed so far.4. Get feedback on your plans for creating a prediction algorithm and Shiny app.
library(tidytext)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(qdapRegex)
##
## Attaching package: 'qdapRegex'
## The following object is masked from 'package:ggplot2':
##
## %+%
## The following object is masked from 'package:dplyr':
##
## explain
library(ngram)
library(RWeka)
wd <- getwd()
f <- "Coursera-SwiftKey.zip"
if (!dir.exists("Coursera-SwiftKey")) {
url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
if (!file.exists("Coursera-SwiftKey.zip")) {
download.file(url, destfile = "Coursera-SwiftKey.zip", method = "curl")
unzip("Coursera-SwiftKey.zip")
download.file(url, file.path(wd, f), method = "curl")
unzip(f)
}
}
blogs_file<-"Coursera-SwiftKey/final/en_US/en_US.blogs.txt"
twitter_file<-"Coursera-SwiftKey/final/en_US/en_US.twitter.txt"
news_file<-"Coursera-SwiftKey/final/en_US/en_US.news.txt"
blogs <- readLines(blogs_file, warn = FALSE, encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines(twitter_file, warn = FALSE, encoding = "UTF-8", skipNul = TRUE)
news <- readLines(news_file, warn = FALSE, encoding = "UTF-8", skipNul = TRUE)
Basic summaries of the three files (Word counts, line counts).
line_news<-length(news)
line_twitter<-length(twitter)
line_blogs<-length(blogs)
wc_news<-wordcount(news)
wc_twitter<-wordcount(twitter)
wc_blogs<-wordcount(blogs)
a<-rbind(line_news,line_twitter,line_blogs)
b<-rbind(wc_news,wc_twitter,wc_blogs)
c<-as.data.frame(cbind(a,b))
names(c)<-c("number of lines","number of words")
rownames(c)<-c("news","twitter","blogs")
c
## number of lines number of words
## news 77259 2643969
## twitter 2360148 30373583
## blogs 899288 37334131
Taking sample of data: 1% of each dataset for exploratory analysis.
set.seed(123)
twitter_sample <- sample(twitter, length(twitter) * 0.01, replace = FALSE)
blogs_sample <- sample(blogs, length(blogs) * 0.01, replace = FALSE)
news_sample <- sample(news, length(news) * 0.01, replace = FALSE)
data_sample = c(twitter_sample, blogs_sample, news_sample)
Data Cleaning
# Remove lines with unidentify characters
NotKnown <- grep("NotKnown", iconv(data_sample, "latin1", "ASCII", sub="NotKnown"))
data_sample <- data_sample[-NotKnown]
# doing some cleaning
data_sample <- gsub("&", "", data_sample)
data_sample <- gsub("RT :|@[a-z,A-Z]*: ", "", data_sample) # remove tweets
data_sample <- gsub("@\\w+", "", data_sample)
data_sample <- gsub("[[:digit:]]", "", data_sample) # remove digits
data_sample <- gsub(" #\\S*","", data_sample) # remove hash tags
data_sample <- gsub(" ?(f|ht)tp(s?)://(.*)[.][a-z]+", "", data_sample) # remove url
data_sample <- rm_white(data_sample) # remove extra spaces
data_sample_df <- data_frame(line = 1:length(data_sample), text = data_sample)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
Unigram
UnigramFreq <- data_sample_df %>%
unnest_tokens(unigram, text, token = "ngrams", n = 3) %>%
separate(unigram, c("word1"), sep = " ",
extra = "drop", fill = "right") %>%
filter(!word1 == "NA") %>%
unite(unigram, word1, sep = " ") %>%
count(unigram, sort = TRUE)
ggplot(head(UnigramFreq,15), aes(reorder(unigram,n), n)) +
geom_bar(stat="identity") + coord_flip() +
xlab("Unigrams") + ylab("Frequency") +
ggtitle("Most frequent unigrams")
To some degree, the most frequent unigrams shown here are meaningless. So the stop word should be filtered out.
Stop word filter out
Stop words are any word in a stop list (or stoplist or negative dictionary) which are filtered out (i.e. stopped) before or after processing of natural language data (text).
UnigramFreq <- data_sample_df %>%
unnest_tokens(unigram, text, token = "ngrams", n = 3) %>%
separate(unigram, c("word1"), sep = " ",
extra = "drop", fill = "right") %>%
filter(!word1 == "NA") %>%
filter(!word1 %in% stop_words$word) %>% # stop wors filter out
unite(unigram, word1, sep = " ") %>%
count(unigram, sort = TRUE)
ggplot(head(UnigramFreq,15), aes(reorder(unigram,n), n)) +
geom_bar(stat="identity") + coord_flip() +
xlab("Unigrams") + ylab("Frequency") +
ggtitle("Most frequent unigrams (stop word filter out)")
Bigram
BigramFreq <- data_sample_df %>%
unnest_tokens(bigram, text, token = "ngrams", n = 3) %>%
separate(bigram, c("word1", "word2"), sep = " ",
extra = "drop", fill = "right") %>%
filter(!word1 == "NA",
!word2 == "NA") %>%
unite(bigram, word1, word2, sep = " ") %>%
count(bigram, sort = TRUE)
ggplot(head(BigramFreq,15), aes(reorder(bigram,n), n)) +
geom_bar(stat="identity") + coord_flip() +
xlab("Bigrams") + ylab("Frequency") +
ggtitle("Most frequent bigrams")
BigramFreq <- data_sample_df %>%
unnest_tokens(bigram, text, token = "ngrams", n = 3) %>%
separate(bigram, c("word1", "word2"), sep = " ",
extra = "drop", fill = "right") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word) %>%
filter(!word1 == "NA",
!word2 == "NA") %>%
unite(bigram, word1, word2, sep = " ") %>%
count(bigram, sort = TRUE)
ggplot(head(BigramFreq,15), aes(reorder(bigram,n), n)) +
geom_bar(stat="identity") + coord_flip() +
xlab("Bigrams") + ylab("Frequency") +
ggtitle("Most frequent bigrams (stop word filter out)")
Trigram
TrigramFreq <- data_sample_df %>%
unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
separate(trigram, c("word1", "word2", "word3"), sep = " ",
extra = "drop", fill = "right") %>%
filter(!word1 == "NA",
!word2 == "NA",
!word3 == "NA") %>%
unite(trigram, word1, word2, word3, sep = " ") %>%
count(trigram, sort = TRUE)
ggplot(head(TrigramFreq,15), aes(reorder(trigram,n), n)) +
geom_bar(stat="identity") + coord_flip() +
xlab("Trigrams") + ylab("Frequency") +
ggtitle("Most frequent trigrams")
TrigramFreq <- data_sample_df %>%
unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
separate(trigram, c("word1", "word2", "word3"), sep = " ",
extra = "drop", fill = "right") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word,
!word3 %in% stop_words$word) %>%
filter(!word1 == "NA",
!word2 == "NA",
!word3 == "NA") %>%
unite(trigram, word1, word2, word3, sep = " ") %>%
count(trigram, sort = TRUE)
ggplot(head(TrigramFreq,15), aes(reorder(trigram,n), n)) +
geom_bar(stat="identity") + coord_flip() +
xlab("Trigrams") + ylab("Frequency") +
ggtitle("Most frequent trigrams (stop word filter out)")
The purpose of this project is going to predict the next world based on this input. Although removing the stop words gives us more meaningful terms, the stop words represent a large proportion of the input.
Take them into consideration, I will combine both with and without stop words list in the further project to give a more humanize prediction.