Processing date: ‘mei 13, 2017’
The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm. Please submit a report on R Pubs (http://rpubs.com/) that explains your exploratory analysis and your goals for the eventual app and algorithm. This document should be concise and explain only the major features of the data you have identified and briefly summarize your plans for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager. You should make use of tables and plots to illustrate important summaries of the data set. The motivation for this project is to:
## Load CRAN modules
suppressMessages(if (!require("pacman")) install.packages("pacman"))
suppressMessages(pacman::p_load(downloader, plyr, dplyr, knitr, tm, stringi,RWeka, ggplot2, slam, wordcloud, RColorBrewer, gridExtra))
## Check if directory already exists?
if (!file.exists("./projectData")){
dir.create("./projectData")
}
Url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
## Check if zip has already been downloaded in projectData directory?
if (!file.exists("./projectData/Coursera-SwiftKey.zip")){
download.file(Url,destfile="./projectData/Coursera-SwiftKey.zip")
}
## Check if zip has already been unzipped?
if (!file.exists("./projectData/final")) {
dir.create("./projectData/final")
unzip(zipfile = "./projectData/Coursera-SwiftKey.zip")
}
# Once the dataset is downloaded start reading it as this a huge dataset so we'll read line by line only the amount of data needed before doing that lets first list all the files in the directory
path <- file.path("./final" , "en_US")
files <- list.files(path, recursive = TRUE)
# Lets make a file connection of the twitter data set
con <- file("./final/en_US/en_US.twitter.txt", "r")
#lineTwitter <-r eadLines(con,encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines(con, skipNul = TRUE)
# Close the connection handle when you are done
close(con)
# Lets make a file connection of the blog data set
con <- file("./final/en_US/en_US.blogs.txt", "r")
#lineBlogs<-readLines(con,encoding = "UTF-8", skipNul = TRUE)
blogs <- readLines(con, skipNul = TRUE)
# Close the connection handle when you are done
close(con)
# Lets make a file connection of the news data set
con <- file("./final/en_US/en_US.news.txt", "r")
#lineNews<-readLines(con,encoding = "UTF-8", skipNul = TRUE)
news <- readLines(con, skipNul = TRUE)
## Warning in readLines(con, skipNul = TRUE): incomplete final line found on
## './final/en_US/en_US.news.txt'
# Close the connection handle when you are done
close(con)
# Get file sizes
blogs.size <- file.info("./final/en_US/en_US.blogs.txt")$size / 1024 ^ 2
news.size <- file.info("./final/en_US/en_US.news.txt")$size / 1024 ^ 2
twitter.size <- file.info("./final/en_US/en_US.twitter.txt")$size / 1024 ^ 2
# Get words in files
blogs.words <- stri_count_words(blogs)
news.words <- stri_count_words(news)
twitter.words <- stri_count_words(twitter)
# Summary of the data sets
dataSet <- data.frame(source = c("blogs", "news", "twitter"),
file.size.MB = c(blogs.size, news.size, twitter.size),
num.lines = c(length(blogs), length(news), length(twitter)),
num.words = c(sum(blogs.words), sum(news.words), sum(twitter.words)),
mean.num.words = c(mean(blogs.words), mean(news.words), mean(twitter.words)))
dataSet
## source file.size.MB num.lines num.words mean.num.words
## 1 blogs 200.4242 899288 38154238 42.42716
## 2 news 196.2775 77259 2693898 34.86840
## 3 twitter 159.3641 2360148 30218166 12.80350
The dataset shown provides some interesting information
Here we can see the frequency for certain number of characters per item in the datasets
par(mfcol = c(1,3))
hist(nchar(twitter))
hist(nchar(blogs))
hist(nchar(news))
To continue we have to explore the natural language processing domain:
We have applied the following actions on the source data
## Cleaning and Sampling the data ## have to reload some libraries seem to be forgotten here
suppressMessages(pacman::p_load(downloader, plyr, dplyr, knitr, tm, stringi,RWeka, ggplot2, slam, wordcloud, RColorBrewer, gridExtra))
set.seed(1234)
sampleSize <- 1000 # set size of corpora to sample
cleanString <- function(x) { # Strip unwanted UTF-8 characters
x <- iconv(x,"UTF-8", "ASCII", "?") # this will leave contractions like don't, I'm, etc.
x <-gsub("(?<!\\w)[-'](?<!\\w)" , " ", x, perl = TRUE) # just leave - and ' in the middle of words #
x <- gsub("[[:space:]]+", " ", x, perl = TRUE) #consolidate spaces to one #
x <- gsub("^[[:space:]]+", "", x, perl = TRUE) # strip leading spaces
x <- x[which(x!="")] ## Remove empty lines
}
blogs <-cleanString(blogs)
news <- cleanString(news)
twitter <- cleanString(twitter)
SPC <- function(x) {
x <- sample(x,sampleSize)
x <- VCorpus(VectorSource(x))
x <- tm_map(x,removeNumbers)
x <- tm_map(x,removeWords, stopwords("english"))
x <- tm_map(x,stripWhitespace)
x <- tm_map(x, content_transformer(tolower))
x <- tm_map(x, removePunctuation,preserve_intra_word_dashes = TRUE)
}
blogs_a <- SPC(blogs)
news_a <- SPC(news)
twitter_a <- SPC(twitter)
using word clouds we like to show the different relevance of words used in the different sources.
wordcloud(blogs_a, scale=c(5,0.1), max.words=200, random.order=FALSE, rot.per=0.5, use.r.layout=FALSE, colors=brewer.pal(8,"Accent"))
wordcloud(news_a, scale=c(5,0.1), max.words=200, random.order=FALSE, rot.per=0.5, use.r.layout=FALSE, colors=brewer.pal(8,"Accent"))
wordcloud(twitter_a, scale=c(5,0.1), max.words=200, random.order=FALSE, rot.per=0.5, use.r.layout=FALSE, colors=brewer.pal(8,"Accent"))
The word clouds show rather different words (relevant words) per source type
In this section we analyze the frequencies of single verbs and groups of verbs. To analyse this we use the dataSample set and Use the N-Gram technology.
## Funcions to extract xGrams
UniToken<-function(x)NGramTokenizer(x, Weka_control(min = 1, max = 1))
BiToken<-function(x)NGramTokenizer(x, Weka_control(min = 2, max = 2))
TriToken<-function(x)NGramTokenizer(x, Weka_control(min = 3, max = 3))
unigrams<-function(x) {tdm <- TermDocumentMatrix(x, control = list(tokenize =
UniToken))
fm <- rowSums(as.matrix(tdm))
ngram<-data.frame(ngram=names(fm),freq=fm)
ngram<-ngram[order(-ngram$freq),] }
bigrams<-function(x) {tdm <- TermDocumentMatrix(x, control = list(tokenize =
BiToken))
fm <- rowSums(as.matrix(tdm))
ngram<-data.frame(ngram=names(fm),freq=fm)
ngram<-ngram[order(-ngram$freq),] }
trigrams<-function(x) {tdm <- TermDocumentMatrix(x, control = list(tokenize =
TriToken))
fm <- rowSums(as.matrix(tdm))
ngram<-data.frame(ngram=names(fm),freq=fm)
ngram<-ngram[order(-ngram$freq),] }
# Generate unigrams, bigrams and trigrams from the three samples corpora
blogs_a1<-unigrams(blogs_a)
blogs_a2<-bigrams(blogs_a)
blogs_a3<-trigrams(blogs_a)
news_a1<-unigrams(news_a)
news_a2<-bigrams(news_a)
news_a3<-trigrams(news_a)
twitter_a1<-unigrams(twitter_a)
twitter_a2<-bigrams(twitter_a)
twitter_a3<-trigrams(twitter_a)
## Start plots
plot_df <- function(df) { ggplot(df, aes(x=ngram, y=freq)) + geom_bar(stat="Identity", fill="red", colour = "pink") +coord_flip() +geom_text(aes(label=freq),vjust=-0.1) + theme(axis.text.x = element_text(angle = 90, hjust = 2)) }
## GridExtra to comine in line
maxNgrams <- 30
p1 <- plot_df(head(blogs_a1,maxNgrams))
p2 <- plot_df(head(news_a1,maxNgrams))
p3 <- plot_df(head(twitter_a1,maxNgrams))
grid.arrange(p1, p2, p3, top = "Unigram Top 30 per Source (blogs, news,twitter)", ncol = 3)
p1 <- plot_df(head(blogs_a2,maxNgrams))
p2 <- plot_df(head(news_a2,maxNgrams))
p3 <- plot_df(head(twitter_a2,maxNgrams))
grid.arrange(p1, p2, p3, top = "Bigram Top 30 per Source (blogs, news,twitter)",ncol = 3)
p1 <- plot_df(head(blogs_a3,maxNgrams))
p2 <- plot_df(head(news_a3,maxNgrams))
p3 <- plot_df(head(twitter_a3,maxNgrams))
grid.arrange(p1, p2, p3, top = "Trigram Top 30 per Source (blogs, news, twitter)", ncol = 3)
The writing style differs per source type. Twitter has short handed notes and uses the I often (this also applies for blogs). News has a more formal language
How many unique words does one need in a frequency sorted dictionary to cover a given % of all word instances in the language? To address this question, we shall create a simple function to run a loop until the desired word coverage is reached. Keep in mind that this is with the truncated, sampled data set across blogs, news and twitter.
wordcoverage <- function(x,wordcover)
#x is the unigram output sorted by frequency, y is the percent word coverage
{
nwords<-0 # initial counter
coverage <- wordcover*sum(x$freq) # number of words to hit coverage
for (i in 1:nrow(x))
{
if (nwords >= coverage) {
return (i)
}
nwords<-nwords+x$freq[i]
}
}
wc <- data.frame( source = c("blogs", "news", "twitter"),
perc50 = c(wordcoverage(blogs_a1,0.5), wordcoverage(news_a1,0.5),
wordcoverage(twitter_a1,0.5)),
perc75 = c(wordcoverage(blogs_a1,0.75), wordcoverage(news_a1,0.75), wordcoverage(twitter_a1,0.75)),
perc90 = c(wordcoverage(blogs_a1,0.9), wordcoverage(news_a1,0.9),
wordcoverage(twitter_a1,0.9)),
perc99 = c(wordcoverage(blogs_a1,0.99),wordcoverage(news_a1,0.99), wordcoverage(twitter_a1,0.99)) )
wc
## source perc50 perc75 perc90 perc99
## 1 blogs 776 2646 5536 7408
## 2 news 992 2973 5859 7641
## 3 twitter 427 1543 2638 3295
Based on the figures shown
This shows the word frequency in the sample datasets of words as log10 of the frequency
par(mfcol = c(1,3))
hist(log10(table(blogs_a1[,2])), xlab="", col = "#56B4E9", ylab="Number of words", main = "Unigram")
hist(log10(table(blogs_a2[,2])), xlab="Blogs word frequency in corpus (log10)", col = "#56B4E9", ylab="", main = "Bigrams")
hist(log10(table(blogs_a3[,2])), xlab="", col = "#56B4E9", ylab="", main = "Trigrams")
par(mfcol = c(1,3))
hist(log10(table(twitter_a1[,2])), xlab="", col = "#56B4E9", ylab="Number of words", main = "Unigram")
hist(log10(table(twitter_a2[,2])), xlab="Twitter word frequency in corpus (log10)", col = "#56B4E9", ylab="", main = "Bigrams")
hist(log10(table(twitter_a3[,2])), xlab="", col = "#56B4E9", ylab="", main = "Trigrams")
par(mfcol = c(1,3))
hist(log10(table(news_a1[,2])), xlab="", col = "#56B4E9", ylab="Number of words", main = "Unigram")
hist(log10(table(news_a2[,2])), xlab="News word frequency in corpus (log10)", col = "#56B4E9", ylab="", main = "Bigrams")
hist(log10(table(news_a3[,2])), xlab="", col = "#56B4E9", ylab="", main = "Trigrams")
Based on the data analyzed i think we can make draw some preliminary conclusions: