Week 2 Project of the course Data Science Capstone under the course track Data Science

Submitted by Olusola Afuwape

January 30th, 2020


0.0.1 Overview

The goal of this project is to undertake an exploratory data analysis of the Coursera-SwiftKey data. The aim of the project is to exhibit how comfortable is it for the data scientist to explore and make meaningful inferences from the data.


0.0.2 Set working directory and Load packages

setwd("C://Users//Olusola//Desktop//New foldercourse//Systems//Data Science//Course10 -  Data Science Capstone//Week 2")

# Unzip data file
dataZipFile <- "Coursera-SwiftKey.zip"

dataFile <- "Coursera-SwiftKey"
if (!file.exists(dataFile)) {
  unzip(dataZipFile)
}

# Load packages
library(tm)
library(stringi)
library(ggplot2)
library(dplyr)
suppressWarnings(library(wordcloud))
library(RColorBrewer)

0.0.3 Create file variables and read lines

# Create files

blog_file <- "final//en_US//en_US.blogs.txt"
news_file <- "final//en_US//en_US.news.txt"
twitter_file <- "final//en_US//en_US.twitter.txt"

# Read lines

blog_lines <- readLines(file(blog_file))
news_lines <- readLines(file(news_file))
Warning in readLines(file(news_file)): incomplete final line found on
'final//en_US//en_US.news.txt'
twitter_lines <- readLines(file(twitter_file))
Warning in readLines(file(twitter_file)): line 167155 appears to contain an
embedded nul
Warning in readLines(file(twitter_file)): line 268547 appears to contain an
embedded nul
Warning in readLines(file(twitter_file)): line 1274086 appears to contain
an embedded nul
Warning in readLines(file(twitter_file)): line 1759032 appears to contain
an embedded nul

0.0.4 Compute the number of lines and sum of words

# Words count

count.blog_lines <- stri_count_words(blog_lines)
count.news_lines <- stri_count_words(news_lines)
count.twitter_lines <- stri_count_words(twitter_lines)

sum_blog <- sum(count.blog_lines)
sum_news <- sum(count.twitter_lines)
sum_twitter <- sum(count.twitter_lines)

len_blog <- length(blog_lines)
len_news <- length(news_lines)
len_twitter <- length(twitter_lines)

lang_df <- data.frame(File = c("Blogs", "News", "Twitter"), Num_lines = c(len_blog, len_news, len_twitter), Counts = c(sum_blog, sum_news, sum_twitter))
lang_df
     File Num_lines   Counts
1   Blogs    899288 38154238
2    News     77259 30218125
3 Twitter   2360148 30218125

0.0.5 Discussion

Considering the amount of memory (RAM) required to run the model and the amount of time the algorithm will take to make a prediction, only 1% of the data will be used in this analysis.

# Create sample data, corpus and cleanse the data for each file

set.seed(1375)

#sample_data <- c(sample(blog_lines, len_blog * 0.001),
                 #sample(news_lines, len_news * 0.001),
                 #sample(twitter_lines, len_twitter * 0.001))

Blog <- c(sample(blog_lines, len_blog * 0.01, replace = FALSE))
News <- c(sample(news_lines, len_news * 0.01, replace = FALSE))
Twitter <- c(sample(twitter_lines, len_twitter * 0.01, replace = FALSE))

0.0.6 Blog sample data

# Remove special characters

Blog <- iconv(Blog, 'UTF-8', 'ASCII')
Blog_corpus <- VCorpus(VectorSource(Blog))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, "", x))
Blog_corpus <- tm_map(Blog_corpus, toSpace, "&amp") # Remove ampersand
Blog_corpus <- tm_map(Blog_corpus, toSpace, "@\\w+") # Remove @ 
Blog_corpus <- tm_map(Blog_corpus, toSpace, "#\\S*") # Remove hashtags
Blog_corpus <- tm_map(Blog_corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+") # Remove url
Blog_corpus <- tm_map(Blog_corpus, toSpace, "RT :|@[a-z,A-Z]*:") # Remove tweets
Blog_corpus <- tm_map(Blog_corpus, tolower)
Blog_corpus <- tm_map(Blog_corpus, removeNumbers)
Blog_corpus <- tm_map(Blog_corpus, removeWords, stopwords("en"))
Blog_corpus <- tm_map(Blog_corpus, removePunctuation)
Blog_corpus <- tm_map(Blog_corpus, stripWhitespace)
Blog_corpus <- tm_map(Blog_corpus, PlainTextDocument)

blog_prominence <- head(sort(rowSums(as.matrix(TermDocumentMatrix(Blog_corpus))), decreasing = TRUE), 15)

blog_prominence
   one    can   like   just   will   time    get   know   also really 
   692    628    603    602    593    541    428    384    347    341 
   now little    new    day   well 
   339    335    327    317    311 
barplot(blog_prominence, xlab = "Words", ylab = "Counts", col = "beige", main = "Most frequent words in the blog file")


wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, rot.per=0.6, use.r.layout = FALSE, colors = brewer.pal(9, "Set1"))
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
different could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
must could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
ago could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
using could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
looking could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
morning could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
quite could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
someone could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
finally could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
makes could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
minutes could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
high could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
maybe could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
children could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
everything could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
whole could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
working could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
already could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
everyone could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
please could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
black could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
idea could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
anything could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
decided could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
head could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
hand could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
write could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
couple could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
month could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
money could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
name could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
along could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
perfect could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
change could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
moment could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
nothing could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
past could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
reading could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
writing could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
others could not be fit on page. It will not be plotted.


0.0.7 News sample data

Cleanse the news sample data and then create corpus, barplot and word cloud

News <- iconv(News, 'UTF-8', 'ASCII')
News_corpus <- VCorpus(VectorSource(News))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, "", x))
News_corpus <- tm_map(News_corpus, toSpace, "&amp") # Remove ampersand
News_corpus <- tm_map(News_corpus, toSpace, "@\\w+") # Remove @ 
News_corpus <- tm_map(News_corpus, toSpace, "#\\S*") # Remove hashtags
News_corpus <- tm_map(News_corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+") # Remove url
News_corpus <- tm_map(News_corpus, toSpace, "RT :|@[a-z,A-Z]*:") # Remove tweets
News_corpus <- tm_map(News_corpus, tolower)
News_corpus <- tm_map(News_corpus, removeNumbers)
News_corpus <- tm_map(News_corpus, removeWords, stopwords("en"))
News_corpus <- tm_map(News_corpus, removePunctuation)
News_corpus <- tm_map(News_corpus, stripWhitespace)
News_corpus <- tm_map(News_corpus, PlainTextDocument)

# Create barplot for news sample data

news_prominence <- head(sort(rowSums(as.matrix(TermDocumentMatrix(News_corpus))), decreasing = TRUE), 15)

news_prominence
  said   will    one   time   just   year    can people    two   like 
   167     80     68     50     49     39     38     37     37     36 
 first   also    now  state   last 
    35     33     33     33     30 
barplot(news_prominence, xlab = "Words", ylab = "Counts", col = "lightcoral", main = "Most frequent words in the news sample file")

# Word cloud for news sample data

wordcloud(News_corpus, max.words = 200, random.order = FALSE, rot.per=0.6, use.r.layout = FALSE, colors = brewer.pal(9, "Accent"))
Warning in brewer.pal(9, "Accent"): n too large, allowed maximum for palette Accent is 8
Returning the palette you asked for with that many colors


0.0.8 Twitter sample data

Cleanse the twitter sample data and then create corpus, barplot and word cloud

Twitter <- iconv(News, 'UTF-8', 'ASCII')
Twitter_corpus <- VCorpus(VectorSource(Twitter))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, "", x))
Twitter_corpus <- tm_map(Twitter_corpus, toSpace, "&amp") # Remove ampersand
Twitter_corpus <- tm_map(Twitter_corpus, toSpace, "@\\w+") # Remove @ 
Twitter_corpus <- tm_map(Twitter_corpus, toSpace, "#\\S*") # Remove hashtags
Twitter_corpus <- tm_map(Twitter_corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+") # Remove url
Twitter_corpus <- tm_map(Twitter_corpus, toSpace, "RT :|@[a-z,A-Z]*:") # Remove tweets
Twitter_corpus <- tm_map(Twitter_corpus, tolower)
Twitter_corpus <- tm_map(Twitter_corpus, removeNumbers)
Twitter_corpus <- tm_map(Twitter_corpus, removeWords, stopwords("en"))
Twitter_corpus <- tm_map(Twitter_corpus, removePunctuation)
Twitter_corpus <- tm_map(Twitter_corpus, stripWhitespace)
Twitter_corpus <- tm_map(Twitter_corpus, PlainTextDocument)

# Create barplot for twitter sample data

twitter_prominence <- head(sort(rowSums(as.matrix(TermDocumentMatrix(Twitter_corpus))), decreasing = TRUE), 15)

twitter_prominence
  said   will    one   time   just   year    can people    two   like 
   167     80     68     50     49     39     38     37     37     36 
 first   also    now  state   last 
    35     33     33     33     30 
barplot(twitter_prominence, xlab = "Words", ylab = "Counts", col = "cadetblue4", main = "Most frequent words in the twitter sample file")

# Word cloud for twitter sample data

wordcloud(Twitter_corpus, max.words = 200, random.order = FALSE, rot.per=0.6, use.r.layout = FALSE, colors = brewer.pal(9, "Dark2"))
Warning in brewer.pal(9, "Dark2"): n too large, allowed maximum for palette Dark2 is 8
Returning the palette you asked for with that many colors