Week 2 Project of the course Data Science Capstone under the course track Data Science
Submitted by Olusola Afuwape
January 30th, 2020
The goal of this project is to undertake an exploratory data analysis of the Coursera-SwiftKey data. The aim of the project is to exhibit how comfortable is it for the data scientist to explore and make meaningful inferences from the data.
setwd("C://Users//Olusola//Desktop//New foldercourse//Systems//Data Science//Course10 - Data Science Capstone//Week 2")
# Unzip data file
dataZipFile <- "Coursera-SwiftKey.zip"
dataFile <- "Coursera-SwiftKey"
if (!file.exists(dataFile)) {
unzip(dataZipFile)
}
# Load packages
library(tm)
library(stringi)
library(ggplot2)
library(dplyr)
suppressWarnings(library(wordcloud))
library(RColorBrewer)
# Create files
blog_file <- "final//en_US//en_US.blogs.txt"
news_file <- "final//en_US//en_US.news.txt"
twitter_file <- "final//en_US//en_US.twitter.txt"
# Read lines
blog_lines <- readLines(file(blog_file))
news_lines <- readLines(file(news_file))
Warning in readLines(file(news_file)): incomplete final line found on
'final//en_US//en_US.news.txt'
twitter_lines <- readLines(file(twitter_file))
Warning in readLines(file(twitter_file)): line 167155 appears to contain an
embedded nul
Warning in readLines(file(twitter_file)): line 268547 appears to contain an
embedded nul
Warning in readLines(file(twitter_file)): line 1274086 appears to contain
an embedded nul
Warning in readLines(file(twitter_file)): line 1759032 appears to contain
an embedded nul
# Words count
count.blog_lines <- stri_count_words(blog_lines)
count.news_lines <- stri_count_words(news_lines)
count.twitter_lines <- stri_count_words(twitter_lines)
sum_blog <- sum(count.blog_lines)
sum_news <- sum(count.twitter_lines)
sum_twitter <- sum(count.twitter_lines)
len_blog <- length(blog_lines)
len_news <- length(news_lines)
len_twitter <- length(twitter_lines)
lang_df <- data.frame(File = c("Blogs", "News", "Twitter"), Num_lines = c(len_blog, len_news, len_twitter), Counts = c(sum_blog, sum_news, sum_twitter))
lang_df
File Num_lines Counts
1 Blogs 899288 38154238
2 News 77259 30218125
3 Twitter 2360148 30218125
Considering the amount of memory (RAM) required to run the model and the amount of time the algorithm will take to make a prediction, only 1% of the data will be used in this analysis.
# Create sample data, corpus and cleanse the data for each file
set.seed(1375)
#sample_data <- c(sample(blog_lines, len_blog * 0.001),
#sample(news_lines, len_news * 0.001),
#sample(twitter_lines, len_twitter * 0.001))
Blog <- c(sample(blog_lines, len_blog * 0.01, replace = FALSE))
News <- c(sample(news_lines, len_news * 0.01, replace = FALSE))
Twitter <- c(sample(twitter_lines, len_twitter * 0.01, replace = FALSE))
# Remove special characters
Blog <- iconv(Blog, 'UTF-8', 'ASCII')
Blog_corpus <- VCorpus(VectorSource(Blog))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, "", x))
Blog_corpus <- tm_map(Blog_corpus, toSpace, "&") # Remove ampersand
Blog_corpus <- tm_map(Blog_corpus, toSpace, "@\\w+") # Remove @
Blog_corpus <- tm_map(Blog_corpus, toSpace, "#\\S*") # Remove hashtags
Blog_corpus <- tm_map(Blog_corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+") # Remove url
Blog_corpus <- tm_map(Blog_corpus, toSpace, "RT :|@[a-z,A-Z]*:") # Remove tweets
Blog_corpus <- tm_map(Blog_corpus, tolower)
Blog_corpus <- tm_map(Blog_corpus, removeNumbers)
Blog_corpus <- tm_map(Blog_corpus, removeWords, stopwords("en"))
Blog_corpus <- tm_map(Blog_corpus, removePunctuation)
Blog_corpus <- tm_map(Blog_corpus, stripWhitespace)
Blog_corpus <- tm_map(Blog_corpus, PlainTextDocument)
blog_prominence <- head(sort(rowSums(as.matrix(TermDocumentMatrix(Blog_corpus))), decreasing = TRUE), 15)
blog_prominence
one can like just will time get know also really
692 628 603 602 593 541 428 384 347 341
now little new day well
339 335 327 317 311
barplot(blog_prominence, xlab = "Words", ylab = "Counts", col = "beige", main = "Most frequent words in the blog file")
wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, rot.per=0.6, use.r.layout = FALSE, colors = brewer.pal(9, "Set1"))
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
different could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
must could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
ago could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
using could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
looking could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
morning could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
quite could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
someone could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
finally could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
makes could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
minutes could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
high could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
maybe could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
children could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
everything could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
whole could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
working could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
already could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
everyone could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
please could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
black could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
idea could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
anything could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
decided could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
head could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
hand could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
write could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
couple could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
month could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
money could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
name could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
along could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
perfect could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
change could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
moment could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
nothing could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
past could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
reading could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
writing could not be fit on page. It will not be plotted.
Warning in wordcloud(Blog_corpus, max.words = 200, random.order = FALSE, :
others could not be fit on page. It will not be plotted.
Cleanse the news sample data and then create corpus, barplot and word cloud
News <- iconv(News, 'UTF-8', 'ASCII')
News_corpus <- VCorpus(VectorSource(News))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, "", x))
News_corpus <- tm_map(News_corpus, toSpace, "&") # Remove ampersand
News_corpus <- tm_map(News_corpus, toSpace, "@\\w+") # Remove @
News_corpus <- tm_map(News_corpus, toSpace, "#\\S*") # Remove hashtags
News_corpus <- tm_map(News_corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+") # Remove url
News_corpus <- tm_map(News_corpus, toSpace, "RT :|@[a-z,A-Z]*:") # Remove tweets
News_corpus <- tm_map(News_corpus, tolower)
News_corpus <- tm_map(News_corpus, removeNumbers)
News_corpus <- tm_map(News_corpus, removeWords, stopwords("en"))
News_corpus <- tm_map(News_corpus, removePunctuation)
News_corpus <- tm_map(News_corpus, stripWhitespace)
News_corpus <- tm_map(News_corpus, PlainTextDocument)
# Create barplot for news sample data
news_prominence <- head(sort(rowSums(as.matrix(TermDocumentMatrix(News_corpus))), decreasing = TRUE), 15)
news_prominence
said will one time just year can people two like
167 80 68 50 49 39 38 37 37 36
first also now state last
35 33 33 33 30
barplot(news_prominence, xlab = "Words", ylab = "Counts", col = "lightcoral", main = "Most frequent words in the news sample file")
# Word cloud for news sample data
wordcloud(News_corpus, max.words = 200, random.order = FALSE, rot.per=0.6, use.r.layout = FALSE, colors = brewer.pal(9, "Accent"))
Warning in brewer.pal(9, "Accent"): n too large, allowed maximum for palette Accent is 8
Returning the palette you asked for with that many colors
Cleanse the twitter sample data and then create corpus, barplot and word cloud
Twitter <- iconv(News, 'UTF-8', 'ASCII')
Twitter_corpus <- VCorpus(VectorSource(Twitter))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, "", x))
Twitter_corpus <- tm_map(Twitter_corpus, toSpace, "&") # Remove ampersand
Twitter_corpus <- tm_map(Twitter_corpus, toSpace, "@\\w+") # Remove @
Twitter_corpus <- tm_map(Twitter_corpus, toSpace, "#\\S*") # Remove hashtags
Twitter_corpus <- tm_map(Twitter_corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+") # Remove url
Twitter_corpus <- tm_map(Twitter_corpus, toSpace, "RT :|@[a-z,A-Z]*:") # Remove tweets
Twitter_corpus <- tm_map(Twitter_corpus, tolower)
Twitter_corpus <- tm_map(Twitter_corpus, removeNumbers)
Twitter_corpus <- tm_map(Twitter_corpus, removeWords, stopwords("en"))
Twitter_corpus <- tm_map(Twitter_corpus, removePunctuation)
Twitter_corpus <- tm_map(Twitter_corpus, stripWhitespace)
Twitter_corpus <- tm_map(Twitter_corpus, PlainTextDocument)
# Create barplot for twitter sample data
twitter_prominence <- head(sort(rowSums(as.matrix(TermDocumentMatrix(Twitter_corpus))), decreasing = TRUE), 15)
twitter_prominence
said will one time just year can people two like
167 80 68 50 49 39 38 37 37 36
first also now state last
35 33 33 33 30
barplot(twitter_prominence, xlab = "Words", ylab = "Counts", col = "cadetblue4", main = "Most frequent words in the twitter sample file")
# Word cloud for twitter sample data
wordcloud(Twitter_corpus, max.words = 200, random.order = FALSE, rot.per=0.6, use.r.layout = FALSE, colors = brewer.pal(9, "Dark2"))
Warning in brewer.pal(9, "Dark2"): n too large, allowed maximum for palette Dark2 is 8
Returning the palette you asked for with that many colors