Summary

We’ve done some exploratory data analysis on the dataset, looking at word counts, line counts, and some simple unigram, bigram, and trigram analysis.

suppressMessages(library(dplyr))
suppressMessages(library(ggplot2))
suppressMessages(library(tm))
## Warning: package 'tm' was built under R version 4.0.5
suppressMessages(library(wordcloud))
## Warning: package 'wordcloud' was built under R version 4.0.5
suppressMessages(library(RWeka))
## Warning: package 'RWeka' was built under R version 4.0.5
suppressMessages(library(stringi))

Loading and preprocessing the data

#setup
setwd("C:/Users/Valor/Desktop/Data Science/Capstone")
furl <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"

if(!file.exists("./Coursera-SwiftKey.zip")){
  download.file(furl,destfile="./Coursera-SwiftKey.zip",mode = "wb")
  unzip("Coursera-SwiftKey.zip")
}

#import data
blogs <- readLines("./final/en_US/en_US.blogs.txt", encoding = "UTF-8")
news <- readLines("./final/en_US/en_US.news.txt", encoding = "UTF-8")
## Warning in readLines("./final/en_US/en_US.news.txt", encoding = "UTF-8"):
## incomplete final line found on './final/en_US/en_US.news.txt'
twit <- readLines("./final/en_US/en_US.twitter.txt", encoding = "UTF-8")
## Warning in readLines("./final/en_US/en_US.twitter.txt", encoding = "UTF-8"):
## line 167155 appears to contain an embedded nul
## Warning in readLines("./final/en_US/en_US.twitter.txt", encoding = "UTF-8"):
## line 268547 appears to contain an embedded nul
## Warning in readLines("./final/en_US/en_US.twitter.txt", encoding = "UTF-8"):
## line 1274086 appears to contain an embedded nul
## Warning in readLines("./final/en_US/en_US.twitter.txt", encoding = "UTF-8"):
## line 1759032 appears to contain an embedded nul

Data Processing and Summary

summary <- data.frame(textfile = c("blogs","news", "twitter"))
summary$filesize[1] <- file.info("final/en_US/en_US.blogs.txt")$size / 1024 ^ 2
summary$filesize[2] <- file.info("final/en_US/en_US.news.txt")$size / 1024 ^ 2
summary$filesize[3] <- file.info("final/en_US/en_US.twitter.txt")$size / 1024 ^ 2
summary$wordcnt[1] <- sum(stri_count_words(blogs))
summary$wordcnt[2] <- sum(stri_count_words(news))
summary$wordcnt[3] <- sum(stri_count_words(twit))
summary$linecnt[1] <- length(blogs)
summary$linecnt[2] <- length(news)
summary$linecnt[3] <- length(twit)
summary
##   textfile filesize  wordcnt linecnt
## 1    blogs       NA 37546239  899288
## 2     news       NA  2674536   77259
## 3  twitter       NA 30093372 2360148

Sampling

Since there’s so much data in the files, we should take a sample to make processing much more streamlined and easier.

set.seed(123)
subblogs <- sample(blogs, 10000)
subnews <- sample(news, 10000)
subtwit <- sample(twit, 10000)
sampleData <- c(subblogs, subnews, subtwit)

Data cleanup

We need to remove https, @, punctuation, numbers, and stopwords to make the data analysis more meaningful.

corpus <- VCorpus(VectorSource(sampleData))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+", lazy = TRUE) #remove https
corpus <- tm_map(corpus, toSpace, "@[^\\s]+", lazy = TRUE) # remove @
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)

wordcloud(corpus, min.freq=20, max.words=50, random.order=TRUE, 
          rot.per=0.5, use.r.layout=FALSE, colors=brewer.pal(12, "Set3"))

Unigram, Bigram, and Trigram Analysis

corpus.df <- data.frame(text = unlist(sapply(corpus, '[', 'content')), stringsAsFactors = F)

uniGram <- data.frame(table(NGramTokenizer(corpus.df, Weka_control(min = 1, max = 1))))
biGram <- data.frame(table(NGramTokenizer(corpus.df, Weka_control(min = 2, max = 2))))
triGram <- data.frame(table(NGramTokenizer(corpus.df, Weka_control(min = 3, max = 3))))


unigram <- uniGram[order(uniGram$Freq, decreasing = TRUE),]
bigram  <- biGram[order(biGram$Freq, decreasing = TRUE),]
trigram <- triGram[order(triGram$Freq, decreasing = TRUE),]

Plotting Unigrams, Bigrams, and Trigrams

par(mfrow = c(3, 1))
par(mar=c(1,1,1,1))
barplot(unigram[1:15,2], 
        names.arg=unigram[1:15,1], 
        col = "blue", 
        main="Unigrams (Top 15)", 
        las=2, 
        ylab = "Frequency")
barplot(bigram[1:15,2], 
        names.arg=bigram[1:15,1], 
        col = "purple", 
        main="bigrams (Top 15)", 
        las=2, 
        ylab = "Frequency")
barplot(trigram[1:15,2], 
        names.arg=trigram[1:15,1], 
        col = "red", 
        main="trigrams (Top 15)", 
        las=2, 
        ylab = "Frequency")