We’ve done some exploratory data analysis on the dataset, looking at word counts, line counts, and some simple unigram, bigram, and trigram analysis.
suppressMessages(library(dplyr))
suppressMessages(library(ggplot2))
suppressMessages(library(tm))
## Warning: package 'tm' was built under R version 4.0.5
suppressMessages(library(wordcloud))
## Warning: package 'wordcloud' was built under R version 4.0.5
suppressMessages(library(RWeka))
## Warning: package 'RWeka' was built under R version 4.0.5
suppressMessages(library(stringi))
#setup
setwd("C:/Users/Valor/Desktop/Data Science/Capstone")
furl <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
if(!file.exists("./Coursera-SwiftKey.zip")){
download.file(furl,destfile="./Coursera-SwiftKey.zip",mode = "wb")
unzip("Coursera-SwiftKey.zip")
}
#import data
blogs <- readLines("./final/en_US/en_US.blogs.txt", encoding = "UTF-8")
news <- readLines("./final/en_US/en_US.news.txt", encoding = "UTF-8")
## Warning in readLines("./final/en_US/en_US.news.txt", encoding = "UTF-8"):
## incomplete final line found on './final/en_US/en_US.news.txt'
twit <- readLines("./final/en_US/en_US.twitter.txt", encoding = "UTF-8")
## Warning in readLines("./final/en_US/en_US.twitter.txt", encoding = "UTF-8"):
## line 167155 appears to contain an embedded nul
## Warning in readLines("./final/en_US/en_US.twitter.txt", encoding = "UTF-8"):
## line 268547 appears to contain an embedded nul
## Warning in readLines("./final/en_US/en_US.twitter.txt", encoding = "UTF-8"):
## line 1274086 appears to contain an embedded nul
## Warning in readLines("./final/en_US/en_US.twitter.txt", encoding = "UTF-8"):
## line 1759032 appears to contain an embedded nul
summary <- data.frame(textfile = c("blogs","news", "twitter"))
summary$filesize[1] <- file.info("final/en_US/en_US.blogs.txt")$size / 1024 ^ 2
summary$filesize[2] <- file.info("final/en_US/en_US.news.txt")$size / 1024 ^ 2
summary$filesize[3] <- file.info("final/en_US/en_US.twitter.txt")$size / 1024 ^ 2
summary$wordcnt[1] <- sum(stri_count_words(blogs))
summary$wordcnt[2] <- sum(stri_count_words(news))
summary$wordcnt[3] <- sum(stri_count_words(twit))
summary$linecnt[1] <- length(blogs)
summary$linecnt[2] <- length(news)
summary$linecnt[3] <- length(twit)
summary
## textfile filesize wordcnt linecnt
## 1 blogs NA 37546239 899288
## 2 news NA 2674536 77259
## 3 twitter NA 30093372 2360148
Since there’s so much data in the files, we should take a sample to make processing much more streamlined and easier.
set.seed(123)
subblogs <- sample(blogs, 10000)
subnews <- sample(news, 10000)
subtwit <- sample(twit, 10000)
sampleData <- c(subblogs, subnews, subtwit)
We need to remove https, @, punctuation, numbers, and stopwords to make the data analysis more meaningful.
corpus <- VCorpus(VectorSource(sampleData))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+", lazy = TRUE) #remove https
corpus <- tm_map(corpus, toSpace, "@[^\\s]+", lazy = TRUE) # remove @
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)
wordcloud(corpus, min.freq=20, max.words=50, random.order=TRUE,
rot.per=0.5, use.r.layout=FALSE, colors=brewer.pal(12, "Set3"))
corpus.df <- data.frame(text = unlist(sapply(corpus, '[', 'content')), stringsAsFactors = F)
uniGram <- data.frame(table(NGramTokenizer(corpus.df, Weka_control(min = 1, max = 1))))
biGram <- data.frame(table(NGramTokenizer(corpus.df, Weka_control(min = 2, max = 2))))
triGram <- data.frame(table(NGramTokenizer(corpus.df, Weka_control(min = 3, max = 3))))
unigram <- uniGram[order(uniGram$Freq, decreasing = TRUE),]
bigram <- biGram[order(biGram$Freq, decreasing = TRUE),]
trigram <- triGram[order(triGram$Freq, decreasing = TRUE),]
par(mfrow = c(3, 1))
par(mar=c(1,1,1,1))
barplot(unigram[1:15,2],
names.arg=unigram[1:15,1],
col = "blue",
main="Unigrams (Top 15)",
las=2,
ylab = "Frequency")
barplot(bigram[1:15,2],
names.arg=bigram[1:15,1],
col = "purple",
main="bigrams (Top 15)",
las=2,
ylab = "Frequency")
barplot(trigram[1:15,2],
names.arg=trigram[1:15,1],
col = "red",
main="trigrams (Top 15)",
las=2,
ylab = "Frequency")