Data Science Specialization Capstone experience requires to demonstrate the ability to work with relatively unstructured text data. The first step is to download and read in the data. The code and the methods adopted in the current project is from various parts of the specialitzation.
The data are available at this url: https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip ### Download the data
## Set your working directory...
path<-paste(getwd(),"/final/en_US", sep="")
## and then load the filenames into memory.
files<-list.files(path)
##Here are the file names
print(files)
## [1] "en_US.blogs.txt" "en_US.news.txt" "en_US.twitter.txt"
## Load file paths into memory
blogPath <- paste(path,"/", files[1],sep="")
newsPath <- paste(path,"/", files[2],sep="")
twitPath <- paste(path,"/", files[3],sep="")
## First, read all of the data.
## Skipping all the nuls.
con <- file(blogPath, open="rb")
blog<-readLines(con, skipNul = TRUE, encoding = "UTF-8")
close(con)
con <- file(newsPath, open="rb")
news<-readLines(con, skipNul = TRUE, encoding = "UTF-8")
close(con)
con <- file(twitPath, open="rb")
twit<-readLines(con, skipNul = TRUE, encoding = "UTF-8")
close(con)
Next step is to obtain basic information about the datafiles (e.g. file sizes) and their contents (e.g. word counts).
## Get file sizes in Bytes
blogBytes <- file.info(blogPath)$size
newsBytes <- file.info(newsPath)$size
twitBytes <- file.info(twitPath)$size
##Convert bytes to mebibytes(MiB)
blogMB <- blogBytes / 1024 ^ 2
newsMB <- newsBytes / 1024 ^ 2
twitMB <- twitBytes / 1024 ^ 2
## Get the number of lines
Lines_blog <- length(count.fields(blogPath, sep="\n"))
Lines_news <- length(count.fields(newsPath, sep="\n"))
Lines_twit <- length(count.fields(twitPath, sep="\n"))
## Get the number of words per line using sapply and gregexpr base functions
Words_blog<-sapply(gregexpr("[[:alpha:]]+", blog), function(x) sum(x > 0))
Words_news<-sapply(gregexpr("[[:alpha:]]+", news), function(x) sum(x > 0))
Words_twit<-sapply(gregexpr("[[:alpha:]]+", twit), function(x) sum(x > 0))
Words_blog_Sum<-sum(Words_blog)
Words_news_Sum<-sum(Words_news)
Words_twit_Sum<-sum(Words_twit)
##Get the character count (per line) for each data set
Char_blog<-nchar(blog, type = "chars")
Char_news<-nchar(news, type = "chars")
Char_twit<-nchar(twit, type = "chars")
##Sum the character counts to get total number of characters
Char_blog_Sum<-sum(Char_blog)
Char_news_Sum<-sum(Char_news)
Char_twit_Sum<-sum(Char_twit)
This summarizes information about our datasets.
df<-data.frame(File=c("Blogs", "News", "Twitter"),
fileSize = c(blogMB, newsMB, twitMB),
lineCount = c(Lines_blog, Lines_news, Lines_twit),
wordCount = c(Words_blog_Sum, Words_news_Sum, Words_twit_Sum),
charCount = c(Char_blog_Sum,Char_news_Sum,Char_twit_Sum),
wordMean = c(mean(Words_blog), mean(Words_news), mean(Words_twit)),
charMean = c(mean(Char_blog), mean(Char_news), mean(Char_twit))
)
View(df)
Now, we start with statistics for a sample of the data. First, set the seed for reproducability. Next, we will sample by line count.
set.seed(2019)
## Next is the sample function for truncated the news dataset
blog10 <- sample(blog, size = length(blog) / 10, replace = FALSE)
news10 <- sample(news, size = length(news)/10, replace = FALSE)
twit10 <- sample(twit, size = length(twit) / 10, replace = FALSE)
Next few steps are the same as before but this time for the samples.
blog10_MB <- format(object.size(blog10), standard = "IEC", units = "MiB")
news10_MB <- format(object.size(news10), standard = "IEC", units = "MiB")
twit10_MB <- format(object.size(twit10), standard = "IEC", units = "MiB")
## Get the number of lines
blog10Lines <- length(blog10)
news10Lines <- length(news10)
twit10Lines <- length(twit10)
## Get the number of words per line using sapply and gregexpr base functions
blog10Words<-sapply(gregexpr("[[:alpha:]]+", blog10), function(x) sum(x > 0))
news10Words<-sapply(gregexpr("[[:alpha:]]+", news10), function(x) sum(x > 0))
twit10Words<-sapply(gregexpr("[[:alpha:]]+", twit10), function(x) sum(x > 0))
## Sum the number of words in each line to get total words
blog10WordsSum<-sum(blog10Words)
news10WordsSum<-sum(news10Words)
twit10WordsSum<-sum(twit10Words)
##Get the character count (per line) for each data set
blog10Char<-nchar(blog10, type = "chars")
news10Char<-nchar(news10, type = "chars")
twit10Char<-nchar(twit10, type = "chars")
##Sum the character counts to get total number of characters
blog10CharSum<-sum(blog10Char)
news10CharSum<-sum(news10Char)
twit10CharSum<-sum(twit10Char)
This is the second deliverable and nicely summarizes information about our samples (from the previous code chunk). It is important to make sure that the values in this table match the previous table. If this is not the case, then it may indicate that something went wrong with the sampling.
df1 <- data.frame(File=c("Blogs Sample", "News Sample", "Twitter Sample"),
fileSize = c(blog10_MB, news10_MB, twit10_MB),
lineCount = c(blog10Lines, news10Lines, twit10Lines),
wordCount = c(blog10WordsSum, news10WordsSum, twit10WordsSum),
charCount = c(blog10CharSum,news10CharSum,twit10CharSum),
wordMean = c(mean(blog10Words), mean(news10Words), mean(twit10Words)),
charMean = c(mean(blog10Char), mean(news10Char), mean(twit10Char))
)
View(df1)
First: put all three of the datasets together. Then: remove stop words, extra whitespace, punctuation, one-letter words and symbols. ### Put all of the dataset samples together
library(tm)
## Loading required package: NLP
## Put all of the data samples together
#dat<- c(blog,news,twit)
dat10<- c(blog10,news10,twit10)
dat10NoPunc<- removePunctuation(dat10)
dat10NoWS<- stripWhitespace(dat10NoPunc)
dat10NoStop <- removeWords(dat10NoWS, stopwords("english"))
library(stringi)
dat10Lower <- stri_trans_tolower(dat10NoStop)
dat10_nospecial <- gsub("ð|â|???|T|o|'|³|¾|ñ|f|.|º|°|»|²|¼|>|<|¹|·|¸|¦|~|~", "", dat10Lower)
dat10_nospecial2<- removePunctuation(dat10_nospecial)
dat10_nospecial3<- stripWhitespace(dat10_nospecial2)
#Remove single letter words
dat10_nospecial4 <- removeWords(dat10_nospecial3, "\\b\\w{1}\\b")
##Tokenization Putting together lists of unigrams, bigrams and trigrams.
## created by Maciej Szymkiewicz, aka zero323 on Github.
download.file("https://raw.githubusercontent.com/zero323/r-snippets/master/R/ngram_tokenizer.R",
destfile = paste(getwd(),"/ngram_tokenizer.R", sep=""))
source("ngram_Tokenizer.R")
unigram_tokenizer <- ngram_tokenizer(1)
uniList <- unigram_tokenizer(dat10_nospecial4)
freqNames <- as.vector(names(table(unlist(uniList))))
freqCount <- as.numeric(table(unlist(uniList)))
dfUni <- data.frame(Word = freqNames,
Count = freqCount)
attach(dfUni)
dfUniSort<-dfUni[order(-Count),]
detach(dfUni)
bigram_tokenizer <- ngram_tokenizer(2)
biList <- bigram_tokenizer(dat10_nospecial4)
freqNames <- as.vector(names(table(unlist(biList))))
freqCount <- as.numeric(table(unlist(biList)))
dfBi <- data.frame(Word = freqNames,
Count = freqCount)
attach(dfBi)
dfBiSort<-dfBi[order(-Count),]
detach(dfBi)
trigram_tokenizer <- ngram_tokenizer(3)
triList <- trigram_tokenizer(dat10_nospecial4)
freqNames <- as.vector(names(table(unlist(triList))))
freqCount <- as.numeric(table(unlist(triList)))
dfTri <- data.frame(Word = freqNames,
Count = freqCount)
attach(dfTri)
dfTriSort<-dfTri[order(-Count),]
detach(dfTri)
After preparing the Ngram lists, Let’s visualize the data ### Unigram histogram
par(mar = c(8,4,1,1) + 0.1, las = 2)
barplot(dfUniSort[1:20,2],col="blue",
names.arg = dfUniSort$Word[1:20],srt = 45,
space=0.1, xlim=c(0,20),
main = "Top 20 Unigrams by Frequency",
cex.names = 1, xpd = FALSE)
### Bigram histogram
par(mar = c(8,4,1,1) + 0.1, las = 2)
barplot(dfBiSort[1:20,2],col="green",
names.arg = dfBiSort$Word[1:20],srt = 45,
space=0.1, xlim=c(0,20),
main = "Top 20 Bigrams by Frequency",
cex.names = 1, xpd = FALSE)
### Trigram histogram
par(mar = c(8,4,1,1) + 0.1, las = 2)
barplot(dfTriSort[1:20,2],col="red",
names.arg = dfTriSort$Word[1:20],srt = 45,
space=0.1, xlim=c(0,20),
main = "Top 20 Trigrams by Frequency",
cex.names = 1, xpd = FALSE)
?barplot
## starting httpd help server ... done
Based on the plots above, it is evident that the data cleaning and tokenization steps are effective although take a long time and there is further requirement of improving the efficiency and increasing the speed.
For the Shiny app, I have come across some projects with functionality for hash tags from the twitter data, predicting what may follows a hash tag. Hashtags by themselves are unigrams even if they represent multiple words (e.g. #HungryLikeAWolf), but they may be preceeded by other words. The next plan is to have word predicted following the hash tag in interactive format.