We are exploring the following: 1. Basic report of statistics of datasets - word counts, line counts and basic data tables 2. Major features of the data sets using histograms and other graphs 3. Plan for the prediction algorithm and shinyapp
if (!file.exists("./textData")) {
dir.create("./textData")
}
swifturl <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
## if file is not already downloaded, download the file
if (!file.exists("./textData/Swiftkey Dataset.zip")) {
download.file(swifturl, destfile = "./textData/Swiftkey Dataset.zip")
}
## Unzip file to datadirectory if the directory does not yet exist exists
if (!file.exists("./textData/final")) {
unzip("./textData/Swiftkey Dataset.zip", exdir = "./textData")
}
rm(swifturl)# removed the file to save memory space
library(dplyr,warn.conflicts = FALSE)
con_tweets <- file("/Users/rizwanmohamed/Coursera/textData/final/en_US/en_US.twitter.txt","r")
con_news<-file("/Users/rizwanmohamed/Coursera/textData/final/en_US/en_US.news.txt","r")
con_blogs<-file("/Users/rizwanmohamed/Coursera/textData/final/en_US/en_US.blogs.txt","r")
tweetdata<- readLines(con_tweets,encoding= "UTF-8",skipNul = TRUE);close(con_tweets)
newsdata <- readLines(con_news,encoding= "UTF-8",skipNul = TRUE);close(con_news)
blogdata <- readLines(con_blogs,encoding= "UTF-8",skipNul = TRUE);close(con_blogs)
library(stringi)
# number of lines in each file (in million)
lt <-length(tweetdata)/1000000
ln <-length(newsdata)/1000000
lb <-length(blogdata)/1000000
# number of words in each file (in million)
wt <- sum(stri_count(tweetdata,regex="\\S+"))/1000000
wn <- sum(stri_count(newsdata,regex="\\S+"))/1000000
wb <- sum(stri_count(blogdata,regex="\\S+"))/1000000
# number of characters in each file (in million)
cht <-sum(nchar(tweetdata, type = "chars"))/1000000
chn <- sum(nchar(newsdata, type = "chars"))/1000000
chb <- sum(nchar(blogdata, type = "chars"))/1000000
# Size of the files in MB
size_MB <-c(file.info("/Users/rizwanmohamed/Coursera/textData/final/en_US/en_US.twitter.txt")$size/1024^2,file.info("/Users/rizwanmohamed/Coursera/textData/final/en_US/en_US.news.txt")$size/1024^2,file.info("/Users/rizwanmohamed/Coursera/textData/final/en_US/en_US.blogs.txt")$size/1024^2)
# Summary of the data sets
TextTable <- data.frame(source = c("en_US.twitter file","en_US.news file", "en_US.blog file"),Total_lines_in_million = c(lt,ln,lb),Total_words_in_million = c(wt,wn,wb),Total_characters_in_million =c(cht,chn,chb),mean_num_of_words_per_line =round(c((wt/lt),(wn/ln),(wb/lb)),2),size_MB)
print(TextTable)
## source Total_lines_in_million Total_words_in_million
## 1 en_US.twitter file 2.360148 30.37358
## 2 en_US.news file 1.010242 34.37253
## 3 en_US.blog file 0.899288 37.33413
## Total_characters_in_million mean_num_of_words_per_line size_MB
## 1 162.0962 12.87 159.3641
## 2 203.2232 34.02 196.2775
## 3 206.8245 41.52 200.4242
rm(TextTable)# removed the file to save memory space
# There are more than 2 million lines in twitter file and 90 lakh lines in blog file.The sentences in the blog file are more lenghthy compared to that of others. Total size of the three files is more than 600MB. Blog file occupies 1/3rd of total size of the files.
# It is computationally expensive to explore data on the total size of the files. We are taking samples (50000 lines) from each of these three files to create the sample text file to do exploratory data analysis. We are sampling data using rbinom function to take randomly normalised data for better interpretation.
set.seed(201911)
# Selected 25000 lines from each of three files to create text sample using binomial distribution
blogsample <- rbinom(length(blogdata),100000,0.5)
newssample <- rbinom(length(newsdata),100000,0.5)
tweetsample <- rbinom(length(tweetdata),100000,0.5)
text_directory<- c(sample(blogdata[blogsample],25000,replace = FALSE),sample(newsdata[newssample],25000,replace = FALSE),
sample(tweetdata[tweetsample],25000,replace = FALSE))# sample of text data
# sample sentences from the blogs
text_directory[1]
## [1] "THE SHAMROCK TSUNAMI"
# sample sentences from the news
text_directory[25001]
## [1] "This is fuzzy, to say the least. If the jury doesn't believe that Bonds perjured himself, then what, pray tell, did he obstruct?"
# sample sentences from the twwets
text_directory[75000]
## [1] "All Hail the King! with an EPIC 39 Save Shutout!!!"
cat(text_directory, file = (con <- file("textsample.txt", "w+", encoding = "UTF-8")), sep = "\n", fill = FALSE, labels = NULL,append = FALSE)
textsample <- readLines(con<-file("textsample.txt",encoding = "UTF-8"))
close(con)
set.seed(201911)
library(NLP)
library(tm)# tm package in R for text data mining
corp<- VCorpus(VectorSource(textsample),list(readPlain,language="en"))
corp<-tm_map(corp,content_transformer(tolower))
reg_exp <- content_transformer(function(x,pattern) gsub(pattern,replacement =" ",x))
corp <- tm_map(corp,reg_exp,"[[:alnum:]]+\\@[[:alpha:]]+\\.com")
corp<- tm_map(corp,reg_exp,"[^a-z\\s]+")
corp <- tm_map(corp,reg_exp,"\\(.*?\\)")
corp<-tm_map(corp,reg_exp,"[[:punct:]]+")
library(stopwords,warn.conflicts = FALSE)
set.seed(201911)
corp<- tm_map(corp,removeWords,stopwords(language = "en",source = "smart"))
con<- file("/Users/rizwanmohamed/Coursera/bad-words.txt","r")
profanityWords <- readLines(con)
close(con = con)
corp<- tm_map(corp,removeWords,profanityWords)
rm(profanityWords)
options(mc.cores=1)
corp<-tm_map(corp,removeNumbers)
corp<-tm_map(corp,stripWhitespace)
corp<-tm_map(corp,PlainTextDocument)
tdm <- TermDocumentMatrix(corp)
Onegram<- removeSparseTerms(tdm, 0.99999)# Tokenizing not required for one gram text data LM
library(RColorBrewer)
library(wordcloud)
v <- sort(rowSums(as.matrix(Onegram)),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
pal <-brewer.pal(9,'Spectral')
wordcloud(d$word, d$freq, scale=c(2,.9),color="black",max.words=100,random.order=FALSE,rot.per=.25,use.r.layout=FALSE,pal)
## Warning in if (min.freq > max(freq)) min.freq <- 0: the condition has length > 1
## and only the first element will be used
## Warning in freq >= min.freq: longer object length is not a multiple of shorter
## object length
## Warning in freq >= min.freq: longer object length is not a multiple of shorter
## object length
#Tokenizing words of the Term Document Matrix
#Here is a histogram of the 30 most common unigrams in the data sample.