This is a milestone report, part of the Data-Science specialization by JHU which is available on coursera. Here, we performed Tokenization, Profanity filtering, and EDA.
setwd("D:/R/Class/10Capstone")
unzip("./dataset.zip")
list.files("./final/en_US/")
## [1] "en_US.blogs.txt" "en_US.news.txt" "en_US.twitter.txt"
library(stringi)
library(tm)
## Loading required package: NLP
conn <- file("./final/en_US/en_US.blogs.txt","r")
blogs <- readLines(conn,skipNul = TRUE)
close(conn)
conn <- file("./final/en_US/en_US.news.txt","rb")
news <- readLines(conn,skipNul = TRUE)
close(conn)
conn <- file("./final/en_US/en_US.twitter.txt","r")
twitter <- readLines(conn,skipNul = TRUE)
close(conn)
blogsize <- file.info("./final/en_US/en_US.blogs.txt")$size / 1024 ^ 2
blogsize
## [1] 200.4242
newssize <- file.info("./final/en_US/en_US.news.txt")$size / 1024 ^ 2
newssize
## [1] 196.2775
tweetsize <- file.info("./final/en_US/en_US.twitter.txt")$size/1024 ^ 2
tweetsize
## [1] 159.3641
bloglen <- length(blogs)
bloglen
## [1] 899288
newslen <- length(news)
newslen
## [1] 1010242
tweetlen <- length(twitter)
tweetlen
## [1] 2360148
wordsblog <- stri_count_words(blogs)
wordsnews <- stri_count_words(news)
wordstweets <- stri_count_words(twitter)
Creating a summary of the data.
sumryofdata <- data.frame(source = c("Blogs","NEWS","Tweets"),
size = c(blogsize,newssize,tweetsize),
NumberOfLines = c(bloglen,newslen,tweetlen),
NumberOfWords = c(sum(wordsblog),sum(wordsnews),sum(wordstweets)),
MeanOfNumberOfWordsPerLine = c(mean(wordsblog),mean(wordsnews),mean(wordstweets)))
sumryofdata
## source size NumberOfLines NumberOfWords MeanOfNumberOfWordsPerLine
## 1 Blogs 200.4242 899288 38154238 42.42716
## 2 NEWS 196.2775 1010242 35010782 34.65584
## 3 Tweets 159.3641 2360148 30218166 12.80350
We sampled some part of the data provided in the file downloaded.
sample.data <- c(sample(blogs,1000),
sample(news,1000),
sample(twitter,1000))
Before performing EDA or doing any process on the data we should always perform the cleaning process for the data so that we can remove the profanity. Reading the badwords.txt for reading and removing the badwords.
conn <- file("./badwords.txt","r")
badwords <- readLines(conn,skipNul = TRUE)
close(conn)
Performing the cleaning process
corpus <- VCorpus(VectorSource(sample.data))
print(corpus)
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 3000
?content_transformer
## starting httpd help server ... done
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "[^A-Za-z0-9:\\.]")
corpus <- tm_map(corpus,tolower)
corpus <- tm_map(corpus,removeNumbers)
corpus <- tm_map(corpus,removeWords,badwords)
corpus <- tm_map(corpus,removeWords,stopwords("en"))
corpus <- tm_map(corpus, removeWords, c("s","t","b","m","re"))
corpus <- tm_map(corpus,removePunctuation,preserve_intra_word_contractions = TRUE, preserve_intra_word_dashes = TRUE)
corpus <- tm_map(corpus,stripWhitespace)
library(RWeka)
library(rJava)
library(wordcloud)
## Loading required package: RColorBrewer
One-Gram:
onegram <- NGramTokenizer(corpus,Weka_control(min = 1,max = 1))
oneGram <- data.frame(table(onegram))
head(oneGram)
## onegram Freq
## 1 aa 1
## 2 aaa 1
## 3 aaah 1
## 4 ab 1
## 5 abandoned 1
## 6 abby 1
oneGram <- oneGram[order(oneGram$Freq,decreasing = T),]
orderedonegram <- oneGram[1:60,]
barplot(orderedonegram$Freq,names.arg = orderedonegram$onegram,cex.names=1,col = terrain.colors(60),las=2,main="One Gram")
wordcloud(oneGram$onegram,freq = oneGram$Freq,max.words = 50,random.order = F,colors=brewer.pal(4, "Set1"),scale = c(2,1))
Bi-Gram:
bigram <- NGramTokenizer(corpus,Weka_control(min=2,max=2))
biGram <- data.frame(table(bigram))
head(biGram)
## bigram Freq
## 1 aa min 1
## 2 aaa representative 1
## 3 aaah see 1
## 4 ab life 1
## 5 abandoned activism 1
## 6 abby wrapped 1
biGram <- biGram[order(biGram$Freq,decreasing = T),]
orderedbigram <- biGram[1:60,]
barplot(orderedbigram$Freq,names.arg = orderedbigram$bigram,col=terrain.colors(60),las=2,main="Bi-Gram")
wordcloud(biGram$bigram,freq = biGram$Freq,max.words = 50,random.order = F,colors = brewer.pal(8,"Set1"),scale = c(2,1))
Tri-Gram:
trigram <- NGramTokenizer(corpus,Weka_control(min=3,max=3))
triGram <- data.frame(table(trigram))
triGram <- triGram[order(triGram$Freq,decreasing = T),]
orderedtrigram <- triGram[1:60,]
barplot(orderedtrigram$Freq,names.arg = orderedtrigram$trigram,col=terrain.colors(60),las=2,main="Tri-Gram",cex.names = 0.7)