## [1] "C/C/C/C/C/en_AU.UTF-8"
library(tm)
library(stringi)
library(R.utils)
library(quanteda)
library(readtext)
library(data.table)
library(dplyr)
library(SnowballC)
library(readr)
library(tidyr)
library(scales)
library(tidytext)
library(ggplot2)
library(scales)
library(ngram)
library(qdap)
library(ggthemes)
library(plotrix)
library(wordcloud)
library(RWeka)
library(textcat)
library(hunspell)
Create a basic summary statistics about the data sets.
summary statistics +File Size:
floor(file.info("~/Documents/DS capstone/data/en_US.blogs.txt")$size/1000000)
## [1] 210
floor(file.info("~/Documents/DS capstone/data/en_US.twitter.txt")$size/1000000)
## [1] 167
floor(file.info("~/Documents/DS capstone/data/en_US.news.txt")$size/1000000)
## [1] 205
length(blogs_lines)
## [1] 899288
length(twitter_lines)
## [1] 2360148
length(twitter_lines)
## [1] 2360148
sum(stri_stats_latex(blogs_lines))
## [1] 242672204
sum(stri_stats_latex(twitter_lines))
## [1] 191984472
sum(stri_stats_latex(twitter_lines))
## [1] 191984472
blogs_lines <- readLines(con = "~/Documents/DS capstone/data/en_US.blogs.txt", encoding= "UTF-8", skipNul = T)
twitter_lines <- readLines(con = "~/Documents/DS capstone/data/en_US.twitter.txt", encoding= "UTF-8", skipNul = T)
news_lines <- readLines(con = "~/Documents/DS capstone/data/en_US.news.txt", encoding= "UTF-8", skipNul = T)
set.seed(250)
blogs_lines <- blogs_lines[sample(seq(length(blogs_lines)))]
twitter_lines <- twitter_lines[sample(seq(length(twitter_lines)))]
news_lines <- news_lines[sample(seq(length(news_lines)))]
b <- length(blogs_lines)
t <- length(twitter_lines)
n <- length(news_lines)
blog_sample <- blogs_lines[1:floor(b*0.01)]
twitter_sample <- twitter_lines[1:floor(t*0.01)]
news_sample <- news_lines[1:floor(n*0.01)]
blog_sample <- gsub("[^[:alnum:][:blank:]?&/\\-]", "", blog_sample) #-
blog_sample <- gsub("U00..", "", blog_sample)
+Remove non UTF-8 characters from text
## Line counts
stri_stats_general(blog_sample)
## Lines LinesNEmpty Chars CharsNWhite
## 8992 8989 2030329 1661945
stri_stats_general(twitter_sample)
## Lines LinesNEmpty Chars CharsNWhite
## 23601 23601 1616652 1337230
stri_stats_general(news_sample)
## Lines LinesNEmpty Chars CharsNWhite
## 10102 10102 2041395 1706400
text_blogs_df <- data_frame(line = 1:8992, text = blog_sample)
text.twitter_df <- data_frame(line = 1:23601, text = twitter_sample)
text_news_df <- data_frame(line = 1:10102, text = news_sample)
blog_words <- hunspell_find(blog_sample, format = "latex")
my.profiles <- TC_byte_profiles[names(TC_byte_profiles)]
my.profiles
## A textcat profile db of length 75.
blog_language <- textcat(blog_words,my.profiles)
sort(unique(unlist(blog_language)))
## [1] "afrikaans" "albanian" "basque"
## [4] "bosnian" "breton" "catalan"
## [7] "chinese-big5" "croatian-ascii" "czech-iso8859_2"
## [10] "danish" "dutch" "english"
## [13] "esperanto" "estonian" "finnish"
## [16] "french" "frisian" "german"
## [19] "hindi" "hungarian" "icelandic"
## [22] "indonesian" "irish" "italian"
## [25] "japanese-shift_jis" "latin" "latvian"
## [28] "lithuanian" "malay" "manx"
## [31] "middle_frisian" "mingo" "nepali"
## [34] "norwegian" "polish" "portuguese"
## [37] "quechua" "romanian" "rumantsch"
## [40] "sanskrit" "scots" "scots_gaelic"
## [43] "serbian-ascii" "slovak-ascii" "slovak-windows1250"
## [46] "slovenian-ascii" "slovenian-iso8859_2" "spanish"
## [49] "swahili" "swedish" "tagalog"
## [52] "turkish" "vietnamese" "welsh"
Using the hunspell packages & dictionary package misplet words can be detect & remove,however this is high resource RAM exercises.
Turn the sampled test files into corpus for TM_MAP Pre Processing.
blogs.corpus <- Corpus(VectorSource(text_blogs_df))
twitter.corpus <- Corpus(VectorSource(text.twitter_df))
news.corpus <- Corpus(VectorSource(text_news_df))
my_custom_stopwords <- c("shit","fuck","damn","bitch","crap","dick","piss","darn","pussy","fag","cock","asshole","bastard","fag","douche","slut")
blogs.corpus <- tm_map(blogs.corpus, removePunctuation) #Removal of punctuation
blogs.corpus <- tm_map(blogs.corpus, removeNumbers) #Removal of numbers
blogs.corpus <- tm_map(blogs.corpus, tolower) #Changing the text to lower case
blogs.corpus <- tm_map(blogs.corpus, stripWhitespace) #Removal of extra whitespaces
blogs.corpus <- tm_map(blogs.corpus, stemDocument, language = "english") #Stemming is the procedure of converting words to their base or root form
blogs.corpus <- tm_map(blogs.corpus, removeWords, stopwords('English')) #Removal of stop words
blogs.corpus <- tm_map(blogs.corpus, removeWords, stopwords('danish'))
blogs.corpus <- tm_map(blogs.corpus, removeWords, stopwords('dutch'))
blogs.corpus <- tm_map(blogs.corpus, removeWords, stopwords('finnish'))
blogs.corpus <- tm_map(blogs.corpus, removeWords, stopwords('french'))
blogs.corpus <- tm_map(blogs.corpus, removeWords, stopwords('german'))
blogs.corpus <- tm_map(blogs.corpus, removeWords, stopwords('hungarian'))
blogs.corpus <- tm_map(blogs.corpus, removeWords, stopwords('italian'))
blogs.corpus <- tm_map(blogs.corpus, removeWords, stopwords('norwegian'))
blogs.corpus <- tm_map(blogs.corpus, removeWords, stopwords('italian'))
blogs.corpus <- tm_map(blogs.corpus, removeWords, stopwords('portuguese'))
blogs.corpus <- tm_map(blogs.corpus, removeWords, stopwords('russian'))
blogs.corpus <- tm_map(blogs.corpus, removeWords, stopwords('spanish'))
blogs.corpus <- tm_map(blogs.corpus, removeWords, stopwords('swedish'))
blogs.corpus <- tm_map(blogs.corpus, removeWords,my_custom_stopwords)
twitter.corpus <- tm_map(twitter.corpus, removePunctuation) #Removal of punctuation
twitter.corpus <- tm_map(twitter.corpus, removeNumbers) #Removal of numbers
twitter.corpus <- tm_map(twitter.corpus, tolower) #Changing the text to lower case
twitter.corpus <- tm_map(twitter.corpus, stripWhitespace) #Removal of extra whitespaces
twitter.corpus <- tm_map(twitter.corpus, stemDocument, language = "english") #Stemming is the procedure of converting words to their base or root form
twitter.corpus <- tm_map(twitter.corpus, removeWords, stopwords('English')) #Removal of stop words
twitter.corpus <- tm_map(twitter.corpus, removeWords, stopwords('danish'))
twitter.corpus <- tm_map(twitter.corpus, removeWords, stopwords('dutch'))
twitter.corpus <- tm_map(twitter.corpus, removeWords, stopwords('finnish'))
twitter.corpus <- tm_map(twitter.corpus, removeWords, stopwords('french'))
twitter.corpus <- tm_map(twitter.corpus, removeWords, stopwords('german'))
twitter.corpus <- tm_map(twitter.corpus, removeWords, stopwords('hungarian'))
twitter.corpus <- tm_map(twitter.corpus, removeWords, stopwords('italian'))
twitter.corpus <- tm_map(twitter.corpus, removeWords, stopwords('norwegian'))
twitter.corpus <- tm_map(twitter.corpus, removeWords, stopwords('italian'))
twitter.corpus <- tm_map(twitter.corpus, removeWords, stopwords('portuguese'))
twitter.corpus <- tm_map(twitter.corpus, removeWords, stopwords('russian'))
twitter.corpus <- tm_map(twitter.corpus, removeWords, stopwords('spanish'))
twitter.corpus <- tm_map(twitter.corpus, removeWords, stopwords('swedish'))
twitter.corpus <- tm_map(twitter.corpus, removeWords,my_custom_stopwords)
news.corpus <- tm_map(news.corpus, removePunctuation) #Removal of punctuation
news.corpus <- tm_map(news.corpus, removeNumbers) #Removal of numbers
news.corpus <- tm_map(news.corpus, tolower) #Changing the text to lower case
news.corpus <- tm_map(news.corpus, stripWhitespace) #Removal of extra whitespaces
news.corpus <- tm_map(news.corpus, stemDocument, language = "english") #Stemming is the procedure of converting words to their base or root form
news.corpus <- tm_map(news.corpus, removeWords, stopwords('English')) #Removal of stop words
news.corpus <- tm_map(news.corpus, removeWords, stopwords('danish'))
news.corpus <- tm_map(news.corpus, removeWords, stopwords('dutch'))
news.corpus <- tm_map(news.corpus, removeWords, stopwords('finnish'))
news.corpus <- tm_map(news.corpus, removeWords, stopwords('french'))
news.corpus <- tm_map(news.corpus, removeWords, stopwords('german'))
news.corpus <- tm_map(news.corpus, removeWords, stopwords('hungarian'))
news.corpus <- tm_map(news.corpus, removeWords, stopwords('italian'))
news.corpus <- tm_map(news.corpus, removeWords, stopwords('norwegian'))
news.corpus <- tm_map(news.corpus, removeWords, stopwords('italian'))
news.corpus <- tm_map(news.corpus, removeWords, stopwords('portuguese'))
news.corpus <- tm_map(news.corpus, removeWords, stopwords('russian'))
news.corpus <- tm_map(news.corpus, removeWords, stopwords('spanish'))
news.corpus <- tm_map(news.corpus, removeWords, stopwords('swedish'))
news.corpus <- tm_map(news.corpus, removeWords,my_custom_stopwords)
blogs_tdm<-TermDocumentMatrix(blogs.corpus,control=list(weighting=weightTf))
tdm.tblogs.m<-as.matrix(blogs_tdm)
twitter_tdm<-TermDocumentMatrix(twitter.corpus,control=list(weighting=weightTf))
tdm.twitter.m<-as.matrix(twitter_tdm)
news_tdm<-TermDocumentMatrix(news.corpus,control=list(weighting=weightTf))
tdm.news.m<-as.matrix(news_tdm)
blogs.term.freq<-rowSums(tdm.tblogs.m)
blogs.term.freq<-data.frame(word=names(blogs.term.freq),frequency=blogs.term.freq)
blogs.term.freq<-blogs.term.freq[order(blogs.term.freq[,2], decreasing=T),]
blogs.term.freq[1:10,]
## word frequency
## one one 1313
## like like 1091
## time time 1073
## just just 1071
## can can 1026
## get get 935
## make make 833
## year year 718
## know know 693
## day day 691
+Visualling the Top Words used in the 1% sample of the Blog Test file.
blogs.term.freq$word<-factor(blogs.term.freq$word,levels=unique(as.character(blogs.term.freq$word)))
ggplot(blogs.term.freq[1:10,], aes(x=word,y=frequency))+geom_bar(stat="identity",fill='darkred')+coord_flip()+theme_gdocs()+
geom_text(aes(label=frequency),
colour="white",hjust=1.25, size=5.0)
+Visualling the Top Words used in the 1% sample of the twitter Test file.
twitter.term.freq<-rowSums(tdm.twitter.m)
twitter.term.freq<-data.frame(word=names(twitter.term.freq),frequency=twitter.term.freq)
twitter.term.freq<-twitter.term.freq[order(twitter.term.freq[,2], decreasing=T),]
twitter.term.freq[1:10,]
## word frequency
## get get 1518
## just just 1495
## like like 1322
## thank thank 1263
## love love 1252
## day day 1081
## good good 995
## know know 901
## dont dont 874
## can can 870
twitter.term.freq$word<-factor(twitter.term.freq$word,levels=unique(as.character(twitter.term.freq$word)))
ggplot(twitter.term.freq[1:10,], aes(x=word,y=frequency))+geom_bar(stat="identity",fill='darkgreen')+coord_flip()+theme_gdocs()+
geom_text(aes(label=frequency),
colour="white",hjust=1.25, size=5.0)
+Visualling the Top Words used in the 1% sample of the twitter Test file.
news.term.freq<-rowSums(tdm.news.m)
news.term.freq<-data.frame(word=names(news.term.freq),frequency=news.term.freq)
news.term.freq<-news.term.freq[order(news.term.freq[,2], decreasing=T),]
news.term.freq[1:10,]
## word frequency
## said said 2532
## year year 1006
## one one 855
## time time 673
## say say 670
## new new 664
## can can 632
## state state 630
## get get 588
## like like 571
+Visualling the Top Words used in the 1% sample of the news Test file.
news.term.freq$word<-factor(news.term.freq$word,levels=unique(as.character(news.term.freq$word)))
ggplot(news.term.freq[1:10,], aes(x=word,y=frequency))+geom_bar(stat="identity",fill='darkblue')+coord_flip()+theme_gdocs()+
geom_text(aes(label=frequency),
colour="white",hjust=1.25, size=5.0)
+ what are the distributions of word frequencies?
+What are the frequencies of 2-grams +Blog
blogs_bigram <- NGramTokenizer(blogs.corpus, Weka_control(min = 2, max = 2,delimiters = " \\r\\n\\t.,;:\"()?!"))
blogs_bigram <- data.frame(table(blogs_bigram ))
blogs_bigram <- blogs_bigram [order(blogs_bigram $Freq,decreasing = TRUE),]
names(blogs_bigram ) <- c("words","freq")
head(blogs_bigram )
## words freq
## 84518 look like 73
## 39528 dont know 69
## 164487 year ago 62
## 78571 last night 53
## 78634 last year 53
## 50126 feel like 50
twitter_bigram <- NGramTokenizer(twitter.corpus, Weka_control(min = 2, max = 2,delimiters = " \\r\\n\\t.,;:\"()?!"))
twitter_bigram <- data.frame(table(twitter_bigram ))
twitter_bigram <- twitter_bigram [order(twitter_bigram $Freq,decreasing = TRUE),]
names(twitter_bigram) <- c("words","freq")
head(twitter_bigram)
## words freq
## 16669 cant wait 170
## 96823 right now 152
## 115244 thank follow 129
## 67459 look like 114
## 67397 look forward 112
## 62218 last night 110
+News
news_bigram <- NGramTokenizer(news.corpus, Weka_control(min = 2, max = 2,delimiters = " \\r\\n\\t.,;:\"()?!"))
news_bigram <- data.frame(table(news_bigram ))
news_bigram <- news_bigram [order(news_bigram $Freq,decreasing = TRUE),]
names(news_bigram ) <- c("words","freq")
head(news_bigram )
## words freq
## 80162 last year 131
## 97848 new york 110
## 66904 high school 87
## 140958 st loui 84
## 80155 last week 76
## 168290 year ago 74
+What visualization the frequencies of 2-grams
ggplot(head(blogs_bigram,10), aes(reorder(words,freq), freq)) +
geom_bar(stat = "identity") + coord_flip() +
xlab("Bigrams") + ylab("Frequency") +
ggtitle("Most Blogs frequent bigrams")
ggplot(head(twitter_bigram,10), aes(reorder(words,freq), freq)) +
geom_bar(stat = "identity") + coord_flip() +
xlab("Bigrams") + ylab("Frequency") +
ggtitle("Most twitterfrequent bigrams")
ggplot(head(news_bigram,10), aes(reorder(words,freq), freq)) +
geom_bar(stat = "identity") + coord_flip() +
xlab("Bigrams") + ylab("Frequency") +
ggtitle("Most News frequent bigrams")
+ What are the frequencies of 3-grams +blog
blogs_trigram <- NGramTokenizer(blogs.corpus, Weka_control(min = 3, max = 3,delimiters = " \\r\\n\\t.,;:\"()?!"))
blogs_trigram <- data.frame(table(blogs_trigram))
blogs_trigram <- blogs_trigram[order(blogs_trigram$Freq,decreasing = TRUE),]
names(blogs_trigram) <- c("words","freq")
head(blogs_trigram)
## words freq
## 24780 carolina insur comapni 16
## 154641 south carolina insur 16
## 112837 none repeat scroll 15
## 138457 repeat scroll yellow 15
## 159942 stylebackground none repeat 15
## 17204 block block obj 8
twitter_trigram <- NGramTokenizer(twitter.corpus, Weka_control(min = 3, max = 3,delimiters = " \\r\\n\\t.,;:\"()?!"))
twitter_trigram <- data.frame(table(twitter_trigram))
twitter_trigram <- twitter_trigram [order(twitter_trigram$Freq,decreasing = TRUE),]
names(twitter_trigram) <- c("words","freq")
head(twitter_trigram)
## words freq
## 59521 happi mother day 33
## 76791 let us know 25
## 19345 cant wait see 22
## 59531 happi new year 20
## 59433 happi happi happi 17
## 45143 follow follow back 13
+News
news_trigram <- NGramTokenizer(news.corpus, Weka_control(min = 3, max = 3,delimiters = " \\r\\n\\t.,;:\"()?!"))
news_trigram <- data.frame(table(news_trigram))
news_trigram <- news_trigram[order(news_trigram$Freq,decreasing = TRUE),]
names(news_trigram) <- c("words","freq")
head(news_trigram)
## words freq
## 128439 presid barack obama 16
## 69184 gov chris christi 13
## 178005 two year ago 13
## 110017 new york citi 12
## 183518 w sunset blvd 10
## 165389 superior court judg 9
ggplot(head(blogs_trigram ,10), aes(reorder(words,freq), freq)) +
geom_bar(stat="identity") + coord_flip() +
xlab("Trigrams") + ylab("Frequency") +
ggtitle("Most frequent trigrams")
ggplot(head(twitter_trigram,10), aes(reorder(words,freq), freq)) +
geom_bar(stat="identity") + coord_flip() +
xlab("Trigrams") + ylab("Frequency") +
ggtitle("Most twitter frequent trigrams")
ggplot(head(news_trigram ,10), aes(reorder(words,freq), freq)) +
geom_bar(stat="identity") + coord_flip() +
xlab("Trigrams") + ylab("Frequency") +
ggtitle("Most news frequent trigrams")
Interesting findings
The text data sets contain a mix of lanuages, an example of these are finnish,swedish. Removing these Lanuages is tricky business, due to not speaking the lanauage & the high resources to remove the from the corpus. I uses the TM Stopwords function in a few lamagues to remove of words.
Almost look at add the top 25, Nouns,Verbs,Adjectives & Prepositions to increase the coverage of