## [1] "C/C/C/C/C/en_AU.UTF-8"
library(tm)
library(stringi)
library(R.utils)
library(quanteda)
library(readtext)
library(data.table)
library(dplyr)
library(SnowballC)
library(readr)
library(tidyr)
library(scales)
library(tidytext)
library(ggplot2) 
library(scales)
library(ngram)
library(qdap)
library(ggthemes)
library(plotrix)
library(wordcloud)
library(RWeka)
library(textcat)
library(hunspell)

1: Introduction:

The purpose of this report is to provide exploratory analysis of unstructure text files for the Capstone: Coursera’s John Hopkins Data Science Specialization course. The unstructure text files are provide by SwiftKey. The text files contain a mix of languages.
A Basic Workflow – How I am approaching this assignment:
  1. Define the problem and specific goals: We are present with three large test files containing unstructured test data.
  • Goal: Exploratory analysis of the data, understanding the distribution of words and relationship between the words in the corpora.
  • Demonstrate that you’ve downloaded the data and have successfully loaded - Refer to 3.Organize the text(Preprocessing)
  • Create a basic summary statistics about the data sets.

  • summary statistics +File Size:

floor(file.info("~/Documents/DS capstone/data/en_US.blogs.txt")$size/1000000)
## [1] 210
floor(file.info("~/Documents/DS capstone/data/en_US.twitter.txt")$size/1000000) 
## [1] 167
floor(file.info("~/Documents/DS capstone/data/en_US.news.txt")$size/1000000) 
## [1] 205
  • Line counts:
length(blogs_lines)
## [1] 899288
length(twitter_lines)
## [1] 2360148
length(twitter_lines)
## [1] 2360148
  • Word Count for each file - (stringi package)
sum(stri_stats_latex(blogs_lines))
## [1] 242672204
sum(stri_stats_latex(twitter_lines))
## [1] 191984472
sum(stri_stats_latex(twitter_lines))
## [1] 191984472
  1. Identify the text that needs to be collected.
  • This has been provide by SwiftKey
  1. Organize the text(Preprocessing)
  • Loading files into R
blogs_lines <- readLines(con = "~/Documents/DS capstone/data/en_US.blogs.txt", encoding= "UTF-8", skipNul = T)
twitter_lines <- readLines(con = "~/Documents/DS capstone/data/en_US.twitter.txt", encoding= "UTF-8", skipNul = T)
news_lines <- readLines(con = "~/Documents/DS capstone/data/en_US.news.txt", encoding= "UTF-8", skipNul = T)
  • Ramdomly sampling data 0.1% of the data files.
set.seed(250)
blogs_lines <- blogs_lines[sample(seq(length(blogs_lines)))]
twitter_lines <- twitter_lines[sample(seq(length(twitter_lines)))]
news_lines <- news_lines[sample(seq(length(news_lines)))]
b <- length(blogs_lines)
t <- length(twitter_lines)
n <- length(news_lines)
blog_sample <- blogs_lines[1:floor(b*0.01)]
twitter_sample <- twitter_lines[1:floor(t*0.01)]
news_sample <- news_lines[1:floor(n*0.01)]
  • Note: Originally select 20%, 10% ,5%, 3%, howver at various stages of creating this report. Each used up to much ram, however 1% of the text files, was able to be convert into a corpus.
blog_sample <- gsub("[^[:alnum:][:blank:]?&/\\-]", "", blog_sample) #- 
blog_sample  <- gsub("U00..", "", blog_sample)

+Remove non UTF-8 characters from text

  • Creating a data frame to organize the text to analyzing word and document frequency.
## Line counts
stri_stats_general(blog_sample)
##       Lines LinesNEmpty       Chars CharsNWhite 
##        8992        8989     2030329     1661945
stri_stats_general(twitter_sample)
##       Lines LinesNEmpty       Chars CharsNWhite 
##       23601       23601     1616652     1337230
stri_stats_general(news_sample)
##       Lines LinesNEmpty       Chars CharsNWhite 
##       10102       10102     2041395     1706400
  • Note: using stri_stats_general to obtain the line code for the data frame.
text_blogs_df <- data_frame(line = 1:8992, text = blog_sample)
text.twitter_df  <- data_frame(line = 1:23601, text = twitter_sample)
text_news_df  <- data_frame(line = 1:10102, text = news_sample)
  • How do you evaluate how many of the words come from foreign languages.
  • Using a combination of hunspell & textcat packges to obtain a list of words missplet or non english
  • There is no hard-and-fast way to accomplish this due to substantial overlap among these languages.

    We can use Stopwords to remove some of the languages. I have applied the stopwords to all of the corpus’s.

blog_words <- hunspell_find(blog_sample, format = "latex")
my.profiles <- TC_byte_profiles[names(TC_byte_profiles)]
my.profiles 
## A textcat profile db of length 75.
blog_language <- textcat(blog_words,my.profiles)
sort(unique(unlist(blog_language)))
##  [1] "afrikaans"           "albanian"            "basque"             
##  [4] "bosnian"             "breton"              "catalan"            
##  [7] "chinese-big5"        "croatian-ascii"      "czech-iso8859_2"    
## [10] "danish"              "dutch"               "english"            
## [13] "esperanto"           "estonian"            "finnish"            
## [16] "french"              "frisian"             "german"             
## [19] "hindi"               "hungarian"           "icelandic"          
## [22] "indonesian"          "irish"               "italian"            
## [25] "japanese-shift_jis"  "latin"               "latvian"            
## [28] "lithuanian"          "malay"               "manx"               
## [31] "middle_frisian"      "mingo"               "nepali"             
## [34] "norwegian"           "polish"              "portuguese"         
## [37] "quechua"             "romanian"            "rumantsch"          
## [40] "sanskrit"            "scots"               "scots_gaelic"       
## [43] "serbian-ascii"       "slovak-ascii"        "slovak-windows1250" 
## [46] "slovenian-ascii"     "slovenian-iso8859_2" "spanish"            
## [49] "swahili"             "swedish"             "tagalog"            
## [52] "turkish"             "vietnamese"          "welsh"
  • Checking for miss spelt words.
  • Using the hunspell packages & dictionary package misplet words can be detect & remove,however this is high resource RAM exercises.

  • Turn the sampled test files into corpus for TM_MAP Pre Processing.

blogs.corpus <- Corpus(VectorSource(text_blogs_df))
twitter.corpus <- Corpus(VectorSource(text.twitter_df))
news.corpus <- Corpus(VectorSource(text_news_df))
  • Extract the most common America swear used on facebook in 2013 & create a custom stopwords.
my_custom_stopwords <- c("shit","fuck","damn","bitch","crap","dick","piss","darn","pussy","fag","cock","asshole","bastard","fag","douche","slut")
  • Use the TM package:
blogs.corpus <- tm_map(blogs.corpus, removePunctuation) #Removal of punctuation
blogs.corpus <- tm_map(blogs.corpus, removeNumbers) #Removal of numbers
blogs.corpus <- tm_map(blogs.corpus, tolower) #Changing the text to lower case
blogs.corpus <- tm_map(blogs.corpus, stripWhitespace) #Removal of extra whitespaces
blogs.corpus <- tm_map(blogs.corpus, stemDocument, language = "english") #Stemming is the procedure of converting words to their base or root form
blogs.corpus <- tm_map(blogs.corpus, removeWords, stopwords('English')) #Removal of stop words
blogs.corpus <- tm_map(blogs.corpus, removeWords, stopwords('danish')) 
blogs.corpus <- tm_map(blogs.corpus, removeWords, stopwords('dutch')) 
blogs.corpus <- tm_map(blogs.corpus, removeWords, stopwords('finnish')) 
blogs.corpus <- tm_map(blogs.corpus, removeWords, stopwords('french')) 
blogs.corpus <- tm_map(blogs.corpus, removeWords, stopwords('german')) 
blogs.corpus <- tm_map(blogs.corpus, removeWords, stopwords('hungarian')) 
blogs.corpus <- tm_map(blogs.corpus, removeWords, stopwords('italian')) 
blogs.corpus <- tm_map(blogs.corpus, removeWords, stopwords('norwegian')) 
blogs.corpus <- tm_map(blogs.corpus, removeWords, stopwords('italian')) 
blogs.corpus <- tm_map(blogs.corpus, removeWords, stopwords('portuguese')) 
blogs.corpus <- tm_map(blogs.corpus, removeWords, stopwords('russian'))
blogs.corpus <- tm_map(blogs.corpus, removeWords, stopwords('spanish'))
blogs.corpus <- tm_map(blogs.corpus, removeWords, stopwords('swedish'))
blogs.corpus <- tm_map(blogs.corpus, removeWords,my_custom_stopwords)

twitter.corpus <- tm_map(twitter.corpus, removePunctuation) #Removal of punctuation
twitter.corpus <- tm_map(twitter.corpus, removeNumbers) #Removal of numbers
twitter.corpus <- tm_map(twitter.corpus, tolower) #Changing the text to lower case
twitter.corpus <- tm_map(twitter.corpus, stripWhitespace) #Removal of extra whitespaces
twitter.corpus <- tm_map(twitter.corpus, stemDocument, language = "english") #Stemming is the procedure of converting words to their base or root form
twitter.corpus <- tm_map(twitter.corpus, removeWords, stopwords('English')) #Removal of stop words
twitter.corpus <- tm_map(twitter.corpus, removeWords, stopwords('danish')) 
twitter.corpus <- tm_map(twitter.corpus, removeWords, stopwords('dutch')) 
twitter.corpus <- tm_map(twitter.corpus, removeWords, stopwords('finnish')) 
twitter.corpus <- tm_map(twitter.corpus, removeWords, stopwords('french')) 
twitter.corpus <- tm_map(twitter.corpus, removeWords, stopwords('german')) 
twitter.corpus <- tm_map(twitter.corpus, removeWords, stopwords('hungarian')) 
twitter.corpus <- tm_map(twitter.corpus, removeWords, stopwords('italian')) 
twitter.corpus <- tm_map(twitter.corpus, removeWords, stopwords('norwegian')) 
twitter.corpus <- tm_map(twitter.corpus, removeWords, stopwords('italian')) 
twitter.corpus <- tm_map(twitter.corpus, removeWords, stopwords('portuguese')) 
twitter.corpus <- tm_map(twitter.corpus, removeWords, stopwords('russian'))
twitter.corpus <- tm_map(twitter.corpus, removeWords, stopwords('spanish'))
twitter.corpus <- tm_map(twitter.corpus, removeWords, stopwords('swedish'))
twitter.corpus <- tm_map(twitter.corpus, removeWords,my_custom_stopwords)

news.corpus <- tm_map(news.corpus, removePunctuation) #Removal of punctuation
news.corpus <- tm_map(news.corpus, removeNumbers) #Removal of numbers
news.corpus <- tm_map(news.corpus, tolower) #Changing the text to lower case
news.corpus <- tm_map(news.corpus, stripWhitespace) #Removal of extra whitespaces
news.corpus <- tm_map(news.corpus, stemDocument, language = "english") #Stemming is the procedure of converting words to their base or root form
news.corpus <- tm_map(news.corpus, removeWords, stopwords('English')) #Removal of stop words
news.corpus <- tm_map(news.corpus, removeWords, stopwords('danish')) 
news.corpus <- tm_map(news.corpus, removeWords, stopwords('dutch')) 
news.corpus <- tm_map(news.corpus, removeWords, stopwords('finnish')) 
news.corpus <- tm_map(news.corpus, removeWords, stopwords('french')) 
news.corpus <- tm_map(news.corpus, removeWords, stopwords('german')) 
news.corpus <- tm_map(news.corpus, removeWords, stopwords('hungarian')) 
news.corpus <- tm_map(news.corpus, removeWords, stopwords('italian')) 
news.corpus <- tm_map(news.corpus, removeWords, stopwords('norwegian')) 
news.corpus <- tm_map(news.corpus, removeWords, stopwords('italian')) 
news.corpus <- tm_map(news.corpus, removeWords, stopwords('portuguese')) 
news.corpus <- tm_map(news.corpus, removeWords, stopwords('russian'))
news.corpus <- tm_map(news.corpus, removeWords, stopwords('spanish'))
news.corpus <- tm_map(news.corpus, removeWords, stopwords('swedish'))
news.corpus <- tm_map(news.corpus, removeWords,my_custom_stopwords)
  • Create a Term Document Matrix for each corpus. This wll be used to obtain the word and document frequency
blogs_tdm<-TermDocumentMatrix(blogs.corpus,control=list(weighting=weightTf))
tdm.tblogs.m<-as.matrix(blogs_tdm)
twitter_tdm<-TermDocumentMatrix(twitter.corpus,control=list(weighting=weightTf))
tdm.twitter.m<-as.matrix(twitter_tdm)
news_tdm<-TermDocumentMatrix(news.corpus,control=list(weighting=weightTf))
tdm.news.m<-as.matrix(news_tdm)
  1. Extract features of the sample corpus to answer the questions below:
  • Some words are more frequent than others
blogs.term.freq<-rowSums(tdm.tblogs.m)
blogs.term.freq<-data.frame(word=names(blogs.term.freq),frequency=blogs.term.freq)
blogs.term.freq<-blogs.term.freq[order(blogs.term.freq[,2], decreasing=T),]
blogs.term.freq[1:10,]
##      word frequency
## one   one      1313
## like like      1091
## time time      1073
## just just      1071
## can   can      1026
## get   get       935
## make make       833
## year year       718
## know know       693
## day   day       691

+Visualling the Top Words used in the 1% sample of the Blog Test file.

blogs.term.freq$word<-factor(blogs.term.freq$word,levels=unique(as.character(blogs.term.freq$word)))
ggplot(blogs.term.freq[1:10,], aes(x=word,y=frequency))+geom_bar(stat="identity",fill='darkred')+coord_flip()+theme_gdocs()+
  geom_text(aes(label=frequency),
            colour="white",hjust=1.25, size=5.0)

+Visualling the Top Words used in the 1% sample of the twitter Test file.

twitter.term.freq<-rowSums(tdm.twitter.m)
twitter.term.freq<-data.frame(word=names(twitter.term.freq),frequency=twitter.term.freq)
twitter.term.freq<-twitter.term.freq[order(twitter.term.freq[,2], decreasing=T),]
twitter.term.freq[1:10,]
##        word frequency
## get     get      1518
## just   just      1495
## like   like      1322
## thank thank      1263
## love   love      1252
## day     day      1081
## good   good       995
## know   know       901
## dont   dont       874
## can     can       870
twitter.term.freq$word<-factor(twitter.term.freq$word,levels=unique(as.character(twitter.term.freq$word)))
ggplot(twitter.term.freq[1:10,], aes(x=word,y=frequency))+geom_bar(stat="identity",fill='darkgreen')+coord_flip()+theme_gdocs()+
  geom_text(aes(label=frequency),
            colour="white",hjust=1.25, size=5.0)

+Visualling the Top Words used in the 1% sample of the twitter Test file.

news.term.freq<-rowSums(tdm.news.m)
news.term.freq<-data.frame(word=names(news.term.freq),frequency=news.term.freq)
news.term.freq<-news.term.freq[order(news.term.freq[,2], decreasing=T),]
news.term.freq[1:10,]
##        word frequency
## said   said      2532
## year   year      1006
## one     one       855
## time   time       673
## say     say       670
## new     new       664
## can     can       632
## state state       630
## get     get       588
## like   like       571

+Visualling the Top Words used in the 1% sample of the news Test file.

news.term.freq$word<-factor(news.term.freq$word,levels=unique(as.character(news.term.freq$word)))
ggplot(news.term.freq[1:10,], aes(x=word,y=frequency))+geom_bar(stat="identity",fill='darkblue')+coord_flip()+theme_gdocs()+
  geom_text(aes(label=frequency),
            colour="white",hjust=1.25, size=5.0)

+ what are the distributions of word frequencies?

  • What are the frequencies of 2-grams and 3-grams in the dataset

+What are the frequencies of 2-grams +Blog

blogs_bigram <- NGramTokenizer(blogs.corpus, Weka_control(min = 2, max = 2,delimiters = " \\r\\n\\t.,;:\"()?!"))
blogs_bigram <- data.frame(table(blogs_bigram ))
blogs_bigram <- blogs_bigram [order(blogs_bigram $Freq,decreasing = TRUE),]
names(blogs_bigram ) <- c("words","freq")
head(blogs_bigram )
##             words freq
## 84518   look like   73
## 39528   dont know   69
## 164487   year ago   62
## 78571  last night   53
## 78634   last year   53
## 50126   feel like   50

+Twitter

twitter_bigram <- NGramTokenizer(twitter.corpus, Weka_control(min = 2, max = 2,delimiters = " \\r\\n\\t.,;:\"()?!"))
twitter_bigram <- data.frame(table(twitter_bigram ))
twitter_bigram <- twitter_bigram [order(twitter_bigram $Freq,decreasing = TRUE),]
names(twitter_bigram) <- c("words","freq")
head(twitter_bigram)
##               words freq
## 16669     cant wait  170
## 96823     right now  152
## 115244 thank follow  129
## 67459     look like  114
## 67397  look forward  112
## 62218    last night  110

+News

news_bigram <- NGramTokenizer(news.corpus, Weka_control(min = 2, max = 2,delimiters = " \\r\\n\\t.,;:\"()?!"))
news_bigram  <- data.frame(table(news_bigram ))
news_bigram  <- news_bigram  [order(news_bigram  $Freq,decreasing = TRUE),]
names(news_bigram  ) <- c("words","freq")
head(news_bigram  )
##              words freq
## 80162    last year  131
## 97848     new york  110
## 66904  high school   87
## 140958     st loui   84
## 80155    last week   76
## 168290    year ago   74

+What visualization the frequencies of 2-grams

ggplot(head(blogs_bigram,10), aes(reorder(words,freq), freq)) +
  geom_bar(stat = "identity") + coord_flip() +
  xlab("Bigrams") + ylab("Frequency") +
  ggtitle("Most Blogs frequent bigrams")

ggplot(head(twitter_bigram,10), aes(reorder(words,freq), freq)) +
  geom_bar(stat = "identity") + coord_flip() +
  xlab("Bigrams") + ylab("Frequency") +
  ggtitle("Most twitterfrequent bigrams")

ggplot(head(news_bigram,10), aes(reorder(words,freq), freq)) +
  geom_bar(stat = "identity") + coord_flip() +
  xlab("Bigrams") + ylab("Frequency") +
  ggtitle("Most News frequent bigrams")

+ What are the frequencies of 3-grams +blog

blogs_trigram <- NGramTokenizer(blogs.corpus, Weka_control(min = 3, max = 3,delimiters = " \\r\\n\\t.,;:\"()?!"))
blogs_trigram <- data.frame(table(blogs_trigram))
blogs_trigram <- blogs_trigram[order(blogs_trigram$Freq,decreasing = TRUE),]
names(blogs_trigram) <- c("words","freq")
head(blogs_trigram)
##                              words freq
## 24780       carolina insur comapni   16
## 154641        south carolina insur   16
## 112837          none repeat scroll   15
## 138457        repeat scroll yellow   15
## 159942 stylebackground none repeat   15
## 17204              block block obj    8

+Twitter

twitter_trigram <- NGramTokenizer(twitter.corpus, Weka_control(min = 3, max = 3,delimiters = " \\r\\n\\t.,;:\"()?!"))
twitter_trigram <- data.frame(table(twitter_trigram))
twitter_trigram <- twitter_trigram [order(twitter_trigram$Freq,decreasing = TRUE),]
names(twitter_trigram) <- c("words","freq")
head(twitter_trigram)
##                    words freq
## 59521   happi mother day   33
## 76791        let us know   25
## 19345      cant wait see   22
## 59531     happi new year   20
## 59433  happi happi happi   17
## 45143 follow follow back   13

+News

news_trigram <- NGramTokenizer(news.corpus, Weka_control(min = 3, max = 3,delimiters = " \\r\\n\\t.,;:\"()?!"))
news_trigram <- data.frame(table(news_trigram))
news_trigram <- news_trigram[order(news_trigram$Freq,decreasing = TRUE),]
names(news_trigram) <- c("words","freq")
head(news_trigram)
##                      words freq
## 128439 presid barack obama   16
## 69184    gov chris christi   13
## 178005        two year ago   13
## 110017       new york citi   12
## 183518       w sunset blvd   10
## 165389 superior court judg    9
ggplot(head(blogs_trigram ,10), aes(reorder(words,freq), freq)) +   
  geom_bar(stat="identity") + coord_flip() + 
  xlab("Trigrams") + ylab("Frequency") +
  ggtitle("Most frequent trigrams")

ggplot(head(twitter_trigram,10), aes(reorder(words,freq), freq)) +   
  geom_bar(stat="identity") + coord_flip() + 
  xlab("Trigrams") + ylab("Frequency") +
  ggtitle("Most twitter frequent trigrams")

ggplot(head(news_trigram ,10), aes(reorder(words,freq), freq)) +   
  geom_bar(stat="identity") + coord_flip() + 
  xlab("Trigrams") + ylab("Frequency") +
  ggtitle("Most news frequent trigrams")

  1. Analyze.
  • Interesting findings

  • The text data sets contain a mix of lanuages, an example of these are finnish,swedish. Removing these Lanuages is tricky business, due to not speaking the lanauage & the high resources to remove the from the corpus. I uses the TM Stopwords function in a few lamagues to remove of words.

  • There are allot of spelling mistakes in these text files. This is nature of twitter, Blogs & News sites. I did sammple a few of the spelling mistakes, however choose to not to remove or correct them due to the high demand of ram. +Next Steps:
  • We shall be using bigram &trigrams to calculate & predict the probabilities of the next word occuring with respect to previous words.
  • To increase the bigram & trigrams prediction accurancy - I will looking at increasing the sample size of the text corpus.
  • I will be looking at methods to split the text data into more managable samples & look at other methods of increasing sample size & limit the impact of high ram usuage. Looking at other r packages, such as quanteda & other function of TM package.
  • Almost look at add the top 25, Nouns,Verbs,Adjectives & Prepositions to increase the coverage of