Introduction

This is a first phase of the projec: demonstrate the text mining skills. The data is the SwiftKet dataset, that contain files in English, Russian, Finnish, German. For the purpose of this project on English files had been considered: en_US.twitter.txt, en_US.blogs.txt, en_US.news.txt. In addition, the profanity file [https://github.com/RobertJGabriel/Google-profanity-words/blob/master/list.txt] has been inlcuded for the corpus tudying purposes.

The project is split into FIVE parts:

#List of used libraries
library(dplyr, stringr, tidytext,janeaustenr, ggplot2, tm, gridExtra, tidyverse, tidyr)

1. Load, sample and clean data.

The functions “sample_df” and “cleanall” are used for the purpose of sampling and tudying the text respectively, see the Functions section for the functions codes.

profanity<-readLines('profanity.txt')
dir<-'./final/en_US'
tweets<-file.path(dir, 'en_US.twitter.txt')
blogs<-file.path(dir, 'en_US.blogs.txt')
news<-file.path(dir, 'en_US.news.txt')

#Sample 10% of tweets
tweets<-read.delim(tweets, header=FALSE)
tweets<-sample_df(tweets)
tweets$text<-cleanall(tweets$text)
tweets<-tweets %>% na_if("") %>% na.omit
head(tweets, 3)
                                                                                 text
1 girl  need  hop    cuz  go harder   sweatpants period stain  kelsey sowers  rockin 
2                                                                             hey boo
3                                      sad    miss  year  looking forward   next year
#sample 10% of news
news<-read.delim(news, header=FALSE)
news<-sample_df(news)
news$text<-cleanall(news$text)
news<-news %>% na_if("") %>% na.omit
head(news, 3)
                                                                                                                                                                                           text
1                                        largerscale renewable projects  tougher brown seeks  megawatts      california energy commission certified nine solar thermal power plants   megawatts
2 daytons understandable chestpounding   fine example   strangely common political phenomenon    politicians  advocates send   spin statement accompanying  report    contradict   report  says
3                                                                            lineup  suttonbrowncatchingsdouglasbriann januaryerin phillips   small however rebounding  long   fever deficiency
#sample 10% of blogs
blogs<-read.delim(blogs, header=FALSE)
blogs<-sample_df(blogs)
blogs$text<-cleanall(blogs$text)
blogs<-blogs %>% na_if("") %>% na.omit
head(blogs,3)
                                                                                                                                                                                       text
1  started  giving   good coat  gesso   coat  brown acrylic paint   coat  black acrylic paint   sanded  mainly along  edges    brown acrylic paint showed   black  rather liked  just like 
2                                                                                                 un lun dun  originally published  april   reissued   brand new cover  may   pan macmillan
3                                                                                                                                              method  need    hexagon flower already made 

2. Data overview after data tidying

Tokenize into words

com_tweets <- tweets %>% 
   unnest_tokens(word, text)
com_news<-news %>% 
   unnest_tokens(word, text)
com_blogs<-blogs %>% 
   unnest_tokens(word, text)
overview<- cbind('Tweets'=com_tweets[1:10,], 'News'=com_news[1:10,], 'Blogs'=com_blogs[1:10,])
overview
      Tweets       News          Blogs    
 [1,] "girl"       "largerscale" "started"
 [2,] "need"       "renewable"   "giving" 
 [3,] "hop"        "projects"    "good"   
 [4,] "cuz"        "tougher"     "coat"   
 [5,] "go"         "brown"       "gesso"  
 [6,] "harder"     "seeks"       "coat"   
 [7,] "sweatpants" "megawatts"   "brown"  
 [8,] "period"     "california"  "acrylic"
 [9,] "stain"      "energy"      "paint"  
[10,] "kelsey"     "commission"  "coat"   

Number of Words used for analysis

print(c('tweets'=nrow(com_tweets), 'news'=nrow(com_news), 'blogs'=nrow(com_blogs)))
 tweets    news   blogs 
1756131  170426 1773184 

Number of lines in the dataframe

print(c('tweets'=nrow(tweets), 'news'=nrow(news), 'blogs'=nrow(blogs)))
tweets   news  blogs 
119495   3963  44396 

3.Plot distribution of the 15 most common words

Selecting by n
Selecting by n
Selecting by n

From the plot it can be observed that the text had been cleaned, however some further tudying is required as there are still single letters and non-enlgish a character.

gc()
           used  (Mb) gc trigger  (Mb) max used  (Mb)
Ncells  2582315 138.0    5296010 282.9  5296010 282.9
Vcells 12913173  98.6   42279348 322.6 42279337 322.6

4. Plot most frequent 2-gram in the combined dataframe

com_bigram_tweets <- tweets%>% 
   unnest_tokens(output=bigram, input=text, token = "ngrams", n = 2)

com_bigram_news <- news%>% 
   unnest_tokens(output=bigram, input=text, token = "ngrams", n = 2)

com_bigram_blogs <- blogs%>% 
   unnest_tokens(output=bigram, input=text, token = "ngrams", n = 2)

com_bigram<-rbind(com_bigram_blogs, com_bigram_news,com_bigram_tweets)
names(com_bigram)<-c('word')
com_bigram<-com_bigram %>% na_if("") %>% na.omit

p_bi<-plot_com(com_bigram)+ labs(title = "2-gram")
Selecting by n
p_bi

5. Plot most frequent 3-gram in the combined dataframe

com_trigram_tweets <- tweets%>% 
   unnest_tokens(output=trigram, input=text, token = "ngrams", n = 3)

com_trigram_news <- news%>% 
   unnest_tokens(output=trigram, input=text, token = "ngrams", n = 3)

com_trigram_blogs <- blogs%>% 
   unnest_tokens(output=trigram, input=text, token = "ngrams", n = 3)

com_trigram<-rbind(com_trigram_blogs, com_trigram_news,com_trigram_tweets)
names(com_trigram)<-c('word')
com_trigram<-com_trigram %>% na_if("") %>% na.omit

p_tri<-plot_com(com_trigram)+ labs(title = "3-gram")
Selecting by n
p_tri

Used Functions

#sampling data to obtain random 10% of the provided textlines.
sample_df<-function(df,k){
   set.seed(77)
   n<- nrow(df) 
   rows <- sample(nrow(df))
   shuffled_df <- data.frame(df[rows, ])
   indices <- 1:round(k * n)
   sample_df <- data.frame(shuffled_df[indices, ]) 
   names(sample_df)=c('text')
   return(sample_df)
}

#Clean data: profanity, stop words, punctuation, numbers, special characters
cleanall<-function(col){
   
   corpus<-Corpus(VectorSource(col))
   corpus<-tm_map(corpus, removeNumbers)
   corpus<-tm_map(corpus, content_transformer(tolower))
   corpus<-tm_map(corpus, removeWords, stopwords('english'))
   corpus<-tm_map(corpus, removePunctuation)
   corpus<-tm_map(corpus, removeWords, profanity)
   removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
   corpus<- tm_map(corpus, content_transformer(removeNumPunct))
   col<-unlist(as.list(corpus))
}

#plotting 15 most common n-grams
plot_com<- function(com_df) {
   com_df %>%
      count(word, sort = TRUE) %>%
      top_n(15) %>%
      mutate(word = reorder(word, n)) %>%
      ggplot(aes(x = word, y = n)) +
      geom_col() +
      xlab(NULL) +
      coord_flip() +
      labs(x = "",
           y = "Frequency")
}

Conclusion

Work on this project clearly showed that enlargment of the sample size will result in memory issue, which will hult the execution of the analysis. Therefore only 10% of each file has been read into as a sample.

Next Steps

Current data mining assignment will be followed by the further analysis/ coverage, model building and app/product that allows the text prediction.