Inits and Libraries

## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate

Loading twitter, news and blogs English dataset

file.list = c("final/en_US/en_US.blogs.txt", "final/en_US/en_US.news.txt", "final/en_US/en_US.twitter.txt")
text <- list(blogs = "", news = "", twitter = "")
data.summary <- matrix(0, nrow = 3, ncol = 3, dimnames = list(c("blogs", "news", "twitter"),c("file size, Mb", "lines", "words")))
for (i in 1:3) {
  con <- file(file.list[i], "rb")
  text[[i]] <- readLines(con, encoding = "UTF-8",skipNul = TRUE)
  close(con)
  data.summary[i,1] <- round(file.info(file.list[i])$size / 1024^2, 2)
  data.summary[i,2] <- length(text[[i]])
  data.summary[i,3] <- sum(stri_count_words(text[[i]]))
}

Getting sample Twittter data and create a Corpus

Normalizing case & spacing, Removing punctuation numbers & standard stopwords

twitterSample <- sample(text$twitter, 0.02*length(text$twitter))

dataSample <- iconv(twitterSample, 'UTF-8', 'ASCII')
corpus <- Corpus(VectorSource(as.data.frame(dataSample, stringsAsFactors = FALSE))) 

corpus <- tm_map(corpus,content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
corpus <- tm_map(corpus,content_transformer(PlainTextDocument))
## Warning in tm_map.SimpleCorpus(corpus,
## content_transformer(PlainTextDocument)): transformation drops documents
corpus <- tm_map(corpus,content_transformer(removePunctuation))
corpus <- tm_map(corpus,content_transformer(removeNumbers))
corpus <- tm_map(corpus,content_transformer(stripWhitespace))
corpus <- tm_map(corpus, removeWords, stopwords("english"))

toSpace <- content_transformer(function(x, pattern) {return (gsub(pattern, '', x))})
corpus <- tm_map( corpus , toSpace, '=' )
## Warning in tm_map.SimpleCorpus(corpus, toSpace, "="): transformation drops
## documents
corpus <- tm_map( corpus , toSpace, "'" )
## Warning in tm_map.SimpleCorpus(corpus, toSpace, "'"): transformation drops
## documents
corpus <- tm_map( corpus , toSpace, "'" )
## Warning in tm_map.SimpleCorpus(corpus, toSpace, "'"): transformation drops
## documents

Word Distribution

Distribution being so concentrated, I splitted the dataset in quantiles to be displayed on the same chart with same scale. There is sightly less than 40,000 words in the dataset, if the most common word appears 1482 times, 75% occured less than 4 times and less than 10% occured mor than 10 times.

# uni word
unigram <- NGramTokenizer(corpus, Weka_control(min = 1, max = 1)) 
unigramdf <- data.frame(table( unigram ) )
unigramdf <- unigramdf[order(-unigramdf$Freq),]
uniqua=round((length(unigramdf$Freq)-4)/4,0)
unimean=round( mean(unigramdf$Freq) + .5 , 0 )
summary(unigramdf)
##               unigram           Freq         
##  =                :    1   Min.   :   1.000  
##  aa               :    1   1st Qu.:   1.000  
##  aaa              :    1   Median :   1.000  
##  aaahhhhhhhhhhhhhh:    1   Mean   :   8.726  
##  aaayyyeee        :    1   3rd Qu.:   3.000  
##  aac              :    1   Max.   :3098.000  
##  (Other)          :38514
plot(unigramdf$Freq[0:uniqua], type = "l", ylim = c(0, unimean*2), xlab = "", main = "Single word distribution", col=1)
    for(ii in 1:3) {
        lines(unigramdf$Freq[(ii*uniqua):((ii*uniqua)+uniqua)] , col=ii+1  )
    }
legend( uniqua*.5, unimean*2, legend = c("1st Quart","2nd","3rd","4th") , col= c(1,2,3,4) , lty = 1 )

Distribution of word binomes

bigramdf <- data.frame(table(NGramTokenizer(corpus, Weka_control(min = 2, max = 2))  ))
bigramdf <- bigramdf[order(-bigramdf$Freq),]
summary(bigramdf)
##             Var1             Freq        
##  = en         :     1   Min.   :  1.000  
##  aa commit    :     1   1st Qu.:  1.000  
##  aa discussion:     1   Median :  1.000  
##  aa first     :     1   Mean   :  1.261  
##  aa oh        :     1   3rd Qu.:  1.000  
##  aa way       :     1   Max.   :317.000  
##  (Other)      :266452
color_pal <- viridis(5) # number of colors to use

barplot(bigramdf$Freq[1:25], las = 2,
         col = color_pal,cex.names=0.9,
        names.arg=bigramdf$Var1[1:25] ) 

3 words

trigramdf <- data.frame(table(NGramTokenizer(corpus, Weka_control(min = 3, max = 3)) ))
summary(trigramdf)
##                            Var1             Freq       
##  aa commit street            :     1   Min.   : 1.000  
##  aa discussion obama         :     1   1st Qu.: 1.000  
##  aa first time               :     1   Median : 1.000  
##  aa oh man                   :     1   Mean   : 1.016  
##  aa way operationalizemeasure:     1   3rd Qu.: 1.000  
##  aaa open pm                 :     1   Max.   :67.000  
##  (Other)                     :330753
trigramdf <- trigramdf[order(-trigramdf$Freq),]
density(trigramdf$Freq)
## 
## Call:
##  density.default(x = trigramdf$Freq)
## 
## Data: trigramdf$Freq (330759 obs.);  Bandwidth 'bw' = 0.01925
## 
##        x                 y           
##  Min.   : 0.9423   Min.   : 0.00000  
##  1st Qu.:17.4711   1st Qu.: 0.00000  
##  Median :34.0000   Median : 0.00000  
##  Mean   :34.0000   Mean   : 0.04058  
##  3rd Qu.:50.5289   3rd Qu.: 0.00000  
##  Max.   :67.0577   Max.   :11.70140
barplot(trigramdf$Freq[1:25], las = 2,
         col = color_pal,cex.names=0.9,
        names.arg=trigramdf$Var1[1:25] ) 

### Conclusion:

Words usage in twitter are extremely concentrate, 1% of the most common words account for 65% of all word instance.

What are the frequencies of 2-grams also very concentrated - the highest occurence of a pair of words found in my sample is 1549 but more than 75% of the pairs only occured once…. and 3-grams in the dataset?

How many unique words do you need in a frequency sorted dictionary to cover 50% of all word instances in the language? - <1%

We will use wordnet to evaluate prediction coming from foreign language.

Clearly to be able to increase meanifull coverage, will will need to qualify our own set of stops words.

Next Step:

The next step is the creation of a model allowing the prediction of the next word based on the entered wordd. The algorithm will be trained on a much larger sample of the data than the one used for this exploration report.