Sinopsys

The goal here is to build your first simple model for the relationship between words. I decided to work with following assumptions: - To work with English text - To use n-gram for analysis (uni-gram, bi-gram, tri-gram) - To make my prediction based on Markov chain (“What happens next depends only on the state of affairs now.”)

1.Data loading

On the first step I download data from the database and load it to R, and push all libraries necessary for further work

library(readr, warn.conflicts = FALSE)
library(quanteda, warn.conflicts = FALSE)
## Package version: 3.2.4
## Unicode version: 13.0
## ICU version: 69.1
## Parallel computing: 8 of 8 threads used.
## See https://quanteda.io for tutorials and examples.
library(stringr, warn.conflicts = FALSE)
library(stringi, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)

library(tm, warn.conflicts = FALSE)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following objects are masked from 'package:quanteda':
## 
##     meta, meta<-
library(ggplot2, warn.conflicts = FALSE)
library(reshape2, warn.conflicts = FALSE)
library(wordcloud, warn.conflicts = FALSE)
## Loading required package: RColorBrewer
library(RWeka, warn.conflicts = FALSE)

setwd("D:/R")
#zipF<- "Capstone_test1.zip"
#outDir<-"unzip_Capstone_test1"

#destfile  = paste(zipF,"./Capstone_test1.zip",sep="")
#if(!file.exists(destfile)){
#  url = "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
#  file <- basename(url)
#  download.file(url, file, method="curl")
#  unzip(file)
#}
dir <- "unzip_Capstone_test1/final/en_US"

twitter_file<-file(paste(dir,"/en_US.twitter.txt", sep=""))
news_file<-file(paste(dir,"/en_US.news.txt", sep=""))
blogs_file<-file(paste(dir,"/en_US.blogs.txt", sep=""))

twitter <- readLines(twitter_file, encoding = "UTF-8", skipNul = TRUE, warn= FALSE)
on.exit(close(twitter_file))
news <- readLines(news_file, encoding = "UTF-8",skipNul = TRUE, warn= FALSE)
on.exit(close(news_file))
blogs <- readLines(blogs_file, encoding = "UTF-8", skipNul = TRUE, warn= FALSE)
on.exit(close(blogs_file))

2.Preprocessing data analisys

On this step I calculated all info about source to have possibility to estimate available resources vs requred

library(stringi)

# Datasets Summary
data.frame(source = c("twitter", "news", "blogs"),
           file_size_MB = c(file.info(paste(dir,"/en_US.twitter.txt", sep=""))$size/1024^2, file.info(paste(dir,"/en_US.news.txt", sep=""))$size/1024^2, file.info(paste(dir,"/en_US.blogs.txt", sep=""))$size/1024^2),
           LinesNumber = c(length(twitter), length(news), length(blogs)),
           WordsNumber = c(sum(stri_count_words(twitter)), sum(stri_count_words(news)), sum(stri_count_words(blogs))),
           mean.num.words = c(mean(stri_count_words(twitter)), mean(stri_count_words(news)), mean(stri_count_words(blogs)))
           )
##    source file_size_MB LinesNumber WordsNumber mean.num.words
## 1 twitter     159.3641     2360148    30093413       12.75065
## 2    news     196.2775       77259     2674536       34.61779
## 3   blogs     200.4242      899288    37546250       41.75109

3.Data cleaning

Since the data is too big and my computer doesn’t have enough capacity to perform exploratory data analysis(cannot allocate vector of size 12.6 Gb) I have to randomLy sampling 0.5% of total data(5.55Gb).

To perform data analysis it’s needed to normalize data and delete: - URL’s; - special characters; - punctuations marks; - numbers; - excess whitespace; - stopwords; - Сhange text to lower case.

set.seed(12-12-2023)

DataSample <- c(sample(twitter, length(twitter) * 0.005),
                 sample(news, length(news) * 0.005),
                 sample(blogs, length(blogs) * 0.005))

# Corpus and Dat clean
corpus <- VCorpus(VectorSource(DataSample))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpus <- tm_map(corpus, toSpace, "@[^\\s]+")
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeWords, c(stopwords("en"),"s","ve"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)

4. Most common n-grams

Calculation of uni/bi/tri-grams

getFreq <- function(tdm) {
  freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
  return(data.frame(word = names(freq), freq = freq))
}
bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
makePlot <- function(data, label) {
  ggplot(data[1:30,], aes(reorder(word, -freq), freq)) +
         labs(x = label, y = "Frequency") +
         theme(axis.text.x = element_text(angle = 60, size = 12, hjust = 1)) +
         geom_bar(stat = "identity", fill = I("blue"))
}


# Frequencies of most common n-grams in data sample
freq1 <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus), 0.9999))
freq2 <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = bigram)), 0.9999))
freq3 <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = trigram)), 0.9999))

a. Uni-gram

library(dplyr, warn.conflicts = FALSE)
#Uni-gram
freqwords<-freq1 %>% mutate(Share = (freq / sum(freq))*100) %>% 
  mutate(Cumulative_Share = cumsum(Share))

cs_1 <- data.frame( quantiles= c("100%","90%", "75%","50%", "25%", "10%"),                     
          NumberOfWords = c(nrow(freqwords[freqwords$Cumulative_Share>=0, ]),
                           nrow(freqwords[freqwords$Cumulative_Share<=90, ]),
                           nrow(freqwords[freqwords$Cumulative_Share<=75, ]),
                           nrow(freqwords[freqwords$Cumulative_Share<=50, ]),
                           nrow(freqwords[freqwords$Cumulative_Share<=25, ]),
                           nrow(freqwords[freqwords$Cumulative_Share<=10, ])))
cs_1<-cs_1 %>% mutate(share = round(NumberOfWords/nrow(freqwords)*100,2 ))
cs_1
##   quantiles NumberOfWords  share
## 1      100%         12106 100.00
## 2       90%          5800  47.91
## 3       75%          2355  19.45
## 4       50%           601   4.96
## 5       25%           113   0.93
## 6       10%            22   0.18

According to calculated results 90% of all text consist of only 50% of unique words from text. It quite good base for prediction I suppose since we see that words constantly repeated.

head(freqwords, 20)
##          word freq     Share Cumulative_Share
## just     just 1244 0.7659674        0.7659674
## will     will 1106 0.6809967        1.4469641
## like     like 1097 0.6754552        2.1224193
## one       one 1094 0.6736080        2.7960273
## can       can  965 0.5941789        3.3902062
## get       get  905 0.5572351        3.9474413
## time     time  817 0.5030509        4.4504923
## love     love  763 0.4698016        4.9202938
## good     good  744 0.4581027        5.3783965
## now       now  721 0.4439409        5.8223374
## day       day  710 0.4371679        6.2595053
## know     know  649 0.3996084        6.6591137
## new       new  636 0.3916039        7.0507176
## see       see  606 0.3731320        7.4238497
## people people  570 0.3509658        7.7748154
## back     back  540 0.3324939        8.1073093
## make     make  537 0.3306467        8.4379560
## think   think  508 0.3127905        8.7507466
## great   great  506 0.3115591        9.0623057
## today   today  494 0.3041703        9.3664760

Here we can see 20 most common words

#brewer.pal(8, “Dark2”)
pal=brewer.pal(8,"Accent")
pal=pal[-(1:3)]
wordcloud(freq1$word,freq1$freq,max.words=30,random.order = F, colors=pal,
          scale=c(4, .5))

# b. Bi-gram

#Bi-gram
freqwords2<-freq2 %>% mutate(Share = (freq / sum(freq))*100) %>% 
  mutate(Cumulative_Share = cumsum(Share))

cs_2 <- data.frame( quantiles= c("100%","90%", "75%","50%", "25%", "10%"),                     
         NumberOfWords = c(nrow(freqwords2[freqwords2$Cumulative_Share>=0, ]),
                           nrow(freqwords2[freqwords2$Cumulative_Share<=90, ]),
                           nrow(freqwords2[freqwords2$Cumulative_Share<=75, ]),
                           nrow(freqwords2[freqwords2$Cumulative_Share<=50, ]),
                           nrow(freqwords2[freqwords2$Cumulative_Share<=25, ]),
                           nrow(freqwords2[freqwords2$Cumulative_Share<=10, ])))
cs_2<-cs_2 %>% mutate(share = round(NumberOfWords/nrow(freqwords2)*100,2 ))
cs_2
##   quantiles NumberOfWords  share
## 1      100%          8945 100.00
## 2       90%          7576  84.70
## 3       75%          5523  61.74
## 4       50%          2461  27.51
## 5       25%           645   7.21
## 6       10%           121   1.35

According to calculated results 90% of all text consist of only 80% of unique words from text

head(freqwords2, 20)
##                            word freq     Share Cumulative_Share
## right now             right now  115 0.4201681        0.4201681
## last night           last night   76 0.2776763        0.6978444
## feel like             feel like   64 0.2338327        0.9316770
## looks like           looks like   51 0.1863354        1.1180124
## just got               just got   46 0.1680672        1.2860796
## can get                 can get   45 0.1644136        1.4504932
## good morning       good morning   44 0.1607600        1.6112532
## looking forward looking forward   43 0.1571063        1.7683595
## make sure             make sure   43 0.1571063        1.9254658
## one day                 one day   42 0.1534527        2.0789185
## follow back         follow back   37 0.1351845        2.2141030
## new york               new york   35 0.1278772        2.3419803
## even though         even though   34 0.1242236        2.4662039
## happy birthday   happy birthday   33 0.1205700        2.5867738
## sounds like         sounds like   33 0.1205700        2.7073438
## can see                 can see   32 0.1169163        2.8242601
## first time           first time   32 0.1169163        2.9411765
## thanks follow     thanks follow   32 0.1169163        3.0580928
## last year             last year   31 0.1132627        3.1713555
## will never           will never   31 0.1132627        3.2846182

20 most common bigrams

pal=brewer.pal(8,"Dark2")
pal=pal[-(1:3)]
wordcloud(freq2$word,freq2$freq,max.words=30,random.order = F, colors=pal,scale=c(4, .5))

c. Tri-gram

freqwords3<-freq3 %>%  mutate(Share = (freq / sum(freq))*100) %>% mutate(Cumulative_Share = cumsum(Share))

cs_3 <- data.frame( quantiles= c("100%","90%", "75%","50%", "25%", "10%"),                     
                    NumberOfWords = c(nrow(freqwords3[freqwords3$Cumulative_Share>=0, ]),
                           nrow(freqwords3[freqwords3$Cumulative_Share<=90, ]),
                           nrow(freqwords3[freqwords3$Cumulative_Share<=75, ]),
                           nrow(freqwords3[freqwords3$Cumulative_Share<=50, ]),
                           nrow(freqwords3[freqwords3$Cumulative_Share<=25, ]),
                           nrow(freqwords3[freqwords3$Cumulative_Share<=10, ])))
cs_3<-cs_3 %>% mutate(share = round(NumberOfWords/nrow(freqwords3)*100,2 ))
cs_3
##   quantiles NumberOfWords  share
## 1      100%           535 100.00
## 2       90%           474  88.60
## 3       75%           383  71.59
## 4       50%           232  43.36
## 5       25%            81  15.14
## 6       10%            22   4.11

According to calculated results 90% of all text consist of only 85% of unique words from text

head(freqwords3, 20)
##                                        word freq     Share Cumulative_Share
## happy mother day           happy mother day   10 0.8250825        0.8250825
## let us know                     let us know   10 0.8250825        1.6501650
## happy new year               happy new year    9 0.7425743        2.3927393
## cake cake cake               cake cake cake    7 0.5775578        2.9702970
## come see us                     come see us    6 0.4950495        3.4653465
## happy mothers day         happy mothers day    6 0.4950495        3.9603960
## couple years ago           couple years ago    5 0.4125413        4.3729373
## good morning everyone good morning everyone    5 0.4125413        4.7854785
## just got back                 just got back    5 0.4125413        5.1980198
## love love love               love love love    5 0.4125413        5.6105611
## new york city                 new york city    5 0.4125413        6.0231023
## please follow back       please follow back    5 0.4125413        6.4356436
## spend much time             spend much time    5 0.4125413        6.8481848
## thanks following us     thanks following us    5 0.4125413        7.2607261
## can follow back             can follow back    4 0.3300330        7.5907591
## child sexual abuse       child sexual abuse    4 0.3300330        7.9207921
## cinco de mayo                 cinco de mayo    4 0.3300330        8.2508251
## couple weeks ago           couple weeks ago    4 0.3300330        8.5808581
## dreams come true           dreams come true    4 0.3300330        8.9108911
## feel much better           feel much better    4 0.3300330        9.2409241

20 most common tri-grams

pal=brewer.pal(8,"Paired")
pal=pal[-(1:3)]
wordcloud(freq3$word,freq3$freq,max.words=15,random.order = F, colors=pal,scale=c(4, .5))
## Warning in wordcloud(freq3$word, freq3$freq, max.words = 15, random.order = F,
## : happy mother day could not be fit on page. It will not be plotted.
## Warning in wordcloud(freq3$word, freq3$freq, max.words = 15, random.order = F,
## : spend much time could not be fit on page. It will not be plotted.
## Warning in wordcloud(freq3$word, freq3$freq, max.words = 15, random.order = F,
## : thanks following us could not be fit on page. It will not be plotted.
## Warning in wordcloud(freq3$word, freq3$freq, max.words = 15, random.order = F,
## : look forward seeing could not be fit on page. It will not be plotted.

Conclusions

We can see that we have quite good source for the analysis. The frequency of tri-grams and Bi-grams quite high. So we use the tri-gram model to predict the next word, If no matching t - bigram model, and then - unigram model if needed. The shiny app will have a text field for the user to input and it will pop up 3 options of suggestions what the next word might be.

Thank you!