Description

The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm.

library(dplyr)
library(plyr)
library(quanteda)
library(tm)
library(ggplot2)
library(NLP)
library(data.table)

#Datasets
en_blog <- readLines(con = "C:/Users/Yaswanth Pulavarthi/Documents/final/en_US/en_US.blogs.txt", encoding= "UTF-8", skipNul = T)
en_news <- readLines(con = "C:/Users/Yaswanth Pulavarthi/Documents/final/en_US/en_US.news.txt", encoding= "UTF-8", skipNul = T)
en_twitter <- readLines(con = "C:/Users/Yaswanth Pulavarthi/Documents/final/en_US/en_US.twitter.txt", encoding= "UTF-8", skipNul = T)

Data Sampling

Data is so large to run on PC. And to avoid long time waiting, we sampled out 10 percent of data.

vector <- c(sample_blog, sample_news, sample_twitter)
corpus <- corpus(vector)
cleaning <- tokens(
    x = tolower(corpus),
    remove_punct = TRUE,
    remove_twitter = TRUE,
    remove_numbers = TRUE,
    remove_hyphens = TRUE,
    remove_symbols = TRUE,
    remove_url = TRUE
)
stem_words <- tokens_wordstem(cleaning, language = "english")
bi_gram <- tokens_ngrams(stem_words, n = 2)
tri_gram <- tokens_ngrams(stem_words, n = 3)

uni_DFM <- dfm(stem_words)
bi_DFM <- dfm(bi_gram)
tri_DFM <- dfm(tri_gram)

# Let us trim the N-Grams for faster calculations
uni_DFM <- dfm_trim(uni_DFM, 3)
bi_DFM <- dfm_trim(bi_DFM, 3)
tri_DFM <- dfm_trim(tri_DFM, 3)
# Create named vectors with counts of words 
num_uni <- colSums(uni_DFM)
num_bi <- colSums(bi_DFM)
num_tri <- colSums(tri_DFM)

# Create data tables with individual words as columns
uni_words <- data.table(One_Word = names(num_uni), Frequency = num_uni)

bi_words <- data.table(
        One_Word = sapply(strsplit(names(num_bi), "_", fixed = TRUE), '[[', 1),
        Two_Words = sapply(strsplit(names(num_bi), "_", fixed = TRUE), '[[', 2),
        Frequency = num_bi)

tri_words <- data.table(
        One_Word = sapply(strsplit(names(num_tri), "_", fixed = TRUE), '[[', 1),
        Two_Words = sapply(strsplit(names(num_tri), "_", fixed = TRUE), '[[', 2),
        Three_Words = sapply(strsplit(names(num_tri), "_", fixed = TRUE), '[[', 3),
        Frequency = num_tri)
uni_words[order(uni_words$Frequency, decreasing = T), ][1:50]
##     One_Word Frequency
##  1:      the    293837
##  2:       to    191681
##  3:      and    160300
##  4:        a    157272
##  5:        i    150692
##  6:       of    128673
##  7:       in    102987
##  8:       it     95873
##  9:      you     85302
## 10:       is     81157
## 11:     that     79209
## 12:      for     77067
## 13:       on     57213
## 14:       my     56424
## 15:     with     48074
## 16:       be     47227
## 17:     have     43050
## 18:     this     42902
## 19:      was     41385
## 20:       at     37352
## 21:      are     36259
## 22:      but     34561
## 23:       me     34249
## 24:       so     33547
## 25:       we     32939
## 26:       as     30851
## 27:      not     30491
## 28:     your     27878
## 29:      all     27547
## 30:     just     25513
## 31:     what     25194
## 32:     like     24938
## 33:     from     24412
## 34:       he     24382
## 35:      get     24329
## 36:       do     24020
## 37:       up     23900
## 38:      out     23446
## 39:      one     23335
## 40:     will     22251
## 41:       or     22193
## 42:       if     22182
## 43:     they     21712
## 44:       go     21676
## 45:    about     21290
## 46:     time     20131
## 47:      i'm     19820
## 48:      can     19372
## 49:     when     19272
## 50:       by     19220
##     One_Word Frequency
bi_words[order(bi_words$Frequency, decreasing = T), ][1:50]
##     One_Word Two_Words Frequency
##  1:       of       the     25795
##  2:       in       the     24534
##  3:      for       the     13714
##  4:       to       the     13539
##  5:       on       the     12785
##  6:       to        be     11827
##  7:       at       the      8755
##  8:       go        to      8220
##  9:        i      have      8092
## 10:      and       the      7734
## 11:        i       was      7634
## 12:       in         a      7437
## 13:     want        to      7404
## 14:       is         a      7400
## 15:        i        am      7301
## 16:      and         i      7266
## 17:       it       was      7116
## 18:     have         a      6835
## 19:       it        is      6681
## 20:     with       the      6632
## 21:      for         a      6592
## 22:       if       you      6366
## 23:       is       the      5636
## 24:     will        be      5501
## 25:       to       get      5322
## 26:     from       the      5263
## 27:        i      love      5230
## 28:     with         a      5095
## 29:     have        to      4993
## 30:     that         i      4948
## 31:     need        to      4899
## 32:      one        of      4855
## 33:       to       see      4804
## 34:       of         a      4763
## 35:        i     don't      4749
## 36:        i     think      4725
## 37:     this        is      4698
## 38:    thank       for      4685
## 39:      but         i      4576
## 40:       to        do      4421
## 41:      all       the      4324
## 42:      out        of      4168
## 43:       of        my      4161
## 44:      you       are      4072
## 45:      tri        to      4064
## 46:       as         a      3990
## 47:    thank       you      3980
## 48:      you       can      3963
## 49:       be         a      3962
## 50:       in        my      3933
##     One_Word Two_Words Frequency
tri_words[order(tri_words$Frequency, decreasing = T), ][1:50]
##     One_Word Two_Words Three_Words Frequency
##  1:    thank       for         the      2452
##  2:      one        of         the      2125
##  3:        a       lot          of      1897
##  4:        i      want          to      1825
##  5:     look   forward          to      1392
##  6:       to        be           a      1293
##  7:       go        to          be      1224
##  8:       be       abl          to      1141
##  9:    thank       you         for      1057
## 10:        i      need          to      1048
## 11:        i      have           a      1048
## 12:        i      have          to      1038
## 13:      the       end          of      1026
## 14:       it       was           a      1020
## 15:      out        of         the       989
## 16:        i      love         you       948
## 17:        i     don't        know       927
## 18:    can't      wait          to       872
## 19:      for       the      follow       870
## 20:      i'm        go          to       842
## 21:      you      want          to       840
## 22:     some        of         the       813
## 23:       as      well          as       806
## 24:      one        of          my       773
## 25:       is        go          to       770
## 26:      the      rest          of       766
## 27:     part        of         the       764
## 28:        a     coupl          of       752
## 29:    there        is           a       737
## 30:       to        go          to       725
## 31:      the      fact        that       712
## 32:     want        to          be       710
## 33:     have         a       great       708
## 34:       go        to         the       697
## 35:       in       the       world       691
## 36:      you      have          to       684
## 37:     need        to          be       665
## 38:       to       see         you       653
## 39:       to      have           a       651
## 40:     this        is           a       650
## 41:        i      have        been       650
## 42:      end        of         the       638
## 43:       it     would          be       635
## 44:       if       you         are       634
## 45:      you      have           a       633
## 46:       it        is           a       624
## 47:        i     think           i       620
## 48:      the     first        time       618
## 49:     this        is         the       616
## 50:        i     think          it       614
##     One_Word Two_Words Three_Words Frequency
top10tri<-tri_words[order(tri_words$Frequency, decreasing = T), ][1:10]
twocomb <- paste(top10tri$One_Word, top10tri$Two_Words,top10tri$Three_Words)
qplot(twocomb,top10tri$Frequency)->q
q+geom_col(aes(twocomb,top10tri$Frequency))+ggtitle("Top ten tri words")

top10bi<-bi_words[order(bi_words$Frequency, decreasing = T), ][1:10]
twocomb <- paste(top10bi$One_Word, top10bi$Two_Words)
qplot(twocomb,top10bi$Frequency)->q
q+geom_col(aes(twocomb,top10bi$Frequency))+ggtitle("Top ten bi words")

qplot(One_Word,Frequency,data=uni_words[order(uni_words$Frequency, decreasing = T), ][1:10])->q
q+geom_col(aes(One_Word,Frequency))+ggtitle("Top ten uni words")

Summary

app https://yesh21.shinyapps.io/WordPred/

git https://github.com/yesh21/RR-Rpubs/tree/master/Shiftkey%20Shiny