The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm.
library(dplyr)
library(plyr)
library(quanteda)
library(tm)
library(ggplot2)
library(NLP)
library(data.table)
#Datasets
en_blog <- readLines(con = "C:/Users/Yaswanth Pulavarthi/Documents/final/en_US/en_US.blogs.txt", encoding= "UTF-8", skipNul = T)
en_news <- readLines(con = "C:/Users/Yaswanth Pulavarthi/Documents/final/en_US/en_US.news.txt", encoding= "UTF-8", skipNul = T)
en_twitter <- readLines(con = "C:/Users/Yaswanth Pulavarthi/Documents/final/en_US/en_US.twitter.txt", encoding= "UTF-8", skipNul = T)
Data is so large to run on PC. And to avoid long time waiting, we sampled out 10 percent of data.
vector <- c(sample_blog, sample_news, sample_twitter)
corpus <- corpus(vector)
cleaning <- tokens(
x = tolower(corpus),
remove_punct = TRUE,
remove_twitter = TRUE,
remove_numbers = TRUE,
remove_hyphens = TRUE,
remove_symbols = TRUE,
remove_url = TRUE
)
stem_words <- tokens_wordstem(cleaning, language = "english")
bi_gram <- tokens_ngrams(stem_words, n = 2)
tri_gram <- tokens_ngrams(stem_words, n = 3)
uni_DFM <- dfm(stem_words)
bi_DFM <- dfm(bi_gram)
tri_DFM <- dfm(tri_gram)
# Let us trim the N-Grams for faster calculations
uni_DFM <- dfm_trim(uni_DFM, 3)
bi_DFM <- dfm_trim(bi_DFM, 3)
tri_DFM <- dfm_trim(tri_DFM, 3)
# Create named vectors with counts of words
num_uni <- colSums(uni_DFM)
num_bi <- colSums(bi_DFM)
num_tri <- colSums(tri_DFM)
# Create data tables with individual words as columns
uni_words <- data.table(One_Word = names(num_uni), Frequency = num_uni)
bi_words <- data.table(
One_Word = sapply(strsplit(names(num_bi), "_", fixed = TRUE), '[[', 1),
Two_Words = sapply(strsplit(names(num_bi), "_", fixed = TRUE), '[[', 2),
Frequency = num_bi)
tri_words <- data.table(
One_Word = sapply(strsplit(names(num_tri), "_", fixed = TRUE), '[[', 1),
Two_Words = sapply(strsplit(names(num_tri), "_", fixed = TRUE), '[[', 2),
Three_Words = sapply(strsplit(names(num_tri), "_", fixed = TRUE), '[[', 3),
Frequency = num_tri)
uni_words[order(uni_words$Frequency, decreasing = T), ][1:50]
## One_Word Frequency
## 1: the 293837
## 2: to 191681
## 3: and 160300
## 4: a 157272
## 5: i 150692
## 6: of 128673
## 7: in 102987
## 8: it 95873
## 9: you 85302
## 10: is 81157
## 11: that 79209
## 12: for 77067
## 13: on 57213
## 14: my 56424
## 15: with 48074
## 16: be 47227
## 17: have 43050
## 18: this 42902
## 19: was 41385
## 20: at 37352
## 21: are 36259
## 22: but 34561
## 23: me 34249
## 24: so 33547
## 25: we 32939
## 26: as 30851
## 27: not 30491
## 28: your 27878
## 29: all 27547
## 30: just 25513
## 31: what 25194
## 32: like 24938
## 33: from 24412
## 34: he 24382
## 35: get 24329
## 36: do 24020
## 37: up 23900
## 38: out 23446
## 39: one 23335
## 40: will 22251
## 41: or 22193
## 42: if 22182
## 43: they 21712
## 44: go 21676
## 45: about 21290
## 46: time 20131
## 47: i'm 19820
## 48: can 19372
## 49: when 19272
## 50: by 19220
## One_Word Frequency
bi_words[order(bi_words$Frequency, decreasing = T), ][1:50]
## One_Word Two_Words Frequency
## 1: of the 25795
## 2: in the 24534
## 3: for the 13714
## 4: to the 13539
## 5: on the 12785
## 6: to be 11827
## 7: at the 8755
## 8: go to 8220
## 9: i have 8092
## 10: and the 7734
## 11: i was 7634
## 12: in a 7437
## 13: want to 7404
## 14: is a 7400
## 15: i am 7301
## 16: and i 7266
## 17: it was 7116
## 18: have a 6835
## 19: it is 6681
## 20: with the 6632
## 21: for a 6592
## 22: if you 6366
## 23: is the 5636
## 24: will be 5501
## 25: to get 5322
## 26: from the 5263
## 27: i love 5230
## 28: with a 5095
## 29: have to 4993
## 30: that i 4948
## 31: need to 4899
## 32: one of 4855
## 33: to see 4804
## 34: of a 4763
## 35: i don't 4749
## 36: i think 4725
## 37: this is 4698
## 38: thank for 4685
## 39: but i 4576
## 40: to do 4421
## 41: all the 4324
## 42: out of 4168
## 43: of my 4161
## 44: you are 4072
## 45: tri to 4064
## 46: as a 3990
## 47: thank you 3980
## 48: you can 3963
## 49: be a 3962
## 50: in my 3933
## One_Word Two_Words Frequency
tri_words[order(tri_words$Frequency, decreasing = T), ][1:50]
## One_Word Two_Words Three_Words Frequency
## 1: thank for the 2452
## 2: one of the 2125
## 3: a lot of 1897
## 4: i want to 1825
## 5: look forward to 1392
## 6: to be a 1293
## 7: go to be 1224
## 8: be abl to 1141
## 9: thank you for 1057
## 10: i need to 1048
## 11: i have a 1048
## 12: i have to 1038
## 13: the end of 1026
## 14: it was a 1020
## 15: out of the 989
## 16: i love you 948
## 17: i don't know 927
## 18: can't wait to 872
## 19: for the follow 870
## 20: i'm go to 842
## 21: you want to 840
## 22: some of the 813
## 23: as well as 806
## 24: one of my 773
## 25: is go to 770
## 26: the rest of 766
## 27: part of the 764
## 28: a coupl of 752
## 29: there is a 737
## 30: to go to 725
## 31: the fact that 712
## 32: want to be 710
## 33: have a great 708
## 34: go to the 697
## 35: in the world 691
## 36: you have to 684
## 37: need to be 665
## 38: to see you 653
## 39: to have a 651
## 40: this is a 650
## 41: i have been 650
## 42: end of the 638
## 43: it would be 635
## 44: if you are 634
## 45: you have a 633
## 46: it is a 624
## 47: i think i 620
## 48: the first time 618
## 49: this is the 616
## 50: i think it 614
## One_Word Two_Words Three_Words Frequency
top10tri<-tri_words[order(tri_words$Frequency, decreasing = T), ][1:10]
twocomb <- paste(top10tri$One_Word, top10tri$Two_Words,top10tri$Three_Words)
qplot(twocomb,top10tri$Frequency)->q
q+geom_col(aes(twocomb,top10tri$Frequency))+ggtitle("Top ten tri words")
top10bi<-bi_words[order(bi_words$Frequency, decreasing = T), ][1:10]
twocomb <- paste(top10bi$One_Word, top10bi$Two_Words)
qplot(twocomb,top10bi$Frequency)->q
q+geom_col(aes(twocomb,top10bi$Frequency))+ggtitle("Top ten bi words")
qplot(One_Word,Frequency,data=uni_words[order(uni_words$Frequency, decreasing = T), ][1:10])->q
q+geom_col(aes(One_Word,Frequency))+ggtitle("Top ten uni words")