Assumption

File is unzip and stored in local folder

Set Working directory

Set the local folder as working directory

setwd("/Users/ashwini/Desktop/Data Science/final/en_US")

Read Data

Read the data from each file blog, news, twitter

blogs <- readLines("en_US.blogs.txt", encoding = "UTF-8" ,skipNul = TRUE)

news <- readLines("en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)

twitter <- readLines("en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)

load Required package

## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Loading required package: NLP
## 
## This data.table install has not detected OpenMP support. It will work but slower in single threaded mode.
## quanteda version 0.9.8.5
## 
## Attaching package: 'quanteda'
## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, stopwords
## The following object is masked from 'package:NLP':
## 
##     ngrams
## The following object is masked from 'package:base':
## 
##     sample
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate

Dataset details

Basic data information for each file blog, news, twitter

Dataset_info <- sapply(list(blogs,news,twitter),function(x) summary(stri_count_words(x))[c('Min.','Max.')])
rownames(Dataset_info)=c('Min','Max')
stats=data.frame(
  Dataset=c("blogs","news","twitter"),      
  t(rbind(
    sapply(list(blogs,news,twitter),stri_stats_general)[c('Lines','Chars'),],
    Words=sapply(list(blogs,news,twitter),stri_stats_latex)['Words',],
    Dataset_info)
  ))
  
 
knitr::kable(head(stats))
Dataset Lines Chars Words Min Max
blogs 899288 206824382 37570839 0 6726
news 1010242 203223154 34494539 1 1796
twitter 2360148 162096241 30451170 1 47

Clean the data and calculate frequency of ngram

Prepare the sample data

set.seed(1024)
sampledataset_blogs <- c(sample(iconv(blogs, "latin1", "ASCII", sub=""), length(blogs) * 0.05))
sampledataset_news <- sample(iconv(news, "latin1", "ASCII", sub=""), length(news) * 0.05)
sampledataset_twitter <- sample(iconv(twitter, "latin1", "ASCII", sub=""), length(twitter) * 0.05)

1-GRAM

1- GRAM Word and Frequency Matrix - Blog

unigram_blogs <- docfreq(dfm(sampledataset_blogs, verbose = FALSE, toLower = TRUE, removePunct = TRUE, removeSeparators = TRUE))[1:15]
unigram_blogs <- data.frame(word=names(unigram_blogs), frequency=unigram_blogs)
knitr::kable(unigram_blogs)
word frequency
the the 27361
glass glass 168
sees sees 67
truth truth 275
and and 23453
is is 12745
creation creation 79
it it 11980
what what 4554
we we 5292
want want 1562
to to 22675
see see 2306
yourself yourself 321
not not 6575

1- GRAM Word and Frequency Matrix - News

unigram_news <- docfreq(dfm(sampledataset_news, verbose = FALSE, toLower = TRUE, removePunct = TRUE, removeSeparators = TRUE))[1:15]

unigram_news <- data.frame(word=names(unigram_news), frequency=unigram_news)

knitr::kable(unigram_news)
word frequency
plate plate 106
the the 37282
poached poached 8
pears pears 8
and and 26558
truffles truffles 7
swept swept 30
money money 834
out out 3291
an an 5503
employment employment 66
department department 597
fund fund 205
a a 25474
risk risk 183

1- GRAM Word and Frequency Matrix - Twitter

unigram_twitter <- docfreq(dfm(sampledataset_twitter, verbose = FALSE, toLower = TRUE, removePunct = TRUE, removeSeparators = TRUE))[1:15]

unigram_twitter <- data.frame(word=names(unigram_twitter), frequency=unigram_twitter)

knitr::kable(unigram_twitter)
word frequency
hey hey 1269
if if 5204
you you 22079
are are 7410
feeling feeling 474
crazy crazy 528
tk tk 6
has has 2278
all all 5925
the the 36523
equipment equipment 23
and and 19524
ability ability 49
would would 2518
need need 2411

1- GRAM Plots

1 - Gram Plat - Blog

qplot(unigram_blogs$word,unigram_blogs$frequency, xlab = "word" , ylab = "frequenct" , main = "1 Gram Plot - blogs")

1 - Gram Plot - News

qplot(unigram_news$word,unigram_news$frequency, xlab = "word" , ylab = "frequenct" , main = "1 Gram - News")

1 - Gram Plot - Twitter

qplot(unigram_twitter$word,unigram_twitter$frequency, xlab = "word" , ylab = "frequenct" , main = "1 Gram - Twitter")

2-GRAM

2- GRAM Word and Frequency Matrix - Blog

bigram_blogs <- docfreq(dfm(sampledataset_blogs, verbose = FALSE, toLower = TRUE, removePunct = TRUE, removeSeparators = TRUE,ngrams = 2, concatenator = ' '))[1:15]
bigram_blogs <- data.frame(word=names(bigram_blogs), frequency=bigram_blogs)
knitr::kable(bigram_blogs)
word frequency
the glass the glass 29
glass sees glass sees 1
sees truth sees truth 1
truth and truth and 35
and truth and truth 9
truth is truth is 53
is creation is creation 1
creation it creation it 3
it sees it sees 1
sees what sees what 2
what we what we 199
we want we want 70
want to want to 1072
to see to see 828
see see see see 1

2- GRAM Word and Frequency Matrix - News

bigram_news <- docfreq(dfm(sampledataset_news, verbose = FALSE, toLower = TRUE, removePunct = TRUE, removeSeparators = TRUE,ngrams = 2, concatenator = ' '))[1:15]

bigram_news <- data.frame(word=names(bigram_news), frequency=bigram_news)

knitr::kable(bigram_news)
word frequency
plate the plate the 3
the poached the poached 2
poached pears poached pears 1
pears and pears and 3
and truffles and truffles 1
swept money swept money 1
money out money out 6
out an out an 19
an employment an employment 3
employment department employment department 1
department fund department fund 1
fund a fund a 6
a risk a risk 21
risk management risk management 3
management fund management fund 1

2- GRAM Word and Frequency Matrix - Twitter

bigram_twitter <- docfreq(dfm(sampledataset_twitter, verbose = FALSE, toLower = TRUE, removePunct = TRUE,ngrams = 2, concatenator = ' ' ,removeSeparators = TRUE))[1:15]

bigram_twitter <- data.frame(word=names(bigram_twitter), frequency=bigram_twitter)

knitr::kable(bigram_twitter)
word frequency
hey if hey if 10
if you if you 1578
you are you are 1128
are feeling are feeling 8
feeling crazy feeling crazy 1
crazy tk crazy tk 1
tk has tk has 1
has all has all 6
all the all the 947
the equipment the equipment 3
equipment and equipment and 2
and ability and ability 4
ability you ability you 1
you would you would 147
would need would need 5

2- GRAM Plots

2 - Gram Plat - Blog

qplot(bigram_blogs$word,bigram_blogs$frequency, xlab = "word" , ylab = "frequenct" , main = "2 Gram Plot - blogs")

1 - Gram Plot - News

qplot(bigram_news$word,bigram_news$frequency, xlab = "word" , ylab = "frequenct" , main = "2 Gram - News")

2 - Gram Plot - Twitter

qplot(bigram_twitter$word,bigram_twitter$frequency, xlab = "word" , ylab = "frequenct" , main = "2 Gram - Twitter")