Assumption
File is unzip and stored in local folder
Set Working directory
Set the local folder as working directory
setwd("/Users/ashwini/Desktop/Data Science/final/en_US")
Read Data
Read the data from each file blog, news, twitter
blogs <- readLines("en_US.blogs.txt", encoding = "UTF-8" ,skipNul = TRUE)
news <- readLines("en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines("en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
load Required package
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: NLP
##
## This data.table install has not detected OpenMP support. It will work but slower in single threaded mode.
## quanteda version 0.9.8.5
##
## Attaching package: 'quanteda'
## The following objects are masked from 'package:tm':
##
## as.DocumentTermMatrix, stopwords
## The following object is masked from 'package:NLP':
##
## ngrams
## The following object is masked from 'package:base':
##
## sample
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
Dataset details
Basic data information for each file blog, news, twitter
Dataset_info <- sapply(list(blogs,news,twitter),function(x) summary(stri_count_words(x))[c('Min.','Max.')])
rownames(Dataset_info)=c('Min','Max')
stats=data.frame(
Dataset=c("blogs","news","twitter"),
t(rbind(
sapply(list(blogs,news,twitter),stri_stats_general)[c('Lines','Chars'),],
Words=sapply(list(blogs,news,twitter),stri_stats_latex)['Words',],
Dataset_info)
))
knitr::kable(head(stats))
| blogs |
899288 |
206824382 |
37570839 |
0 |
6726 |
| news |
1010242 |
203223154 |
34494539 |
1 |
1796 |
| twitter |
2360148 |
162096241 |
30451170 |
1 |
47 |
Clean the data and calculate frequency of ngram
Prepare the sample data
set.seed(1024)
sampledataset_blogs <- c(sample(iconv(blogs, "latin1", "ASCII", sub=""), length(blogs) * 0.05))
sampledataset_news <- sample(iconv(news, "latin1", "ASCII", sub=""), length(news) * 0.05)
sampledataset_twitter <- sample(iconv(twitter, "latin1", "ASCII", sub=""), length(twitter) * 0.05)
1-GRAM
1- GRAM Word and Frequency Matrix - Blog
unigram_blogs <- docfreq(dfm(sampledataset_blogs, verbose = FALSE, toLower = TRUE, removePunct = TRUE, removeSeparators = TRUE))[1:15]
unigram_blogs <- data.frame(word=names(unigram_blogs), frequency=unigram_blogs)
knitr::kable(unigram_blogs)
| the |
the |
27361 |
| glass |
glass |
168 |
| sees |
sees |
67 |
| truth |
truth |
275 |
| and |
and |
23453 |
| is |
is |
12745 |
| creation |
creation |
79 |
| it |
it |
11980 |
| what |
what |
4554 |
| we |
we |
5292 |
| want |
want |
1562 |
| to |
to |
22675 |
| see |
see |
2306 |
| yourself |
yourself |
321 |
| not |
not |
6575 |
1- GRAM Word and Frequency Matrix - News
unigram_news <- docfreq(dfm(sampledataset_news, verbose = FALSE, toLower = TRUE, removePunct = TRUE, removeSeparators = TRUE))[1:15]
unigram_news <- data.frame(word=names(unigram_news), frequency=unigram_news)
knitr::kable(unigram_news)
| plate |
plate |
106 |
| the |
the |
37282 |
| poached |
poached |
8 |
| pears |
pears |
8 |
| and |
and |
26558 |
| truffles |
truffles |
7 |
| swept |
swept |
30 |
| money |
money |
834 |
| out |
out |
3291 |
| an |
an |
5503 |
| employment |
employment |
66 |
| department |
department |
597 |
| fund |
fund |
205 |
| a |
a |
25474 |
| risk |
risk |
183 |
1- GRAM Word and Frequency Matrix - Twitter
unigram_twitter <- docfreq(dfm(sampledataset_twitter, verbose = FALSE, toLower = TRUE, removePunct = TRUE, removeSeparators = TRUE))[1:15]
unigram_twitter <- data.frame(word=names(unigram_twitter), frequency=unigram_twitter)
knitr::kable(unigram_twitter)
| hey |
hey |
1269 |
| if |
if |
5204 |
| you |
you |
22079 |
| are |
are |
7410 |
| feeling |
feeling |
474 |
| crazy |
crazy |
528 |
| tk |
tk |
6 |
| has |
has |
2278 |
| all |
all |
5925 |
| the |
the |
36523 |
| equipment |
equipment |
23 |
| and |
and |
19524 |
| ability |
ability |
49 |
| would |
would |
2518 |
| need |
need |
2411 |
1- GRAM Plots
1 - Gram Plat - Blog
qplot(unigram_blogs$word,unigram_blogs$frequency, xlab = "word" , ylab = "frequenct" , main = "1 Gram Plot - blogs")

1 - Gram Plot - News
qplot(unigram_news$word,unigram_news$frequency, xlab = "word" , ylab = "frequenct" , main = "1 Gram - News")

2-GRAM
2- GRAM Word and Frequency Matrix - Blog
bigram_blogs <- docfreq(dfm(sampledataset_blogs, verbose = FALSE, toLower = TRUE, removePunct = TRUE, removeSeparators = TRUE,ngrams = 2, concatenator = ' '))[1:15]
bigram_blogs <- data.frame(word=names(bigram_blogs), frequency=bigram_blogs)
knitr::kable(bigram_blogs)
| the glass |
the glass |
29 |
| glass sees |
glass sees |
1 |
| sees truth |
sees truth |
1 |
| truth and |
truth and |
35 |
| and truth |
and truth |
9 |
| truth is |
truth is |
53 |
| is creation |
is creation |
1 |
| creation it |
creation it |
3 |
| it sees |
it sees |
1 |
| sees what |
sees what |
2 |
| what we |
what we |
199 |
| we want |
we want |
70 |
| want to |
want to |
1072 |
| to see |
to see |
828 |
| see see |
see see |
1 |
2- GRAM Word and Frequency Matrix - News
bigram_news <- docfreq(dfm(sampledataset_news, verbose = FALSE, toLower = TRUE, removePunct = TRUE, removeSeparators = TRUE,ngrams = 2, concatenator = ' '))[1:15]
bigram_news <- data.frame(word=names(bigram_news), frequency=bigram_news)
knitr::kable(bigram_news)
| plate the |
plate the |
3 |
| the poached |
the poached |
2 |
| poached pears |
poached pears |
1 |
| pears and |
pears and |
3 |
| and truffles |
and truffles |
1 |
| swept money |
swept money |
1 |
| money out |
money out |
6 |
| out an |
out an |
19 |
| an employment |
an employment |
3 |
| employment department |
employment department |
1 |
| department fund |
department fund |
1 |
| fund a |
fund a |
6 |
| a risk |
a risk |
21 |
| risk management |
risk management |
3 |
| management fund |
management fund |
1 |
2- GRAM Word and Frequency Matrix - Twitter
bigram_twitter <- docfreq(dfm(sampledataset_twitter, verbose = FALSE, toLower = TRUE, removePunct = TRUE,ngrams = 2, concatenator = ' ' ,removeSeparators = TRUE))[1:15]
bigram_twitter <- data.frame(word=names(bigram_twitter), frequency=bigram_twitter)
knitr::kable(bigram_twitter)
| hey if |
hey if |
10 |
| if you |
if you |
1578 |
| you are |
you are |
1128 |
| are feeling |
are feeling |
8 |
| feeling crazy |
feeling crazy |
1 |
| crazy tk |
crazy tk |
1 |
| tk has |
tk has |
1 |
| has all |
has all |
6 |
| all the |
all the |
947 |
| the equipment |
the equipment |
3 |
| equipment and |
equipment and |
2 |
| and ability |
and ability |
4 |
| ability you |
ability you |
1 |
| you would |
you would |
147 |
| would need |
would need |
5 |
2- GRAM Plots
2 - Gram Plat - Blog
qplot(bigram_blogs$word,bigram_blogs$frequency, xlab = "word" , ylab = "frequenct" , main = "2 Gram Plot - blogs")

1 - Gram Plot - News
qplot(bigram_news$word,bigram_news$frequency, xlab = "word" , ylab = "frequenct" , main = "2 Gram - News")
