Assumption

File is unzip and stored in local folder

Set Working directory

Set the local folder as working directory

setwd("/Users/ashwini/Desktop/Data Science/final/en_US")

Read Data

Read the data from each file blog, news, twitter

blogs <- readLines("en_US.blogs.txt", encoding = "UTF-8" ,skipNul = TRUE)

news <- readLines("en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)

twitter <- readLines("en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)

load Required package

## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Loading required package: NLP
## 
## This data.table install has not detected OpenMP support. It will work but slower in single threaded mode.
## quanteda version 0.9.8.5
## 
## Attaching package: 'quanteda'
## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, stopwords
## The following object is masked from 'package:NLP':
## 
##     ngrams
## The following object is masked from 'package:base':
## 
##     sample

Dataset details

Basic data information for each file blog, news, twitter

Dataset_info <- sapply(list(blogs,news,twitter),function(x) summary(stri_count_words(x))[c('Min.','Max.')])
rownames(Dataset_info)=c('Min','Max')
stats=data.frame(
  Dataset=c("blogs","news","twitter"),      
  t(rbind(
    sapply(list(blogs,news,twitter),stri_stats_general)[c('Lines','Chars'),],
    Words=sapply(list(blogs,news,twitter),stri_stats_latex)['Words',],
    Dataset_info)
  ))
  
 
knitr::kable(head(stats))
Dataset Lines Chars Words Min Max
blogs 899288 206824382 37570839 0 6726
news 1010242 203223154 34494539 1 1796
twitter 2360148 162096241 30451170 1 47

Clean the data and calculate frequency of ngram

Prepare the sample data

set.seed(1024)
sampledataset_blogs <- c(sample(iconv(blogs, "latin1", "ASCII", sub=""), length(blogs) * 0.05))
sampledataset_news <- sample(iconv(news, "latin1", "ASCII", sub=""), length(news) * 0.05)
sampledataset_twitter <- sample(iconv(twitter, "latin1", "ASCII", sub=""), length(twitter) * 0.05)

1-GRAM

unigram_blogs <- docfreq(dfm(sampledataset_blogs, verbose = FALSE, toLower = TRUE, removePunct = TRUE, removeSeparators = TRUE))[1:15]
unigram_blogs
##      the    glass     sees    truth      and       is creation       it 
##    27361      168       67      275    23453    12745       79    11980 
##     what       we     want       to      see yourself      not 
##     4554     5292     1562    22675     2306      321     6575
unigram_news <- docfreq(dfm(sampledataset_news, verbose = FALSE, toLower = TRUE, removePunct = TRUE, removeSeparators = TRUE))[1:15]
unigram_news
##      plate        the    poached      pears        and   truffles 
##        106      37282          8          8      26558          7 
##      swept      money        out         an employment department 
##         30        834       3291       5503         66        597 
##       fund          a       risk 
##        205      25474        183
unigram_twitter <- docfreq(dfm(sampledataset_twitter, verbose = FALSE, toLower = TRUE, removePunct = TRUE, removeSeparators = TRUE))[1:15]
unigram_twitter
##       hey        if       you       are   feeling     crazy        tk 
##      1269      5204     22079      7410       474       528         6 
##       has       all       the equipment       and   ability     would 
##      2278      5925     36523        23     19524        49      2518 
##      need 
##      2411

2-GRAM

bigram_blogs <-  docfreq(dfm(sampledataset_blogs, verbose = FALSE, toLower = TRUE, removePunct = TRUE, removeSeparators = TRUE, ngrams = 2, concatenator = ' '))[1:15]
bigram_blogs
##   the glass  glass sees  sees truth   truth and   and truth    truth is 
##          29           1           1          35           9          53 
## is creation creation it     it sees   sees what     what we     we want 
##           1           3           1           2         199          70 
##     want to      to see     see see 
##        1072         828           1
bigram_news <- docfreq(dfm(sampledataset_news, verbose = FALSE, toLower = TRUE, removePunct = TRUE, removeSeparators = TRUE, ngrams = 2, concatenator = ' '))[1:15]
bigram_news
##             plate the           the poached         poached pears 
##                     3                     2                     1 
##             pears and          and truffles           swept money 
##                     3                     1                     1 
##             money out                out an         an employment 
##                     6                    19                     3 
## employment department       department fund                fund a 
##                     1                     1                     6 
##                a risk       risk management       management fund 
##                    21                     3                     1
bigram_twitter <- docfreq(dfm(sampledataset_twitter, verbose = FALSE, toLower = TRUE, removePunct = TRUE, removeSeparators = TRUE, ngrams = 2, concatenator = ' '))[1:15]
bigram_twitter
##        hey if        if you       you are   are feeling feeling crazy 
##            10          1578          1128             8             1 
##      crazy tk        tk has       has all       all the the equipment 
##             1             1             6           947             3 
## equipment and   and ability   ability you     you would    would need 
##             2             4             1           147             5

3-GRAM

trigram_blogs <- docfreq(dfm(sampledataset_blogs, verbose = FALSE, toLower = TRUE, removePunct = TRUE, removeSeparators = TRUE, ngrams = 3, concatenator = ' '))[1:15]
trigram_blogs
##    the glass sees  glass sees truth    sees truth and   truth and truth 
##                 1                 1                 1                 1 
##      and truth is truth is creation    is creation it  creation it sees 
##                 3                 1                 1                 1 
##      it sees what      sees what we      what we want        we want to 
##                 1                 1                 7                45 
##       want to see        to see see  see see yourself 
##                44                 1                 1
trigram_news<- docfreq(dfm(sampledataset_news, verbose = FALSE, toLower = TRUE, removePunct = TRUE, removeSeparators = TRUE, ngrams = 3, concatenator = ' '))[1:15]
trigram_news
##          plate the poached          the poached pears 
##                          1                          1 
##          poached pears and         pears and truffles 
##                          1                          1 
##            swept money out               money out an 
##                          1                          1 
##          out an employment   an employment department 
##                          2                          1 
## employment department fund          department fund a 
##                          1                          1 
##                fund a risk          a risk management 
##                          1                          1 
##       risk management fund        management fund and 
##                          1                          1 
##                 fund and a 
##                          1
trigram_twitter <- docfreq(dfm(sampledataset_twitter, verbose = FALSE, toLower = TRUE, removePunct = TRUE, removeSeparators = TRUE, ngrams = 3, concatenator = ' '))[1:15]
trigram_twitter
##            hey if you            if you are       you are feeling 
##                     2                   122                     6 
##     are feeling crazy      feeling crazy tk          crazy tk has 
##                     1                     1                     1 
##            tk has all           has all the     all the equipment 
##                     1                     4                     1 
##     the equipment and equipment and ability       and ability you 
##                     1                     1                     1 
##     ability you would        you would need        would need for 
##                     1                     1                     1