Number of rows per file

As first step, let’s upload the data and see how many rows have each file

blog <- read.delim("en_US.blogs.txt", sep = "\t")
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec =
## dec, : EOF within quoted string
dim(blog)
## [1] 444438      1
news <- read.delim("en_US.news.txt", sep = "\t")
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec =
## dec, : EOF within quoted string
dim(news)
## [1] 39642     1
twitter <- read.delim("en_US.twitter.txt", sep = "\t")
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec =
## dec, : embedded nul(s) found in input
dim(twitter)
## [1] 1195014       1
temp <- as.data.frame(rbind(cbind("blog", nrow(blog)),
                    cbind("news", nrow(news)),
                    cbind("twitter", nrow(twitter))))
names(temp) <- c("File", "Rows")
temp
##      File    Rows
## 1    blog  444438
## 2    news   39642
## 3 twitter 1195014

mean length by line and per file

# longitud por fila y por archivo
A <- as.numeric(nchar(as.character(blog[,])))
B <- rep("Blog", length(A))
C1 <- as.data.frame(cbind(A,B))
names(C1) <- c("Length","File")

A <- as.numeric(nchar(as.character(news[,])))
B <- rep("News", length(A))
C2 <- as.data.frame(cbind(A,B))
names(C2) <- c("Length","File")

A <- as.numeric(nchar(as.character(twitter[,])))
B <- rep("Twitter", length(A))
C3 <- as.data.frame(cbind(A,B))
names(C3) <- c("Length","File")

C4 <- rbind(C1, C2, C3)

set.seed(1)
C4$Length <- as.numeric(C4$Length)
n <- sample(1:nrow(C4), 1000)

library(ggplot2)
qplot(Length, data = C4[n,], colour = File)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

C4 <- C4[n,]
dim(C4)
## [1] 1000    2
plot(
  x = C4$Length,
  col = C4$File,
  main = "File per row coloured by file"
)

Size of the files in MB

# tamaño en MB
rbind(cbind("blog", round(as.numeric(object.size(blog)) / 1024.00 / 1024.00,1)),
      cbind("news", round(as.numeric(object.size(news)) / 1024.00 / 1024.00,1)),
      cbind("twitter", round(as.numeric(object.size(twitter)) / 1024.00 / 1024.00,1)))
##      [,1]      [,2]   
## [1,] "blog"    "228.7"
## [2,] "news"    "17.6" 
## [3,] "twitter" "243.7"

Most common word in each File

There is a random sample of 100 files due mi PC capacity I choose to use the wordcloud function to plot words frecuency

# word cloud

library(wordcloud)
## Loading required package: RColorBrewer
# Blog
temp <- as.character(blog[1:100,])
temp2 <- as.data.frame(table(strsplit(paste(temp, collapse = ""), " ")))
temp3 <- temp2[order(-temp2$Freq),]
temp3[1:20,]
##      Var1 Freq
## 2663  the  335
## 2741   to  238
## 207   and  224
## 86      a  212
## 1854   of  166
## 1331    I  128
## 1368   in  116
## 2654 that   91
## 1414   is   72
## 2994 with   71
## 1866   on   69
## 2913  was   66
## 1073  for   61
## 1417   it   56
## 3061  you   49
## 2928   we   47
## 1780   my   45
## 266    as   44
## 1228 have   44
## 338    be   41
print("Blog")
## [1] "Blog"
wordcloud(words = temp3$Var1,
          freq = temp3$Freq,
          colors = brewer.pal(8, "Dark2"))

# news
temp <- as.character(news[1:100,])
temp2 <- as.data.frame(table(strsplit(paste(temp, collapse = ""), " ")))
temp3 <- temp2[order(-temp2$Freq),]
temp3[1:20,]
##      Var1 Freq
## 3298  the  407
## 3364   to  247
## 177     a  208
## 2262   of  194
## 321   and  191
## 1672   in  156
## 1343  for   87
## 1741   is   72
## 3290 that   72
## 2285   on   65
## 3669 with   57
## 3584  was   55
## 2827 said   50
## 397    at   45
## 1584  his   44
## 1753   it   43
## 1527 have   39
## 1533   he   39
## 474    be   38
## 616    by   37
print("News")
## [1] "News"
wordcloud(words = temp3$Var1,
          freq = temp3$Freq,
          colors = brewer.pal(8, "Dark2"))

# twitter
temp <- as.character(twitter[1:100,])
temp2 <- as.data.frame(table(strsplit(paste(temp, collapse = ""), " ")))
temp3 <- temp2[order(-temp2$Freq),]
temp3[1:20,]
##     Var1 Freq
## 599  the   38
## 630   to   35
## 50     a   21
## 68   and   15
## 244  for   15
## 308    I   14
## 597 that   14
## 317   in   13
## 464   on   12
## 457   of   11
## 328   is   10
## 433   my   10
## 725 your   10
## 101   be    9
## 330   it    9
## 717  you    9
## 82    at    8
## 450  not    8
## 697 will    8
## 26     :    7
print("Twitter")
## [1] "Twitter"
wordcloud(words = temp3$Var1,
          freq = temp3$Freq,
          colors = brewer.pal(8, "Dark2"))

Average of lines per line

I omitted lines with lenght over the 99th percentile length

# promedio de palabras por linea

#blog 
temp <- (as.numeric(lengths(gregexpr("\\W+", as.character(blog[,])))))
blog_words <- round(mean(temp[temp <= quantile(temp, c(0.99))]),2)

temp <- (as.numeric(lengths(gregexpr("\\W+", as.character(news[,])))))
news_words <- round(mean(temp[temp <= quantile(temp, c(0.99))]),2)

temp <- (as.numeric(lengths(gregexpr("\\W+", as.character(twitter[,])))))
twitter_words <- round(mean(temp[temp <= quantile(temp, c(0.99))]),2)

rbind(cbind("Blog", blog_words),
      cbind("News", news_words),
      cbind("Twitter", twitter_words))
##                blog_words
## [1,] "Blog"    "41.26"   
## [2,] "News"    "35.32"   
## [3,] "Twitter" "12.81"

analysis of 2 word n-gram

I use only a few rows cause files are too large

X <- as.character(blog[1:50,])
library(ngram)
ng <- ngram(X, n = 2, sep = " ")
X_blog <- as.data.frame(get.phrasetable(ng))
X_blog <- X_blog[order(-X_blog$freq),]
X_blog[1:20,]
##       ngrams freq         prop
## 1    in the    29 0.0053406998
## 2    of the    21 0.0038674033
## 3    on the    12 0.0022099448
## 4    to the    11 0.0020257827
## 5  with the    11 0.0020257827
## 6   and the    10 0.0018416206
## 7     and I    10 0.0018416206
## 8      of a     8 0.0014732965
## 9     I was     8 0.0014732965
## 10     in a     8 0.0014732965
## 11    I had     7 0.0012891344
## 12   I have     6 0.0011049724
## 13   can be     6 0.0011049724
## 14 that the     6 0.0011049724
## 15    a few     6 0.0011049724
## 16    to be     6 0.0011049724
## 17    but I     5 0.0009208103
## 18   with a     5 0.0009208103
## 19   have a     5 0.0009208103
## 20   of all     5 0.0009208103
library(wordcloud)
print("Blog")
## [1] "Blog"
wordcloud(words = X_blog$ngrams, freq = X_blog$freq, colors = brewer.pal(8, "Dark2"))

Y <- as.character(news[1:50,])
ng <- ngram(Y, n = 2, sep = " ")
X_news <- as.data.frame(get.phrasetable(ng))
X_news <- X_news[order(-X_news$freq),]
X_news[1:20,]
##            ngrams freq        prop
## 1         in the    11 0.006376812
## 2         of the    10 0.005797101
## 3         to the     6 0.003478261
## 4      There are     4 0.002318841
## 5         by the     3 0.001739130
## 6        for the     3 0.001739130
## 7        and the     3 0.001739130
## 8         on the     3 0.001739130
## 9         is not     3 0.001739130
## 10         up to     3 0.001739130
## 11      from the     3 0.001739130
## 12    a question     2 0.001159420
## 13    decided to     2 0.001159420
## 14 the situation     2 0.001159420
## 15        is the     2 0.001159420
## 16  for Progress     2 0.001159420
## 17   Maryland is     2 0.001159420
## 18     wants the     2 0.001159420
## 19      said the     2 0.001159420
## 20       to keep     2 0.001159420
print("News")
## [1] "News"
wordcloud(words = X_news$ngrams, freq = X_news$freq, colors = brewer.pal(8, "Dark2"))

Z <- as.character(twitter[1:50,])
ng <- ngram(Y, n = 2, sep = " ")
X_twitter <- as.data.frame(get.phrasetable(ng))
X_twitter <- X_twitter[order(-X_twitter$freq),]
X_twitter[1:20,]
##            ngrams freq        prop
## 1         in the    11 0.006376812
## 2         of the    10 0.005797101
## 3         to the     6 0.003478261
## 4      There are     4 0.002318841
## 5         by the     3 0.001739130
## 6        for the     3 0.001739130
## 7        and the     3 0.001739130
## 8         on the     3 0.001739130
## 9         is not     3 0.001739130
## 10         up to     3 0.001739130
## 11      from the     3 0.001739130
## 12    a question     2 0.001159420
## 13    decided to     2 0.001159420
## 14 the situation     2 0.001159420
## 15        is the     2 0.001159420
## 16  for Progress     2 0.001159420
## 17   Maryland is     2 0.001159420
## 18     wants the     2 0.001159420
## 19      said the     2 0.001159420
## 20       to keep     2 0.001159420
print("Twitter")
## [1] "Twitter"
wordcloud(words = X_twitter$ngrams, freq = X_twitter$freq, colors = brewer.pal(8, "Dark2"))

Prediction Next Steps

As a next step, for the predictive model I consider that Random Forest or some similar classifier would not be the best option since the response variable would be of type factor with many classes.

It would be very interesting to be able to make a predictive model with Markov chains which can update their coefficients and learn after each iteration.

Reading about the next task… I think that a great and viable soluction is to use Kneser–Ney smoothing.

This is a method primarily used to calculate the probability distribution of n-grams in a document based on their histories.

In statistics and image processing, to smooth a data set is to create an approximating function that attempts to capture important patterns in the data, while leaving out noise or other fine-scale structures/rapid phenomena.