As first step, let’s upload the data and see how many rows have each file
blog <- read.delim("en_US.blogs.txt", sep = "\t")
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec =
## dec, : EOF within quoted string
dim(blog)
## [1] 444438 1
news <- read.delim("en_US.news.txt", sep = "\t")
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec =
## dec, : EOF within quoted string
dim(news)
## [1] 39642 1
twitter <- read.delim("en_US.twitter.txt", sep = "\t")
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec =
## dec, : embedded nul(s) found in input
dim(twitter)
## [1] 1195014 1
temp <- as.data.frame(rbind(cbind("blog", nrow(blog)),
cbind("news", nrow(news)),
cbind("twitter", nrow(twitter))))
names(temp) <- c("File", "Rows")
temp
## File Rows
## 1 blog 444438
## 2 news 39642
## 3 twitter 1195014
# longitud por fila y por archivo
A <- as.numeric(nchar(as.character(blog[,])))
B <- rep("Blog", length(A))
C1 <- as.data.frame(cbind(A,B))
names(C1) <- c("Length","File")
A <- as.numeric(nchar(as.character(news[,])))
B <- rep("News", length(A))
C2 <- as.data.frame(cbind(A,B))
names(C2) <- c("Length","File")
A <- as.numeric(nchar(as.character(twitter[,])))
B <- rep("Twitter", length(A))
C3 <- as.data.frame(cbind(A,B))
names(C3) <- c("Length","File")
C4 <- rbind(C1, C2, C3)
set.seed(1)
C4$Length <- as.numeric(C4$Length)
n <- sample(1:nrow(C4), 1000)
library(ggplot2)
qplot(Length, data = C4[n,], colour = File)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
C4 <- C4[n,]
dim(C4)
## [1] 1000 2
plot(
x = C4$Length,
col = C4$File,
main = "File per row coloured by file"
)
# tamaño en MB
rbind(cbind("blog", round(as.numeric(object.size(blog)) / 1024.00 / 1024.00,1)),
cbind("news", round(as.numeric(object.size(news)) / 1024.00 / 1024.00,1)),
cbind("twitter", round(as.numeric(object.size(twitter)) / 1024.00 / 1024.00,1)))
## [,1] [,2]
## [1,] "blog" "228.7"
## [2,] "news" "17.6"
## [3,] "twitter" "243.7"
There is a random sample of 100 files due mi PC capacity I choose to use the wordcloud function to plot words frecuency
# word cloud
library(wordcloud)
## Loading required package: RColorBrewer
# Blog
temp <- as.character(blog[1:100,])
temp2 <- as.data.frame(table(strsplit(paste(temp, collapse = ""), " ")))
temp3 <- temp2[order(-temp2$Freq),]
temp3[1:20,]
## Var1 Freq
## 2663 the 335
## 2741 to 238
## 207 and 224
## 86 a 212
## 1854 of 166
## 1331 I 128
## 1368 in 116
## 2654 that 91
## 1414 is 72
## 2994 with 71
## 1866 on 69
## 2913 was 66
## 1073 for 61
## 1417 it 56
## 3061 you 49
## 2928 we 47
## 1780 my 45
## 266 as 44
## 1228 have 44
## 338 be 41
print("Blog")
## [1] "Blog"
wordcloud(words = temp3$Var1,
freq = temp3$Freq,
colors = brewer.pal(8, "Dark2"))
# news
temp <- as.character(news[1:100,])
temp2 <- as.data.frame(table(strsplit(paste(temp, collapse = ""), " ")))
temp3 <- temp2[order(-temp2$Freq),]
temp3[1:20,]
## Var1 Freq
## 3298 the 407
## 3364 to 247
## 177 a 208
## 2262 of 194
## 321 and 191
## 1672 in 156
## 1343 for 87
## 1741 is 72
## 3290 that 72
## 2285 on 65
## 3669 with 57
## 3584 was 55
## 2827 said 50
## 397 at 45
## 1584 his 44
## 1753 it 43
## 1527 have 39
## 1533 he 39
## 474 be 38
## 616 by 37
print("News")
## [1] "News"
wordcloud(words = temp3$Var1,
freq = temp3$Freq,
colors = brewer.pal(8, "Dark2"))
# twitter
temp <- as.character(twitter[1:100,])
temp2 <- as.data.frame(table(strsplit(paste(temp, collapse = ""), " ")))
temp3 <- temp2[order(-temp2$Freq),]
temp3[1:20,]
## Var1 Freq
## 599 the 38
## 630 to 35
## 50 a 21
## 68 and 15
## 244 for 15
## 308 I 14
## 597 that 14
## 317 in 13
## 464 on 12
## 457 of 11
## 328 is 10
## 433 my 10
## 725 your 10
## 101 be 9
## 330 it 9
## 717 you 9
## 82 at 8
## 450 not 8
## 697 will 8
## 26 : 7
print("Twitter")
## [1] "Twitter"
wordcloud(words = temp3$Var1,
freq = temp3$Freq,
colors = brewer.pal(8, "Dark2"))
I omitted lines with lenght over the 99th percentile length
# promedio de palabras por linea
#blog
temp <- (as.numeric(lengths(gregexpr("\\W+", as.character(blog[,])))))
blog_words <- round(mean(temp[temp <= quantile(temp, c(0.99))]),2)
temp <- (as.numeric(lengths(gregexpr("\\W+", as.character(news[,])))))
news_words <- round(mean(temp[temp <= quantile(temp, c(0.99))]),2)
temp <- (as.numeric(lengths(gregexpr("\\W+", as.character(twitter[,])))))
twitter_words <- round(mean(temp[temp <= quantile(temp, c(0.99))]),2)
rbind(cbind("Blog", blog_words),
cbind("News", news_words),
cbind("Twitter", twitter_words))
## blog_words
## [1,] "Blog" "41.26"
## [2,] "News" "35.32"
## [3,] "Twitter" "12.81"
I use only a few rows cause files are too large
X <- as.character(blog[1:50,])
library(ngram)
ng <- ngram(X, n = 2, sep = " ")
X_blog <- as.data.frame(get.phrasetable(ng))
X_blog <- X_blog[order(-X_blog$freq),]
X_blog[1:20,]
## ngrams freq prop
## 1 in the 29 0.0053406998
## 2 of the 21 0.0038674033
## 3 on the 12 0.0022099448
## 4 to the 11 0.0020257827
## 5 with the 11 0.0020257827
## 6 and the 10 0.0018416206
## 7 and I 10 0.0018416206
## 8 of a 8 0.0014732965
## 9 I was 8 0.0014732965
## 10 in a 8 0.0014732965
## 11 I had 7 0.0012891344
## 12 I have 6 0.0011049724
## 13 can be 6 0.0011049724
## 14 that the 6 0.0011049724
## 15 a few 6 0.0011049724
## 16 to be 6 0.0011049724
## 17 but I 5 0.0009208103
## 18 with a 5 0.0009208103
## 19 have a 5 0.0009208103
## 20 of all 5 0.0009208103
library(wordcloud)
print("Blog")
## [1] "Blog"
wordcloud(words = X_blog$ngrams, freq = X_blog$freq, colors = brewer.pal(8, "Dark2"))
Y <- as.character(news[1:50,])
ng <- ngram(Y, n = 2, sep = " ")
X_news <- as.data.frame(get.phrasetable(ng))
X_news <- X_news[order(-X_news$freq),]
X_news[1:20,]
## ngrams freq prop
## 1 in the 11 0.006376812
## 2 of the 10 0.005797101
## 3 to the 6 0.003478261
## 4 There are 4 0.002318841
## 5 by the 3 0.001739130
## 6 for the 3 0.001739130
## 7 and the 3 0.001739130
## 8 on the 3 0.001739130
## 9 is not 3 0.001739130
## 10 up to 3 0.001739130
## 11 from the 3 0.001739130
## 12 a question 2 0.001159420
## 13 decided to 2 0.001159420
## 14 the situation 2 0.001159420
## 15 is the 2 0.001159420
## 16 for Progress 2 0.001159420
## 17 Maryland is 2 0.001159420
## 18 wants the 2 0.001159420
## 19 said the 2 0.001159420
## 20 to keep 2 0.001159420
print("News")
## [1] "News"
wordcloud(words = X_news$ngrams, freq = X_news$freq, colors = brewer.pal(8, "Dark2"))
Z <- as.character(twitter[1:50,])
ng <- ngram(Y, n = 2, sep = " ")
X_twitter <- as.data.frame(get.phrasetable(ng))
X_twitter <- X_twitter[order(-X_twitter$freq),]
X_twitter[1:20,]
## ngrams freq prop
## 1 in the 11 0.006376812
## 2 of the 10 0.005797101
## 3 to the 6 0.003478261
## 4 There are 4 0.002318841
## 5 by the 3 0.001739130
## 6 for the 3 0.001739130
## 7 and the 3 0.001739130
## 8 on the 3 0.001739130
## 9 is not 3 0.001739130
## 10 up to 3 0.001739130
## 11 from the 3 0.001739130
## 12 a question 2 0.001159420
## 13 decided to 2 0.001159420
## 14 the situation 2 0.001159420
## 15 is the 2 0.001159420
## 16 for Progress 2 0.001159420
## 17 Maryland is 2 0.001159420
## 18 wants the 2 0.001159420
## 19 said the 2 0.001159420
## 20 to keep 2 0.001159420
print("Twitter")
## [1] "Twitter"
wordcloud(words = X_twitter$ngrams, freq = X_twitter$freq, colors = brewer.pal(8, "Dark2"))
As a next step, for the predictive model I consider that Random Forest or some similar classifier would not be the best option since the response variable would be of type factor with many classes.
It would be very interesting to be able to make a predictive model with Markov chains which can update their coefficients and learn after each iteration.
Reading about the next task… I think that a great and viable soluction is to use Kneser–Ney smoothing.
This is a method primarily used to calculate the probability distribution of n-grams in a document based on their histories.
In statistics and image processing, to smooth a data set is to create an approximating function that attempts to capture important patterns in the data, while leaving out noise or other fine-scale structures/rapid phenomena.