##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
file.list = c("final/en_US/en_US.blogs.txt", "final/en_US/en_US.news.txt", "final/en_US/en_US.twitter.txt")
text <- list(blogs = "", news = "", twitter = "")
data.summary <- matrix(0, nrow = 3, ncol = 3, dimnames = list(c("blogs", "news", "twitter"),c("file size, Mb", "lines", "words")))
for (i in 1:3) {
con <- file(file.list[i], "rb")
text[[i]] <- readLines(con, encoding = "UTF-8",skipNul = TRUE)
close(con)
data.summary[i,1] <- round(file.info(file.list[i])$size / 1024^2, 2)
data.summary[i,2] <- length(text[[i]])
data.summary[i,3] <- sum(stri_count_words(text[[i]]))
}
twitterSample <- sample(text$twitter, 0.02*length(text$twitter))
dataSample <- iconv(twitterSample, 'UTF-8', 'ASCII')
corpus <- Corpus(VectorSource(as.data.frame(dataSample, stringsAsFactors = FALSE)))
corpus <- tm_map(corpus,content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
corpus <- tm_map(corpus,content_transformer(PlainTextDocument))
## Warning in tm_map.SimpleCorpus(corpus,
## content_transformer(PlainTextDocument)): transformation drops documents
corpus <- tm_map(corpus,content_transformer(removePunctuation))
corpus <- tm_map(corpus,content_transformer(removeNumbers))
corpus <- tm_map(corpus,content_transformer(stripWhitespace))
corpus <- tm_map(corpus, removeWords, stopwords("english"))
toSpace <- content_transformer(function(x, pattern) {return (gsub(pattern, '', x))})
corpus <- tm_map( corpus , toSpace, '=' )
## Warning in tm_map.SimpleCorpus(corpus, toSpace, "="): transformation drops
## documents
corpus <- tm_map( corpus , toSpace, "'" )
## Warning in tm_map.SimpleCorpus(corpus, toSpace, "'"): transformation drops
## documents
corpus <- tm_map( corpus , toSpace, "'" )
## Warning in tm_map.SimpleCorpus(corpus, toSpace, "'"): transformation drops
## documents
Distribution being so concentrated, I splitted the dataset in quantiles to be displayed on the same chart with same scale. There is sightly less than 40,000 words in the dataset, if the most common word appears 1482 times, 75% occured less than 4 times and less than 10% occured mor than 10 times.
# uni word
unigram <- NGramTokenizer(corpus, Weka_control(min = 1, max = 1))
unigramdf <- data.frame(table( unigram ) )
unigramdf <- unigramdf[order(-unigramdf$Freq),]
uniqua=round((length(unigramdf$Freq)-4)/4,0)
unimean=round( mean(unigramdf$Freq) + .5 , 0 )
summary(unigramdf)
## unigram Freq
## = : 1 Min. : 1.000
## aa : 1 1st Qu.: 1.000
## aaa : 1 Median : 1.000
## aaahhhhhhhhhhhhhh: 1 Mean : 8.726
## aaayyyeee : 1 3rd Qu.: 3.000
## aac : 1 Max. :3098.000
## (Other) :38514
plot(unigramdf$Freq[0:uniqua], type = "l", ylim = c(0, unimean*2), xlab = "", main = "Single word distribution", col=1)
for(ii in 1:3) {
lines(unigramdf$Freq[(ii*uniqua):((ii*uniqua)+uniqua)] , col=ii+1 )
}
legend( uniqua*.5, unimean*2, legend = c("1st Quart","2nd","3rd","4th") , col= c(1,2,3,4) , lty = 1 )
bigramdf <- data.frame(table(NGramTokenizer(corpus, Weka_control(min = 2, max = 2)) ))
bigramdf <- bigramdf[order(-bigramdf$Freq),]
summary(bigramdf)
## Var1 Freq
## = en : 1 Min. : 1.000
## aa commit : 1 1st Qu.: 1.000
## aa discussion: 1 Median : 1.000
## aa first : 1 Mean : 1.261
## aa oh : 1 3rd Qu.: 1.000
## aa way : 1 Max. :317.000
## (Other) :266452
color_pal <- viridis(5) # number of colors to use
barplot(bigramdf$Freq[1:25], las = 2,
col = color_pal,cex.names=0.9,
names.arg=bigramdf$Var1[1:25] )
trigramdf <- data.frame(table(NGramTokenizer(corpus, Weka_control(min = 3, max = 3)) ))
summary(trigramdf)
## Var1 Freq
## aa commit street : 1 Min. : 1.000
## aa discussion obama : 1 1st Qu.: 1.000
## aa first time : 1 Median : 1.000
## aa oh man : 1 Mean : 1.016
## aa way operationalizemeasure: 1 3rd Qu.: 1.000
## aaa open pm : 1 Max. :67.000
## (Other) :330753
trigramdf <- trigramdf[order(-trigramdf$Freq),]
density(trigramdf$Freq)
##
## Call:
## density.default(x = trigramdf$Freq)
##
## Data: trigramdf$Freq (330759 obs.); Bandwidth 'bw' = 0.01925
##
## x y
## Min. : 0.9423 Min. : 0.00000
## 1st Qu.:17.4711 1st Qu.: 0.00000
## Median :34.0000 Median : 0.00000
## Mean :34.0000 Mean : 0.04058
## 3rd Qu.:50.5289 3rd Qu.: 0.00000
## Max. :67.0577 Max. :11.70140
barplot(trigramdf$Freq[1:25], las = 2,
col = color_pal,cex.names=0.9,
names.arg=trigramdf$Var1[1:25] )
### Conclusion:
Words usage in twitter are extremely concentrate, 1% of the most common words account for 65% of all word instance.
What are the frequencies of 2-grams also very concentrated - the highest occurence of a pair of words found in my sample is 1549 but more than 75% of the pairs only occured onceā¦. and 3-grams in the dataset?
How many unique words do you need in a frequency sorted dictionary to cover 50% of all word instances in the language? - <1%
We will use wordnet to evaluate prediction coming from foreign language.
Clearly to be able to increase meanifull coverage, will will need to qualify our own set of stops words.
The next step is the creation of a model allowing the prediction of the next word based on the entered wordd. The algorithm will be trained on a much larger sample of the data than the one used for this exploration report.