The goal of this project is to show an understanding of three data sets for news, blogs, and tweets along.
library(NLP)
library(tm)
library(RColorBrewer)
library(wordcloud)
library(SnowballC)
library(ggplot2)
library(fpc)
We load data from [this link][1]
basePath <- '/home/mohamed/R_Coursr_Coursera/CastponeProject/week2_Assignment/Coursera-SwiftKey/final/en_US'
flist <- list.files(path=basePath, recursive=T, pattern=".*en_.*.txt")
fileNames <- paste(basePath, flist, sep="/")
samplePerc<- 0.001 # Sample 0.1% of the entire data sets
blogs<-readLines(fileNames[1])
blogss<-sample(blogs, round(samplePerc*length(blogs)), replace = F)
rm(blogs)
news<-readLines(fileNames[2])
newss<-sample(news, round(samplePerc*length(news)), replace = F)
rm(news)
twitter<-readLines(fileNames[3])
twitters<-sample(twitter, round(samplePerc*length(twitter)), replace = F)
rm(twitter)
blogsCorpus <- VCorpus(VectorSource(blogss))
newsCorpus <- VCorpus(VectorSource(newss))
twitterCorpus <- VCorpus(VectorSource(twitters))
l <- lapply(paste(basePath, flist, sep="/"), function(f) {
fsize <- file.info(f)[1]/1024/1024
con <- file(f, open="r")
lines <- readLines(con)
nchars <- lapply(lines, nchar)
maxchars <- which.max(nchars)
nwords <- sum(sapply(strsplit(lines, "\\s+"), length))
close(con)
return(c(f, format(round(fsize, 2), nsmall=2), length(lines), maxchars, nwords))
})
df <- data.frame(matrix(unlist(l), nrow=length(l), byrow=T))
colnames(df) <- c("file", "size(MB)", "num.of.lines", "longest.line", "num.of.words")
print(df)
## file
## 1 /home/mohamed/R_Coursr_Coursera/CastponeProject/week2_Assignment/Coursera-SwiftKey/final/en_US/en_US.blogs.txt
## 2 /home/mohamed/R_Coursr_Coursera/CastponeProject/week2_Assignment/Coursera-SwiftKey/final/en_US/en_US.news.txt
## 3 /home/mohamed/R_Coursr_Coursera/CastponeProject/week2_Assignment/Coursera-SwiftKey/final/en_US/en_US.twitter.txt
## size(MB) num.of.lines longest.line num.of.words
## 1 200.42 899288 483415 37334131
## 2 196.28 1010242 123628 34372530
## 3 159.36 2360148 26 30373543
CorpusClean <- function(x){
x <- tm_map(x, removePunctuation)
x <- tm_map(x, removeNumbers)
x <- tm_map(x, stripWhitespace)
x <- tm_map(x, content_transformer(tolower))
x <- tm_map(x, removeWords, stopwords("english"))
x <- tm_map(x, PlainTextDocument)
x
}
blogsCorpus <- CorpusClean(blogsCorpus)
newsCorpus <- CorpusClean(newsCorpus)
twitterCorpus <- CorpusClean(twitterCorpus)
UnigramTokenizer <-
function(x)
unlist(lapply(ngrams(words(x), 1), paste, collapse = " "), use.names = FALSE)
BigramTokenizer <-
function(x)
unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
TrigramTokenizer <-
function(x)
unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE)
btdm1 <- TermDocumentMatrix(blogsCorpus, control = list(tokenize = UnigramTokenizer))
btdm2 <- TermDocumentMatrix(blogsCorpus, control = list(tokenize = BigramTokenizer))
btdm3 <- TermDocumentMatrix(blogsCorpus, control = list(tokenize = TrigramTokenizer))
ntdm1 <- TermDocumentMatrix(newsCorpus, control = list(tokenize = UnigramTokenizer))
ntdm2 <- TermDocumentMatrix(newsCorpus, control = list(tokenize = BigramTokenizer))
ntdm3 <- TermDocumentMatrix(newsCorpus, control = list(tokenize = TrigramTokenizer))
ttdm1 <- TermDocumentMatrix(twitterCorpus, control = list(tokenize = UnigramTokenizer))
ttdm2 <- TermDocumentMatrix(twitterCorpus, control = list(tokenize = BigramTokenizer))
ttdm3 <- TermDocumentMatrix(twitterCorpus, control = list(tokenize = TrigramTokenizer))
showCorpusInfo <- function(theCorpus)
{
m <- as.matrix(theCorpus)
v <- sort(rowSums(m), decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
#print(head(d, 10))
barplot(d[1:10,]$freq, las = 2, names.arg = d[1:10,]$word,
col ="lightblue", main ="Most frequent words",
ylab = "Word frequencies")
return (d)
}
createAcloud <- function(d)
{
minf = 40
wordcloud(words = d$word, freq = d$freq, min.freq = minf,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
}
d1<-showCorpusInfo(btdm1)
createAcloud(d1)
d2<-showCorpusInfo(btdm2)
d3<-showCorpusInfo(btdm3)
d1<-showCorpusInfo(ntdm1)
createAcloud(d1)
d2<-showCorpusInfo(ntdm2)
d3<-showCorpusInfo(ntdm3)
d1<-showCorpusInfo(ttdm1)
createAcloud(d1)
d2<-showCorpusInfo(ttdm2)
d3<-showCorpusInfo(ttdm3)
[1] https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip