Goal of this assignment is 1.show data is downloaded 2.create basic report summary 3.show any interesting finding 4.create a plan for ngram
library(tm)
## Loading required package: NLP
library(stringi)
library(slam)
library(RWeka)
## Warning: package 'RWeka' was built under R version 3.4.1
setwd("C:\\Users\\suman\\Desktop\\datasciencecoursera\\capstone")
if(!file.exists("./Coursera-SwiftKey.zip")){
download.file(Url,destfile="./Coursera-SwiftKey.zip",mode = "wb")
}
if(!file.exists("./final")){
unzip(zipfile="./Coursera-SwiftKey.zip",exdir=".")
}
con<-file("./final/en_US/en_US.twitter.txt", "r")
lineTwitter<-readLines(con,skipNul = T)
close(con)
con <- file("./final/en_US/en_US.blogs.txt", "r")
lineBlogs<-readLines(con, skipNul = T)
close(con)
con <- file("./final/en_US/en_US.news.txt", "r")
lineNews<-readLines(con, skipNul = T)
## Warning in readLines(con, skipNul = T): incomplete final line found on './
## final/en_US/en_US.news.txt'
close(con)
fileSummary <- data.frame(
fileName = c("Twitter","Blogs","News"),
fileSize = c(round(file.info("./final/en_US/en_US.twitter.txt")$size, digits = 2),
round(file.info("./final/en_US/en_US.blogs.txt")$size,digits = 2),
round(file.info("./final/en_US/en_US.news.txt")$size, digits = 2)),
lineCount = c(length(lineTwitter)[1], length(lineBlogs)[1], length(lineNews)[1]),
wordCount = c(stri_stats_general(lineTwitter)[1], stri_stats_general(lineBlogs)[1], stri_stats_general(lineNews)[1])
)
fileSummary
## fileName fileSize lineCount wordCount
## 1 Twitter NA 2360148 2360148
## 2 Blogs NA 899288 899288
## 3 News NA 77259 77259
sampleTwitter<-lineTwitter[sample(1:length(lineTwitter),10000)]
sampleBlogs<-lineBlogs[sample(1:length(lineBlogs),10000)]
sampleNews<-lineNews[sample(1:length(lineNews),10000)]
testsample<-c(sampleTwitter,sampleBlogs,sampleNews)
cleanSample <- VCorpus(VectorSource(testsample))
rm(sampleBlogs)
rm(sampleNews)
rm(sampleTwitter)
rm(testsample)
cleanSample<-tm_map(cleanSample,content_transformer(tolower))
cleanSample <- tm_map(cleanSample, content_transformer(removePunctuation))
cleanSample <- tm_map(cleanSample, content_transformer(removeNumbers))
urlremove <- function(x) gsub("http[^[:space:]]*", "", x)
specialcharremove<-function(x) gsub("[[:punct:]]*", "", x)
hashtagremove<- function(x) gsub("#\\S+", "", x)
twitterremove<- function(x) gsub("@\\S+", "", x)
mystopword<-c("a","an","the")
cleanSample <- tm_map(cleanSample, removeWords, c("twitter"))
cleanSample<-tm_map(cleanSample,content_transformer(urlremove))
cleanSample<-tm_map(cleanSample,content_transformer(specialcharremove))
cleanSample <- tm_map(cleanSample, removeWords, stopwords("english"))
cleanSample <- tm_map(cleanSample, stripWhitespace)
tdm1 <- TermDocumentMatrix(cleanSample,control=list(wordLengths=c(1, 10),
bounds = list(global = c(5000,Inf))))
findFreqTerms(tdm1,1000)
## NULL
freq <- sort(rowSums(as.matrix(rollup(tdm1, 2, FUN = sum)), na.rm = T), decreasing = TRUE)
tdm1freq<-data.frame(word = names(freq), freq = freq)
tdm1freq<-tdm1freq[1:10,]
ngramTokenizer <- function(x,n) NGramTokenizer(x,Weka_control(min = n, max = n))
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tdm2 <- TermDocumentMatrix(cleanSample, control = list(tokenize = BigramTokenizer))
freq <- sort(rowSums(as.matrix(rollup(tdm2, 2, FUN = sum)), na.rm = T), decreasing = TRUE)
tdm2freq<-data.frame(word = names(freq), freq = freq)
tdm2freq<-tdm2freq[(tdm2freq$freq>10),]
trigramTokenizer <- function(x) NGramTokenizer(x,Weka_control(min = 3, max = 3))
tdm3 <- TermDocumentMatrix(cleanSample, control = list(tokenize = trigramTokenizer))
freq <- sort(rowSums(as.matrix(rollup(tdm3, 2, FUN = sum)), na.rm = T), decreasing = TRUE)
tdm3freq<-data.frame(word = names(freq), freq = freq)
tdm3freq<-tdm3freq[(tdm3freq$freq>9),]
library(ggplot2) a<-as.data.frame(tdm1freq) ggplot(a,aes(word,freq))+ geom_bar(stat = “identity”) a<-as.data.frame(subset(tdm2freq,freq>200)) ggplot(a,aes(word,freq))+ geom_bar(stat = “identity”) a<-as.data.frame(subset(tdm3freq,freq>50)) ggplot(a,aes(word,freq))+ geom_bar(stat = “identity”) ```