This is a project to predict word after taking input from twitter,news and blogs. As per those input when user type something it should predict next word,Tis part include to process the input data and creating phrases of words and exploratory analysis.
Goal of this assignment is 1.show data is downloaded 2.create basic report summary 3.show any interesting finding 4.create a plan for ngram
library(tm)
## Loading required package: NLP
library(stringi)
library(slam)
library(RWeka)
## Warning: package 'RWeka' was built under R version 3.4.1
setwd("C:\\Users\\suman\\Desktop\\datasciencecoursera\\capstone")
if(!file.exists("./Coursera-SwiftKey.zip")){
download.file(Url,destfile="./Coursera-SwiftKey.zip",mode = "wb")
}
if(!file.exists("./final")){
unzip(zipfile="./Coursera-SwiftKey.zip",exdir=".")
}
con<-file("./final/en_US/en_US.twitter.txt", "r")
lineTwitter<-readLines(con,skipNul = T)
close(con)
con <- file("./final/en_US/en_US.blogs.txt", "r")
lineBlogs<-readLines(con, skipNul = T)
close(con)
con <- file("./final/en_US/en_US.news.txt", "r")
lineNews<-readLines(con, skipNul = T)
## Warning in readLines(con, skipNul = T): incomplete final line found on './
## final/en_US/en_US.news.txt'
close(con)
fileSummary <- data.frame(
fileName = c("Twitter","Blogs","News"),
fileSize = c(round(file.info("./final/en_US/en_US.twitter.txt")$size, digits = 2),
round(file.info("./final/en_US/en_US.blogs.txt")$size,digits = 2),
round(file.info("./final/en_US/en_US.news.txt")$size, digits = 2)),
lineCount = c(length(lineTwitter)[1], length(lineBlogs)[1], length(lineNews)[1]),
wordCount = c(stri_stats_general(lineTwitter)[1], stri_stats_general(lineBlogs)[1], stri_stats_general(lineNews)[1])
)
fileSummary
## fileName fileSize lineCount wordCount
## 1 Twitter NA 2360148 2360148
## 2 Blogs NA 899288 899288
## 3 News NA 77259 77259
sampleTwitter<-lineTwitter[sample(1:length(lineTwitter),5000)]
sampleBlogs<-lineBlogs[sample(1:length(lineBlogs),5000)]
sampleNews<-lineNews[sample(1:length(lineNews),5000)]
testsample<-c(sampleTwitter,sampleBlogs,sampleNews)
cleanSample <- VCorpus(VectorSource(testsample))
rm(sampleBlogs)
rm(sampleNews)
rm(sampleTwitter)
cleanSample<-tm_map(cleanSample,content_transformer(tolower))
cleanSample <- tm_map(cleanSample, content_transformer(removePunctuation))
cleanSample <- tm_map(cleanSample, content_transformer(removeNumbers))
urlremove <- function(x) gsub("http[^[:space:]]*", "", x)
specialcharremove<-function(x) gsub("[[:punct:]]*", "", x)
hashtagremove<- function(x) gsub("#\\S+", "", x)
twitterremove<- function(x) gsub("@\\S+", "", x)
mystopword<-c("a","an","the")
cleanSample <- tm_map(cleanSample, removeWords, c("twitter"))
cleanSample<-tm_map(cleanSample,content_transformer(urlremove))
cleanSample<-tm_map(cleanSample,content_transformer(specialcharremove))
cleanSample <- tm_map(cleanSample, removeWords, stopwords("english"))
cleanSample <- tm_map(cleanSample, stripWhitespace)
tdm1 <- TermDocumentMatrix(cleanSample,control=list(wordLengths=c(1, 10),
bounds = list(global = c(500,Inf))))
findFreqTerms(tdm1,50)
## [1] "also" "can" "day" "first" "get" "good" "im"
## [8] "just" "know" "last" "like" "new" "now" "one"
## [15] "people" "said" "time" "us" "will"
freq <- sort(rowSums(as.matrix(rollup(tdm1, 2, FUN = sum)), na.rm = T), decreasing = TRUE)
tdm1freq<-data.frame(word = names(freq), freq = freq)
tdm1freq<-tdm1freq[1:10,]
ngramTokenizer <- function(x,n) NGramTokenizer(x,Weka_control(min = n, max = n))
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
a<-as.data.frame(tdm1freq)
head(a)
## word freq
## said said 1461
## will will 1392
## one one 1310
## just just 1087
## like like 1041
## can can 1036
tdm2 <- TermDocumentMatrix(cleanSample, control = list(tokenize = BigramTokenizer))
freq <- sort(rowSums(as.matrix(rollup(tdm2, 2, FUN = sum)), na.rm = T), decreasing = TRUE)
tdm2freq<-data.frame(word = names(freq), freq = freq)
tdm2freq<-tdm2freq[(tdm2freq$freq>10),]
trigramTokenizer <- function(x) NGramTokenizer(x,Weka_control(min = 3, max = 3))
tdm3 <- TermDocumentMatrix(cleanSample, control = list(tokenize = trigramTokenizer))
freq <- sort(rowSums(as.matrix(rollup(tdm3, 2, FUN = sum)), na.rm = T), decreasing = TRUE)
tdm3freq<-data.frame(word = names(freq), freq = freq)
tdm3freq<-tdm3freq[(tdm3freq$freq>5),]
ggplot(a,aes(word,freq))+ geom_bar(stat = "identity")
a<-as.data.frame(subset(tdm2freq,freq>50))
ggplot(a,aes(word,freq))+ geom_bar(stat = "identity")
a<-as.data.frame(subset(tdm3freq,freq>6))
ggplot(a,aes(word,freq))+ geom_bar(stat = "identity")+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
#next those dictionary will be used to predict next word