introduction

This is a project to predict word after taking input from twitter,news and blogs. As per those input when user type something it should predict next word,Tis part include to process the input data and creating phrases of words and exploratory analysis.

Goal of this assignment is 1.show data is downloaded 2.create basic report summary 3.show any interesting finding 4.create a plan for ngram

load data and library then read from twitter blog and news

library(tm)
## Loading required package: NLP
library(stringi)
library(slam)
library(RWeka)
## Warning: package 'RWeka' was built under R version 3.4.1
setwd("C:\\Users\\suman\\Desktop\\datasciencecoursera\\capstone")
if(!file.exists("./Coursera-SwiftKey.zip")){
  download.file(Url,destfile="./Coursera-SwiftKey.zip",mode = "wb")
}
if(!file.exists("./final")){
  unzip(zipfile="./Coursera-SwiftKey.zip",exdir=".")
}
con<-file("./final/en_US/en_US.twitter.txt", "r")
lineTwitter<-readLines(con,skipNul = T)
close(con)
con <- file("./final/en_US/en_US.blogs.txt", "r") 
lineBlogs<-readLines(con, skipNul = T)
close(con)
con <- file("./final/en_US/en_US.news.txt", "r") 
lineNews<-readLines(con, skipNul = T)
## Warning in readLines(con, skipNul = T): incomplete final line found on './
## final/en_US/en_US.news.txt'
close(con)

summary of files like lines words etc

fileSummary <- data.frame(
      fileName = c("Twitter","Blogs","News"),
     fileSize = c(round(file.info("./final/en_US/en_US.twitter.txt")$size, digits = 2), 
                                      round(file.info("./final/en_US/en_US.blogs.txt")$size,digits = 2), 
                                      round(file.info("./final/en_US/en_US.news.txt")$size, digits = 2)),
     lineCount = c(length(lineTwitter)[1], length(lineBlogs)[1], length(lineNews)[1]),
     wordCount = c(stri_stats_general(lineTwitter)[1], stri_stats_general(lineBlogs)[1], stri_stats_general(lineNews)[1])                  
  )
fileSummary
##   fileName fileSize lineCount wordCount
## 1  Twitter       NA   2360148   2360148
## 2    Blogs       NA    899288    899288
## 3     News       NA     77259     77259

creating sample of 5000 and create term document matrix

sampleTwitter<-lineTwitter[sample(1:length(lineTwitter),5000)]
sampleBlogs<-lineBlogs[sample(1:length(lineBlogs),5000)]
sampleNews<-lineNews[sample(1:length(lineNews),5000)]
testsample<-c(sampleTwitter,sampleBlogs,sampleNews)

cleanSample <- VCorpus(VectorSource(testsample))
rm(sampleBlogs)
rm(sampleNews)
rm(sampleTwitter)

cleanSample<-tm_map(cleanSample,content_transformer(tolower))
cleanSample <- tm_map(cleanSample, content_transformer(removePunctuation))
cleanSample <- tm_map(cleanSample, content_transformer(removeNumbers))
urlremove <- function(x) gsub("http[^[:space:]]*", "", x)
specialcharremove<-function(x)      gsub("[[:punct:]]*", "", x) 
hashtagremove<- function(x) gsub("#\\S+", "", x)
twitterremove<- function(x) gsub("@\\S+", "", x)
mystopword<-c("a","an","the")
cleanSample <- tm_map(cleanSample, removeWords, c("twitter"))
cleanSample<-tm_map(cleanSample,content_transformer(urlremove))
cleanSample<-tm_map(cleanSample,content_transformer(specialcharremove))
cleanSample <- tm_map(cleanSample, removeWords, stopwords("english"))
cleanSample <- tm_map(cleanSample, stripWhitespace)
tdm1 <- TermDocumentMatrix(cleanSample,control=list(wordLengths=c(1, 10), 
                           bounds = list(global = c(500,Inf))))
findFreqTerms(tdm1,50)
##  [1] "also"   "can"    "day"    "first"  "get"    "good"   "im"    
##  [8] "just"   "know"   "last"   "like"   "new"    "now"    "one"   
## [15] "people" "said"   "time"   "us"     "will"
freq <- sort(rowSums(as.matrix(rollup(tdm1, 2, FUN = sum)), na.rm = T), decreasing = TRUE)
tdm1freq<-data.frame(word = names(freq), freq = freq)
tdm1freq<-tdm1freq[1:10,]
ngramTokenizer <- function(x,n) NGramTokenizer(x,Weka_control(min = n, max = n))
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
a<-as.data.frame(tdm1freq)
head(a)
##      word freq
## said said 1461
## will will 1392
## one   one 1310
## just just 1087
## like like 1041
## can   can 1036
tdm2 <- TermDocumentMatrix(cleanSample, control = list(tokenize = BigramTokenizer))
freq <- sort(rowSums(as.matrix(rollup(tdm2, 2, FUN = sum)), na.rm = T), decreasing = TRUE)
tdm2freq<-data.frame(word = names(freq), freq = freq)
tdm2freq<-tdm2freq[(tdm2freq$freq>10),]
trigramTokenizer <- function(x) NGramTokenizer(x,Weka_control(min = 3, max = 3))
tdm3 <- TermDocumentMatrix(cleanSample, control = list(tokenize = trigramTokenizer))
freq <- sort(rowSums(as.matrix(rollup(tdm3, 2, FUN = sum)), na.rm = T), decreasing = TRUE)
tdm3freq<-data.frame(word = names(freq), freq = freq)
tdm3freq<-tdm3freq[(tdm3freq$freq>5),]

plot frequency of phrases (1,2,3) of most common words

ggplot(a,aes(word,freq))+ geom_bar(stat = "identity")

a<-as.data.frame(subset(tdm2freq,freq>50))
ggplot(a,aes(word,freq))+ geom_bar(stat = "identity")

a<-as.data.frame(subset(tdm3freq,freq>6))
ggplot(a,aes(word,freq))+ geom_bar(stat = "identity")+
    theme(axis.text.x = element_text(angle = 90, hjust = 1))

#next those dictionary will be used to predict next word