The motivation for this project is to:
First the required R-packages for the text mining analysis are loaded:
knitr::opts_chunk$set(echo = TRUE)
#install.packages(c('SnowballC', 'wordcloud', 'topicmodels','data.table','tidytext', 'Rweka'))
options(warn=-1)
library(tm)
## Loading required package: NLP
library(stats)
library(SnowballC)
library(wordcloud)
## Loading required package: RColorBrewer
library(utils)
library(data.table)
library(tidytext)
library(RWeka)
Then the working directory is set and the data is loaded in this directory.
#dir.create('c:/Coursera/Capstone/Assignment')
#setwd('c:/Coursera/Capstone/Assignment')
#getwd()download.file('https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip','c:/Coursera/Capstone/Assignment/Coursera-SwiftKey.zip')
#unzip('c:/Coursera/Capstone/Assignment/Coursera-SwiftKey.zip')
In a first data exploration a table showing the number of lines and characters per file is created.
setwd('c:/Coursera/Capstone/Assignment/final/en_US')
txt <- dir()
con <- file(txt[1], "r")
blogs <- readLines(con)
close(con)
con <- file(txt[2], "r")
news <- readLines(con)
close(con)
con <- file(txt[3], "r")
twitter <- readLines(con)
close(con)
all_data <- list(blogs, news, twitter)
#rm(blogs, news, twitter)
line_count <- sapply(all_data, length)
char_count <- sapply(all_data, function(x) sum(nchar(x)))
summ <- data.table(File_name = txt, Lines = line_count, Characters = char_count)
print(summ)
## File_name Lines Characters
## 1: en_US.blogs.txt 899288 208361438
## 2: en_US.news.txt 77259 15683765
## 3: en_US.twitter.txt 2360148 162384825
Preprocessing, applying methodes for cleaning up and structuring the text for futher analysis, is a core component in practical text mining. In the preprocessing the following steps are performed:
We build a “TermDocumentMatrix” that includes frequencies of words.
Cblogs <- SimpleCorpus(VectorSource(blogs))
Cnews <- SimpleCorpus(VectorSource(news))
Ctwitter <- SimpleCorpus(VectorSource(twitter))
preprocessing <- function (x) {
x = tm_map(x, content_transformer(tolower))
x = tm_map(x, removeNumbers)
x = tm_map(x, removePunctuation)
x = tm_map(x, removeWords, c("the", "and", stopwords("english")))
x = tm_map(x, stripWhitespace)
return (x)
}
Cblogs <- preprocessing (Cblogs)
Cnews <- preprocessing (Cnews)
Ctwitter<- preprocessing (Ctwitter)
bDTM <- TermDocumentMatrix(Cblogs)
nDTM <- TermDocumentMatrix(Cnews)
tDTM <- TermDocumentMatrix(Ctwitter)
After preprocessing the frequencies of the most occurring words are presented.
bdtm_row_sums<-slam::row_sums(bDTM)
bdtm_row_sums <- sort(bdtm_row_sums, decreasing=T)
btop_10<-bdtm_row_sums[1:10]
btop_10
## one will just like can time get know now people
## 123697 112396 99603 97997 97835 87450 70491 59507 58801 58689
barplot(btop_10, main="Most occurrent Words in Blogs",
xlab="Word", ylab = "Frequency")
ndtm_row_sums<-slam::row_sums(nDTM)
ndtm_row_sums <- sort(ndtm_row_sums, decreasing=T)
ntop_10<-ndtm_row_sums[1:10]
ntop_10
## said will one new also year two can first just
## 19166 8463 6392 5327 4515 4450 4433 4393 4148 4132
barplot(ntop_10, main="Most occurrent Words in News",
xlab="Word", ylab = "Frequency")
tdtm_row_sums<-slam::row_sums(tDTM)
tdtm_row_sums <- sort(tdtm_row_sums, decreasing=T)
ttop_10<-tdtm_row_sums[1:10]
ttop_10
## just like get love good will day can dont thanks
## 149589 121282 111903 105430 99554 94260 89817 89090 88684 88597
barplot(ttop_10, main="Most occurrent Words in Twitter",
xlab="Word", ylab = "Frequency")
The Shiny app will consist of a user interface that will allow a user to enter a word into a single text box. This will trigger our algorithm and then high correlating words identified by associations in our text data are shown.