This is week 2 of the project, where I show my ability on performing exploratory data analysis of text files by use of text mining.
The following questions mat be answered in this project. In week 2 I will only focus on the first question.
We first start by importing the packages and data.
library(tidyverse)
library(readr)
library(tidyr)
library(dplyr)
library(skimr)
library(tm)
library(wordnet)
library(SnowballC)
library(wordcloud)
library(RColorBrewer)
blogs <- readLines(file("./Coursera-SwiftKey/final/en_US/en_US.blogs.txt","r"))
news <- readLines(file("./Coursera-SwiftKey/final/en_US/en_US.news.txt","r"))
twitter <- readLines(file("./Coursera-SwiftKey/final/en_US/en_US.twitter.txt","r"))
Before using the data, we will get information regarding the size of the data.
We first start with calculating the size of the three files.
blogs_size <- file.info("./Coursera-SwiftKey/final/en_US/en_US.blogs.txt")
kb <- blogs_size$size/1024
mb <- kb/1024
mb
## [1] 200.4242
news_size <- file.info("./Coursera-SwiftKey/final/en_US/en_US.news.txt")
kb <- news_size$size/1024
mb <- kb/1024
mb
## [1] 196.2775
twitter_size <- file.info("./Coursera-SwiftKey/final/en_US/en_US.twitter.txt")
kb <- twitter_size$size/1024
mb <- kb/1024
mb
## [1] 159.3641
Then we calculate the number of lines per file.
length(blogs)
## [1] 899288
length(news)
## [1] 77259
length(twitter)
## [1] 2360148
Due to the huge size of the data, we start by sampling the data before going on for further processing. The sample used is 0.5%.
blogs_sample <- sample(blogs, length(blogs) * 0.005)
news_sample <- sample(news, length(news) * 0.005)
twitter_sample <- sample(twitter, length(twitter) * 0.005)
# Creating my own stopwords
mystopwords <- c("and", "for", "in", "is", "it", "not", "the", "to")
# Preprocessing blogs data
blogs_corpus <- VCorpus(VectorSource(blogs_sample))
blogs_corpus <- tm_map(blogs_corpus, content_transformer(tolower)) # Convert the text to lower case
blogs_corpus <- tm_map(blogs_corpus, removeNumbers) # Remove numbers
blogs_corpus <- tm_map(blogs_corpus, removeWords, stopwords("en")) # Remove stopwords
blogs_corpus <- tm_map(blogs_corpus, removeWords, mystopwords) # Remove my own stopwords
blogs_corpus <- tm_map(blogs_corpus, removePunctuation) # Remove punctuations
blogs_corpus <- tm_map(blogs_corpus, stripWhitespace) # Eliminate extra white spaces
# Preprocessing news data
news_corpus <- VCorpus(VectorSource(news_sample))
news_corpus <- tm_map(news_corpus, content_transformer(tolower)) # Convert the text to lower case
news_corpus <- tm_map(news_corpus, removeNumbers) # Remove numbers
news_corpus <- tm_map(news_corpus, removeWords, stopwords("en")) # Remove stopwords
news_corpus <- tm_map(news_corpus, removeWords, mystopwords) # Remove my own stopwords
news_corpus <- tm_map(news_corpus, removePunctuation) # Remove punctuations
news_corpus <- tm_map(news_corpus, stripWhitespace) # Eliminate extra white spaces
# Preprocessing blogs data
twitter_corpus <- VCorpus(VectorSource(twitter_sample))
twitter_corpus <- tm_map(twitter_corpus, content_transformer(tolower)) # Convert the text to lower case
twitter_corpus <- tm_map(twitter_corpus, removeNumbers) # Remove numbers
twitter_corpus <- tm_map(twitter_corpus, removeWords, stopwords("en")) # Remove stopwords
twitter_corpus <- tm_map(twitter_corpus, removeWords, mystopwords) # Remove my own stopwords
twitter_corpus <- tm_map(twitter_corpus, removePunctuation) # Remove punctuations
twitter_corpus <- tm_map(twitter_corpus, stripWhitespace) # Eliminate extra white spaces
After preprocessing, we are able to answer the questions. The frequency of words is calculated by a Document matrix and visualised in a barplot and WordCloud.
blogs_dtm <- TermDocumentMatrix(blogs_corpus)
blogs_m <- as.matrix(blogs_dtm)
blogs_v <- sort(rowSums(blogs_m),decreasing=TRUE)
blogs_d <- data.frame(word = names(blogs_v),freq=blogs_v)
head(blogs_d, 10)
## word freq
## one one 649
## will will 579
## just just 513
## like like 501
## can can 460
## time time 448
## get get 368
## people people 324
## new new 301
## know know 297
news_dtm <- TermDocumentMatrix(news_corpus)
news_m <- as.matrix(news_dtm)
news_v <- sort(rowSums(news_m),decreasing=TRUE)
news_d <- data.frame(word = names(news_v),freq=news_v)
head(news_d, 10)
## word freq
## said said 94
## one one 55
## will will 43
## can can 32
## new new 29
## also also 27
## two two 24
## says says 23
## time time 23
## get get 21
twitter_dtm <- TermDocumentMatrix(twitter_corpus)
twitter_m <- as.matrix(twitter_dtm)
twitter_v <- sort(rowSums(twitter_m),decreasing=TRUE)
twitter_d <- data.frame(word = names(twitter_v),freq=twitter_v)
head(twitter_d, 10)
## word freq
## just just 750
## like like 629
## get get 551
## love love 539
## good good 520
## day day 481
## will will 478
## thanks thanks 457
## can can 445
## one one 438
par(mfrow=c(1,3))
barplot(blogs_d[1:10,]$freq, las = 2, names.arg = blogs_d[1:10,]$word,
col ="lightblue", main ="Most frequent words in blogs",
ylab = "Word frequencies")
barplot(news_d[1:10,]$freq, las = 2, names.arg = news_d[1:10,]$word,
col ="lightblue", main ="Most frequent words in the news",
ylab = "Word frequencies")
barplot(twitter_d[1:10,]$freq, las = 2, names.arg = twitter_d[1:10,]$word,
col ="lightblue", main ="Most frequent words in twitter",
ylab = "Word frequencies")
par(mfrow=c(1,3))
wordcloud(words = blogs_d$word, freq = blogs_d$freq, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
wordcloud(words = news_d$word, freq = news_d$freq, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
wordcloud(words = twitter_d$word, freq = twitter_d$freq, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))