The Key partners for this project are Swiftkey and Coursera, explores Natural Language Processing facet of Data Science.
library(dplyr)
library(doParallel)
library(stringi)
library(tm)
library(slam)
library(ggplot2)
library(wordcloud)
blogs <- file("en_US.blogs.txt", open="rb")
blogs <- readLines(blogs, encoding = "UTF-8", skipNul=TRUE)
news <- file("en_US.news.txt", open = "rb")
news <- readLines(news, encoding = "UTF-8", skipNul=TRUE)
twitter <- file("en_US.twitter.txt", open = "rb")
twitter <- readLines(twitter, encoding = "UTF-8", skipNul=TRUE)
Basic statistics of the 3 data files, including Line, Character and Word counts, and Words Per Line (WPL) summaries, histograms are also plotted for the same
rawP <-lapply(list(blogs,news,twitter),function(x) stri_count_words(x))
stat <-data.frame(
File=c("blogs","news","twitter"),
t(rbind(sapply(list(blogs,news,twitter),stri_stats_general),
TotalWords=sapply(list(blogs,news,twitter),stri_stats_latex)[4,])),
WPL=rbind(summary(rawP[[1]]),summary(rawP[[2]]),summary(rawP[[3]])))
print(stat)
## File Lines LinesNEmpty Chars CharsNWhite TotalWords WPL.Min.
## 1 blogs 899288 899288 206824382 170389539 37570839 0
## 2 news 1010242 1010242 203223154 169860866 34494539 1
## 3 twitter 2360148 2360148 162096241 134082806 30451170 1
## WPL.1st.Qu. WPL.Median WPL.Mean WPL.3rd.Qu. WPL.Max.
## 1 9 28 41.75107 60 6726
## 2 19 32 34.40997 46 1796
## 3 7 12 12.75065 18 47
# Plot histogram for each data type
qplot(rawP[[1]],geom="histogram",main="US Blogs",
xlab="WordCount",ylab="Freq",binwidth=10)
qplot(rawP[[2]],geom="histogram",main="US News",
xlab="WordCount",ylab="Freq",binwidth=10)
qplot(rawP[[3]],geom="histogram",main="US Twits",
xlab="WordCount",ylab="Freq",binwidth=1)
rm(rawP);rm(stat)
Sample 20000 lines from each data type before cleaning and performing exploratory analysis again
sizeS <- 20000 # Sample Size
set.seed(2809)
# data and sample vectors
data <- list(blogs, news, twitter)
sampleVec <- list()
# Iterate each raw data
for (i in 1:length(data)) {
# Create sample dataset
Ffil <- sample(1:length(data[[i]]), sizeS, replace = FALSE)
sampleVec[[i]] <- data[[i]][Ffil]
# Remove unconvention/funny characters
for (j in 1:length(sampleVec[[i]])) {
row1 <- sampleVec[[i]][j]
row2 <- iconv(row1, "latin1", "ASCII", sub = "")
sampleVec[[i]][j] <- row2
}
}
rm(blogs)
rm(news)
rm(twitter)
# Corpus and document term matrix vectors
corpus <- list()
matrixD <- list()
# Iterate each sample data
for (i in 1:length(sampleVec)) {
# Create corpus dataset
corpus[[i]] <- Corpus(VectorSource(sampleVec[[i]]))
# Cleaning/stemming the data
corpus[[i]] <- tm_map(corpus[[i]], tolower)
corpus[[i]] <- tm_map(corpus[[i]], removeNumbers)
corpus[[i]] <- tm_map(corpus[[i]], removeWords, stopwords("english"))
corpus[[i]] <- tm_map(corpus[[i]], removePunctuation)
corpus[[i]] <- tm_map(corpus[[i]], stemDocument)
corpus[[i]] <- tm_map(corpus[[i]], stripWhitespace)
# calculate document term frequency for corpus
matrixD[[i]] <- DocumentTermMatrix(corpus[[i]], control = list(wordLengths = c(0,
Inf)))
}
rm(data)
rm(sampleVec)
Illustration word frequencies via word cloud plotted for each data type
set.seed(3340)
par(mfrow = c(1, 3)) # Plot Panel
h = c("Word Cloud - Blogs", "Word Cloud - News", "Word Cloud - Twitter")
# Iterate each corpus/DTM and plot word cloud for each
for (i in 1:length(corpus)) {
wordcloud(words = colnames(matrixD[[i]]), freq = col_sums(matrixD[[i]]),
scale = c(3, 1), max.words = 100, random.order = FALSE, rot.per = 0.35,
use.r.layout = FALSE, colors = brewer.pal(8, "Dark2"))
title(h[i])
}
Using N-grams to generate tokens Summarizing frequency of tokens Building predictive model Develop data product (i.e. shiny app)