This is the week 2 Mile stone report done as part of the Coursera project work on analysisng Data sets given by Swiftkey and using them to build data models that would predict the next word/words once a user enters the first few words.
Download the data set from Link. The zipped file contains blogs,news and texts from 4 languages ( English,German, Finish,RUssian).I have only utilized the English files for the analysis and prediction.
The sampling can be done only the first time and stored as text files for further loading and analysis.
#Run below code only for the first time
#delete all environ variables
rm(list = ls())
setwd("~/R/Coursera Word Prediction/final/en_US")
# blogs
blogsFileName <- "en_US.blogs.txt"
con <- file(blogsFileName, open = "r")
blogs <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)
# news
setwd("~/R/Coursera Word Prediction/final/en_US")
newsFileName <- "en_US.news.txt"
con <- file(newsFileName, open = "r")
news <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)
# twitter
twitterFileName <- "en_US.twitter.txt"
con <- file(twitterFileName, open = "r")
twitter <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)
rm(con)
# num lines per file
numLines <- sapply(list(blogs, news, twitter), length)
# num characters per file
numChars <- sapply(list(nchar(blogs), nchar(news), nchar(twitter)), sum)
# num words per file
numWords <- sapply(list(blogs, news, twitter), stri_stats_latex)[4,]
Basic_summ <- data.frame(
File = c("en_US.blogs.txt", "en_US.news.txt", "en_US.twitter.txt"),
Lines = numLines,
Characters = numChars,
Words = numWords)
#Sampling for Analysis
con <- file("en_US.twitter.txt", "r")
t_l <- determine_nlines("en_US.twitter.txt")
#t_l #2360148
t_sam <- sample_lines("en_US.twitter.txt", t_l*0.1, nlines = t_l)
#length(t_sam) #236014
#class(t_sam)
fileConn<-file("s_t.txt")
writeLines(t_sam, fileConn)
close(fileConn)
close(con)
#Take 10% sample from blogs and write to a txt file
setwd("~/R/Coursera Word Prediction/final/en_US")
con <- file("en_US.blogs.txt", "r")
t_l <- determine_nlines("en_US.blogs.txt")
#t_l #899288
t_sam <- sample_lines("en_US.blogs.txt", t_l*0.1, nlines = t_l)
fileConn<-file("s_b.txt")
writeLines(t_sam, fileConn)
close(fileConn)
close(con)
#Take 10% sample from news and write to a txt file
setwd("~/R/Coursera Word Prediction/final/en_US")
con <- file("en_US.news.txt", "r")
t_l <- determine_nlines("en_US.news.txt")
#t_l #1010242
t_sam <- sample_lines("en_US.news.txt", t_l*0.1, nlines = t_l)
fileConn<-file("s_n.txt")
writeLines(t_sam, fileConn)
close(fileConn)
close(con)
mem_used()#851 MB
## 851 MB
#Load list of profanity words
setwd("~/R/Coursera Word Prediction/final/en_US")
conprofane <- file("./bad-words.txt", "r")
profanity_vector <- VectorSource(readLines(conprofane))
#length(profanity_vector)#1384
close.connection(conprofane)
#1384
#Remove null/blanks and have only unique values
profanity_vector <- unique(profanity_vector[profanity_vector != ""])
#head(profanity_vector)
#length(profanity_vector)
rm(conprofane)
#delete the variables not needed
rm(blogs)
rm(blogsFileName)
rm(news)
rm(newsFileName)
rm(twitter)
rm(twitterFileName)
rm(numChars)
rm(numLines)
rm(numWords)
rm(con)
rm(fileConn)
rm(t_l)
rm(t_sam)
The basic summary of the 3 files is given below.This indicates the vastness of the data.
head(Basic_summ)
## File Lines Characters Words
## 1 en_US.blogs.txt 899288 206824505 37570839
## 2 en_US.news.txt 77259 15639408 2651432
## 3 en_US.twitter.txt 2360148 162096241 30451170
Lets read back the sample text files, perform cleansing, removal of special characters, profane words and form ngrams - 1 to 4 for our further exploration.
#rm(list=ls())
#Read the sample files into a corpus
setwd("~/R/Coursera Word Prediction/final/en_US")
twit <-readLines("s_t.txt", skipNul = TRUE,encoding="latin1")
blogs <- readLines("s_b.txt", skipNul = TRUE,encoding="latin1")
news <- readLines("s_n.txt", skipNul = TRUE,encoding="latin1")
#Make 1 corpi
#my_corp <- corpus(c(twit,blogs,news))
my_corp <- corpus(c(news,blogs,twit))
rm("twit")
rm(blogs)
rm(news)
#mem_used()#317MB
#View(my_corp[1:1])
#This has many sentences in 1
#Reshape them into sentences
#Reshape to sentences of my sample
my_corp <- corpus_reshape(my_corp,to="sentences")
#my_corp[1:2]
#rm(m_tokens)
#Make tokens is crashing in R studio
mem_used()
## 411 MB
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4174006 223.0 8354811 446.2 8354811 446.2
## Vcells 22106284 168.7 76585769 584.4 95732210 730.4
m_tokens <- tokens( my_corp,remove_punct=TRUE,
remove_symbols=TRUE,
remove_numbers = TRUE,
remove_url = TRUE,
remove_separators = TRUE,
split_hyphens = TRUE,
include_docvars = TRUE,
# remove_twitter=TRUE,
padding = FALSE,
verbose = quanteda_options("verbose"))
mem_used()#218MB
## 534 MB
#m_tokens[1:4]
#class(m_tokens)
#remove stop words
m_tokens <- tokens_select(m_tokens, pattern = stopwords("en"), selection = "remove")
#remove profanity words from our list
m_tokens <- tokens_remove(m_tokens,pattern=profanity_vector)
mem_used()#512 MB
## 512 MB
format(object.size(m_tokens),units="Mb")
## [1] "232.4 Mb"
format(object.size(my_corp),units="Mb")
## [1] "253.9 Mb"
#remove one character strings
#m_tokens <- tokens_select(m_tokens, min_nchar=2, selection = "remove")
#summary(m_tokens[1:10])
#remove non breaking space https://github.com/quanteda/quanteda/issues/796
m_tokens <- tokens_remove(m_tokens, "\\p{Z}", valuetype = "regex")
library(textclean)
format(object.size(m_tokens),units="Mb") #0.9GB
## [1] "232.4 Mb"
#remove punct
m_tokens<- tokens_select(
m_tokens,
c("[\\d-]", "[[:punct:]]", "^.{1,2}$"), # where \anynumber followed by "-", any punctuation mark,
selection = "remove",
valuetype = "regex",
verbose = TRUE
)
## removed 48,523 features
#48680 features removed
m_tokens[1:20]
## Tokens consisting of 20 documents.
## text1.1 :
## [1] "Brett" "Favre" "still" "able" "throw" "proper"
## [7] "pass" "practice" "Vikings" "still" "given" "hope"
## [ ... and 8 more ]
##
## text1.2 :
## [1] "Favre" "able" "minimal" "work" "practice" "Friday"
## [7] "first" "time" "week" "making" "soft" "tosses"
## [ ... and 6 more ]
##
## text1.3 :
## [1] "listed" "questionable" "injury" "report" "game"
## [6] "Giants" "game" "time" "decision" "whether"
## [11] "make" "straight"
## [ ... and 3 more ]
##
## text2.1 :
## [1] "INDIANAPOLIS" "Danny" "Granger" "scored" "points"
## [6] "help" "Indiana" "Pacers" "defeat" "Orlando"
## [11] "Magic" "Tuesday"
## [ ... and 8 more ]
##
## text3.1 :
## [1] "two" "groups" "numbering" "people" "combined" "began"
## [7] "interact" "point" "thatâ" "trouble" "started" "Conroy"
## [ ... and 1 more ]
##
## text4.1 :
## [1] "George" "Spafford" "principal" "consultant" "Pepperweed"
##
## [ reached max_ndoc ... 14 more documents ]
#remove some words patterns
m_tokens<- tokens_select(
m_tokens,
pattern = c("blah"),
selection = "remove",
valuetype = "regex",
verbose = TRUE
)
## removed 15 features
#N GRAMS
#generate n grams from 1:4
t_1g <- tokens_ngrams(m_tokens, n = 1)
format(object.size(t_1g),units="Mb") #194.4 MB
## [1] "227.6 Mb"
t_2g <- tokens_ngrams(m_tokens, n = 2)
format(object.size(t_2g),units="Mb") #359.9 MB
## [1] "425.2 Mb"
t_3g <- tokens_ngrams(m_tokens, n = 3)
format(object.size(t_3g),units="Mb") #426 MB
## [1] "505 Mb"
t_4g <- tokens_ngrams(m_tokens, n = 4)
format(object.size(t_4g),units="Mb") #400.9 MB
## [1] "475.7 Mb"
rm(my_corp)
#head(t_3g)
#head(t_4g)
#head(t_4g)
#Mile stone report
#Analysis of n grams by plotting
#sample
#Convert all n grams to Data table for saving memory and speed
library(data.table)
# convert to a data.frame
x_1g_dfm <- dfm(t_1g)
dt_1g <- data.frame(Content = featnames(x_1g_dfm), Frequency = colSums(x_1g_dfm),
row.names = NULL, stringsAsFactors = FALSE)
setDT(dt_1g)
dt_1g <- dt_1g[order( -Frequency)]
rm(x_1g_dfm)
format(object.size(dt_1g),units="Mb")
## [1] "11.5 Mb"
x_2g_dfm <- dfm(t_2g)
dt_2g <- data.frame(Content = featnames(x_2g_dfm), Frequency = colSums(x_2g_dfm),
row.names = NULL, stringsAsFactors = FALSE)
setDT(dt_2g)
rm(x_2g_dfm)
format(object.size(dt_2g),units="Mb")
## [1] "226.3 Mb"
#sort by Freq in Desc
dt_2g <- dt_2g[order( -Frequency)]
#dt_2g[1:10,]
x_3g_dfm <- dfm(t_3g)
dt_3g <- data.frame(Content = featnames(x_3g_dfm), Frequency = colSums(x_3g_dfm),
row.names = NULL, stringsAsFactors = FALSE)
setDT(dt_3g)
rm(x_3g_dfm)
dt_3g <- dt_3g[order( -Frequency)]
format(object.size(dt_3g),units="Mb")
## [1] "323.6 Mb"
#4 grams
x_4g_dfm <- dfm(t_4g)
dt_4g <- data.frame(Content = featnames(x_4g_dfm), Frequency = colSums(x_4g_dfm),
row.names = NULL, stringsAsFactors = FALSE)
setDT(dt_4g)
rm(x_4g_dfm)
dt_4g <- dt_4g[order( -Frequency)]
format(object.size(dt_4g),units="Mb")
## [1] "294.9 Mb"
dt_1g[, .N]# 149402Rows
## [1] 158555
dt_2g[, .N] #429740 Rows
## [1] 2823752
dt_3g[, .N] #431264 Rows
## [1] 3608018
dt_4g[, .N] #363080 Rows
## [1] 3130270
#remove unwanted variables
rm(t_1g)
rm(t_2g)
rm(t_3g)
rm(t_4g)
Lets plot the histograms for the 1 gram to 4 gram.
#Plot of top 10 words in each n gram order by Frequency
plot_1g<- dt_1g[1:20,]
plot_1g[] <- lapply(plot_1g, gsub, pattern='_', replacement=' ')
plot_1g$Frequency <- as.numeric(plot_1g$Frequency)
ggplot(data=plot_1g, aes(x=reorder(Content, -Frequency), y=Frequency)) + geom_bar(stat="identity",fill="red") +
theme(axis.text.x = element_text(angle = 40, hjust = 1))+geom_text(aes(label=Frequency), position=position_dodge(width=0.9), vjust=-0.25)
plot_2g<- dt_2g[1:10,]
plot_2g[] <- lapply(plot_2g, gsub, pattern='_', replacement=' ')
plot_2g$Frequency <- as.numeric(plot_2g$Frequency)
ggplot(data=plot_2g, aes(x=reorder(Content, -Frequency), y=Frequency)) + geom_bar(stat="identity",fill="yellow") +
theme(axis.text.x = element_text(angle = 40, hjust = 1))+geom_text(aes(label=Frequency), position=position_dodge(width=0.9), vjust=-0.25)
plot_3g<- dt_3g[1:20,]
plot_3g[] <- lapply(plot_3g, gsub, pattern='_', replacement=' ')
plot_3g$Frequency <- as.numeric(plot_3g$Frequency)
ggplot(data=plot_3g, aes(x=reorder(Content, -Frequency), y=Frequency)) + geom_bar(stat="identity",fill="green") +
theme(axis.text.x = element_text(angle = 40, hjust = 1))+geom_text(aes(label=Frequency), position=position_dodge(width=0.9), vjust=-0.25)
plot_4g<- dt_4g[1:20,]
plot_4g[] <- lapply(plot_4g, gsub, pattern='_', replacement=' ')
plot_4g$Frequency <- as.numeric(plot_4g$Frequency)
ggplot(data=plot_4g, aes(x=reorder(Content, -Frequency), y=Frequency)) + geom_bar(stat="identity",fill="pink") +
theme(axis.text.x = element_text(angle = 40, hjust = 1))+geom_text(aes(label=Frequency), position=position_dodge(width=0.9), vjust=-0.25)
#Word clouds
suppressWarnings (
wordcloud(words = dt_1g$Content,
freq = dt_1g$Frequency,
min.freq = 1,
max.words = 300,
random.order = FALSE,
# rot.per = 0.35,
colors=brewer.pal(8, "Dark2"))
)
suppressWarnings (
wordcloud(words = dt_2g$Content,
freq = dt_2g$Frequency,
min.freq = 1,
max.words = 300,
random.order = FALSE,
# rot.per = 0.35,
colors=brewer.pal(8, "Dark2"))
)
suppressWarnings (
wordcloud(words = dt_3g$Content,
freq = dt_3g$Frequency,
min.freq = 1,
max.words = 300,
random.order = FALSE,
# rot.per = 0.35,
colors=brewer.pal(8, "Dark2"))
)
suppressWarnings (
wordcloud(words = dt_4g$Content,
freq = dt_4g$Frequency,
min.freq = 1,
max.words = 300,
random.order = FALSE,
# rot.per = 0.35,
colors=brewer.pal(8, "Dark2"))
)
Iam planning to build the final prediction model based on algorithms I see when I do a google search and by reading into the coursera weekly forums discussions.Stupid backoff algorithm and Markov chains seem to be fascinating.The ideas I have /rough steps I intend to follow are: 1. Store these 1-4 grams data tables in data files and load them, instead of processing them every time 2. Design a function using the algorithm /Markov chains that stores these 1-4 grams 3. Design a shiny UI that would get input as one word or set of words .Predict the next possible word from the models built. 4. CHeck how fast and efficient the model is 5. Design a methodology to handle situations when it cannot predict words from the built ngrams. 6. Ensure the App is efficient and reliable 99% of the time.