Coursera_Milestone

Coursera Mile Stone Report on Data Set

Exploratory analysis of the Data
Goals for the App and Prediction
major features of the data
summaries/plots
downloaded data and loaded it in
summary statistics of 3 data sets
link lead to html page
word count,line count,basic data tables
basic plots, histograms
brief concise for a non data scientist to understand?

Overview

This is the week 2 Mile stone report done as part of the Coursera project work on analysisng Data sets given by Swiftkey and using them to build data models that would predict the next word/words once a user enters the first few words.

Step 1- Download and Analyse the Data Set and Sample them

Download the data set from Link. The zipped file contains blogs,news and texts from 4 languages ( English,German, Finish,RUssian).I have only utilized the English files for the analysis and prediction.

The sampling can be done only the first time and stored as text files for further loading and analysis.

#Run below code only for the first time 
#delete all environ variables
rm(list = ls())

setwd("~/R/Coursera Word Prediction/final/en_US")
# blogs
blogsFileName <- "en_US.blogs.txt"
con <- file(blogsFileName, open = "r")
blogs <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)

# news
setwd("~/R/Coursera Word Prediction/final/en_US")
newsFileName <- "en_US.news.txt"
con <- file(newsFileName, open = "r")
news <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)

# twitter
twitterFileName <- "en_US.twitter.txt"
con <- file(twitterFileName, open = "r")
twitter <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)

rm(con)

# num lines per file
numLines <- sapply(list(blogs, news, twitter), length)

# num characters per file
numChars <- sapply(list(nchar(blogs), nchar(news), nchar(twitter)), sum)


# num words per file
numWords <- sapply(list(blogs, news, twitter), stri_stats_latex)[4,]

Basic_summ <- data.frame(
    File = c("en_US.blogs.txt", "en_US.news.txt", "en_US.twitter.txt"),
    Lines = numLines,
    Characters = numChars,
    Words = numWords)

#Sampling for Analysis
con <- file("en_US.twitter.txt", "r")
t_l <- determine_nlines("en_US.twitter.txt")
#t_l #2360148
t_sam <- sample_lines("en_US.twitter.txt", t_l*0.1, nlines = t_l)
#length(t_sam) #236014
#class(t_sam)
fileConn<-file("s_t.txt")
writeLines(t_sam, fileConn)
close(fileConn)
close(con)



#Take 10% sample from blogs and write to a txt file 
setwd("~/R/Coursera Word Prediction/final/en_US")

con <- file("en_US.blogs.txt", "r")
t_l <- determine_nlines("en_US.blogs.txt")
#t_l #899288
t_sam <- sample_lines("en_US.blogs.txt", t_l*0.1, nlines = t_l)
fileConn<-file("s_b.txt")
writeLines(t_sam, fileConn)
close(fileConn)
close(con)


#Take 10% sample from news and write to a txt file 
setwd("~/R/Coursera Word Prediction/final/en_US")

con <- file("en_US.news.txt", "r")
t_l <- determine_nlines("en_US.news.txt")
#t_l #1010242

t_sam <- sample_lines("en_US.news.txt", t_l*0.1, nlines = t_l)
fileConn<-file("s_n.txt")
writeLines(t_sam, fileConn)
close(fileConn)
close(con)


mem_used()#851 MB

## 851 MB

#Load list of profanity words 
setwd("~/R/Coursera Word Prediction/final/en_US")

conprofane <- file("./bad-words.txt", "r")
profanity_vector <- VectorSource(readLines(conprofane))
#length(profanity_vector)#1384
close.connection(conprofane)
#1384
#Remove null/blanks and have only unique values
profanity_vector <- unique(profanity_vector[profanity_vector != ""])
#head(profanity_vector)
#length(profanity_vector)
rm(conprofane)
#delete the variables not needed

rm(blogs)
rm(blogsFileName)
rm(news)
rm(newsFileName)
rm(twitter)
rm(twitterFileName)
rm(numChars)
rm(numLines)
rm(numWords)
rm(con)
rm(fileConn)
rm(t_l)
rm(t_sam)

Step 2- Basic Summaries of the 3 data sets

The basic summary of the 3 files is given below.This indicates the vastness of the data.

head(Basic_summ)

##                File   Lines Characters    Words
## 1   en_US.blogs.txt  899288  206824505 37570839
## 2    en_US.news.txt   77259   15639408  2651432
## 3 en_US.twitter.txt 2360148  162096241 30451170

Step 3- Quanteda Package to perform text processing

Lets read back the sample text files, perform cleansing, removal of special characters, profane words and form ngrams - 1 to 4 for our further exploration.

#rm(list=ls())
#Read the sample files into a corpus
setwd("~/R/Coursera Word Prediction/final/en_US")
twit <-readLines("s_t.txt", skipNul = TRUE,encoding="latin1")
blogs <- readLines("s_b.txt", skipNul = TRUE,encoding="latin1")
news <- readLines("s_n.txt", skipNul = TRUE,encoding="latin1")

#Make 1 corpi
#my_corp <- corpus(c(twit,blogs,news))
my_corp <- corpus(c(news,blogs,twit))
rm("twit")
rm(blogs)
rm(news)
#mem_used()#317MB

#View(my_corp[1:1])
#This has many sentences in 1

#Reshape them into sentences
#Reshape to sentences of my sample 
my_corp <- corpus_reshape(my_corp,to="sentences")
#my_corp[1:2]
#rm(m_tokens)
#Make tokens is crashing in R studio
mem_used()

## 411 MB

gc()

##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  4174006 223.0    8354811 446.2  8354811 446.2
## Vcells 22106284 168.7   76585769 584.4 95732210 730.4

m_tokens <- tokens( my_corp,remove_punct=TRUE,
                    remove_symbols=TRUE,
                    remove_numbers = TRUE,
                    remove_url = TRUE,
                    remove_separators = TRUE,
                    split_hyphens = TRUE,
                    include_docvars = TRUE,
               #     remove_twitter=TRUE,
                    padding = FALSE,
                    verbose = quanteda_options("verbose"))
mem_used()#218MB

## 534 MB

#m_tokens[1:4]
#class(m_tokens)
#remove stop words
m_tokens <- tokens_select(m_tokens, pattern = stopwords("en"), selection = "remove")
#remove profanity words from our list 
m_tokens <- tokens_remove(m_tokens,pattern=profanity_vector)
mem_used()#512 MB

## 512 MB

format(object.size(m_tokens),units="Mb")

## [1] "232.4 Mb"

format(object.size(my_corp),units="Mb")

## [1] "253.9 Mb"

#remove one character strings 
#m_tokens <- tokens_select(m_tokens, min_nchar=2, selection = "remove")
#summary(m_tokens[1:10])
#remove non breaking space https://github.com/quanteda/quanteda/issues/796
m_tokens <- tokens_remove(m_tokens, "\\p{Z}", valuetype = "regex")
library(textclean)
format(object.size(m_tokens),units="Mb") #0.9GB

## [1] "232.4 Mb"

#remove punct 
m_tokens<- tokens_select(
  m_tokens,
  c("[\\d-]", "[[:punct:]]", "^.{1,2}$"), # where \anynumber followed by "-", any punctuation mark,
  selection = "remove",
  valuetype = "regex",
  verbose = TRUE
)

## removed 48,523 features

#48680 features removed
m_tokens[1:20]

## Tokens consisting of 20 documents.
## text1.1 :
##  [1] "Brett"    "Favre"    "still"    "able"     "throw"    "proper"  
##  [7] "pass"     "practice" "Vikings"  "still"    "given"    "hope"    
## [ ... and 8 more ]
## 
## text1.2 :
##  [1] "Favre"    "able"     "minimal"  "work"     "practice" "Friday"  
##  [7] "first"    "time"     "week"     "making"   "soft"     "tosses"  
## [ ... and 6 more ]
## 
## text1.3 :
##  [1] "listed"       "questionable" "injury"       "report"       "game"        
##  [6] "Giants"       "game"         "time"         "decision"     "whether"     
## [11] "make"         "straight"    
## [ ... and 3 more ]
## 
## text2.1 :
##  [1] "INDIANAPOLIS" "Danny"        "Granger"      "scored"       "points"      
##  [6] "help"         "Indiana"      "Pacers"       "defeat"       "Orlando"     
## [11] "Magic"        "Tuesday"     
## [ ... and 8 more ]
## 
## text3.1 :
##  [1] "two"       "groups"    "numbering" "people"    "combined"  "began"    
##  [7] "interact"  "point"     "thatâ"     "trouble"   "started"   "Conroy"   
## [ ... and 1 more ]
## 
## text4.1 :
## [1] "George"     "Spafford"   "principal"  "consultant" "Pepperweed"
## 
## [ reached max_ndoc ... 14 more documents ]

#remove some words patterns
m_tokens<- tokens_select(
  m_tokens,
pattern = c("blah"),
  selection = "remove",
  valuetype = "regex",
  verbose = TRUE
)

## removed 15 features

#N GRAMS

#generate n grams from 1:4
t_1g <- tokens_ngrams(m_tokens, n = 1)
format(object.size(t_1g),units="Mb") #194.4 MB

## [1] "227.6 Mb"

t_2g <- tokens_ngrams(m_tokens, n = 2)
format(object.size(t_2g),units="Mb") #359.9 MB

## [1] "425.2 Mb"

t_3g <- tokens_ngrams(m_tokens, n = 3)
format(object.size(t_3g),units="Mb") #426 MB

## [1] "505 Mb"

t_4g <- tokens_ngrams(m_tokens, n = 4)
format(object.size(t_4g),units="Mb") #400.9 MB

## [1] "475.7 Mb"

rm(my_corp)
#head(t_3g)
#head(t_4g)
#head(t_4g)

#Mile stone report
#Analysis of n grams by plotting 
#sample


#Convert all n grams to Data table for saving memory and speed 
library(data.table)
# convert to a data.frame
x_1g_dfm <- dfm(t_1g)
dt_1g <- data.frame(Content = featnames(x_1g_dfm), Frequency = colSums(x_1g_dfm), 
                    row.names = NULL, stringsAsFactors = FALSE)
setDT(dt_1g)
dt_1g <- dt_1g[order( -Frequency)]
rm(x_1g_dfm)
format(object.size(dt_1g),units="Mb")

## [1] "11.5 Mb"

x_2g_dfm <- dfm(t_2g)
dt_2g <- data.frame(Content = featnames(x_2g_dfm), Frequency = colSums(x_2g_dfm), 
                    row.names = NULL, stringsAsFactors = FALSE)
setDT(dt_2g)
rm(x_2g_dfm)
format(object.size(dt_2g),units="Mb")

## [1] "226.3 Mb"

#sort by  Freq in Desc 
dt_2g <- dt_2g[order( -Frequency)]
#dt_2g[1:10,]


x_3g_dfm <- dfm(t_3g)
dt_3g <- data.frame(Content = featnames(x_3g_dfm), Frequency = colSums(x_3g_dfm), 
                    row.names = NULL, stringsAsFactors = FALSE)
setDT(dt_3g)
rm(x_3g_dfm)
dt_3g <- dt_3g[order( -Frequency)]
format(object.size(dt_3g),units="Mb")

## [1] "323.6 Mb"

#4 grams
x_4g_dfm <- dfm(t_4g)
dt_4g <- data.frame(Content = featnames(x_4g_dfm), Frequency = colSums(x_4g_dfm), 
                    row.names = NULL, stringsAsFactors = FALSE)
setDT(dt_4g)
rm(x_4g_dfm)

dt_4g <- dt_4g[order( -Frequency)]

format(object.size(dt_4g),units="Mb")

## [1] "294.9 Mb"

dt_1g[, .N]# 149402Rows

## [1] 158555

dt_2g[, .N] #429740 Rows

## [1] 2823752

dt_3g[, .N] #431264 Rows

## [1] 3608018

dt_4g[, .N] #363080 Rows

## [1] 3130270

#remove unwanted variables
rm(t_1g)
rm(t_2g)
rm(t_3g)
rm(t_4g)

Step 4- Basic Histogram Plots

Lets plot the histograms for the 1 gram to 4 gram.

#Plot of top 10 words in each n gram order by Frequency
plot_1g<- dt_1g[1:20,]
plot_1g[] <- lapply(plot_1g, gsub, pattern='_', replacement=' ')
plot_1g$Frequency <- as.numeric(plot_1g$Frequency)

ggplot(data=plot_1g, aes(x=reorder(Content, -Frequency), y=Frequency)) + geom_bar(stat="identity",fill="red") +
  theme(axis.text.x = element_text(angle = 40, hjust = 1))+geom_text(aes(label=Frequency), position=position_dodge(width=0.9), vjust=-0.25)

plot_2g<- dt_2g[1:10,]
plot_2g[] <- lapply(plot_2g, gsub, pattern='_', replacement=' ')
  plot_2g$Frequency <- as.numeric(plot_2g$Frequency)
  ggplot(data=plot_2g, aes(x=reorder(Content, -Frequency), y=Frequency)) + geom_bar(stat="identity",fill="yellow") +
  theme(axis.text.x = element_text(angle = 40, hjust = 1))+geom_text(aes(label=Frequency), position=position_dodge(width=0.9), vjust=-0.25)

plot_3g<- dt_3g[1:20,]
plot_3g[] <- lapply(plot_3g, gsub, pattern='_', replacement=' ')
plot_3g$Frequency <- as.numeric(plot_3g$Frequency)

ggplot(data=plot_3g, aes(x=reorder(Content, -Frequency), y=Frequency)) + geom_bar(stat="identity",fill="green") +
  theme(axis.text.x = element_text(angle = 40, hjust = 1))+geom_text(aes(label=Frequency), position=position_dodge(width=0.9), vjust=-0.25)

plot_4g<- dt_4g[1:20,]
plot_4g[] <- lapply(plot_4g, gsub, pattern='_', replacement=' ')
plot_4g$Frequency <- as.numeric(plot_4g$Frequency)

ggplot(data=plot_4g, aes(x=reorder(Content, -Frequency), y=Frequency)) + geom_bar(stat="identity",fill="pink") +
  theme(axis.text.x = element_text(angle = 40, hjust = 1))+geom_text(aes(label=Frequency), position=position_dodge(width=0.9), vjust=-0.25)

#Word clouds 
suppressWarnings (
  wordcloud(words = dt_1g$Content,
            freq = dt_1g$Frequency,
            min.freq = 1,
            max.words = 300,
            random.order = FALSE,
          #  rot.per = 0.35, 
            colors=brewer.pal(8, "Dark2"))
)

suppressWarnings (
  wordcloud(words = dt_2g$Content,
            freq = dt_2g$Frequency,
            min.freq = 1,
            max.words = 300,
            random.order = FALSE,
            #  rot.per = 0.35, 
            colors=brewer.pal(8, "Dark2"))
)

suppressWarnings (
  wordcloud(words = dt_3g$Content,
            freq = dt_3g$Frequency,
            min.freq = 1,
            max.words = 300,
            random.order = FALSE,
            #  rot.per = 0.35, 
            colors=brewer.pal(8, "Dark2"))
)

suppressWarnings (
  wordcloud(words = dt_4g$Content,
            freq = dt_4g$Frequency,
            min.freq = 1,
            max.words = 300,
            random.order = FALSE,
            #  rot.per = 0.35, 
            colors=brewer.pal(8, "Dark2"))
)

Step 5- Plans for building the Prediction Model and Shiny App

Iam planning to build the final prediction model based on algorithms I see when I do a google search and by reading into the coursera weekly forums discussions.Stupid backoff algorithm and Markov chains seem to be fascinating.The ideas I have /rough steps I intend to follow are: 1. Store these 1-4 grams data tables in data files and load them, instead of processing them every time 2. Design a function using the algorithm /Markov chains that stores these 1-4 grams 3. Design a shiny UI that would get input as one word or set of words .Predict the next possible word from the models built. 4. CHeck how fast and efficient the model is 5. Design a methodology to handle situations when it cannot predict words from the built ngrams. 6. Ensure the App is efficient and reliable 99% of the time.

Coursera_Milestone_Nov22

Gayathri Nagarajan

11/17/2020