Milestone Submission

Author

Published

March 4, 2023

#Project overview

Data Loading and Cleaning

Data Loading

This data is provided as a part of the Data Science course on Coursera, the link can be retrieved here: “https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip”
This zip folder uses the files named LOCALE.blogs.txt where LOCALE is the each of the four locales en_US, de_DE, ru_RU and fi_FI. The data is from a corpus called HC Corpora. See the About the Corpora reading for more details. The files have been language filtered but may still contain some foreign text.
The first step of this project is to tokenize the text and make profanity filtering (i.e., removing bad words) as the data may contain words of offensive and profane meaning.

library(tm)
library(RWeka) 
library(dplyr)
library(RColorBrewer)
library(wordcloud)
library(caret)
library(ggplot2)
library("stringi")
library(gridExtra)

twitter_url="./Coursera-SwiftKey/final/en_US/en_US.twitter.txt"
blogs_url= "./Coursera-SwiftKey/final/en_US/en_US.blogs.txt"
news_url="./Coursera-SwiftKey/final/en_US/en_US.news.txt"
twitter_file = readLines(twitter_url, encoding="UTF-8")
blogs_file = readLines(blogs_url, encoding="UTF-8")
news_file= readLines(news_url, encoding="UTF-8")

###Have a look at the data

data.frame(file = c("Twitter", "Blogs", "News"),
                         size_MB = c(file.info(twitter_url)$size/1024^2,
                                     file.info(blogs_url)$size/1024^2,
                                     file.info(news_url)$size/1024^2),
            lines = c(length(twitter_file),
                      length(blogs_file),
                      length(news_file)),
            longest_line = c(summary(nchar(twitter_file))[6],
                             summary(nchar(blogs_file))[6],
                             summary(nchar(news_file))[6]),
           word_count=c(sum(stri_count_words(twitter_file)),
                        sum(stri_count_words(blogs_file)),
                        sum(stri_count_words(news_file)))
)

     file  size_MB   lines longest_line word_count
1 Twitter 159.3641 2360148          140   30093372
2   Blogs 200.4242  899288        40833   37546250
3    News 196.2775   77259         5760    2674536

Data cleaning

To make it handy for the following steps of building the prediction model, the data is subset as follow:

twitter_subset=readLines(twitter_url,500)
blogs_subset=readLines(blogs_url,500)
news_subset=readLines(news_url,500)
all_subset=paste(twitter_subset,blogs_subset,news_subset)

The following code removes all of the non-alphabet emelements and lowercase the text

# Remove special characters from the text
all_subset=iconv(all_subset,"UTF-8", "ASCII", sub = "")
#Remove number
all_subset=removeNumbers(all_subset)
#Remove white space
all_subset=stripWhitespace(all_subset)
#Remove Punctuation
all_subset=removePunctuation(all_subset)
#To lowercase
all_subset = tolower(all_subset)

The following code removes profane words from the data. The “bad-words” corpus provided by the Carnegie Mellon’s School of Computer Science is used to filtering profanity in working dataset.

profanity_url="http://www.cs.cmu.edu/~biglou/resources/bad-words.txt"
download.file(profanity_url,destfile = "profanity.txt")
profanity=readLines("profanity.txt",encoding="UTF-8")
all_subset=removeWords(all_subset,c(profanity))

Exploratory data analysis

In this section, the analysis aim to: 1. Understanding the distribution of words and relationship between the words in the corpora. 2. Understand frequencies of words and word pairs To achieve this, this analysis adopts the N-gram function to examine the frequencies of single words and groups of words that appear in the dataset. The function NGramTokenizer() from package tm is used in the following code. ## Tokenizing the sample

tokens <- function(x, y){
        x <- NGramTokenizer(all_subset, Weka_control(min = y, max = y))
        x <- data.frame(table(x))
        x <- x[order(x$Freq, decreasing = TRUE), ]
}

unigrams_token <- tokens(all_subset, 1)
bigrams_token <- tokens(all_subset, 2)
trigrams_token <- tokens(all_subset, 3)

all_data_unigrams <- c(twitter_file, news_file, blogs_file) %>% tokens(1)

##Plot the frequencies of top 15 words in each N-grams

p1= unigrams_token[1:15,] %>%
        ggplot(aes(x = reorder(x, Freq), y = Freq)) +
        geom_bar(stat ='identity', 
                 colour = "lightgrey", 
                 fill = "lightcoral") +
        geom_text(label = unigrams_token$Freq[1:15], 
                  size = 2.5, 
                  hjust = 3) +
        xlab("1-gram") + 
        ylab("Frequency") + 
        ggtitle("Top 15 Unigrams words") +
        theme(plot.title = element_text(hjust = 0.5))+
        coord_flip()
p2= bigrams_token[1:15,] %>%
        ggplot(aes(x = reorder(x, Freq), y = Freq)) +
        geom_bar(stat ='identity', 
                 colour = "lightgrey", 
                 fill = "lightblue") +
        geom_text(label = bigrams_token$Freq[1:15], 
                  size = 2.5, 
                  hjust = 3) +
        xlab("2-gram") + 
        ylab("Frequency") + 
        ggtitle("Top 15 Bigrams words") +
        theme(plot.title = element_text(hjust = 0.5))+
        coord_flip()
p3= trigrams_token[1:15,] %>%
        ggplot(aes(x = reorder(x, Freq), y = Freq)) +
        geom_bar(stat ='identity', 
                 colour = "lightgrey", 
                 fill = "springgreen4") +
        geom_text(label = trigrams_token$Freq[1:15], 
                  size = 2.5, 
                  hjust = 3) +
        xlab("3-gram") + 
        ylab("Frequency") + 
        ggtitle("Top 15 Trigrams words") +
        theme(plot.title = element_text(hjust = 0.5))+
        coord_flip()
grid.arrange(p1, p2, p3, nrow = 2)

##The 100 most frequent words

#Tonkenizing the words in the dataset
words<-WordTokenizer(all_subset) 
wordcloud(words, scale=c(5,0.1), max.words=100, random.order=FALSE, 
          rot.per=0.5, use.r.layout=FALSE, colors=brewer.pal(8,"Accent"))

Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
drops documents

Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
tm::stopwords())): transformation drops documents

Further step

Now since we have the data to the n-grams model we and the observations required we can proceed to make our shiny application that predicts the the words using N - gram model.