Summary

Around the world, people are spending more and more time on their mobile devices for email, social networking, banking and a host of other activities. However, typing on mobile devices can be slow and tedious. SwiftKey created a technology that makes it easier for everyone to communicate and work with their mobile devices. The company’s best-known application is the SwiftKey smart keyboard app, which learns from users as they type and makes it easier for people to type faster on their cell phones. One of its most important features is to propose three alternatives to continue the text being typed.

In this final project we will apply natural language processing (NLP), text mining and R tools for exploratory data analysis and subsequent text modeling and prediction.

The datasets for this project can be downloaded from the web Coursera-SwiftKey Datasets

The data is originally from: HC Corpora

In this report, We will focus on the files that contain English data, which are en_US.blogs.txt, en_US.news.txt and en_US.twitter.txt files.

Datasets

Install and load necessary libraries.

library(tm) # Text Mining Package
# library(readr)
library(qdap) # to assist in quantitative discourse analysis
library(tidyverse)
library(stringi) # Character String Processing Facilities
library(RWeka) #collection of ML algorithms for data mining tasks 
library(ggplot2)
library(wordcloud) # Functionality to create pretty word clouds,
library(SnowballC) # implements Porter's word stemming algorithm 
# library(gridExtra)
# getwd()
set.seed(2701)

Download files

# We set the data file 'data'.
datafolder <- "data"
# url with the text data files to be analyzed
url  <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
fname <- "Coursera-SwiftKey.zip"
fpath <- paste(datafolder, fname, sep="/")

### Files are downloaded if they are not in the 'data' directory.
if (!file.exists(fpath)){
    download.file(url, destfile=fpath, method="curl")
}
unzip(zipfile=fpath, exdir=datafolder)

Basic information on datasets

# We go through the 'data' directory and create a list with the paths to 
# the files we are interested in "en_*.txt".

file_l <- list.files(path=datafolder, recursive=T, pattern=".*en_.*.txt")

# Apply a Function over a List or Vector with lapply
file_info <- lapply(paste(datafolder, file_l, sep="/"),
                    function(file_) {

            # Establish connection
            fileConnection <- file(file_, open='r')
            # Read lines to a file
            linesInFile <- readLines(fileConnection, skipNul=TRUE)
            # size_original_(MB)
            file_Size_origin <- round(file.info(file_)$size/1024^2, 1)
            # size_in_R_(MB)            
            fileSize <- format(object.size(linesInFile), units ='Mb')
            # number of lines
            fileNoOfLines <- as.numeric(length(linesInFile))
            # number of words
            fileWords <- sum(stri_count_words(linesInFile))
            # longest_line
            nchars <- lapply(linesInFile, nchar)
            maxchars <- which.max(nchars)
            # Extract dataset name from file path
            name <- stri_extract_last_regex(file_, '([a-zA-Z]+)[^\\.txt]')
            close(fileConnection)
            return(c(name, file_Size_origin, fileSize, 
                     fileNoOfLines, fileWords, maxchars))

})
# unlist
# simplifies it to produce a vector which contains all the atomic components
# which occur in x.
# matrix creates a matrix from the given set of values.
file_info_df <- data.frame(matrix(unlist(file_info), 
                                  nrow=length(file_info),
                                  byrow=TRUE))
colnames(file_info_df) <- c("file", "size_original_(MB)", "size_in_R_(MB)",
                            "lines", "words", "longest_line" )
# data frame with descriptive data of the three text files
blogs_lines <- file_info_df$lines[1]
news_lines <- file_info_df$lines[2]
twitter_lines <- file_info_df$lines[3]
knitr::kable(file_info_df)
file size_original_(MB) size_in_R_(MB) lines words longest_line
blogs 200.4 255.4 Mb 899288 37546250 483415
news 196.3 257.3 Mb 1010242 34762395 123628
twitter 159.4 319 Mb 2360148 30093413 26

Pre-processing and Cleanup

Create file corpus, clean the corpus

We create the remove_internet_chars function to remove urls, hastags and emails. We download a list of profanity words in order to remove them from the texts.

remove_internet_chars <- function(x){
    x <- gsub("[^ ]+@[^ ]+", " ", x)
    x <- gsub(" @[^ ]+", " ", x)
    x <- gsub("#[^ ]+", " ", x)
    x <- gsub("[^ ]+://[^ ]+", " ", x)
}

### File with offensive words to remove them from text files.
badWordsFileURL <- 'http://www.bannedwordlist.com/lists/swearWords.txt'
badWordsFileName <- 'swearWords.txt'
fpath_badwords <- paste(datafolder, badWordsFileName, sep="/")
### Files are downloaded if they are not in the 'data' directory.
if (!file.exists(fpath_badwords)){
    download.file(badWordsFileURL, destfile=fpath_badwords, method="curl")
}
badwords <- readLines(fpath_badwords)
## Warning in readLines(fpath_badwords): incomplete final line found on 'data/
## swearWords.txt'
profanity <- VectorSource(badwords)
import_file <- function(file_, lines_){
    fileConnection <- file(paste0('./data/final/en_US/',file_))
    linesInFile <- readLines(fileConnection, (as.numeric(lines_)/1000), skipNul=TRUE)
    close(fileConnection)
    return(linesInFile)    
}


blogs <- import_file('en_US.blogs.txt', blogs_lines)
news <- import_file('en_US.blogs.txt', news_lines)
twitters <- import_file('en_US.blogs.txt', twitter_lines)

text_all <- paste(blogs, news, twitters)

corpusFeeds <- VCorpus(VectorSource(text_all))


corpusFeeds <-tm_map(corpusFeeds, content_transformer(gsub), 
                     pattern="’|'|`|'",replace="'")
# convert contraction
corpusFeeds <- tm_map(corpusFeeds,
                      content_transformer(replace_contraction))
# convert abbreviation
corpusFeeds <- tm_map(corpusFeeds,
                      content_transformer(replace_abbreviation))
# convert to lower case
corpusFeeds <- tm_map(corpusFeeds, content_transformer(tolower))
# remove URLs
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
corpusFeeds <- tm_map(corpusFeeds, content_transformer(removeURL)
)
corpusFeeds <- tm_map(corpusFeeds,
                  content_transformer(remove_internet_chars))
#remove ������ what would be emojis
corpusFeeds <-tm_map(corpusFeeds, content_transformer(gsub), 
                     pattern="\\W",replace=" ")
# remove anything other than English letters or space
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
corpusFeeds <- tm_map(corpusFeeds, content_transformer(removeNumPunct))
# remove stop words
corpusFeeds <- tm_map(corpusFeeds, removeWords, stopwords("english"))
# remove extra whitespace
corpusFeeds <- tm_map(corpusFeeds, stripWhitespace)
# Remove numbers
corpusFeeds <- tm_map(corpusFeeds, removeNumbers)
# Remove punctuations
corpusFeeds <- tm_map(corpusFeeds, removePunctuation)
# Remove bad words
corpusFeeds <- tm_map(corpusFeeds, removeWords, profanity)
# convert to plain text
corpusFeeds <- tm_map(corpusFeeds, PlainTextDocument)
# size of corpus
# summary(corpusFeeds)
# convert to Stemmed
corpusFeedsStemmed <- tm_map(corpusFeeds, stemDocument)

In order to visualize the frequency of words appearing in a text file, word clouds have been used. word cloud figures are shown below to display the word clouds of the corpus with and without stemming.

corpus_to_matrix <- function(corpus_){
    dtmCorpus <- TermDocumentMatrix(corpus_)
    corpusMatrix <- as.matrix(dtmCorpus)
    sortedMatrix <- sort(rowSums(corpusMatrix), decreasing = TRUE)
    corpus_df <- data.frame(word = names(sortedMatrix), freq = sortedMatrix)
    return(corpus_df)
}


corpus_df <- corpus_to_matrix(corpusFeeds)
corpus_stemmed_df <- corpus_to_matrix(corpusFeedsStemmed)

Word clouds of the corpus without stemming.

corpus_to_wordcloud <- function(corpus_df_){
    wordcloud(words = corpus_df_$word,
          freq = corpus_df_$freq,
          min.freq = 1,
          max.words = 50,
          random.order = FALSE,
          rot.per = 0.35,
          colors = brewer.pal(8, "Dark2"))
}
corpus_to_wordcloud(corpus_df)

Word clouds of the corpus with stemming.

corpus_to_wordcloud(corpus_stemmed_df)

N-Gram

N-Grams (uni-, bi- and tri-grams) have been generated and histograms of the 20 most frequent N-Grams have been plotted.

corpus_to_gram <- function(corpus_, number_gram_){
    df_for_Grams <- data.frame(text = sapply(corpus_, as.character), 
                               stringsAsFactors = FALSE)
    GramToken <- NGramTokenizer(df_for_Grams,
                                Weka_control(min=number_gram_,
                                             max=number_gram_))
    Grams <- data.frame(table(GramToken))
    Grams <- Grams[order(Grams$Freq, decreasing = TRUE),]
    colnames(Grams) <- c("Word", "Frecuency")
    return(Grams)
}

uniGrams <- corpus_to_gram(corpusFeeds, 1)
knitr::kable(uniGrams[1:20,])
Word Frecuency
9654 s 1424
12402 will 1134
7773 one 1030
6536 like 974
1608 can 972
6128 just 897
11444 time 804
4741 get 597
11977 us 562
6244 know 549
7640 now 507
2837 day 501
7520 new 496
4836 good 472
7344 much 460
9085 really 448
8185 people 444
347 also 415
6770 make 399
4288 first 398
biGrams <- corpus_to_gram(corpusFeeds, 2)
knitr::kable(biGrams[1:20,])
Word Frecuency
24143 let us 81
20681 hyun suk 60
6176 can see 51
30598 one day 51
26285 make sure 49
21915 jae ha 48
29472 new york 48
19322 haven t 47
14862 feel like 43
6890 chang min 42
23521 last week 42
19794 high school 40
46483 u s 40
13627 every day 39
37372 s day 38
50489 year old 38
36729 right now 37
6099 can get 34
15535 first time 33
23502 last night 33
triGrams <- corpus_to_gram(corpusFeeds, 3)
knitr::kable(triGrams[1:20,])
Word Frecuency
10672 cricket world cup 20
23104 hyun suk asks 18
49256 team leader han 18
28944 love spending time 15
41569 rock paper scissors 15
56459 world cup dvd 15
56973 year old daughter 15
4351 believers insist can 14
5403 books get chores 14
8166 children s books 14
15188 every single day 14
19071 get chores done 14
22055 hide books get 14
31841 mother s day 14
33149 new york city 14
52216 two edged sword 14
24674 joe o reilly 13
26368 late last night 13
41293 right around corner 13
6395 cab will run 12
plot_gram <- function(xgram, title_){
    plot_ <- ggplot(xgram, aes(reorder(Word, Frecuency), y = Frecuency)) +
        geom_bar(stat = "Identity", fill="lightskyblue4") +
        geom_text(aes(label = Frecuency), hjust = +1.3, color="white") +
        coord_flip() +
        theme_minimal() +
        theme(panel.grid.major.x = element_blank(),
              panel.grid.minor.x = element_blank(),
              panel.grid.major.y = element_blank(),
              axis.ticks.x = element_blank(),
              axis.text.x = element_blank()) +
        labs(x="", title=paste("20 most frequently", title_))

    return(plot_)
}
uniGrams_plot <- plot_gram(uniGrams[1:20, ], "1-Gram")
uniGrams_plot

biGrams_plot <- plot_gram(biGrams[1:20, ], "2-Gram")
biGrams_plot

triGrams_plot <- plot_gram(triGrams[1:20, ], "3-Gram")
triGrams_plot