Summary

Around the world, people are spending more and more time on their mobile devices for email, social networking, banking and a host of other activities. However, typing on mobile devices can be slow and tedious. SwiftKey created a technology that makes it easier for everyone to communicate and work with their mobile devices. The company’s best-known application is the SwiftKey smart keyboard app, which learns from users as they type and makes it easier for people to type faster on their cell phones. One of its most important features is to propose three alternatives to continue the text being typed.

In this final project we will apply natural language processing (NLP), text mining and R tools for exploratory data analysis and subsequent text modeling and prediction.

The datasets for this project can be downloaded from the web Coursera-SwiftKey Datasets

The data is originally from: HC Corpora

In this report, We will focus on the files that contain English data, which are en_US.blogs.txt, en_US.news.txt and en_US.twitter.txt files.

Datasets

Install and load necessary libraries.

library(tm) # Text Mining Package
# library(readr)
library(qdap) # to assist in quantitative discourse analysis
library(tidyverse)
library(stringi) # Character String Processing Facilities
library(RWeka) #collection of ML algorithms for data mining tasks 
library(ggplot2)
library(wordcloud) # Functionality to create pretty word clouds,
library(SnowballC) # implements Porter's word stemming algorithm 
# library(gridExtra)
# getwd()
set.seed(2701)

Download files

# We set the data file 'data'.
datafolder <- "data"
# url with the text data files to be analyzed
url  <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
fname <- "Coursera-SwiftKey.zip"
fpath <- paste(datafolder, fname, sep="/")

### Files are downloaded if they are not in the 'data' directory.
if (!file.exists(fpath)){
    download.file(url, destfile=fpath, method="curl")
}
unzip(zipfile=fpath, exdir=datafolder)

Basic information on datasets

# We go through the 'data' directory and create a list with the paths to 
# the files we are interested in "en_*.txt".

file_l <- list.files(path=datafolder, recursive=T, pattern=".*en_.*.txt")

# Apply a Function over a List or Vector with lapply
file_info <- lapply(paste(datafolder, file_l, sep="/"),
                    function(file_) {

            # Establish connection
            fileConnection <- file(file_, open='r')
            # Read lines to a file
            linesInFile <- readLines(fileConnection, skipNul=TRUE)
            # size_original_(MB)
            file_Size_origin <- round(file.info(file_)$size/1024^2, 1)
            # size_in_R_(MB)            
            fileSize <- format(object.size(linesInFile), units ='Mb')
            # number of lines
            fileNoOfLines <- as.numeric(length(linesInFile))
            # number of words
            fileWords <- sum(stri_count_words(linesInFile))
            # longest_line
            nchars <- lapply(linesInFile, nchar)
            maxchars <- which.max(nchars)
            # Extract dataset name from file path
            name <- stri_extract_last_regex(file_, '([a-zA-Z]+)[^\\.txt]')
            close(fileConnection)
            return(c(name, file_Size_origin, fileSize, 
                     fileNoOfLines, fileWords, maxchars))

})
# unlist
# simplifies it to produce a vector which contains all the atomic components
# which occur in x.
# matrix creates a matrix from the given set of values.
file_info_df <- data.frame(matrix(unlist(file_info), 
                                  nrow=length(file_info),
                                  byrow=TRUE))
colnames(file_info_df) <- c("file", "size_original_(MB)", "size_in_R_(MB)",
                            "lines", "words", "longest_line" )
# data frame with descriptive data of the three text files
blogs_lines <- file_info_df$lines[1]
news_lines <- file_info_df$lines[2]
twitter_lines <- file_info_df$lines[3]
knitr::kable(file_info_df)

file	size_original_(MB)	size_in_R_(MB)	lines	words	longest_line
blogs	200.4	255.4 Mb	899288	37546250	483415
news	196.3	257.3 Mb	1010242	34762395	123628
twitter	159.4	319 Mb	2360148	30093413	26

Pre-processing and Cleanup

Create file corpus, clean the corpus

We create the remove_internet_chars function to remove urls, hastags and emails. We download a list of profanity words in order to remove them from the texts.

remove_internet_chars <- function(x){
    x <- gsub("[^ ]+@[^ ]+", " ", x)
    x <- gsub(" @[^ ]+", " ", x)
    x <- gsub("#[^ ]+", " ", x)
    x <- gsub("[^ ]+://[^ ]+", " ", x)
}

### File with offensive words to remove them from text files.
badWordsFileURL <- 'http://www.bannedwordlist.com/lists/swearWords.txt'
badWordsFileName <- 'swearWords.txt'
fpath_badwords <- paste(datafolder, badWordsFileName, sep="/")
### Files are downloaded if they are not in the 'data' directory.
if (!file.exists(fpath_badwords)){
    download.file(badWordsFileURL, destfile=fpath_badwords, method="curl")
}
badwords <- readLines(fpath_badwords)

## Warning in readLines(fpath_badwords): incomplete final line found on 'data/
## swearWords.txt'

profanity <- VectorSource(badwords)

import_file <- function(file_, lines_){
    fileConnection <- file(paste0('./data/final/en_US/',file_))
    linesInFile <- readLines(fileConnection, (as.numeric(lines_)/1000), skipNul=TRUE)
    close(fileConnection)
    return(linesInFile)    
}


blogs <- import_file('en_US.blogs.txt', blogs_lines)
news <- import_file('en_US.blogs.txt', news_lines)
twitters <- import_file('en_US.blogs.txt', twitter_lines)

text_all <- paste(blogs, news, twitters)

corpusFeeds <- VCorpus(VectorSource(text_all))


corpusFeeds <-tm_map(corpusFeeds, content_transformer(gsub), 
                     pattern="’|'|`|'",replace="'")
# convert contraction
corpusFeeds <- tm_map(corpusFeeds,
                      content_transformer(replace_contraction))
# convert abbreviation
corpusFeeds <- tm_map(corpusFeeds,
                      content_transformer(replace_abbreviation))
# convert to lower case
corpusFeeds <- tm_map(corpusFeeds, content_transformer(tolower))
# remove URLs
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
corpusFeeds <- tm_map(corpusFeeds, content_transformer(removeURL)
)
corpusFeeds <- tm_map(corpusFeeds,
                  content_transformer(remove_internet_chars))
#remove ������ what would be emojis
corpusFeeds <-tm_map(corpusFeeds, content_transformer(gsub), 
                     pattern="\\W",replace=" ")
# remove anything other than English letters or space
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
corpusFeeds <- tm_map(corpusFeeds, content_transformer(removeNumPunct))
# remove stop words
corpusFeeds <- tm_map(corpusFeeds, removeWords, stopwords("english"))
# remove extra whitespace
corpusFeeds <- tm_map(corpusFeeds, stripWhitespace)
# Remove numbers
corpusFeeds <- tm_map(corpusFeeds, removeNumbers)
# Remove punctuations
corpusFeeds <- tm_map(corpusFeeds, removePunctuation)
# Remove bad words
corpusFeeds <- tm_map(corpusFeeds, removeWords, profanity)
# convert to plain text
corpusFeeds <- tm_map(corpusFeeds, PlainTextDocument)
# size of corpus
# summary(corpusFeeds)
# convert to Stemmed
corpusFeedsStemmed <- tm_map(corpusFeeds, stemDocument)

In order to visualize the frequency of words appearing in a text file, word clouds have been used. word cloud figures are shown below to display the word clouds of the corpus with and without stemming.

corpus_to_matrix <- function(corpus_){
    dtmCorpus <- TermDocumentMatrix(corpus_)
    corpusMatrix <- as.matrix(dtmCorpus)
    sortedMatrix <- sort(rowSums(corpusMatrix), decreasing = TRUE)
    corpus_df <- data.frame(word = names(sortedMatrix), freq = sortedMatrix)
    return(corpus_df)
}


corpus_df <- corpus_to_matrix(corpusFeeds)
corpus_stemmed_df <- corpus_to_matrix(corpusFeedsStemmed)

Word clouds of the corpus without stemming.

corpus_to_wordcloud <- function(corpus_df_){
    wordcloud(words = corpus_df_$word,
          freq = corpus_df_$freq,
          min.freq = 1,
          max.words = 50,
          random.order = FALSE,
          rot.per = 0.35,
          colors = brewer.pal(8, "Dark2"))
}
corpus_to_wordcloud(corpus_df)

Word clouds of the corpus with stemming.

corpus_to_wordcloud(corpus_stemmed_df)

N-Gram

N-Grams (uni-, bi- and tri-grams) have been generated and histograms of the 20 most frequent N-Grams have been plotted.

corpus_to_gram <- function(corpus_, number_gram_){
    df_for_Grams <- data.frame(text = sapply(corpus_, as.character), 
                               stringsAsFactors = FALSE)
    GramToken <- NGramTokenizer(df_for_Grams,
                                Weka_control(min=number_gram_,
                                             max=number_gram_))
    Grams <- data.frame(table(GramToken))
    Grams <- Grams[order(Grams$Freq, decreasing = TRUE),]
    colnames(Grams) <- c("Word", "Frecuency")
    return(Grams)
}

uniGrams <- corpus_to_gram(corpusFeeds, 1)
knitr::kable(uniGrams[1:20,])

	Word	Frecuency
9654	s	1424
12402	will	1134
7773	one	1030
6536	like	974
1608	can	972
6128	just	897
11444	time	804
4741	get	597
11977	us	562
6244	know	549
7640	now	507
2837	day	501
7520	new	496
4836	good	472
7344	much	460
9085	really	448
8185	people	444
347	also	415
6770	make	399
4288	first	398

biGrams <- corpus_to_gram(corpusFeeds, 2)
knitr::kable(biGrams[1:20,])

	Word	Frecuency
24143	let us	81
20681	hyun suk	60
6176	can see	51
30598	one day	51
26285	make sure	49
21915	jae ha	48
29472	new york	48
19322	haven t	47
14862	feel like	43
6890	chang min	42
23521	last week	42
19794	high school	40
46483	u s	40
13627	every day	39
37372	s day	38
50489	year old	38
36729	right now	37
6099	can get	34
15535	first time	33
23502	last night	33

triGrams <- corpus_to_gram(corpusFeeds, 3)
knitr::kable(triGrams[1:20,])

	Word	Frecuency
10672	cricket world cup	20
23104	hyun suk asks	18
49256	team leader han	18
28944	love spending time	15
41569	rock paper scissors	15
56459	world cup dvd	15
56973	year old daughter	15
4351	believers insist can	14
5403	books get chores	14
8166	children s books	14
15188	every single day	14
19071	get chores done	14
22055	hide books get	14
31841	mother s day	14
33149	new york city	14
52216	two edged sword	14
24674	joe o reilly	13
26368	late last night	13
41293	right around corner	13
6395	cab will run	12

plot_gram <- function(xgram, title_){
    plot_ <- ggplot(xgram, aes(reorder(Word, Frecuency), y = Frecuency)) +
        geom_bar(stat = "Identity", fill="lightskyblue4") +
        geom_text(aes(label = Frecuency), hjust = +1.3, color="white") +
        coord_flip() +
        theme_minimal() +
        theme(panel.grid.major.x = element_blank(),
              panel.grid.minor.x = element_blank(),
              panel.grid.major.y = element_blank(),
              axis.ticks.x = element_blank(),
              axis.text.x = element_blank()) +
        labs(x="", title=paste("20 most frequently", title_))

    return(plot_)
}
uniGrams_plot <- plot_gram(uniGrams[1:20, ], "1-Gram")
uniGrams_plot

biGrams_plot <- plot_gram(biGrams[1:20, ], "2-Gram")
biGrams_plot

triGrams_plot <- plot_gram(triGrams[1:20, ], "3-Gram")
triGrams_plot

Natural Language Processing: Exploratory data analysis of SwiftKey data

Luis Ballesteros

5/9/2021