Around the world, people are spending more and more time on their mobile devices for email, social networking, banking and a host of other activities. However, typing on mobile devices can be slow and tedious. SwiftKey created a technology that makes it easier for everyone to communicate and work with their mobile devices. The company’s best-known application is the SwiftKey smart keyboard app, which learns from users as they type and makes it easier for people to type faster on their cell phones. One of its most important features is to propose three alternatives to continue the text being typed.
In this final project we will apply natural language processing (NLP), text mining and R tools for exploratory data analysis and subsequent text modeling and prediction.
The datasets for this project can be downloaded from the web Coursera-SwiftKey Datasets
The data is originally from: HC Corpora
In this report, We will focus on the files that contain English data, which are en_US.blogs.txt, en_US.news.txt and en_US.twitter.txt files.
library(tm) # Text Mining Package
# library(readr)
library(qdap) # to assist in quantitative discourse analysis
library(tidyverse)
library(stringi) # Character String Processing Facilities
library(RWeka) #collection of ML algorithms for data mining tasks
library(ggplot2)
library(wordcloud) # Functionality to create pretty word clouds,
library(SnowballC) # implements Porter's word stemming algorithm
# library(gridExtra)
# getwd()
set.seed(2701)
# We set the data file 'data'.
datafolder <- "data"
# url with the text data files to be analyzed
url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
fname <- "Coursera-SwiftKey.zip"
fpath <- paste(datafolder, fname, sep="/")
### Files are downloaded if they are not in the 'data' directory.
if (!file.exists(fpath)){
download.file(url, destfile=fpath, method="curl")
}
unzip(zipfile=fpath, exdir=datafolder)
# We go through the 'data' directory and create a list with the paths to
# the files we are interested in "en_*.txt".
file_l <- list.files(path=datafolder, recursive=T, pattern=".*en_.*.txt")
# Apply a Function over a List or Vector with lapply
file_info <- lapply(paste(datafolder, file_l, sep="/"),
function(file_) {
# Establish connection
fileConnection <- file(file_, open='r')
# Read lines to a file
linesInFile <- readLines(fileConnection, skipNul=TRUE)
# size_original_(MB)
file_Size_origin <- round(file.info(file_)$size/1024^2, 1)
# size_in_R_(MB)
fileSize <- format(object.size(linesInFile), units ='Mb')
# number of lines
fileNoOfLines <- as.numeric(length(linesInFile))
# number of words
fileWords <- sum(stri_count_words(linesInFile))
# longest_line
nchars <- lapply(linesInFile, nchar)
maxchars <- which.max(nchars)
# Extract dataset name from file path
name <- stri_extract_last_regex(file_, '([a-zA-Z]+)[^\\.txt]')
close(fileConnection)
return(c(name, file_Size_origin, fileSize,
fileNoOfLines, fileWords, maxchars))
})
# unlist
# simplifies it to produce a vector which contains all the atomic components
# which occur in x.
# matrix creates a matrix from the given set of values.
file_info_df <- data.frame(matrix(unlist(file_info),
nrow=length(file_info),
byrow=TRUE))
colnames(file_info_df) <- c("file", "size_original_(MB)", "size_in_R_(MB)",
"lines", "words", "longest_line" )
# data frame with descriptive data of the three text files
blogs_lines <- file_info_df$lines[1]
news_lines <- file_info_df$lines[2]
twitter_lines <- file_info_df$lines[3]
knitr::kable(file_info_df)
| file | size_original_(MB) | size_in_R_(MB) | lines | words | longest_line |
|---|---|---|---|---|---|
| blogs | 200.4 | 255.4 Mb | 899288 | 37546250 | 483415 |
| news | 196.3 | 257.3 Mb | 1010242 | 34762395 | 123628 |
| 159.4 | 319 Mb | 2360148 | 30093413 | 26 |
We create the remove_internet_chars function to remove urls, hastags and emails. We download a list of profanity words in order to remove them from the texts.
remove_internet_chars <- function(x){
x <- gsub("[^ ]+@[^ ]+", " ", x)
x <- gsub(" @[^ ]+", " ", x)
x <- gsub("#[^ ]+", " ", x)
x <- gsub("[^ ]+://[^ ]+", " ", x)
}
### File with offensive words to remove them from text files.
badWordsFileURL <- 'http://www.bannedwordlist.com/lists/swearWords.txt'
badWordsFileName <- 'swearWords.txt'
fpath_badwords <- paste(datafolder, badWordsFileName, sep="/")
### Files are downloaded if they are not in the 'data' directory.
if (!file.exists(fpath_badwords)){
download.file(badWordsFileURL, destfile=fpath_badwords, method="curl")
}
badwords <- readLines(fpath_badwords)
## Warning in readLines(fpath_badwords): incomplete final line found on 'data/
## swearWords.txt'
profanity <- VectorSource(badwords)
import_file <- function(file_, lines_){
fileConnection <- file(paste0('./data/final/en_US/',file_))
linesInFile <- readLines(fileConnection, (as.numeric(lines_)/1000), skipNul=TRUE)
close(fileConnection)
return(linesInFile)
}
blogs <- import_file('en_US.blogs.txt', blogs_lines)
news <- import_file('en_US.blogs.txt', news_lines)
twitters <- import_file('en_US.blogs.txt', twitter_lines)
text_all <- paste(blogs, news, twitters)
corpusFeeds <- VCorpus(VectorSource(text_all))
corpusFeeds <-tm_map(corpusFeeds, content_transformer(gsub),
pattern="’|'|`|'",replace="'")
# convert contraction
corpusFeeds <- tm_map(corpusFeeds,
content_transformer(replace_contraction))
# convert abbreviation
corpusFeeds <- tm_map(corpusFeeds,
content_transformer(replace_abbreviation))
# convert to lower case
corpusFeeds <- tm_map(corpusFeeds, content_transformer(tolower))
# remove URLs
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
corpusFeeds <- tm_map(corpusFeeds, content_transformer(removeURL)
)
corpusFeeds <- tm_map(corpusFeeds,
content_transformer(remove_internet_chars))
#remove ������ what would be emojis
corpusFeeds <-tm_map(corpusFeeds, content_transformer(gsub),
pattern="\\W",replace=" ")
# remove anything other than English letters or space
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
corpusFeeds <- tm_map(corpusFeeds, content_transformer(removeNumPunct))
# remove stop words
corpusFeeds <- tm_map(corpusFeeds, removeWords, stopwords("english"))
# remove extra whitespace
corpusFeeds <- tm_map(corpusFeeds, stripWhitespace)
# Remove numbers
corpusFeeds <- tm_map(corpusFeeds, removeNumbers)
# Remove punctuations
corpusFeeds <- tm_map(corpusFeeds, removePunctuation)
# Remove bad words
corpusFeeds <- tm_map(corpusFeeds, removeWords, profanity)
# convert to plain text
corpusFeeds <- tm_map(corpusFeeds, PlainTextDocument)
# size of corpus
# summary(corpusFeeds)
# convert to Stemmed
corpusFeedsStemmed <- tm_map(corpusFeeds, stemDocument)
In order to visualize the frequency of words appearing in a text file, word clouds have been used. word cloud figures are shown below to display the word clouds of the corpus with and without stemming.
corpus_to_matrix <- function(corpus_){
dtmCorpus <- TermDocumentMatrix(corpus_)
corpusMatrix <- as.matrix(dtmCorpus)
sortedMatrix <- sort(rowSums(corpusMatrix), decreasing = TRUE)
corpus_df <- data.frame(word = names(sortedMatrix), freq = sortedMatrix)
return(corpus_df)
}
corpus_df <- corpus_to_matrix(corpusFeeds)
corpus_stemmed_df <- corpus_to_matrix(corpusFeedsStemmed)
corpus_to_wordcloud <- function(corpus_df_){
wordcloud(words = corpus_df_$word,
freq = corpus_df_$freq,
min.freq = 1,
max.words = 50,
random.order = FALSE,
rot.per = 0.35,
colors = brewer.pal(8, "Dark2"))
}
corpus_to_wordcloud(corpus_df)
corpus_to_wordcloud(corpus_stemmed_df)
N-Grams (uni-, bi- and tri-grams) have been generated and histograms of the 20 most frequent N-Grams have been plotted.
corpus_to_gram <- function(corpus_, number_gram_){
df_for_Grams <- data.frame(text = sapply(corpus_, as.character),
stringsAsFactors = FALSE)
GramToken <- NGramTokenizer(df_for_Grams,
Weka_control(min=number_gram_,
max=number_gram_))
Grams <- data.frame(table(GramToken))
Grams <- Grams[order(Grams$Freq, decreasing = TRUE),]
colnames(Grams) <- c("Word", "Frecuency")
return(Grams)
}
uniGrams <- corpus_to_gram(corpusFeeds, 1)
knitr::kable(uniGrams[1:20,])
| Word | Frecuency | |
|---|---|---|
| 9654 | s | 1424 |
| 12402 | will | 1134 |
| 7773 | one | 1030 |
| 6536 | like | 974 |
| 1608 | can | 972 |
| 6128 | just | 897 |
| 11444 | time | 804 |
| 4741 | get | 597 |
| 11977 | us | 562 |
| 6244 | know | 549 |
| 7640 | now | 507 |
| 2837 | day | 501 |
| 7520 | new | 496 |
| 4836 | good | 472 |
| 7344 | much | 460 |
| 9085 | really | 448 |
| 8185 | people | 444 |
| 347 | also | 415 |
| 6770 | make | 399 |
| 4288 | first | 398 |
biGrams <- corpus_to_gram(corpusFeeds, 2)
knitr::kable(biGrams[1:20,])
| Word | Frecuency | |
|---|---|---|
| 24143 | let us | 81 |
| 20681 | hyun suk | 60 |
| 6176 | can see | 51 |
| 30598 | one day | 51 |
| 26285 | make sure | 49 |
| 21915 | jae ha | 48 |
| 29472 | new york | 48 |
| 19322 | haven t | 47 |
| 14862 | feel like | 43 |
| 6890 | chang min | 42 |
| 23521 | last week | 42 |
| 19794 | high school | 40 |
| 46483 | u s | 40 |
| 13627 | every day | 39 |
| 37372 | s day | 38 |
| 50489 | year old | 38 |
| 36729 | right now | 37 |
| 6099 | can get | 34 |
| 15535 | first time | 33 |
| 23502 | last night | 33 |
triGrams <- corpus_to_gram(corpusFeeds, 3)
knitr::kable(triGrams[1:20,])
| Word | Frecuency | |
|---|---|---|
| 10672 | cricket world cup | 20 |
| 23104 | hyun suk asks | 18 |
| 49256 | team leader han | 18 |
| 28944 | love spending time | 15 |
| 41569 | rock paper scissors | 15 |
| 56459 | world cup dvd | 15 |
| 56973 | year old daughter | 15 |
| 4351 | believers insist can | 14 |
| 5403 | books get chores | 14 |
| 8166 | children s books | 14 |
| 15188 | every single day | 14 |
| 19071 | get chores done | 14 |
| 22055 | hide books get | 14 |
| 31841 | mother s day | 14 |
| 33149 | new york city | 14 |
| 52216 | two edged sword | 14 |
| 24674 | joe o reilly | 13 |
| 26368 | late last night | 13 |
| 41293 | right around corner | 13 |
| 6395 | cab will run | 12 |
plot_gram <- function(xgram, title_){
plot_ <- ggplot(xgram, aes(reorder(Word, Frecuency), y = Frecuency)) +
geom_bar(stat = "Identity", fill="lightskyblue4") +
geom_text(aes(label = Frecuency), hjust = +1.3, color="white") +
coord_flip() +
theme_minimal() +
theme(panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
panel.grid.major.y = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_blank()) +
labs(x="", title=paste("20 most frequently", title_))
return(plot_)
}
uniGrams_plot <- plot_gram(uniGrams[1:20, ], "1-Gram")
uniGrams_plot
biGrams_plot <- plot_gram(biGrams[1:20, ], "2-Gram")
biGrams_plot
triGrams_plot <- plot_gram(triGrams[1:20, ], "3-Gram")
triGrams_plot