This project will use natural language processing and text mining to build a predictive model that can predict the next word that a person intends to type, for example in their smart phone. For this capstone project, Johns Hopkins University and Coursera partnered with SwiftKey, a company that has built a virtual keyboard app which learns from previous typed text and outputs predictions based on currently inputted text and what it has learned: https://en.wikipedia.org/wiki/Microsoft_SwiftKey.
To build this predictive model, we will use a large corpus of text documents, provided by Swiftkey and derived from blogs, news, and Twitter (tweets). We will use the English database of text documents to complete this project, although the text documents are also provided in German, Finnish, and Russian. Once the predictive text model is built, it will be turned into a Shiny app that accepts input and predicts the final word. For example, the app might receive the input phrase “better late than” and it might output the word “never”.
library(stringi)
library(stringr)
library(tm)
library(LaF)
library(dplyr)
library(tidytext)
library(ggplot2)
library(gridExtra)
library(scales)
library(data.table)
library(kableExtra)
library(wordcloud)
#set working directory
setwd("C:/Users/yxj4/OneDrive - CDC/+My_Documents/CDC/1 DDT/Data Modernization/Coursera/Hopkins Data Science Specialization/10 Data Science Capstone/0 Dataset/Coursera-SwiftKey/final/en_US")
#The following code reads the first five lines of the English blog data set:
con <- file("en_US.blogs.txt", "r")
## Read in the first 3 lines of text
readLines(con, 3)
## [1] "In the years thereafter, most of the Oil fields and platforms were named after pagan “gods”."
## [2] "We love you Mr. Brown."
## [3] "Chad has been awesome with the kids and holding down the fort while I work later than usual! The kids have been busy together playing Skylander on the XBox together, after Kyan cashed in his $$$ from his piggy bank. He wanted that game so bad and used his gift card from his birthday he has been saving and the money to get it (he never taps into that thing either, that is how we know he wanted it so bad). We made him count all of his money to make sure that he had enough! It was very cute to watch his reaction when he realized he did! He also does a very good job of letting Lola feel like she is playing too, by letting her switch out the characters! She loves it almost as much as him."
##Close the connection when you are done
close(con)
#set working directory
setwd("C:/Users/yxj4/OneDrive - CDC/+My_Documents/CDC/1 DDT/Data Modernization/Coursera/Hopkins Data Science Specialization/10 Data Science Capstone/0 Dataset/Coursera-SwiftKey/final/en_US")
#The following code reads the first five lines of the English blog data set:
con <- file("en_US.news.txt", "r")
## Read in the first 3 lines of text
readLines(con, 3)
## [1] "He wasn't home alone, apparently."
## [2] "The St. Louis plant had to close. It would die of old age. Workers had been making cars there since the onset of mass automotive production in the 1920s."
## [3] "WSU's plans quickly became a hot topic on local online sites. Though most people applauded plans for the new biomedical center, many deplored the potential loss of the building."
##Close the connection when you are done
close(con)
#set working directory
setwd("C:/Users/yxj4/OneDrive - CDC/+My_Documents/CDC/1 DDT/Data Modernization/Coursera/Hopkins Data Science Specialization/10 Data Science Capstone/0 Dataset/Coursera-SwiftKey/final/en_US")
#The following code reads the first five lines of the English blog data set:
con <- file("en_US.twitter.txt", "r")
## Read in the first 3 lines of text
readLines(con, 3)
## [1] "How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long."
## [2] "When you meet someone special... you'll know. Your heart will beat more rapidly and you'll smile for no reason."
## [3] "they've decided its more fun if I don't."
##Close the connection when you are done
close(con)
#set working directory
setwd("C:/Users/yxj4/OneDrive - CDC/+My_Documents/CDC/1 DDT/Data Modernization/Coursera/Hopkins Data Science Specialization/10 Data Science Capstone/0 Dataset/Coursera-SwiftKey/final/en_US")
#set seed for reproducibility
set.seed(42)
#Load in the data as a corpora
en_files <- list.files()
#Create a separate file for blogs, news, and twitter data
blogs <- readLines(en_files[1], encoding = "UTF-8", skipNul = TRUE)
news <- readLines(en_files[2], encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines(en_files[3], encoding = "UTF-8", skipNul = TRUE)
#Produce descriptive statistics for each dataset: blogs, news, twitter
blogsStats <- stri_stats_general(blogs)
blogsStats
## Lines LinesNEmpty Chars CharsNWhite
## 899288 899288 206824382 170389539
newsStats <- stri_stats_general(news)
newsStats
## Lines LinesNEmpty Chars CharsNWhite
## 77259 77259 15639408 13072698
twitterStats <- stri_stats_general(twitter)
twitterStats
## Lines LinesNEmpty Chars CharsNWhite
## 2360148 2360148 162096241 134082806
#Describe the average number of words per line [BLOGS]
blogsWords <- stringi::stri_count_words(blogs)
mean(blogsWords)
## [1] 41.75109
#Describe the average number of words per line [NEWS]
newsWords <- stringi::stri_count_words(news)
mean(newsWords)
## [1] 34.61779
#Describe the average number of words per line [TWITTER]
twitterWords <- stringi::stri_count_words(twitter)
mean(twitterWords)
## [1] 12.75065
#Further description of the data
fileOverview <- data.frame(file = c("blogs", "news", "twitter"),
totalLines = c(stri_stats_general(blogs)[1], stri_stats_general(news)[1], stri_stats_general(twitter)[1]),
totalWords = c(sum(blogsWords), sum(newsWords), sum(twitterWords)),
totalChars = c(stri_stats_general(blogs)[3], stri_stats_general(news)[3], stri_stats_general(twitter)[3]),
averageWords = c(mean(blogsWords), mean(newsWords), mean(twitterWords)),
minWords = c(min(blogsWords), min(newsWords), min(twitterWords)),
maxWords = c(max(blogsWords), max(newsWords), max(twitterWords)),
averageChars = c(mean(stringi::stri_count_boundaries(blogs, type = "character")), mean(stringi::stri_count_boundaries(news, type = "character")), mean(stringi::stri_count_boundaries(twitter, type = "character"))),
minChars = c(min(stringi::stri_count_boundaries(blogs, type = "character")), min(stringi::stri_count_boundaries(news, type = "character")), min(stringi::stri_count_boundaries(twitter, type = "character"))),
maxChars = c(max(stringi::stri_count_boundaries(blogs, type = "character")), max(stringi::stri_count_boundaries(news, type = "character")), max(stringi::stri_count_boundaries(twitter, type = "character"))))
fileOverview
## file totalLines totalWords totalChars averageWords minWords maxWords
## 1 blogs 899288 37546250 206824382 41.75109 0 6726
## 2 news 77259 2674536 15639408 34.61779 1 1123
## 3 twitter 2360148 30093413 162096241 12.75065 1 47
## averageChars minChars maxChars
## 1 229.98666 1 40833
## 2 202.42830 2 5760
## 3 68.68043 2 140
In this step, I remove profane words and other words that I do not want to predict. The list of profane words comes from Google’s banned words list. To do this, I used the removeWords function from the tm library.
#######################################################
##Removing profanity in blogs, news, and twitter files
#######################################################
#Set working directory
setwd("/Users/elizabethlundeen/Desktop/Mac/Data")
#Set seed for reproducibility
set.seed(42)
#Load in the data
en_files <- list.files()
#Create a separate file for blogs, news, and twitter data
blogs <- readLines(en_files[1], encoding = "UTF-8", skipNul = TRUE)
news <- readLines(en_files[2], encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines(en_files[3], encoding = "UTF-8", skipNul = TRUE)
#Load the file of profane words
setwd("/Users/elizabethlundeen/Desktop/Mac")
profanity <- read.table("bad_words_list.txt")
#Use the removeWords function from the tm library to remove all profane words
clean_blogs <- removeWords(str_to_lower(blogs), profanity[,1])
clean_news <- removeWords(str_to_lower(news), profanity[,1])
clean_twitter <- removeWords(str_to_lower(twitter), profanity[,1])
#Save the cleaned text data
write.table(clean_blogs, "/Users/elizabethlundeen/Desktop/Mac/clean_blogs.txt")
write.table(clean_news, "/Users/elizabethlundeen/Desktop/Mac/clean_news.txt")
write.table(clean_twitter, "/Users/elizabethlundeen/Desktop/Mac/clean_twitter.txt")
To build models you don’t need to load in and use all of the data. Often relatively few randomly selected rows or chunks need to be included to get an accurate approximation to results that would be obtained using all the data. For this milestone report, I will just analyze a random 10% of lines from each file.
#Set working directory
setwd("C:/Users/yxj4/OneDrive - CDC/+My_Documents/CDC/1 DDT/Data Modernization/Coursera/Hopkins Data Science Specialization/10 Data Science Capstone/Mac/Clean")
#Set seed for reproducibility
set.seed(42)
#Load in the data
en_files <- list.files()
#Create a separate file for blogs, news, and twitter data
blogs_clean <- readLines(en_files[1], encoding = "UTF-8", skipNul = TRUE)
news_clean <- readLines(en_files[2], encoding = "UTF-8", skipNul = TRUE)
twitter_clean <- readLines(en_files[3], encoding = "UTF-8", skipNul = TRUE)
#Sample a random 25% of lines from each file
mini_blogs <- sample(blogs_clean, length(blogs_clean)*.10)
mini_news <- sample(news_clean, length(news_clean)*.10)
mini_twitter <- sample(twitter_clean, length(twitter_clean)*.10)
#Create data frames
clean_blogs_df <- data.frame(line = 1:length(mini_blogs),
text = mini_blogs, stringsAsFactors = FALSE)
clean_news_df <- data.frame(line = 1:length(mini_news),
text = mini_news, stringsAsFactors = FALSE)
clean_twitter_df <- data.frame(line = 1:length(mini_twitter),
text = mini_twitter, stringsAsFactors = FALSE)
#Save the cleaned and sampled datasets
data.table::fwrite(clean_blogs_df, "clean_blogs_df.csv")
data.table::fwrite(clean_news_df, "clean_news_df.csv")
data.table::fwrite(clean_twitter_df, "clean_twitter_df.csv")
#Set Working Directory
setwd("C:/Users/yxj4/OneDrive - CDC/+My_Documents/CDC/1 DDT/Data Modernization/Coursera/Hopkins Data Science Specialization/10 Data Science Capstone/Mac/Clean_Sampled")
#set seed for reproducibility
set.seed(42)
#need to first run 'GettingAndCleaning.R' file
#load in mini, profanity-free DF's
clean_blogs_df <- fread("clean_blogs_df.csv")
clean_news_df <- fread("clean_news_df.csv")
clean_twitter_df <- fread("clean_twitter_df.csv")
tokenized_blogs <- clean_blogs_df %>%
unnest_tokens(output = word, input = text) %>%
anti_join(get_stopwords())
## Joining, by = "word"
tokenized_news <- clean_news_df %>%
unnest_tokens(output = word, input = text) %>%
anti_join(get_stopwords())
## Joining, by = "word"
tokenized_twitter <- clean_twitter_df %>%
unnest_tokens(output = word, input = text) %>%
anti_join(get_stopwords())
## Joining, by = "word"
Answer this question: What are the distributions of word frequencies (some words are more frequent than others)?
#Examine the top 10 most commonly used words [BLOGS]
CommonWords_blogs <- tokenized_blogs %>%
count(word, sort = TRUE) %>%
mutate(file = "Blogs") %>%
top_n(n = 10, wt = n)
CommonWords_blogs
## word n file
## 1: one 12915 Blogs
## 2: like 10169 Blogs
## 3: just 10029 Blogs
## 4: can 9893 Blogs
## 5: time 9132 Blogs
## 6: get 7074 Blogs
## 7: people 6145 Blogs
## 8: now 6063 Blogs
## 9: know 6010 Blogs
## 10: new 5535 Blogs
#Examine the top 10 most commonly used words [NEWS]
CommonWords_news <- tokenized_news %>%
count(word, sort = TRUE) %>%
mutate(file = "News") %>%
top_n(n = 10, wt = n)
CommonWords_news
## word n file
## 1: said 1891 News
## 2: one 731 News
## 3: year 587 News
## 4: new 549 News
## 5: two 512 News
## 6: can 488 News
## 7: state 470 News
## 8: time 468 News
## 9: also 466 News
## 10: first 437 News
#Examine the top 10 most commonly used words [TWITTER]
CommonWords_twitter <- tokenized_twitter %>%
count(word, sort = TRUE) %>%
mutate(file = "Twitter") %>%
top_n(n = 10, wt = n)
CommonWords_twitter
## word n file
## 1: just 14934 Twitter
## 2: like 12253 Twitter
## 3: get 11191 Twitter
## 4: love 10796 Twitter
## 5: good 10037 Twitter
## 6: thanks 9136 Twitter
## 7: rt 9052 Twitter
## 8: day 9024 Twitter
## 9: can 8972 Twitter
## 10: one 8443 Twitter
CommonWords_all <- rbind.data.frame(CommonWords_blogs, CommonWords_news, CommonWords_twitter)
ggplot(CommonWords_all, aes(x = reorder(word,desc(n)), y = n, fill = n)) +
geom_bar(stat = "identity", alpha = .95) +
scale_fill_gradient(low = "gray75", high = "darkslategray4") +
labs(y = "", x = "", title = "The 10 Most Commonly Used Words in Each File Type") +
facet_grid(.~file, scales = "free_x") +
coord_flip() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
#set working directory
setwd("C:/Users/yxj4/OneDrive - CDC/+My_Documents/CDC/1 DDT/Data Modernization/Coursera/Hopkins Data Science Specialization/10 Data Science Capstone/0 Dataset/Coursera-SwiftKey/final/en_US")
#set seed for reproducibility
set.seed(42)
#Load in the data as a corpora
en_files <- list.files()
#Create a separate file for blogs, news, and twitter data
blogs <- readLines(en_files[1], encoding = "UTF-8", skipNul = TRUE)
news <- readLines(en_files[2], encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines(en_files[3], encoding = "UTF-8", skipNul = TRUE)
en_US_Blogs_plain <- Corpus(VectorSource(blogs))
# Convert the text to lower case
en_US_Blogs_plain <- tm_map(en_US_Blogs_plain, content_transformer(tolower))
# Remove numbers
en_US_Blogs_plain <- tm_map(en_US_Blogs_plain, removeNumbers)
# Remove punctuation
en_US_Blogs_plain <- tm_map(en_US_Blogs_plain, removePunctuation, ucp= TRUE)
# Eliminate extra white space
en_US_Blogs_plain <- tm_map(en_US_Blogs_plain, stripWhitespace)
# Remove common stop words
en_US_Blogs_plain <- tm_map(en_US_Blogs_plain, removeWords, stopwords("english"))
wordcloud(en_US_Blogs_plain, max.words = 80, random.order = F, colors = brewer.pal(name = "Dark2", n = 8))
en_US_news_plain <- Corpus(VectorSource(news))
# Convert the text to lower case
en_US_news_plain <- tm_map(en_US_news_plain, content_transformer(tolower))
# Remove numbers
en_US_news_plain <- tm_map(en_US_news_plain, removeNumbers)
# Remove punctuation
en_US_news_plain <- tm_map(en_US_news_plain, removePunctuation, ucp= TRUE)
# Eliminate extra white space
en_US_news_plain <- tm_map(en_US_news_plain, stripWhitespace)
# Remove common stop words
en_US_news_plain <- tm_map(en_US_news_plain, removeWords, stopwords("english"))
wordcloud(en_US_news_plain, max.words = 80, random.order = F, colors = brewer.pal(name = "Dark2", n = 8))
en_US_twitter_plain <- Corpus(VectorSource(twitter))
# Convert the text to lower case
en_US_twitter_plain <- tm_map(en_US_twitter_plain, content_transformer(tolower))
# Remove numbers
en_US_twitter_plain <- tm_map(en_US_twitter_plain, removeNumbers)
# Remove punctuation
en_US_twitter_plain <- tm_map(en_US_twitter_plain, removePunctuation, ucp= TRUE)
# Eliminate extra white space
en_US_twitter_plain <- tm_map(en_US_twitter_plain, stripWhitespace)
# Remove common stop words
en_US_twitter_plain <- tm_map(en_US_twitter_plain, removeWords, stopwords("english"))
wordcloud(en_US_twitter_plain, max.words = 80, random.order = F, colors = brewer.pal(name = "Dark2", n = 8))
Answer this question: What are the frequencies of 2-grams and 3-grams in the data set?
#n-gram analysis
#bigrams
bigram_Analysis <- function(text, filetype) {
text %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
tidyr::separate(bigram, c("word1", "word2"), sep = " ") %>%
na.omit() %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word) %>%
count(word1, word2, sort = TRUE) %>%
top_n(n = 10, wt = n) %>%
slice(row_number(1:10)) %>%
mutate(bigram = paste(word1, word2, sep = " ")) %>%
mutate(file = filetype)
}
blogs_bigram <- bigram_Analysis(clean_blogs_df, "Blogs")
news_bigram <- bigram_Analysis(clean_news_df, "News")
twitter_bigram <- bigram_Analysis(clean_twitter_df, "Twitter")
bigram_all <- as.data.frame(rbind.data.frame(blogs_bigram, news_bigram, twitter_bigram))
#trigrams
trigram_Analysis <- function(text, filetype) {
text %>%
unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
tidyr::separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
na.omit() %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word,
!word3 %in% stop_words$word) %>%
count(word1, word2, word3, sort = TRUE) %>%
top_n(n = 10, wt = n) %>%
slice(row_number(1:10)) %>%
mutate(trigram = paste(word1, word2, word3, sep = " ")) %>%
mutate(file = filetype)
}
blogs_trigram <- trigram_Analysis(clean_blogs_df, "Blogs")
news_trigram <- trigram_Analysis(clean_news_df, "News")
twitter_trigram <- trigram_Analysis(clean_twitter_df, "Twitter")
trigram_all <- as.data.frame(rbind.data.frame(blogs_trigram, news_trigram, twitter_trigram))
#visualize the bigrams & trigrams
ngram_plot <- function(data, num_gram) {
label <- as.character(str_to_title(num_gram))
ggplot(data, aes_string(x = num_gram)) +
aes(y = n, fill = as.factor(file)) +
geom_bar(stat = "identity") +
facet_grid(file~., scales = "free_y") +
coord_flip() +
labs(title = paste("Most Common", label, collapse = ""), y = label, x = "Occurences", fill = "File") +
theme(axis.text.y = element_text(size = 7))
}
top_bigrams <- ngram_plot(bigram_all, "bigram")
top_trigrams <- ngram_plot(trigram_all, "trigram")
##Top Bigrams
top_bigrams
##Top Trigrams
top_trigrams
Now that I have explored the data and produced basic descriptive statistics, I am ready to work on building the predictive model.