The Capstone project for the Coursera Data Science Specialization involves using the 3 datasets provided by Swiftkey. The goal of this project is to design a shiny application with text prediction capabilities. This report will outline the exploratory analysis of the dataset and the current plans for implementing the text prediction algorithm.
The datasets are frome 3 sources, US News, blogs and twitter. The dataset contains 3 files across four languages (Russian, Finnish, German and English). This project will focus on exclusively on the English language datasets. The names of the data files are as follows:
en_US.blogs.txt
en_US.twitter.txt
en_US.news.txt
These sets will be imported in R and cleaned and summarised during the creation of the web application.
Below is the code for the various packages used in this analysis
Code for importing the blog/news/twitter datasets en changing the unrecognised characters. Finally it also imports the facebook bad words-list, version 29-7-2018
# Import data from folder
source_file_1<-"en_US.blogs.txt"
source_file_2<-"en_US.news.txt"
source_file_3<-"en_US.twitter.txt"
dataset_raw_1<- read_lines(paste0(data_folder,source_file_1))
dataset_raw_2<- read_lines(paste0(data_folder,source_file_2))
dataset_raw_3<- read_lines(paste0(data_folder,source_file_3))
# Change data encoding
dataset_raw_1 <- iconv(dataset_raw_1, "latin1", "ASCII", sub="")
dataset_raw_2 <- iconv(dataset_raw_2, "latin1", "ASCII", sub="")
dataset_raw_3 <- iconv(dataset_raw_3, "latin1", "ASCII", sub="")
# Data as tibble
dataset_1<-as_tibble(dataset_raw_1)
dataset_2<-as_tibble(dataset_raw_2)
dataset_3<-as_tibble(dataset_raw_3)
# import profanity lexicon
prof_file<-"facebook-bad-words-list_comma-separated-text-file_2018_07_29.txt"
prof_raw<- as_tibble(t(read.delim(paste0(prof_folder,prof_file),skip=15,sep=",",header=F)))
colnames(prof_raw)<-"word"
prof_raw$word<-str_trim(prof_raw$word,side="both")
# import stopword lexicon
stopwords_smart<-subset(stop_words,lexicon=="SMART")
# Display data
n1<-nrow(dataset_1)
n2<-nrow(dataset_2)
n3<-nrow(dataset_3)
# Display data
head(dataset_1)
## # A tibble: 6 x 1
## value
## <chr>
## 1 In the years thereafter, most of the Oil fields and platforms were named~
## 2 We love you Mr. Brown.
## 3 Chad has been awesome with the kids and holding down the fort while I wo~
## 4 so anyways, i am going to share some home decor inspiration that i have ~
## 5 With graduation season right around the corner, Nancy has whipped up a f~
## 6 If you have an alternative argument, let's hear it! :)
head(dataset_2)
## # A tibble: 6 x 1
## value
## <chr>
## 1 He wasn't home alone, apparently.
## 2 The St. Louis plant had to close. It would die of old age. Workers had b~
## 3 WSU's plans quickly became a hot topic on local online sites. Though mos~
## 4 The Alaimo Group of Mount Holly was up for a contract last fall to evalu~
## 5 And when it's often difficult to predict a law's impact, legislators sho~
## 6 There was a certain amount of scoffing going around a few years ago when~
head(dataset_3)
## # A tibble: 6 x 1
## value
## <chr>
## 1 How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Lov~
## 2 When you meet someone special... you'll know. Your heart will beat more ~
## 3 they've decided its more fun if I don't.
## 4 So Tired D; Played Lazer Tag & Ran A LOT D; Ughh Going To Sleep Like In ~
## 5 Words from a complete stranger! Made my birthday even better :)
## 6 First Cubs game ever! Wrigley field is gorgeous. This is perfect. Go Cub~
head(prof_raw)
## # A tibble: 6 x 1
## word
## <chr>
## 1 2 girls 1 cup
## 2 2g1c
## 3 4r5e
## 4 5h1t
## 5 5hit
## 6 a$$
head(stopwords_smart)
## # A tibble: 6 x 2
## word lexicon
## <chr> <chr>
## 1 a SMART
## 2 a's SMART
## 3 able SMART
## 4 about SMART
## 5 above SMART
## 6 according SMART
n1 #Number of rows in dataset 1
## [1] 899288
n2 #Number of rows in dataset 2
## [1] 1010242
n3 #Number of rows in dataset 3
## [1] 2360148
# Kolomheaders aanpassen
colnames(dataset_1)<-"text_raw"
colnames(dataset_2)<-"text_raw"
colnames(dataset_3)<-"text_raw"
# Cleaning of datasets
# data1_cln
data1_cln<-dataset_1%>%
mutate(text=str_to_lower(text_raw))%>%
mutate(text=removeNumbers(text))%>%
mutate(text=removePunctuation(text, preserve_intra_word_contractions = TRUE))%>%
mutate(text=replace_url(text)) %>%
mutate(text=replace_white(text))%>%
mutate(lines=1:nrow(dataset_1))
# data2_cln
data2_cln<-dataset_2 %>%
mutate(text=str_to_lower(text_raw))%>%
mutate(text=removeNumbers(text))%>%
mutate(text=removePunctuation(text, preserve_intra_word_contractions = TRUE))%>%
mutate(text=replace_url(text)) %>%
mutate(text=replace_white(text))%>%
mutate(lines=1:nrow(dataset_2))
# data3_cln
data3_cln<-dataset_3%>%
mutate(text=str_to_lower(text_raw))%>%
mutate(text=removeNumbers(text))%>%
mutate(text=removePunctuation(text, preserve_intra_word_contractions = TRUE))%>%
mutate(text=replace_url(text)) %>%
mutate(text=replace_white(text))%>%
mutate(lines=1:nrow(dataset_3))
# Creating subsets of the source data
data1_cln_train<-sample_frac(data1_cln,0.05,replace=FALSE)
data2_cln_train<-sample_frac(data2_cln,0.05,replace=FALSE)
data3_cln_train<-sample_frac(data3_cln,0.05,replace=FALSE)
# Filtering dataset of unused words
# Filtering dataset 1
data_token_1<-data1_cln_train %>%
unnest_tokens(word, text) %>%
anti_join(prof_raw)%>%
anti_join(stopwords_smart)%>%
filter(!str_detect(word, "[:digit:]"))%>%
mutate(word2=gsub("([[:alpha:]])\\1{2,}", "\\1",word ))%>%
filter(!nchar(word2)<3)%>%
mutate_at("word2", str_replace_all, "[:space:]+"," ")
n1_word<-nrow(data_token_1)
data_cln_1<- data_token_1 %>%
group_by(lines)%>%
summarise(text = paste(word2, collapse =" "))
# Filtering dataset 2
data_token_2<-data2_cln_train %>%
unnest_tokens(word, text) %>%
anti_join(prof_raw)%>%
anti_join(stopwords_smart)%>%
filter(!str_detect(word, "[:digit:]"))%>%
mutate(word2=gsub("([[:alpha:]])\\1{2,}", "\\1",word ))%>%
filter(!nchar(word2)<3)%>%
mutate_at("word2", str_replace_all, "[:space:]+"," ")
n2_word<-nrow(data_token_2)
data_cln_2<- data_token_2 %>%
group_by(lines)%>%
summarise(text = paste(word2, collapse =" "))
# Filtering dataset 3
data_token_3<-data3_cln_train %>%
unnest_tokens(word, text) %>%
anti_join(prof_raw)%>%
anti_join(stopwords_smart)%>%
filter(!str_detect(word, "[:digit:]"))%>%
mutate(word2=gsub("([[:alpha:]])\\1{2,}", "\\1",word ))%>%
filter(!nchar(word2)<3)%>%
mutate_at("word2", str_replace_all, "[:space:]+"," ")
n3_word<-nrow(data_token_3)
data_cln_3<- data_token_3 %>%
group_by(lines)%>%
summarise(text = paste(word2, collapse =" "))
n1_word #Number of words in cleaned 5% sample of dataset 1
## [1] 746650
n2_word #Number of words in cleaned 5% sample of dataset 2
## [1] 812037
n3_word #Number of words in cleaned 5% sample of dataset 3
## [1] 613258
Code chunck to create 1, 2, 3 and 4-grams and calculates term frequencies
# Create ngrams
# Set 1
dataset_set1_ngrm1<-data_cln_1 %>%
unnest_tokens(word, text)%>%
count(word)%>%
rename(ngram=word)%>%
rename(tot_ngram=n)%>%
na.omit()
dataset_set1_ngrm2 <- data_cln_1 %>%
unnest_tokens(ngram, text,token = "ngrams", n = 2,collapse = FALSE)%>%
group_by(ngram)%>%
summarise(tot_ngram=n())%>%
na.omit()
dataset_set1_ngrm3 <- data_cln_1%>%
unnest_tokens(ngram, text,token = "ngrams", n = 3,collapse = FALSE)%>%
group_by(ngram)%>%
summarise(tot_ngram=n())%>%
na.omit()
dataset_set1_ngrm4<- data_cln_1 %>%
unnest_tokens(ngram, text,token = "ngrams", n = 4,collapse = FALSE)%>%
group_by(ngram)%>%
summarise(tot_ngram=n())%>%
na.omit()
#set 2
dataset_set2_ngrm1<-data_cln_2 %>%
unnest_tokens(word, text)%>%
count(word)%>%
rename(ngram=word)%>%
rename(tot_ngram=n)%>%
na.omit()
dataset_set2_ngrm2 <- data_cln_2 %>%
unnest_tokens(ngram, text,token = "ngrams", n = 2,collapse = FALSE)%>%
group_by(ngram)%>%
summarise(tot_ngram=n())%>%
na.omit()
dataset_set2_ngrm3 <- data_cln_2 %>%
unnest_tokens(ngram, text,token = "ngrams", n = 3,collapse = FALSE)%>%
group_by(ngram)%>%
summarise(tot_ngram=n())%>%
na.omit()
dataset_set2_ngrm4<- data_cln_2 %>%
unnest_tokens(ngram, text,token = "ngrams", n = 4,collapse = FALSE)%>%
group_by(ngram)%>%
summarise(tot_ngram=n())%>%
na.omit()
#set3
dataset_set3_ngrm1<-data_cln_3 %>%
unnest_tokens(word, text)%>%
count(word)%>%
rename(ngram=word)%>%
rename(tot_ngram=n)%>%
na.omit()
dataset_set3_ngrm2 <- data_cln_3 %>%
unnest_tokens(ngram, text,token = "ngrams", n = 2,collapse = FALSE)%>%
group_by(ngram)%>%
summarise(tot_ngram=n())%>%
na.omit()
dataset_set3_ngrm3 <- data_cln_3 %>%
unnest_tokens(ngram, text,token = "ngrams", n = 3,collapse = FALSE)%>%
group_by(ngram)%>%
summarise(tot_ngram=n())%>%
na.omit()
dataset_set3_ngrm4<- data_cln_3 %>%
unnest_tokens(ngram, text,token = "ngrams", n = 4,collapse = FALSE)%>%
group_by(ngram)%>%
summarise(tot_ngram=n())%>%
na.omit()
# example datasets
head(dataset_set1_ngrm1)
## # A tibble: 6 x 2
## ngram tot_ngram
## <chr> <int>
## 1 a's 1
## 2 aabid 1
## 3 aacc 1
## 4 aaclass 1
## 5 aadhar 1
## 6 aagaard 1
head(dataset_set1_ngrm2)
## # A tibble: 6 x 2
## ngram tot_ngram
## <chr> <int>
## 1 a's remotes 1
## 2 aabid surtis 1
## 3 aacc december 1
## 4 aaclass minor 1
## 5 aadhar data 1
## 6 aagaard farms 1
head(dataset_set1_ngrm3)
## # A tibble: 6 x 2
## ngram tot_ngram
## <chr> <int>
## 1 a's remotes work 1
## 2 aabid surtis book 1
## 3 aacc december economic 1
## 4 aaclass minor team 1
## 5 aagaard farms dont 1
## 6 aah decided whip 1
head(dataset_set1_ngrm4)
## # A tibble: 6 x 2
## ngram tot_ngram
## <chr> <int>
## 1 a's remotes work smack 1
## 2 aabid surtis book decision 1
## 3 aacc december economic justice 1
## 4 aaclass minor team started 1
## 5 aagaard farms dont mom 1
## 6 aah decided whip bannock 1
Code chunk for creating word clouds per data source
par(mfrow = c(1, 3))
set1_wordcloud<-wordcloud (dataset_set1_ngrm1$ngram, dataset_set1_ngrm1$tot_ngram,scale=c(3,0.5), max.words=25, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE,colors=palette("default"))
set2_wordcloud<-wordcloud (dataset_set1_ngrm2$ngram, dataset_set2_ngrm1$tot_ngram,scale=c(3,0.5), max.words=25, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE,colors=palette("default"))
set3_wordcloud<-wordcloud (dataset_set3_ngrm1$ngram, dataset_set3_ngrm1$tot_ngram,scale=c(3,0.5), max.words=25, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE,colors=palette("default"))
Code chunk that combines the 3 different source sample sets, calculates IDF and reorders the sets
#1grams
data_ngrm1_train<-bind_rows(mutate(dataset_set1_ngrm1, text_source ="Blog"),
mutate(dataset_set2_ngrm1, text_source ="News"),
mutate(dataset_set3_ngrm1, text_source ="Twitter"))
ngrm1_tf_idf <- data_ngrm1_train %>%
bind_tf_idf(ngram,text_source,tot_ngram) %>%
arrange(desc(tf_idf))
#2grams
data_ngrm2_train<-bind_rows(mutate(dataset_set1_ngrm2, text_source ="Blog"),
mutate(dataset_set2_ngrm2, text_source ="News"),
mutate(dataset_set3_ngrm2, text_source ="Twitter"))
ngrm2_tf_idf <- data_ngrm2_train %>%
bind_tf_idf(ngram,text_source,tot_ngram) %>%
arrange(desc(tf_idf))
#3grams
data_ngrm3_train<-bind_rows(mutate(dataset_set1_ngrm3, text_source ="Blog"),
mutate(dataset_set2_ngrm3, text_source ="News"),
mutate(dataset_set3_ngrm3, text_source ="Twitter"))
ngrm3_tf_idf <- data_ngrm3_train %>%
bind_tf_idf(ngram,text_source,tot_ngram) %>%
arrange(desc(tf_idf))
#4grams
data_ngrm4_train<-bind_rows(mutate(dataset_set1_ngrm4, text_source ="Blog"),
mutate(dataset_set2_ngrm4, text_source ="News"),
mutate(dataset_set3_ngrm4, text_source ="Twitter"))
ngrm4_tf_idf <- data_ngrm4_train %>%
bind_tf_idf(ngram,text_source,tot_ngram) %>%
arrange(desc(tf_idf))
Code chunk to calculate idf-values. First the ngrams are filtered to remove all ngrams with a frequency less than 2. Secondly, the ngrams are transformed to factors so they can be plotted by ggplot2
#1grams
ngrm1_tf_idf <- data_ngrm1_train %>%
bind_tf_idf(ngram,text_source,tot_ngram) %>%
arrange(desc(tf_idf))%>%
filter(tot_ngram>1) %>%
mutate(ngram = factor(ngram, levels = rev(unique(ngram)))) %>%
mutate(text_source = factor(text_source, levels = c("Blog","News","Twitter")))
ngrm1_tf_idf_plot<-ngrm1_tf_idf%>%
group_by(text_source) %>%
top_n(25, tf_idf) %>%
ungroup() %>%
mutate(ngram = reorder(ngram, tf_idf))
#2grams
ngrm2_tf_idf <- data_ngrm2_train %>%
bind_tf_idf(ngram,text_source,tot_ngram) %>%
arrange(desc(tf_idf))%>%
filter(tot_ngram>1) %>%
mutate(ngram = factor(ngram, levels = rev(unique(ngram)))) %>%
mutate(text_source = factor(text_source, levels = c("Blog","News","Twitter")))
ngrm2_tf_idf_plot<-ngrm2_tf_idf%>%
group_by(text_source) %>%
top_n(25, tf_idf) %>%
ungroup() %>%
mutate(ngram = reorder(ngram, tf_idf))
#3grams
ngrm3_tf_idf <- data_ngrm3_train %>%
bind_tf_idf(ngram,text_source,tot_ngram) %>%
arrange(desc(tf_idf))%>%
filter(tot_ngram>1) %>%
mutate(ngram = factor(ngram, levels = rev(unique(ngram)))) %>%
mutate(text_source = factor(text_source, levels = c("Blog","News","Twitter")))
ngrm3_tf_idf_plot<-ngrm3_tf_idf%>%
group_by(text_source) %>%
top_n(25, tf_idf) %>%
ungroup() %>%
mutate(ngram = reorder(ngram, tf_idf))
#4grams
ngrm4_tf_idf <- data_ngrm4_train %>%
bind_tf_idf(ngram,text_source,tot_ngram) %>%
arrange(desc(tf_idf))%>%
filter(tot_ngram>1) %>%
mutate(ngram = factor(ngram, levels = rev(unique(ngram)))) %>%
mutate(text_source = factor(text_source, levels = c("Blog","News","Twitter")))
ngrm4_tf_idf_plot<-ngrm3_tf_idf%>%
group_by(text_source) %>%
top_n(25, tf_idf) %>%
ungroup() %>%
mutate(ngram = reorder(ngram, tf_idf))
ggplot(ngrm1_tf_idf_plot,aes(ngram, tf_idf, fill = text_source)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~text_source, ncol = 3, scales = "free") +
coord_flip()
ggplot(ngrm2_tf_idf_plot,aes(ngram, tf_idf, fill = text_source)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~text_source, ncol = 3, scales = "free") +
coord_flip()
ggplot(ngrm3_tf_idf_plot,aes(ngram, tf_idf, fill = text_source)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~text_source, ncol = 3, scales = "free") +
coord_flip()
ggplot(ngrm4_tf_idf_plot,aes(ngram, tf_idf, fill = text_source)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~text_source, ncol = 3, scales = "free") +
coord_flip()
From here on, the plan is to create a online text prediction application using the created ngrams above. The application will provide a means to enter a sentence which will be matched to the ngrams and the most likely candidate will be suggested