Summary

The Capstone project for the Coursera Data Science Specialization involves using the 3 datasets provided by Swiftkey. The goal of this project is to design a shiny application with text prediction capabilities. This report will outline the exploratory analysis of the dataset and the current plans for implementing the text prediction algorithm.

Description of Data

The datasets are frome 3 sources, US News, blogs and twitter. The dataset contains 3 files across four languages (Russian, Finnish, German and English). This project will focus on exclusively on the English language datasets. The names of the data files are as follows:

en_US.blogs.txt
en_US.twitter.txt
en_US.news.txt

These sets will be imported in R and cleaned and summarised during the creation of the web application.

System setup

Below is the code for the various packages used in this analysis

Importing data

Code for importing the blog/news/twitter datasets en changing the unrecognised characters. Finally it also imports the facebook bad words-list, version 29-7-2018

# Import data from folder
  source_file_1<-"en_US.blogs.txt"
  source_file_2<-"en_US.news.txt"
  source_file_3<-"en_US.twitter.txt"

  dataset_raw_1<- read_lines(paste0(data_folder,source_file_1))
  dataset_raw_2<- read_lines(paste0(data_folder,source_file_2))
  dataset_raw_3<- read_lines(paste0(data_folder,source_file_3))

# Change data encoding
  dataset_raw_1 <- iconv(dataset_raw_1, "latin1", "ASCII", sub="")
  dataset_raw_2 <- iconv(dataset_raw_2, "latin1", "ASCII", sub="")
  dataset_raw_3 <- iconv(dataset_raw_3, "latin1", "ASCII", sub="")

# Data as tibble
  
  dataset_1<-as_tibble(dataset_raw_1)
  dataset_2<-as_tibble(dataset_raw_2)
  dataset_3<-as_tibble(dataset_raw_3)

# import profanity lexicon
  
  prof_file<-"facebook-bad-words-list_comma-separated-text-file_2018_07_29.txt"
  
  prof_raw<- as_tibble(t(read.delim(paste0(prof_folder,prof_file),skip=15,sep=",",header=F)))
  colnames(prof_raw)<-"word"
  prof_raw$word<-str_trim(prof_raw$word,side="both")
  
# import stopword lexicon
  
  stopwords_smart<-subset(stop_words,lexicon=="SMART")

# Display data
  
  n1<-nrow(dataset_1)
  n2<-nrow(dataset_2)
  n3<-nrow(dataset_3)
  
# Display data
  
  head(dataset_1)
## # A tibble: 6 x 1
##   value                                                                    
##   <chr>                                                                    
## 1 In the years thereafter, most of the Oil fields and platforms were named~
## 2 We love you Mr. Brown.                                                   
## 3 Chad has been awesome with the kids and holding down the fort while I wo~
## 4 so anyways, i am going to share some home decor inspiration that i have ~
## 5 With graduation season right around the corner, Nancy has whipped up a f~
## 6 If you have an alternative argument, let's hear it! :)
  head(dataset_2)
## # A tibble: 6 x 1
##   value                                                                    
##   <chr>                                                                    
## 1 He wasn't home alone, apparently.                                        
## 2 The St. Louis plant had to close. It would die of old age. Workers had b~
## 3 WSU's plans quickly became a hot topic on local online sites. Though mos~
## 4 The Alaimo Group of Mount Holly was up for a contract last fall to evalu~
## 5 And when it's often difficult to predict a law's impact, legislators sho~
## 6 There was a certain amount of scoffing going around a few years ago when~
  head(dataset_3)
## # A tibble: 6 x 1
##   value                                                                    
##   <chr>                                                                    
## 1 How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Lov~
## 2 When you meet someone special... you'll know. Your heart will beat more ~
## 3 they've decided its more fun if I don't.                                 
## 4 So Tired D; Played Lazer Tag & Ran A LOT D; Ughh Going To Sleep Like In ~
## 5 Words from a complete stranger! Made my birthday even better :)          
## 6 First Cubs game ever! Wrigley field is gorgeous. This is perfect. Go Cub~
  head(prof_raw)  
## # A tibble: 6 x 1
##   word         
##   <chr>        
## 1 2 girls 1 cup
## 2 2g1c         
## 3 4r5e         
## 4 5h1t         
## 5 5hit         
## 6 a$$
  head(stopwords_smart)
## # A tibble: 6 x 2
##   word      lexicon
##   <chr>     <chr>  
## 1 a         SMART  
## 2 a's       SMART  
## 3 able      SMART  
## 4 about     SMART  
## 5 above     SMART  
## 6 according SMART
  n1 #Number of rows in dataset 1
## [1] 899288
  n2 #Number of rows in dataset 2
## [1] 1010242
  n3 #Number of rows in dataset 3
## [1] 2360148

Cleaning the datasets

# Kolomheaders aanpassen

    colnames(dataset_1)<-"text_raw"
    colnames(dataset_2)<-"text_raw"
    colnames(dataset_3)<-"text_raw"

# Cleaning of datasets

  # data1_cln
  data1_cln<-dataset_1%>%
    mutate(text=str_to_lower(text_raw))%>%
    mutate(text=removeNumbers(text))%>%
    mutate(text=removePunctuation(text, preserve_intra_word_contractions = TRUE))%>%
    mutate(text=replace_url(text)) %>%
    mutate(text=replace_white(text))%>%
    mutate(lines=1:nrow(dataset_1))

  # data2_cln
  data2_cln<-dataset_2 %>%
    mutate(text=str_to_lower(text_raw))%>%
    mutate(text=removeNumbers(text))%>%
    mutate(text=removePunctuation(text, preserve_intra_word_contractions = TRUE))%>%
    mutate(text=replace_url(text)) %>%
    mutate(text=replace_white(text))%>%
    mutate(lines=1:nrow(dataset_2))
  
  # data3_cln
  data3_cln<-dataset_3%>%
    mutate(text=str_to_lower(text_raw))%>%
    mutate(text=removeNumbers(text))%>%
    mutate(text=removePunctuation(text, preserve_intra_word_contractions = TRUE))%>%
    mutate(text=replace_url(text)) %>%
    mutate(text=replace_white(text))%>%
    mutate(lines=1:nrow(dataset_3))

# Creating subsets of the source data
  
    data1_cln_train<-sample_frac(data1_cln,0.05,replace=FALSE)
    data2_cln_train<-sample_frac(data2_cln,0.05,replace=FALSE)
    data3_cln_train<-sample_frac(data3_cln,0.05,replace=FALSE) 
  
# Filtering dataset of unused words      

    # Filtering dataset 1
    data_token_1<-data1_cln_train %>% 
      unnest_tokens(word, text) %>%
      anti_join(prof_raw)%>%
      anti_join(stopwords_smart)%>%
      filter(!str_detect(word, "[:digit:]"))%>%
      mutate(word2=gsub("([[:alpha:]])\\1{2,}", "\\1",word ))%>%
      filter(!nchar(word2)<3)%>%
      mutate_at("word2", str_replace_all, "[:space:]+"," ")    

    n1_word<-nrow(data_token_1)
        
    data_cln_1<- data_token_1 %>% 
      group_by(lines)%>%
      summarise(text = paste(word2, collapse =" "))

  # Filtering dataset 2 
    data_token_2<-data2_cln_train %>% 
      unnest_tokens(word, text) %>%
      anti_join(prof_raw)%>%
      anti_join(stopwords_smart)%>%
      filter(!str_detect(word, "[:digit:]"))%>%
      mutate(word2=gsub("([[:alpha:]])\\1{2,}", "\\1",word ))%>%
      filter(!nchar(word2)<3)%>%
      mutate_at("word2", str_replace_all, "[:space:]+"," ")     

  n2_word<-nrow(data_token_2)
        
    data_cln_2<- data_token_2 %>% 
      group_by(lines)%>%
      summarise(text = paste(word2, collapse =" "))

  # Filtering dataset 3
    data_token_3<-data3_cln_train %>% 
      unnest_tokens(word, text) %>%
      anti_join(prof_raw)%>%
      anti_join(stopwords_smart)%>%
      filter(!str_detect(word, "[:digit:]"))%>%
      mutate(word2=gsub("([[:alpha:]])\\1{2,}", "\\1",word ))%>%
      filter(!nchar(word2)<3)%>%
      mutate_at("word2", str_replace_all, "[:space:]+"," ")       
 
  n3_word<-nrow(data_token_3)
       
    data_cln_3<- data_token_3 %>% 
      group_by(lines)%>%
      summarise(text = paste(word2, collapse =" "))

  n1_word #Number of words in cleaned 5% sample of dataset 1
## [1] 746650
  n2_word #Number of words in cleaned 5% sample of dataset 2
## [1] 812037
  n3_word #Number of words in cleaned 5% sample of dataset 3
## [1] 613258

Creating ngrams

Code chunck to create 1, 2, 3 and 4-grams and calculates term frequencies

# Create ngrams
  
  # Set 1
  
   dataset_set1_ngrm1<-data_cln_1 %>%
    unnest_tokens(word, text)%>%
    count(word)%>%
    rename(ngram=word)%>%
    rename(tot_ngram=n)%>%
    na.omit()

  dataset_set1_ngrm2 <- data_cln_1 %>%
    unnest_tokens(ngram, text,token = "ngrams", n = 2,collapse = FALSE)%>%
    group_by(ngram)%>%
    summarise(tot_ngram=n())%>%
    na.omit()
  
  dataset_set1_ngrm3 <- data_cln_1%>%
    unnest_tokens(ngram, text,token = "ngrams", n = 3,collapse = FALSE)%>%
    group_by(ngram)%>%
    summarise(tot_ngram=n())%>%
    na.omit()
  
  dataset_set1_ngrm4<- data_cln_1 %>%
    unnest_tokens(ngram, text,token = "ngrams", n = 4,collapse = FALSE)%>%
    group_by(ngram)%>%
    summarise(tot_ngram=n())%>%
    na.omit()

  #set 2
  
  dataset_set2_ngrm1<-data_cln_2 %>%
    unnest_tokens(word, text)%>%
    count(word)%>%
    rename(ngram=word)%>%
    rename(tot_ngram=n)%>%
    na.omit()
  
  dataset_set2_ngrm2 <- data_cln_2 %>%
    unnest_tokens(ngram, text,token = "ngrams", n = 2,collapse = FALSE)%>%
    group_by(ngram)%>%
    summarise(tot_ngram=n())%>%
    na.omit()
  
  dataset_set2_ngrm3 <- data_cln_2 %>%
    unnest_tokens(ngram, text,token = "ngrams", n = 3,collapse = FALSE)%>%
    group_by(ngram)%>%
    summarise(tot_ngram=n())%>%
    na.omit()
  
  dataset_set2_ngrm4<- data_cln_2 %>%
    unnest_tokens(ngram, text,token = "ngrams", n = 4,collapse = FALSE)%>%
    group_by(ngram)%>%
    summarise(tot_ngram=n())%>%
    na.omit()

  #set3
  
  dataset_set3_ngrm1<-data_cln_3 %>%
    unnest_tokens(word, text)%>%
    count(word)%>%
    rename(ngram=word)%>%
    rename(tot_ngram=n)%>%
    na.omit()
  
  dataset_set3_ngrm2 <- data_cln_3  %>%
    unnest_tokens(ngram, text,token = "ngrams", n = 2,collapse = FALSE)%>%
    group_by(ngram)%>%
    summarise(tot_ngram=n())%>%
    na.omit()
  
  dataset_set3_ngrm3 <- data_cln_3 %>%
    unnest_tokens(ngram, text,token = "ngrams", n = 3,collapse = FALSE)%>%
    group_by(ngram)%>%
    summarise(tot_ngram=n())%>%
    na.omit()
  
  dataset_set3_ngrm4<- data_cln_3 %>%
    unnest_tokens(ngram, text,token = "ngrams", n = 4,collapse = FALSE)%>%
    group_by(ngram)%>%
    summarise(tot_ngram=n())%>%
    na.omit()

# example datasets
    
  head(dataset_set1_ngrm1)
## # A tibble: 6 x 2
##   ngram   tot_ngram
##   <chr>       <int>
## 1 a's             1
## 2 aabid           1
## 3 aacc            1
## 4 aaclass         1
## 5 aadhar          1
## 6 aagaard         1
  head(dataset_set1_ngrm2)
## # A tibble: 6 x 2
##   ngram         tot_ngram
##   <chr>             <int>
## 1 a's remotes           1
## 2 aabid surtis          1
## 3 aacc december         1
## 4 aaclass minor         1
## 5 aadhar data           1
## 6 aagaard farms         1
  head(dataset_set1_ngrm3)
## # A tibble: 6 x 2
##   ngram                  tot_ngram
##   <chr>                      <int>
## 1 a's remotes work               1
## 2 aabid surtis book              1
## 3 aacc december economic         1
## 4 aaclass minor team             1
## 5 aagaard farms dont             1
## 6 aah decided whip               1
  head(dataset_set1_ngrm4)
## # A tibble: 6 x 2
##   ngram                          tot_ngram
##   <chr>                              <int>
## 1 a's remotes work smack                 1
## 2 aabid surtis book decision             1
## 3 aacc december economic justice         1
## 4 aaclass minor team started             1
## 5 aagaard farms dont mom                 1
## 6 aah decided whip bannock               1

Creating word clouds

Code chunk for creating word clouds per data source

par(mfrow = c(1, 3))

set1_wordcloud<-wordcloud (dataset_set1_ngrm1$ngram, dataset_set1_ngrm1$tot_ngram,scale=c(3,0.5), max.words=25, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE,colors=palette("default"))

set2_wordcloud<-wordcloud (dataset_set1_ngrm2$ngram, dataset_set2_ngrm1$tot_ngram,scale=c(3,0.5), max.words=25, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE,colors=palette("default"))

set3_wordcloud<-wordcloud (dataset_set3_ngrm1$ngram, dataset_set3_ngrm1$tot_ngram,scale=c(3,0.5), max.words=25, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE,colors=palette("default"))

Calculate IDF

Code chunk that combines the 3 different source sample sets, calculates IDF and reorders the sets

#1grams
  data_ngrm1_train<-bind_rows(mutate(dataset_set1_ngrm1, text_source ="Blog"),
                              mutate(dataset_set2_ngrm1, text_source ="News"), 
                              mutate(dataset_set3_ngrm1, text_source ="Twitter"))

  ngrm1_tf_idf <- data_ngrm1_train %>%
    bind_tf_idf(ngram,text_source,tot_ngram) %>%
    arrange(desc(tf_idf))

#2grams
  data_ngrm2_train<-bind_rows(mutate(dataset_set1_ngrm2, text_source ="Blog"),
                              mutate(dataset_set2_ngrm2, text_source ="News"), 
                              mutate(dataset_set3_ngrm2, text_source ="Twitter"))

  ngrm2_tf_idf <- data_ngrm2_train %>%
    bind_tf_idf(ngram,text_source,tot_ngram) %>%
    arrange(desc(tf_idf))

#3grams

  data_ngrm3_train<-bind_rows(mutate(dataset_set1_ngrm3, text_source ="Blog"),
                              mutate(dataset_set2_ngrm3, text_source ="News"), 
                              mutate(dataset_set3_ngrm3, text_source ="Twitter"))

  ngrm3_tf_idf <- data_ngrm3_train %>%
    bind_tf_idf(ngram,text_source,tot_ngram) %>%
    arrange(desc(tf_idf))

#4grams

  data_ngrm4_train<-bind_rows(mutate(dataset_set1_ngrm4, text_source ="Blog"),
                              mutate(dataset_set2_ngrm4, text_source ="News"), 
                              mutate(dataset_set3_ngrm4, text_source ="Twitter"))

  ngrm4_tf_idf <- data_ngrm4_train %>%
    bind_tf_idf(ngram,text_source,tot_ngram) %>%
    arrange(desc(tf_idf))

Data exploration

Code chunk to calculate idf-values. First the ngrams are filtered to remove all ngrams with a frequency less than 2. Secondly, the ngrams are transformed to factors so they can be plotted by ggplot2

#1grams
  ngrm1_tf_idf <- data_ngrm1_train %>%
    bind_tf_idf(ngram,text_source,tot_ngram) %>%
    arrange(desc(tf_idf))%>%
    filter(tot_ngram>1) %>%
    mutate(ngram = factor(ngram, levels = rev(unique(ngram)))) %>%
    mutate(text_source = factor(text_source, levels = c("Blog","News","Twitter")))

 ngrm1_tf_idf_plot<-ngrm1_tf_idf%>% 
  group_by(text_source) %>% 
  top_n(25, tf_idf) %>% 
  ungroup() %>%
  mutate(ngram = reorder(ngram, tf_idf))

#2grams
  ngrm2_tf_idf <- data_ngrm2_train %>%
    bind_tf_idf(ngram,text_source,tot_ngram) %>%
    arrange(desc(tf_idf))%>%
    filter(tot_ngram>1) %>%
    mutate(ngram = factor(ngram, levels = rev(unique(ngram)))) %>%
    mutate(text_source = factor(text_source, levels = c("Blog","News","Twitter")))

 ngrm2_tf_idf_plot<-ngrm2_tf_idf%>% 
  group_by(text_source) %>% 
  top_n(25, tf_idf) %>% 
  ungroup() %>%
  mutate(ngram = reorder(ngram, tf_idf))

#3grams
  ngrm3_tf_idf <- data_ngrm3_train %>%
    bind_tf_idf(ngram,text_source,tot_ngram) %>%
    arrange(desc(tf_idf))%>%
    filter(tot_ngram>1) %>%
    mutate(ngram = factor(ngram, levels = rev(unique(ngram)))) %>%
    mutate(text_source = factor(text_source, levels = c("Blog","News","Twitter")))

 ngrm3_tf_idf_plot<-ngrm3_tf_idf%>% 
  group_by(text_source) %>% 
  top_n(25, tf_idf) %>% 
  ungroup() %>%
  mutate(ngram = reorder(ngram, tf_idf))

#4grams
  ngrm4_tf_idf <- data_ngrm4_train %>%
    bind_tf_idf(ngram,text_source,tot_ngram) %>%
    arrange(desc(tf_idf))%>%
    filter(tot_ngram>1) %>%
    mutate(ngram = factor(ngram, levels = rev(unique(ngram)))) %>%
    mutate(text_source = factor(text_source, levels = c("Blog","News","Twitter")))

 ngrm4_tf_idf_plot<-ngrm3_tf_idf%>% 
  group_by(text_source) %>% 
  top_n(25, tf_idf) %>% 
  ungroup() %>%
  mutate(ngram = reorder(ngram, tf_idf))

Bar chart with the highest idf-values voor 1-grams

  ggplot(ngrm1_tf_idf_plot,aes(ngram, tf_idf, fill = text_source)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~text_source, ncol = 3, scales = "free") +
  coord_flip()

Bar chart with the highest idf-values voor 2-grams

  ggplot(ngrm2_tf_idf_plot,aes(ngram, tf_idf, fill = text_source)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~text_source, ncol = 3, scales = "free") +
  coord_flip()

Bar chart with the highest idf-values voor 3-grams

  ggplot(ngrm3_tf_idf_plot,aes(ngram, tf_idf, fill = text_source)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~text_source, ncol = 3, scales = "free") +
  coord_flip()   

Bar chart with the highest idf-values voor 4-grams

  ggplot(ngrm4_tf_idf_plot,aes(ngram, tf_idf, fill = text_source)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~text_source, ncol = 3, scales = "free") +
  coord_flip()   

Plan

From here on, the plan is to create a online text prediction application using the created ngrams above. The application will provide a means to enter a sentence which will be matched to the ngrams and the most likely candidate will be suggested