Loading in and Cleaning Data

Let’s load in the three files and take a look at their basic statistics.

con <- file("F:/Capstone/final/en_US/en_US.blogs.txt", "r")
sample <- readLines(con, 3000)
close(con)
data_blogs <- as.data.frame(sample)
con <- file("F:/Capstone/final/en_US/en_US.twitter.txt", "r")
sample <- readLines(con, 3000)
close(con)
data_twitter <- as.data.frame(sample)
con <- file("F:/Capstone/final/en_US/en_US.news.txt", "r")
sample <- readLines(con, 3000)
close(con)
data_news <- as.data.frame(sample)
data<- rbind(data_twitter, data_blogs, data_news)
data$ID <- seq.int(nrow(data))
data$sample <- as.character(data$sample)

Let’s use the udpipe package for word tokenization.

library(udpipe)
word_tokens <- strsplit.data.frame(data, term = "sample", group = "ID", split = " ")

Now, let’s strip all punctuation from all the words EXCEPT those which are in emojis. We’ll use the lexicon package’s hash_emoticons dataset for this. We can filter out the profanity using the profanity_zac_anger dataset.

library(lexicon)
#Strip punctuation from non emoticon entries
for (i in 1:length(word_tokens$sample)) {
  if(!(word_tokens$sample[i] %in% hash_emoticons$x)){
    word_tokens$sample[i] <- tolower(word_tokens$sample[i])
    if(!(word_tokens$sample[i] %in% key_contractions$contraction)){
      word_tokens$sample[i] <-  gsub('[[:punct:] ]+',' ',word_tokens$sample[i])
    }
  }
}

#Remove Profanity
word_tokens <- word_tokens[!(word_tokens$sample %in% profanity_zac_anger),]
#Regroup into another variable for later use
data_clean <- paste.data.frame(word_tokens, term = "sample", group = "ID", collapse = " ")
head(data_clean)
##   ID
## 1  1
## 2  2
## 3  3
## 4  4
## 5  5
## 6  6
##                                                                                                          sample
## 1 how are you  btw thanks for the rt  you gonna be in dc anytime soon  love to see you  been way  way too long 
## 2 when you meet someone special  you'll know  your heart will beat more rapidly and you'll smile for no reason 
## 3                                                                      they've decided its more fun if i don t 
## 4                          so tired D; played lazer tag   ran a lot D; ughh going to sleep like in 5 minutes ;)
## 5                                               words from a complete stranger  made my birthday even better :)
## 6                                 first cubs game ever  wrigley field is gorgeous  this is perfect  go cubs go

Frequencies

Let’s observe frequencies of words, 2-grams and 3-grams. ### Function

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(tokenizers)
library(reshape)
## 
## Attaching package: 'reshape'
## The following object is masked from 'package:dplyr':
## 
##     rename
ngram_freqs <- function(m){
  ngrams <- melt(tokenize_ngrams(data_clean$sample, n = m))
  ngrams<-ngrams[ngrams$value %in% na.omit(ngrams$value),]
  ngrams %>% count(ngrams$value, sort = TRUE, )
}
plotter <- function(x, m){
  top <- top_n(x,10)
  top$`ngrams$value` <- reorder(top$`ngrams$value`, -top$n)
  label<- paste(as.character(m),"-grams", sep = "")
  title<-paste("Frequency of usage of", label, sep = " ")
  ggplot(data = top, aes(x = `ngrams$value`, y = n)) + 
    geom_col() + 
    xlab(label) + ylab("Frequency") + ggtitle(title) + theme(axis.text.x = element_text(angle = 90))
}

Plots

library(ggpubr)
one_gram <- ngram_freqs(1)
two_gram <- ngram_freqs(2)
three_gram <- ngram_freqs(3)
ggarrange(plotter(ngram_freqs(1),1), plotter(ngram_freqs(2),2), plotter(ngram_freqs(3),3), nrow = 2, ncol = 2)
## Selecting by n
## Selecting by n
## Selecting by n

## Percentage Coverage of Total Words ### Function

word_freq<- one_gram
word_freq$proportion <- word_freq$n/sum(word_freq$n)
coverage <- function(p){
  l <- 0
  i<-1
  while (l<p) {
    l <- l + word_freq$proportion[i]
    i <- i+1
  }
  i
}

Now let’s see the number of words required for covering a specific proportion of the total number of words.

s<- c()
t<-c()
for (i in 1:9) {
  s<-append(s, coverage(i/10), after = length(s))
  t<- append(t, i/10, after = length(t))
}
d<- as.data.frame(cbind(t,s))
colnames(d)<-(c("Proportion", "Words Needed"))
d
##   Proportion Words Needed
## 1        0.1            4
## 2        0.2           11
## 3        0.3           27
## 4        0.4           63
## 5        0.5          146
## 6        0.6          378
## 7        0.7          950
## 8        0.8         2362
## 9        0.9         6760

Language Detection

We can utilize the cld3 package which uses Google’s Compact Language Detector 3 neural network model for language identification.

library(cld3)
data_clean$language <- detect_language(data_clean$sample)
head(data_clean)
##   ID
## 1  1
## 2  2
## 3  3
## 4  4
## 5  5
## 6  6
##                                                                                                          sample
## 1 how are you  btw thanks for the rt  you gonna be in dc anytime soon  love to see you  been way  way too long 
## 2 when you meet someone special  you'll know  your heart will beat more rapidly and you'll smile for no reason 
## 3                                                                      they've decided its more fun if i don t 
## 4                          so tired D; played lazer tag   ran a lot D; ughh going to sleep like in 5 minutes ;)
## 5                                               words from a complete stranger  made my birthday even better :)
## 6                                 first cubs game ever  wrigley field is gorgeous  this is perfect  go cubs go 
##   language
## 1       en
## 2       en
## 3       en
## 4       en
## 5       en
## 6       en

Evidently, this model isn’t completely accurate but it’s pretty good!.

Increasing Word Coverage

We could provide synonyms of our predicted output to cover more possible cases. A preliminary search yields a package named wordnet which has a function named getSynonyms.

Plans for modeling

I want to predict possible words as well as predict potential emojis they might want to insert into the message or tweet using sentiment analysis.