Let’s load in the three files and take a look at their basic statistics.
con <- file("F:/Capstone/final/en_US/en_US.blogs.txt", "r")
sample <- readLines(con, 3000)
close(con)
data_blogs <- as.data.frame(sample)
con <- file("F:/Capstone/final/en_US/en_US.twitter.txt", "r")
sample <- readLines(con, 3000)
close(con)
data_twitter <- as.data.frame(sample)
con <- file("F:/Capstone/final/en_US/en_US.news.txt", "r")
sample <- readLines(con, 3000)
close(con)
data_news <- as.data.frame(sample)
data<- rbind(data_twitter, data_blogs, data_news)
data$ID <- seq.int(nrow(data))
data$sample <- as.character(data$sample)
Let’s use the udpipe package for word tokenization.
library(udpipe)
word_tokens <- strsplit.data.frame(data, term = "sample", group = "ID", split = " ")
Now, let’s strip all punctuation from all the words EXCEPT those which are in emojis. We’ll use the lexicon package’s hash_emoticons dataset for this. We can filter out the profanity using the profanity_zac_anger dataset.
library(lexicon)
#Strip punctuation from non emoticon entries
for (i in 1:length(word_tokens$sample)) {
if(!(word_tokens$sample[i] %in% hash_emoticons$x)){
word_tokens$sample[i] <- tolower(word_tokens$sample[i])
if(!(word_tokens$sample[i] %in% key_contractions$contraction)){
word_tokens$sample[i] <- gsub('[[:punct:] ]+',' ',word_tokens$sample[i])
}
}
}
#Remove Profanity
word_tokens <- word_tokens[!(word_tokens$sample %in% profanity_zac_anger),]
#Regroup into another variable for later use
data_clean <- paste.data.frame(word_tokens, term = "sample", group = "ID", collapse = " ")
head(data_clean)
## ID
## 1 1
## 2 2
## 3 3
## 4 4
## 5 5
## 6 6
## sample
## 1 how are you btw thanks for the rt you gonna be in dc anytime soon love to see you been way way too long
## 2 when you meet someone special you'll know your heart will beat more rapidly and you'll smile for no reason
## 3 they've decided its more fun if i don t
## 4 so tired D; played lazer tag ran a lot D; ughh going to sleep like in 5 minutes ;)
## 5 words from a complete stranger made my birthday even better :)
## 6 first cubs game ever wrigley field is gorgeous this is perfect go cubs go
Let’s observe frequencies of words, 2-grams and 3-grams. ### Function
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tokenizers)
library(reshape)
##
## Attaching package: 'reshape'
## The following object is masked from 'package:dplyr':
##
## rename
ngram_freqs <- function(m){
ngrams <- melt(tokenize_ngrams(data_clean$sample, n = m))
ngrams<-ngrams[ngrams$value %in% na.omit(ngrams$value),]
ngrams %>% count(ngrams$value, sort = TRUE, )
}
plotter <- function(x, m){
top <- top_n(x,10)
top$`ngrams$value` <- reorder(top$`ngrams$value`, -top$n)
label<- paste(as.character(m),"-grams", sep = "")
title<-paste("Frequency of usage of", label, sep = " ")
ggplot(data = top, aes(x = `ngrams$value`, y = n)) +
geom_col() +
xlab(label) + ylab("Frequency") + ggtitle(title) + theme(axis.text.x = element_text(angle = 90))
}
library(ggpubr)
one_gram <- ngram_freqs(1)
two_gram <- ngram_freqs(2)
three_gram <- ngram_freqs(3)
ggarrange(plotter(ngram_freqs(1),1), plotter(ngram_freqs(2),2), plotter(ngram_freqs(3),3), nrow = 2, ncol = 2)
## Selecting by n
## Selecting by n
## Selecting by n
## Percentage Coverage of Total Words ### Function
word_freq<- one_gram
word_freq$proportion <- word_freq$n/sum(word_freq$n)
coverage <- function(p){
l <- 0
i<-1
while (l<p) {
l <- l + word_freq$proportion[i]
i <- i+1
}
i
}
Now let’s see the number of words required for covering a specific proportion of the total number of words.
s<- c()
t<-c()
for (i in 1:9) {
s<-append(s, coverage(i/10), after = length(s))
t<- append(t, i/10, after = length(t))
}
d<- as.data.frame(cbind(t,s))
colnames(d)<-(c("Proportion", "Words Needed"))
d
## Proportion Words Needed
## 1 0.1 4
## 2 0.2 11
## 3 0.3 27
## 4 0.4 63
## 5 0.5 146
## 6 0.6 378
## 7 0.7 950
## 8 0.8 2362
## 9 0.9 6760
We can utilize the cld3 package which uses Google’s Compact Language Detector 3 neural network model for language identification.
library(cld3)
data_clean$language <- detect_language(data_clean$sample)
head(data_clean)
## ID
## 1 1
## 2 2
## 3 3
## 4 4
## 5 5
## 6 6
## sample
## 1 how are you btw thanks for the rt you gonna be in dc anytime soon love to see you been way way too long
## 2 when you meet someone special you'll know your heart will beat more rapidly and you'll smile for no reason
## 3 they've decided its more fun if i don t
## 4 so tired D; played lazer tag ran a lot D; ughh going to sleep like in 5 minutes ;)
## 5 words from a complete stranger made my birthday even better :)
## 6 first cubs game ever wrigley field is gorgeous this is perfect go cubs go
## language
## 1 en
## 2 en
## 3 en
## 4 en
## 5 en
## 6 en
Evidently, this model isn’t completely accurate but it’s pretty good!.
We could provide synonyms of our predicted output to cover more possible cases. A preliminary search yields a package named wordnet which has a function named getSynonyms.
I want to predict possible words as well as predict potential emojis they might want to insert into the message or tweet using sentiment analysis.