library(ggplot2)
library(tm)
library(stringi)
library(magrittr)
library(tidyverse)
library(corpus)
if(!file.exists("./data")){
dir.create("./data")
}
setwd("./data")
fileUrl <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
download.file(fileUrl, "Coursera-SwiftKey.zip", method="curl")
unzip("Coursera-SwiftKey.zip")
setwd("./final/en_US/")
con_twitter <- file("en_US.twitter.txt", "r")
twitter <- readLines(con_twitter)
close(con_twitter)
con_news <- file("en_US.news.txt", "r")
news<- readLines(con_news)
close(con_news)
con_blogs <- file("en_US.blogs.txt")
blogs <- readLines(con_blogs)
close(con_blogs)
stri_stats_general(twitter)
## Lines LinesNEmpty Chars CharsNWhite
## 2360148 2360148 162096031 134082634
stri_stats_general(news)
## Lines LinesNEmpty Chars CharsNWhite
## 1010242 1010242 203223154 169860866
stri_stats_general(blogs)
## Lines LinesNEmpty Chars CharsNWhite
## 899288 899288 206824382 170389539
For the sake of analysis, considering the limitations of my laptop i’ll extract a sample from the downloaded data.
twitter_sample <- sample(twitter, 8000)
news_sample <- sample(news, 8000)
blogs_sample <- sample(blogs, 8000)
# Define a function to remove special characters
remove_special_chars <- function(x) {
x <- gsub("[^A-Za-z ]", "", x)
return(x)
}
# Define a function to remove double words
remove_double_words <- function(x) {
words <- unlist(strsplit(x, " "))
words_unique <- unique(words)
return(paste(words_unique, collapse = " "))
}
# Define a function to create and clean corpus
preprocess_corpus <- function(x) {
x <- VCorpus(VectorSource(x)) %>%
tm_map(content_transformer(remove_double_words)) %>%
tm_map(content_transformer(remove_special_chars)) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(content_transformer(stripWhitespace)) %>%
tm_map(removeWords, stopwords("en")) %>%
tm_map(content_transformer(stemDocument))
return(x)
}
twitter_corpus <- preprocess_corpus(twitter_sample)
blogs_corpus <- preprocess_corpus(blogs_sample)
news_corpus <- preprocess_corpus(news_sample)
stat_word <- function(x) {
y<- term_stats(x)
return(head(y, 20))
}
stat_2gram <- function(x) {
y<- term_stats(x, ngrams=2)
return(head(y, 20))
}
stat_3gram <- function(x) {
y<- term_stats(x, ngrams=3)
return(head(y, 20))
}
graph_single_word <- function(x) {ggplot(stat_word(x), aes(x=count,y=reorder(term, count))) +
geom_bar(stat= "identity",
color="black",
fill="orange") +
geom_text(aes(label=count, hjust=1.5)) +
ylab("1-Gram") +
xlab("Frequency") +
ggtitle(paste("Top 20 word", substitute(x) ))
}
graph_single_word(twitter_corpus)
graph_single_word(blogs_corpus)
graph_single_word(news_corpus)
graph_bigrams <- function(x) {ggplot(stat_2gram(x), aes(x=count,y=reorder(term, count))) +
geom_bar(stat= "identity",
color="black",
fill="lightblue") +
geom_text(aes(label=count, hjust=1.5)) +
ylab("2-Gram") +
xlab("Frequency") +
ggtitle(paste("Top 20 2-grams", substitute(x) ))
}
graph_bigrams(twitter_corpus)
graph_bigrams(news_corpus)
graph_bigrams(blogs_corpus)
graph_trigrams <- function(x) {ggplot(stat_3gram(x), aes(x=count,y=reorder(term, count))) +
geom_bar(stat= "identity",
color="black",
fill="lightblue") +
geom_text(aes(label=count, hjust=1.5)) +
ylab("3-Gram") +
xlab("Frequency") +
ggtitle(paste("Top 20 3-grams", substitute(x) ))
}
graph_trigrams(twitter_corpus)
graph_trigrams(news_corpus)
graph_trigrams(blogs_corpus)
unique_words <- function(x,y=0.5) {
stat<- term_stats(x)
stat <- mutate(stat, cumulative_ratio= cumsum(stat$count)/sum(stat$count))
return(which(stat$cumulative_ratio > y )[1])
}
unique_words(twitter_corpus)
## [1] 339
unique_words(twitter_corpus, 0.9)
## [1] 4487
unique_words(news_corpus)
## [1] 671
unique_words(news_corpus, 0.9)
## [1] 6632
unique_words(blogs_corpus)
## [1] 564
unique_words(blogs_corpus, 0.9)
## [1] 6133
Accurate identification of non-English words in a vector can be a challenging task, as there may be words that contain non-English characters, but are still considered English words.
One approach to achieve more accurate identification of non-English words is to use a dictionary of English words and compare the words in the vector against the dictionary. If a word is not found in the dictionary, it can be considered as a non-English word. In this case there are very few non-english word, so this analysis is useless.
Can you think of a way to increase the coverage – identifying words that may not be in the corpora or using a smaller number of words in the dictionary to cover the same number of phrases? there are several ways to increase coverage in natural language processing tasks such as language modeling and text classification. Here are some of the most common methods:
Out-of-vocabulary (OOV) handling: When the model encounters a word that is not in the vocabulary, it can either ignore the word or replace it with a special token, such as “UNK”. To handle OOV words effectively, you can use techniques such as subword modeling, where you segment words into smaller units (e.g., subwords or characters), so that even if the model has not seen a specific word, it can still make a prediction based on the subwords that make up the word.
Data augmentation: Data augmentation refers to generating new training examples from the existing data. For example, you can replace words with synonyms, add random noise to the text, or translate the text into another language and back to the original language. This can help the model learn a more robust representation of the text and increase its coverage.
Transfer learning: Transfer learning refers to using a pre-trained model on a related task and fine-tuning it on a new task. For example, you can use a pre-trained language model like BERT and fine-tune it on a text classification task. This can save time and computation resources and often leads to better performance.
Domain adaptation: Domain adaptation refers to adjusting the model to a specific domain. For example, if you have a pre-trained model that was trained on general-domain text, you can fine-tune it on a specific domain (e.g., medical text, legal text) to increase its coverage of that domain.