####Loading the necessary libraries
library(stringi)
library(tm)
## Warning: package 'tm' was built under R version 4.3.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 4.3.3
library(SnowballC)
library(RWeka)
## Warning: package 'RWeka' was built under R version 4.3.3
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.3.3
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 4.3.3
## Loading required package: RColorBrewer
##Synopsis The milestone report is the part of the Coursera Data Science Specialization Capstone Project. The primary objective is to conduct exploratory data analysis on datasets in four languages: German, English(US), Finnish and Russian. The analysis focuses on the English dataset, comprsing text from Twitter, blogs and news sources.
The capstone projects goal is to develop a predictive text model using a large corpus of text data, enabling next word prediction. Ultimately, the model will be implemented as a Shiny Web application.
##Getting the Data
####Downloading the Data
The dataset is downloaded from the following url: Capstone Dataset.
if (!file.exists("Coursera-SwiftKey.zip(1)")) {
download.file("https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip","Coursera-SwiftKey.zip")
unzip("Coursera-SwiftKey.zip")
}
download.file("http://www.cs.cmu.edu/~biglou/resources/bad-words.txt","bad-words.txt")
getwd()
## [1] "C:/Users/Lenovo/Desktop/final/en_US"
twitter.url <- "./en_US.twitter.txt"
blog.url <- "./en_US.blogs.txt"
news.url <- "./en_US.news.txt"
twitter <- readLines(twitter.url, skipNul = TRUE, encoding = "UTF-8")
blog <- readLines(blog.url, skipNul = TRUE, encoding = "UTF-8")
news.file <- file(news.url,"rb")
news <- readLines(news.file, skipNul = TRUE, encoding = "UTF-8")
close(news.file)
##Basic Summary of Data
As soon as the data are loaded in R, a basic summary of the characteristics of the datasets occurs, in order to get a top view of the data.
create_summary_table <- function(twitter,blog,news){
stats <- data.frame(source = c("twitter","blog","news"),
arraySizeMB = c(object.size(twitter)/1024^2,object.size(blog)/1024^2,object.size(news)/1024^2),
fileSizeMB = c(file.info(twitter.url)$size/1024^2,file.info(blog.url)$size/1024^2,file.info(news.url)$size/1024^2),
lineCount = c(length(twitter),length(blog),length(news)),
wordCount = c(sum(stri_count_words(twitter)),sum(stri_count_words(blog)),sum(stri_count_words(news))),
charCount = c(stri_stats_general(twitter)[3],stri_stats_general(blog)[3],stri_stats_general(news)[3])
)
print(stats)
}
create_summary_table(twitter,blog,news)
## source arraySizeMB fileSizeMB lineCount wordCount charCount
## 1 twitter 318.9897 159.3641 2360148 30096690 162096241
## 2 blog 255.3545 200.4242 899288 37546806 206824382
## 3 news 257.3404 196.2775 1010242 34762658 203223154
##Sampling the data
The datasets are quite large in size, therefore there are 10.000 rows of each dataset sampled and combined into a single dataset.
set.seed(1805)
sampleData <- c(sample(twitter,10000),sample(blog,10000),sample(news,10000))
str(sampleData)
## chr [1:30000] "So true, this would be a good quote :)" ...
##Cleaning the Data
The raw data is converted into a corpus , a data structure suitable for natural language processing (NLP). The cleaning processing involves several steps to prepare the text for meaningful analysis. This includes removing non-ACSII characters and applying transformation to standadize the content.
For exploratory analysis, stopwords are removed to focus on more relevant tems. However, in the final predictive model, stopwords will be retained to ensure the model can predict all types of words accurately.
corpus <- VCorpus(VectorSource(sampleData))
toSpace <- content_transformer(function(x, pattern) {return (gsub(pattern," ",x))})
#Cleaning all non ASCII characters
corpus <- tm_map(corpus,toSpace,"[^[:graph:]]")
#Transforming all data to lower case
corpus <- tm_map(corpus,content_transformer(tolower))
#Deleting all English stopwords and any stray letters left my the non-ASCII removal
corpus <- tm_map(corpus,removeWords,c(stopwords("english"),letters))
#Removing Punctuation
corpus <- tm_map(corpus,removePunctuation)
#Removing Numbers
corpus <- tm_map(corpus,removeNumbers)
#Removing Profanities
profanities = readLines('bad-words.txt')
corpus <- tm_map(corpus, removeWords, profanities)
#Removing all stray letters left by the last two calls
corpus <- tm_map(corpus,removeWords,letters)
#Striping all extra whitespace
corpus <- tm_map(corpus,stripWhitespace)
print(corpus)
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 30000
##Exploratory Analysis
Now exploratory data analysis is about to be performed on the data. First of all the n-gram matrices are created for n=1,2,3.
####Creating N-grams
#Creating a unigram, bigram, trigram document text matrix
unigramTokenizer <- function(x) {NGramTokenizer(x, Weka_control(min = 1, max = 1))}
unigrams <- DocumentTermMatrix(corpus, control = list(tokenize = unigramTokenizer))
BigramTokenizer <- function(x) {NGramTokenizer(x, Weka_control(min = 2, max = 2))}
bigrams <- DocumentTermMatrix(corpus, control = list(tokenize = BigramTokenizer))
TrigramTokenizer <- function(x) {NGramTokenizer(x, Weka_control(min = 3, max = 3))}
trigrams <- DocumentTermMatrix(corpus, control = list(tokenize = TrigramTokenizer))
####Most Frequent Terms per N-gram
Below the top n-grams for n=1,2,3 can be seen.
freqTerms <- findFreqTerms(unigrams,lowfreq = 1000)
unigrams_frequency <- sort(colSums(as.matrix(unigrams[,freqTerms])),decreasing = TRUE)
unigrams_freq_df <- data.frame(word = names(unigrams_frequency), frequency = unigrams_frequency)
wordcloud(unigrams_freq_df$word,unigrams_freq_df$frequency,scale=c(3,.1), colors = brewer.pal(7, "Dark2"), random.order = TRUE, random.color = TRUE, rot.per = 0.35)
freqTerms <- findFreqTerms(bigrams,lowfreq = 75)
bigrams_frequency <- sort(colSums(as.matrix(bigrams[,freqTerms])),decreasing = TRUE)
bigrams_freq_df <- data.frame(word = names(bigrams_frequency), frequency = bigrams_frequency)
wordcloud(bigrams_freq_df$word,bigrams_freq_df$frequency,scale=c(3,.1), colors = brewer.pal(7, "Dark2"), random.order = TRUE, random.color = TRUE, rot.per = 0.35)
freqTerms <- findFreqTerms(trigrams,lowfreq = 10)
trigrams_frequency <- sort(colSums(as.matrix(trigrams[,freqTerms])),decreasing = TRUE)
trigrams_freq_df <- data.frame(word = names(trigrams_frequency), frequency = trigrams_frequency)
wordcloud(trigrams_freq_df$word,trigrams_freq_df$frequency,scale=c(3,.1), colors = brewer.pal(7, "Dark2"), random.order = TRUE, random.color = TRUE, rot.per = 0.35)
##Graphs
Below the the graphs for the most common ngrams can be seen.
g <- ggplot(unigrams_freq_df,aes(x=reorder(word,-frequency),y=frequency))+geom_bar(stat="identity",fill="blue") + xlab("Unigram") + ylab("Frequency") +labs(title="Most common unigrams") + theme(axis.text.x=element_text(angle=55, hjust=1))
g
g <- ggplot(bigrams_freq_df,aes(x=reorder(word,-frequency),y=frequency))+geom_bar(stat="identity",fill="blue") + xlab("Bigram") + ylab("Frequency") +labs(title="Most common bigrams") + theme(axis.text.x=element_text(angle=55, hjust=1))
g
g <- ggplot(trigrams_freq_df,aes(x=reorder(word,-frequency),y=frequency))+geom_bar(stat="identity",fill="blue") + xlab("Trigram") + ylab("Frequency") +labs(title="Most common trigrams") + theme(axis.text.x=element_text(angle=55, hjust=1))
g
##Prediction Algorithm and Shiny App
After completing theexploratory analysis, the project will progress to finalizing a predictive algorithm. This model will be deployed as an interactive Shiny application, accompanied by a prensentation deck summarizing the results.
the predictive model will use an n-gram backoff algorithm. It will prioritize the most common 3- grams or 4-grams containing the provided text and make predictions based on frequency. If no suitable match is found, the model will revert to prograssively smaller n- grams down to unigrams. the training dataset for the model will be larger than the one used in the exploratory analysis. Additionally, the model will include fallback suggestions based on the most common unigram, with smoothed probabilities, in cases where larger n- grams fails fail to provide predictions.
The Shiny app will allows users to input text and instantly receives a prediction for the next word, providing an interactive experience.