Data Source: https://github.com/MrJay10/banking-faq-bot/blob/master/BankFAQs.csv
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
# Chatbot for Bank FAQs
# Methdology
# 1. Convert training questions into document term matrix (sparse matrix with 1s and 0s)
# 2. Match the matrix of each training question with its corresponding answer to form a training matrix
# 3. Train SVM model with the training matrix
# 4. Propose a testing question
# 5. Convert the testing question into document term matrix (sparse matrix with 1s and 0s)
# 6. Merge the testing DTM with training DTM, with testing DTM 1s for all terms and training DTM 0s for all terms
# 7. Predict the answer with the trained SVM model
# read data
data = read.csv(url("https://raw.githubusercontent.com/MrJay10/banking-faq-bot/master/BankFAQs.csv"), stringsAsFactors = TRUE )
# 1. Convert training questions into document term matrix (sparse matrix with 1s and 0s)
#clean the text
library(SnowballC)
library(tm)
## Warning: package 'tm' was built under R version 4.2.3
## Loading required package: NLP
corpus = VCorpus(VectorSource(data$Question))
corpus = tm_map(corpus, content_transformer(tolower))
corpus = tm_map(corpus, removeNumbers)
corpus = tm_map(corpus, removePunctuation)
# corpus = tm_map(corpus, removeWords, stopwords())
corpus = tm_map(corpus, stemDocument)
corpus = tm_map(corpus, stripWhitespace)
# convert to DTM
dtm = DocumentTermMatrix(corpus)
# convert to dataframe
dataset = as.data.frame(as.matrix(dtm))
# 2. Match the matrix of each training question with its corresponding answer to form a training matrix
data_train= cbind(data['Answer'], dataset)
#Check size of training data
dim(data_train)
## [1] 1764 1040
#Sampling data for training model
train_dt <- data_train[sample(1:nrow(data_train), size = 1040*0.3), ]
# 3. Train SVM model with the training matrix
library("e1071")
svmfit = svm(Answer ~., train_dt, kernel = "linear", cost = 100, scale = FALSE, type = "C-classification")
# 4. Propose a testing question and build the prediction function
pred = function(x){
# 5. Convert the testing question into document term matrix (sparse matrix with 1s and 0s)
#clean the text
corpus = VCorpus(VectorSource(x))
corpus = tm_map(corpus, content_transformer(tolower))
corpus = tm_map(corpus, removeNumbers)
corpus = tm_map(corpus, removePunctuation)
# corpus = tm_map(corpus, removeWords, stopwords())
corpus = tm_map(corpus, stemDocument)
corpus = tm_map(corpus, stripWhitespace)
# convert to DTM
dtm = DocumentTermMatrix(corpus)
# convert to dataframe
data_test = as.data.frame(as.matrix(dtm))
# 6. Merge the testing DTM with training DTM, with testing DTM 1s for all terms and training DTM 0s for all terms
add_data = dataset[1,]
add_data[add_data == 1] = 0
data_test=cbind(data_test,add_data)
# 7. Predict the answer with the trained SVM model
p = predict(svmfit, data_test)
answer = as.character(p)
# Predict
paste("Answer:", answer)
}
pred(" ")
## [1] "Answer: In case your card gets lost or stolen, all you have to do is call HDFC Bank PhoneBanking immediately and report the loss of your card. This facility is available 24 hrs. on all days. In case if you have been issued a back-up card in the kit, you can activate the back-up card by calling HDFC Bank Phone Banking or through Prepaid NetBanking. Once the back-up card is activated, all the funds from primary card will be transferred automatically to the back up card."