rm(list = ls())
library(ggplot2) 
library(readr) 
library(stringr)
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(syuzhet)
library(SnowballC)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(e1071)
library(klaR)
## Loading required package: MASS
library(bookdown)
train <-read.csv("C:/Users/Jonathan/Desktop/train.csv", stringsAsFactors = FALSE)
test <-read.csv("C:/Users/Jonathan/Desktop/test.csv", stringsAsFactors = FALSE)
dim(train)
## [1] 404290      6
dim(test)
## [1] 2345796       3
str(train)
## 'data.frame':    404290 obs. of  6 variables:
##  $ id          : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ qid1        : int  1 3 5 7 9 11 13 15 17 19 ...
##  $ qid2        : int  2 4 6 8 10 12 14 16 18 20 ...
##  $ question1   : chr  "What is the step by step guide to invest in share market in india?" "What is the story of Kohinoor (Koh-i-Noor) Diamond?" "How can I increase the speed of my internet connection while using a VPN?" "Why am I mentally very lonely? How can I solve it?" ...
##  $ question2   : chr  "What is the step by step guide to invest in share market?" "What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?" "How can Internet speed be increased by hacking through DNS?" "Find the remainder when [math]23^{24}[/math] is divided by 24,23?" ...
##  $ is_duplicate: int  0 0 0 0 0 1 0 1 0 0 ...
str(test)
## 'data.frame':    2345796 obs. of  3 variables:
##  $ test_id  : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ question1: chr  "How does the Surface Pro himself 4 compare with iPad Pro?" "Should I have a hair transplant at age 24? How much would it cost?" "What but is the best way to send money from China to the US?" "Which food not emulsifiers?" ...
##  $ question2: chr  "Why did Microsoft choose core m3 and not core i3 home Surface Pro 4?" "How much cost does hair transplant require?" "What you send money to China?" "What foods fibre?" ...
train = train[1:10000,]
test = test[1:10000,]
df = data.frame()
df.new = data.frame()
for (i in 1:nrow(train))
{
#cbind(train,adist = diag(adist(train$q1,train$q2)))
q1 <- Corpus(VectorSource(train$question1[i]))
q1 <- tm_map(q1, removePunctuation)   
q1 <- tm_map(q1, removeNumbers)   
q1 <- tm_map(q1, tolower)   
q1 <- tm_map(q1, stemDocument)   
q1 <- tm_map(q1, stripWhitespace)   
q1 <- tm_map(q1, removeWords, stopwords("english"))   
q1 <- tm_map(q1, PlainTextDocument)   
doc = TermDocumentMatrix(q1) 
a11 = doc$dimnames$Terms

q2 <- Corpus(VectorSource(train$question2[i]))
q2 <- tm_map(q2, removePunctuation)   
q2 <- tm_map(q2, removeNumbers)   
q2 <- tm_map(q2, tolower)   
q2 <- tm_map(q2, stemDocument)   
q2 <- tm_map(q2, stripWhitespace)   
q2 <- tm_map(q2, removeWords, stopwords("english"))   
q2 <- tm_map(q2, PlainTextDocument)   
doc = TermDocumentMatrix(q2) 
b11 = doc$dimnames$Terms
#a11
#b11
c11 = a11 %in% b11
same_items = sum(c11)
distinct_items = length(a11) + length(b11)
match_count = (2*same_items)/(distinct_items)
#train$is_duplicate[8]
#train$q1[8]
#train$q2[8]
sentiment1 <- get_nrc_sentiment(train$question1[i])
sentiment2 <- get_nrc_sentiment(train$question2[i])
sentiment1
sentiment2
p1 = sum(sentiment1$positive)
p2 = sum(sentiment2$positive)
n1 = sum(sentiment1$negative)
n2 = sum(sentiment2$negative)
#n1
#n2
#p1
#p2
df.new = cbind(match_count,p1,p2,n1,n2)
df = rbind(df,df.new)
}
tr = cbind(train,df)
tr = tr[,6:11]
tr[,c(1,3:6)] = lapply(tr[,c(1,3:6)],as.factor)
tr = na.omit(tr)
model <- randomForest(is_duplicate ~ ., data = tr)
model
## 
## Call:
##  randomForest(formula = is_duplicate ~ ., data = tr) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 31.32%
## Confusion matrix:
##      0    1 class.error
## 0 5079 1207   0.1920140
## 1 1924 1787   0.5184586
pred <- predict(model, newdata = tr,  type="prob")
str(pred)
##  matrix [1:9997, 1:2] 0.598 0.944 0.566 1 0.984 0.158 1 0.996 0.944 0.998 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:9997] "1" "2" "3" "4" ...
##   ..$ : chr [1:2] "0" "1"
pred = apply(pred, c(1,2), function(x) min(max(x, 1E-15), 1-1E-15)) 
logLoss = function(pred, actual)
{
  -1*mean(log(pred[model.matrix(~ actual + 0) - pred > 0]))
}
logLoss(pred, tr$is_duplicate)
## [1] 1.086917
Naive_Bayes_model <- naiveBayes(is_duplicate ~., data = tr, laplace = 3)
Naive_Bayes_model
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##         0         1 
## 0.6287886 0.3712114 
## 
## Conditional probabilities:
##    match_count
## Y        [,1]      [,2]
##   0 0.4619241 0.2819891
##   1 0.6945732 0.1914599
## 
##    p1
## Y              0            1            2            3            4
##   0 0.5649278579 0.3175836372 0.0887902331 0.0204534644 0.0057079436
##   1 0.5592175777 0.3419078242 0.0777063237 0.0176848875 0.0018756699
##    p1
## Y              5            6
##   0 0.0017440939 0.0007927699
##   1 0.0008038585 0.0008038585
## 
##    p2
## Y              0            1            2            3            4
##   0 0.5570522979 0.3190174326 0.0900158479 0.0248811410 0.0060221870
##   1 0.5483266399 0.3515394913 0.0784471218 0.0174029451 0.0018741633
##    p2
## Y              5            6            8
##   0 0.0012678288 0.0009508716 0.0007923930
##   1 0.0008032129 0.0008032129 0.0008032129
## 
##    n1
## Y              0            1            2            3            4
##   0 0.7723667513 0.1863895939 0.0334708122 0.0057106599 0.0014276650
##   1 0.7645481362 0.1922767498 0.0362027353 0.0050951998 0.0010726736
##    n1
## Y              5
##   0 0.0006345178
##   1 0.0008045052
## 
##    n2
## Y              0            1            2            3            4
##   0 0.7729001585 0.1816164818 0.0340729002 0.0083993661 0.0011093502
##   1 0.7659973226 0.1892904953 0.0374832664 0.0037483266 0.0010709505
##    n2
## Y              5            6            8
##   0 0.0006339144 0.0006339144 0.0006339144
##   1 0.0008032129 0.0008032129 0.0008032129
pred <- predict(Naive_Bayes_model, tr)
table(pred, tr$is_duplicate)
##     
## pred    0    1
##    0 4307 1360
##    1 1979 2351