NLP/Text Mining - Predicting product recommendation using model ensembling(Stacking)

Analysis Title of a product and predicting if the product will be recommended

Synopsis

This dataset was taken from Kaggle and was used to predict if the product will be recommended. Variable Title was taken and corpus was created using TM library and most frequent words were analysed and used for predictions. Few new variables were also introduced after EDA which resulted in better ouput.

set.seed(1)
setwd("C:\\R Programming\\Women's Clothing Reviews")
data<-read.csv("data.csv",stringsAsFactors = FALSE,na.strings = c(""," "))

library(tm)

## Loading required package: NLP

library(SnowballC)
library(caTools)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(textstem)

## Loading required package: koRpus.lang.en

## Loading required package: koRpus

## Loading required package: data.table

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

library(ngram)
Sys.setenv(JAVA_HOME='C:\\Program Files\\Java\\jre-10.0.2')
library(RWeka)

## 
## Attaching package: 'RWeka'

## The following object is masked from 'package:caTools':
## 
##     LogitBoost

data$notNull<-as.factor(ifelse(is.na(data$Title)==TRUE,0,1))
data$Title[is.na(data$Title)]<-"Missing"


corpus = VCorpus(VectorSource(data$Title)) 
corpus = tm_map(corpus, content_transformer(tolower))
corpus = tm_map(corpus, removePunctuation)

exceptions   <- c("not","too","bad","just","no","but")
my_stopwords <- setdiff(stopwords("en"), exceptions)
corpus = tm_map(corpus, removeWords, my_stopwords)
corpus <- tm_map(corpus, lemmatize_strings)
corpus <- tm_map(corpus, PlainTextDocument)

corpus = tm_map(corpus, stemDocument)


#BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 3))
#frequencies <- TermDocumentMatrix(corpus, control = list(tokenize = BigramTokenizer))

#frequencies = DocumentTermMatrix(corpus)
frequencies = DocumentTermMatrix(corpus,control = list(weighting = function(x) weightTfIdf(x, normalize = TRUE)))

## Warning in weightTfIdf(x, normalize = TRUE): empty document(s):
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)

sparse = removeSparseTerms(frequencies, 0.98)

TitleSparse = as.data.frame(as.matrix(sparse))
colnames(TitleSparse) = make.names(colnames(TitleSparse))
TitleSparse$Rating = as.factor(data$Rating)
TitleSparse$Recommended.IND = as.factor(data$Recommended.IND)

Including Plots for EDA

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

## # A tibble: 5 x 2
##   Rating `median(Age)`
##    <int>         <dbl>
## 1      1            42
## 2      2            41
## 3      3            40
## 4      4            41
## 5      5            41

## # A tibble: 2 x 2
##   Recommended.IND `median(Age)`
##             <int>         <dbl>
## 1               0            40
## 2               1            41

## # A tibble: 4 x 2
##   Division.Name  `median(Age)`
##   <chr>                  <dbl>
## 1 General                   41
## 2 General Petite            41
## 3 Initmates                 39
## 4 <NA>                      38

## # A tibble: 7 x 2
##   Department.Name `median(Age)`
##   <chr>                   <dbl>
## 1 Bottoms                    41
## 2 Dresses                    40
## 3 Intimate                   39
## 4 Jackets                    42
## 5 Tops                       42
## 6 Trend                      43
## 7 <NA>                       38

## # A tibble: 7 x 2
##   Department.Name `median(Age)`
##   <chr>                   <dbl>
## 1 Bottoms                    41
## 2 Dresses                    40
## 3 Intimate                   39
## 4 Jackets                    42
## 5 Tops                       42
## 6 Trend                      43
## 7 <NA>                       38

Now using only values from corpus/Title variable and few Engineered variable to create models.

TitleSparse$Positive.Feedback.Count<-NULL
TitleSparse$Recommended.IND<-as.factor(data$Recommended.IND)
TitleSparse$containsdots<-ifelse(grepl("\\...",data$Title),1,0)
TitleSparse$containsexclamation<-ifelse(grepl("\\!",data$Title),1,0)
TitleSparse$containsquestionmark<-ifelse(grepl("\\?",data$Title),1,0)

explore<-TitleSparse %>% group_by(containsdots) %>% summarise(mean(as.numeric(Recommended.IND)-1))
print(explore)

## # A tibble: 2 x 2
##   containsdots `mean(as.numeric(Recommended.IND) - 1)`
##          <dbl>                                   <dbl>
## 1            0                                   0.829
## 2            1                                   0.605

explore<-TitleSparse %>% group_by(containsexclamation) %>% summarise(mean(as.numeric(Recommended.IND)-1))
print(explore)

## # A tibble: 2 x 2
##   containsexclamation `mean(as.numeric(Recommended.IND) - 1)`
##                 <dbl>                                   <dbl>
## 1                   0                                   0.800
## 2                   1                                   0.924

explore<-TitleSparse %>% group_by(containsquestionmark) %>% summarise(mean(as.numeric(Recommended.IND)-1))
print(explore)

## # A tibble: 2 x 2
##   containsquestionmark `mean(as.numeric(Recommended.IND) - 1)`
##                  <dbl>                                   <dbl>
## 1                    0                                   0.823
## 2                    1                                   0.669

library(caTools)
splits<-sample.split(TitleSparse,SplitRatio = 0.8) 
train<-subset(TitleSparse,splits==TRUE)
test<-subset(TitleSparse,splits==FALSE)

library(party)

## Loading required package: grid

## Loading required package: mvtnorm

## Loading required package: modeltools

## Loading required package: stats4

## Loading required package: strucchange

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: sandwich

library(caret)

## Loading required package: lattice

output.tree <- train(factor(Recommended.IND)~.,data=train,method="ctree",trControl=trainControl(method="cv",number = 5))
predictionsctree<-predict(output.tree,test,type="prob")[,2]


predictionsctreetrain<-predict(output.tree,train,type="prob")[,2]
plot(output.tree)

print(output.tree)

## Conditional Inference Tree 
## 
## 18381 samples
##    22 predictor
##     2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 14704, 14704, 14705, 14706, 14705 
## Resampling results across tuning parameters:
## 
##   mincriterion  Accuracy   Kappa    
##   0.01          0.8373859  0.2525123
##   0.50          0.8373315  0.2477633
##   0.99          0.8369510  0.2203518
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mincriterion = 0.01.

modelglm<-train(factor(Recommended.IND)~.,data=train,method="glm",trControl=trainControl(method="cv",number = 5))
predictionsGLM<-data.frame(predict(modelglm,test,type="prob"))
predictionsGLM<-predictionsGLM[,2]


predictionsGLMtrain<-data.frame(predict(modelglm,train,type="prob"))
predictionsGLMtrain<-predictionsGLMtrain[,2]
print(table(predictionsGLM>0.5,test$Recommended.IND))

##        
##            0    1
##   FALSE  145   96
##   TRUE   748 4116

print(modelglm)

## Generalized Linear Model 
## 
## 18381 samples
##    22 predictor
##     2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 14704, 14704, 14706, 14705, 14705 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.8339044  0.1954241

modelgbm<-modelglm<-train(factor(Recommended.IND)~.,data=train,method="gbm",trControl=trainControl(method="cv",number = 5))

## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9288             nan     0.1000    0.0048
##      2        0.9223             nan     0.1000    0.0035
##      3        0.9172             nan     0.1000    0.0026
##      4        0.9133             nan     0.1000    0.0018
##      5        0.9079             nan     0.1000    0.0027
##      6        0.9044             nan     0.1000    0.0016
##      7        0.9018             nan     0.1000    0.0014
##      8        0.8994             nan     0.1000    0.0011
##      9        0.8970             nan     0.1000    0.0011
##     10        0.8940             nan     0.1000    0.0015
##     20        0.8783             nan     0.1000    0.0004
##     40        0.8564             nan     0.1000    0.0005
##     60        0.8415             nan     0.1000    0.0001
##     80        0.8298             nan     0.1000    0.0002
##    100        0.8214             nan     0.1000    0.0002
##    120        0.8131             nan     0.1000    0.0002
##    140        0.8072             nan     0.1000    0.0001
##    150        0.8053             nan     0.1000   -0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9220             nan     0.1000    0.0077
##      2        0.9117             nan     0.1000    0.0050
##      3        0.9047             nan     0.1000    0.0033
##      4        0.8984             nan     0.1000    0.0030
##      5        0.8934             nan     0.1000    0.0025
##      6        0.8893             nan     0.1000    0.0022
##      7        0.8858             nan     0.1000    0.0015
##      8        0.8830             nan     0.1000    0.0010
##      9        0.8802             nan     0.1000    0.0013
##     10        0.8766             nan     0.1000    0.0016
##     20        0.8558             nan     0.1000    0.0009
##     40        0.8279             nan     0.1000    0.0005
##     60        0.8115             nan     0.1000    0.0000
##     80        0.7994             nan     0.1000    0.0002
##    100        0.7903             nan     0.1000    0.0001
##    120        0.7840             nan     0.1000    0.0001
##    140        0.7787             nan     0.1000    0.0000
##    150        0.7768             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9208             nan     0.1000    0.0087
##      2        0.9091             nan     0.1000    0.0060
##      3        0.8990             nan     0.1000    0.0051
##      4        0.8913             nan     0.1000    0.0039
##      5        0.8850             nan     0.1000    0.0032
##      6        0.8806             nan     0.1000    0.0020
##      7        0.8762             nan     0.1000    0.0020
##      8        0.8720             nan     0.1000    0.0021
##      9        0.8677             nan     0.1000    0.0018
##     10        0.8646             nan     0.1000    0.0014
##     20        0.8392             nan     0.1000    0.0005
##     40        0.8108             nan     0.1000    0.0005
##     60        0.7927             nan     0.1000    0.0002
##     80        0.7816             nan     0.1000    0.0000
##    100        0.7746             nan     0.1000    0.0001
##    120        0.7703             nan     0.1000   -0.0001
##    140        0.7669             nan     0.1000   -0.0001
##    150        0.7650             nan     0.1000   -0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9276             nan     0.1000    0.0048
##      2        0.9204             nan     0.1000    0.0036
##      3        0.9151             nan     0.1000    0.0028
##      4        0.9089             nan     0.1000    0.0031
##      5        0.9049             nan     0.1000    0.0020
##      6        0.9019             nan     0.1000    0.0014
##      7        0.8979             nan     0.1000    0.0014
##      8        0.8953             nan     0.1000    0.0011
##      9        0.8927             nan     0.1000    0.0011
##     10        0.8909             nan     0.1000    0.0008
##     20        0.8731             nan     0.1000    0.0004
##     40        0.8533             nan     0.1000    0.0005
##     60        0.8389             nan     0.1000    0.0003
##     80        0.8275             nan     0.1000    0.0001
##    100        0.8184             nan     0.1000    0.0002
##    120        0.8112             nan     0.1000    0.0001
##    140        0.8049             nan     0.1000    0.0001
##    150        0.8023             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9212             nan     0.1000    0.0076
##      2        0.9108             nan     0.1000    0.0054
##      3        0.9024             nan     0.1000    0.0040
##      4        0.8959             nan     0.1000    0.0029
##      5        0.8913             nan     0.1000    0.0023
##      6        0.8873             nan     0.1000    0.0018
##      7        0.8830             nan     0.1000    0.0021
##      8        0.8791             nan     0.1000    0.0017
##      9        0.8761             nan     0.1000    0.0013
##     10        0.8732             nan     0.1000    0.0013
##     20        0.8524             nan     0.1000    0.0009
##     40        0.8254             nan     0.1000    0.0001
##     60        0.8087             nan     0.1000    0.0001
##     80        0.7970             nan     0.1000    0.0001
##    100        0.7885             nan     0.1000    0.0000
##    120        0.7816             nan     0.1000   -0.0000
##    140        0.7764             nan     0.1000    0.0002
##    150        0.7746             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9191             nan     0.1000    0.0089
##      2        0.9055             nan     0.1000    0.0065
##      3        0.8968             nan     0.1000    0.0044
##      4        0.8886             nan     0.1000    0.0039
##      5        0.8820             nan     0.1000    0.0032
##      6        0.8770             nan     0.1000    0.0023
##      7        0.8726             nan     0.1000    0.0020
##      8        0.8688             nan     0.1000    0.0020
##      9        0.8655             nan     0.1000    0.0015
##     10        0.8615             nan     0.1000    0.0019
##     20        0.8369             nan     0.1000    0.0006
##     40        0.8072             nan     0.1000    0.0004
##     60        0.7899             nan     0.1000    0.0002
##     80        0.7799             nan     0.1000    0.0001
##    100        0.7725             nan     0.1000    0.0002
##    120        0.7680             nan     0.1000   -0.0001
##    140        0.7644             nan     0.1000   -0.0000
##    150        0.7631             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9282             nan     0.1000    0.0044
##      2        0.9214             nan     0.1000    0.0033
##      3        0.9166             nan     0.1000    0.0025
##      4        0.9130             nan     0.1000    0.0016
##      5        0.9075             nan     0.1000    0.0030
##      6        0.9048             nan     0.1000    0.0011
##      7        0.9009             nan     0.1000    0.0019
##      8        0.8977             nan     0.1000    0.0013
##      9        0.8953             nan     0.1000    0.0011
##     10        0.8931             nan     0.1000    0.0010
##     20        0.8772             nan     0.1000    0.0004
##     40        0.8572             nan     0.1000    0.0002
##     60        0.8403             nan     0.1000    0.0003
##     80        0.8299             nan     0.1000    0.0002
##    100        0.8201             nan     0.1000    0.0002
##    120        0.8126             nan     0.1000    0.0001
##    140        0.8065             nan     0.1000   -0.0000
##    150        0.8034             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9212             nan     0.1000    0.0075
##      2        0.9108             nan     0.1000    0.0047
##      3        0.9034             nan     0.1000    0.0039
##      4        0.8974             nan     0.1000    0.0028
##      5        0.8925             nan     0.1000    0.0026
##      6        0.8885             nan     0.1000    0.0018
##      7        0.8854             nan     0.1000    0.0016
##      8        0.8824             nan     0.1000    0.0014
##      9        0.8791             nan     0.1000    0.0014
##     10        0.8760             nan     0.1000    0.0014
##     20        0.8549             nan     0.1000    0.0007
##     40        0.8288             nan     0.1000    0.0004
##     60        0.8115             nan     0.1000    0.0002
##     80        0.7990             nan     0.1000    0.0000
##    100        0.7903             nan     0.1000    0.0001
##    120        0.7846             nan     0.1000    0.0000
##    140        0.7788             nan     0.1000   -0.0000
##    150        0.7770             nan     0.1000   -0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9193             nan     0.1000    0.0092
##      2        0.9070             nan     0.1000    0.0060
##      3        0.8972             nan     0.1000    0.0046
##      4        0.8903             nan     0.1000    0.0033
##      5        0.8850             nan     0.1000    0.0025
##      6        0.8797             nan     0.1000    0.0026
##      7        0.8750             nan     0.1000    0.0020
##      8        0.8707             nan     0.1000    0.0019
##      9        0.8682             nan     0.1000    0.0011
##     10        0.8648             nan     0.1000    0.0015
##     20        0.8396             nan     0.1000    0.0009
##     40        0.8091             nan     0.1000    0.0004
##     60        0.7924             nan     0.1000    0.0003
##     80        0.7815             nan     0.1000    0.0002
##    100        0.7742             nan     0.1000    0.0001
##    120        0.7698             nan     0.1000   -0.0001
##    140        0.7668             nan     0.1000   -0.0000
##    150        0.7655             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9294             nan     0.1000    0.0042
##      2        0.9239             nan     0.1000    0.0023
##      3        0.9176             nan     0.1000    0.0029
##      4        0.9134             nan     0.1000    0.0020
##      5        0.9100             nan     0.1000    0.0016
##      6        0.9065             nan     0.1000    0.0015
##      7        0.9038             nan     0.1000    0.0012
##      8        0.9014             nan     0.1000    0.0012
##      9        0.8994             nan     0.1000    0.0010
##     10        0.8975             nan     0.1000    0.0009
##     20        0.8825             nan     0.1000    0.0008
##     40        0.8625             nan     0.1000    0.0004
##     60        0.8474             nan     0.1000    0.0003
##     80        0.8366             nan     0.1000    0.0003
##    100        0.8286             nan     0.1000    0.0002
##    120        0.8212             nan     0.1000   -0.0000
##    140        0.8157             nan     0.1000    0.0002
##    150        0.8127             nan     0.1000    0.0002
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9243             nan     0.1000    0.0072
##      2        0.9136             nan     0.1000    0.0048
##      3        0.9067             nan     0.1000    0.0030
##      4        0.9023             nan     0.1000    0.0021
##      5        0.8977             nan     0.1000    0.0024
##      6        0.8941             nan     0.1000    0.0018
##      7        0.8903             nan     0.1000    0.0015
##      8        0.8872             nan     0.1000    0.0015
##      9        0.8845             nan     0.1000    0.0014
##     10        0.8818             nan     0.1000    0.0013
##     20        0.8620             nan     0.1000    0.0004
##     40        0.8367             nan     0.1000    0.0004
##     60        0.8198             nan     0.1000    0.0005
##     80        0.8079             nan     0.1000   -0.0001
##    100        0.7991             nan     0.1000    0.0002
##    120        0.7920             nan     0.1000    0.0002
##    140        0.7872             nan     0.1000    0.0001
##    150        0.7854             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9211             nan     0.1000    0.0075
##      2        0.9105             nan     0.1000    0.0057
##      3        0.9015             nan     0.1000    0.0040
##      4        0.8944             nan     0.1000    0.0035
##      5        0.8890             nan     0.1000    0.0025
##      6        0.8842             nan     0.1000    0.0023
##      7        0.8801             nan     0.1000    0.0020
##      8        0.8766             nan     0.1000    0.0014
##      9        0.8740             nan     0.1000    0.0010
##     10        0.8707             nan     0.1000    0.0014
##     20        0.8464             nan     0.1000    0.0010
##     40        0.8179             nan     0.1000    0.0005
##     60        0.8003             nan     0.1000    0.0004
##     80        0.7906             nan     0.1000    0.0001
##    100        0.7842             nan     0.1000    0.0001
##    120        0.7795             nan     0.1000    0.0001
##    140        0.7752             nan     0.1000    0.0000
##    150        0.7737             nan     0.1000   -0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9291             nan     0.1000    0.0048
##      2        0.9223             nan     0.1000    0.0031
##      3        0.9178             nan     0.1000    0.0022
##      4        0.9144             nan     0.1000    0.0016
##      5        0.9099             nan     0.1000    0.0023
##      6        0.9066             nan     0.1000    0.0015
##      7        0.9036             nan     0.1000    0.0012
##      8        0.9015             nan     0.1000    0.0009
##      9        0.8986             nan     0.1000    0.0014
##     10        0.8963             nan     0.1000    0.0011
##     20        0.8813             nan     0.1000    0.0003
##     40        0.8606             nan     0.1000    0.0002
##     60        0.8451             nan     0.1000    0.0003
##     80        0.8337             nan     0.1000    0.0002
##    100        0.8251             nan     0.1000    0.0002
##    120        0.8177             nan     0.1000    0.0001
##    140        0.8116             nan     0.1000    0.0001
##    150        0.8089             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9243             nan     0.1000    0.0069
##      2        0.9148             nan     0.1000    0.0046
##      3        0.9077             nan     0.1000    0.0037
##      4        0.9020             nan     0.1000    0.0026
##      5        0.8967             nan     0.1000    0.0026
##      6        0.8934             nan     0.1000    0.0016
##      7        0.8896             nan     0.1000    0.0016
##      8        0.8864             nan     0.1000    0.0014
##      9        0.8835             nan     0.1000    0.0014
##     10        0.8806             nan     0.1000    0.0015
##     20        0.8598             nan     0.1000    0.0003
##     40        0.8331             nan     0.1000    0.0004
##     60        0.8144             nan     0.1000    0.0004
##     80        0.8031             nan     0.1000    0.0001
##    100        0.7947             nan     0.1000   -0.0000
##    120        0.7876             nan     0.1000   -0.0000
##    140        0.7827             nan     0.1000   -0.0001
##    150        0.7804             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9213             nan     0.1000    0.0077
##      2        0.9092             nan     0.1000    0.0056
##      3        0.9003             nan     0.1000    0.0044
##      4        0.8936             nan     0.1000    0.0033
##      5        0.8878             nan     0.1000    0.0028
##      6        0.8834             nan     0.1000    0.0023
##      7        0.8796             nan     0.1000    0.0015
##      8        0.8753             nan     0.1000    0.0020
##      9        0.8722             nan     0.1000    0.0014
##     10        0.8688             nan     0.1000    0.0015
##     20        0.8433             nan     0.1000    0.0009
##     40        0.8132             nan     0.1000    0.0005
##     60        0.7957             nan     0.1000    0.0002
##     80        0.7852             nan     0.1000    0.0002
##    100        0.7776             nan     0.1000   -0.0000
##    120        0.7723             nan     0.1000    0.0001
##    140        0.7691             nan     0.1000   -0.0000
##    150        0.7682             nan     0.1000   -0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9217             nan     0.1000    0.0087
##      2        0.9092             nan     0.1000    0.0063
##      3        0.8998             nan     0.1000    0.0046
##      4        0.8924             nan     0.1000    0.0035
##      5        0.8867             nan     0.1000    0.0029
##      6        0.8820             nan     0.1000    0.0023
##      7        0.8773             nan     0.1000    0.0022
##      8        0.8740             nan     0.1000    0.0015
##      9        0.8704             nan     0.1000    0.0017
##     10        0.8678             nan     0.1000    0.0011
##     20        0.8425             nan     0.1000    0.0011
##     40        0.8134             nan     0.1000    0.0006
##     50        0.8043             nan     0.1000    0.0002

predictionsGBM<-predict(modelgbm,test,type="prob")[,2]
predictionsGBMtrain<-predict(modelgbm,train,type="prob")[,2]
plot(modelgbm)

print(modelgbm)

## Stochastic Gradient Boosting 
## 
## 18381 samples
##    22 predictor
##     2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 14706, 14704, 14705, 14705, 14704 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  Accuracy   Kappa    
##   1                   50      0.8328709  0.1778144
##   1                  100      0.8337414  0.1940634
##   1                  150      0.8336326  0.1967619
##   2                   50      0.8347205  0.1999170
##   2                  100      0.8341767  0.1987011
##   2                  150      0.8338504  0.2111095
##   3                   50      0.8361895  0.2093033
##   3                  100      0.8348840  0.2173995
##   3                  150      0.8360809  0.2295391
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 50, interaction.depth
##  = 3, shrinkage = 0.1 and n.minobsinnode = 10.

library("xgboost")

## 
## Attaching package: 'xgboost'

## The following object is masked from 'package:dplyr':
## 
##     slice

library("Matrix")

## 
## Attaching package: 'Matrix'

## The following object is masked from 'package:tidyr':
## 
##     expand

library(dplyr)
train$Recommended.IND<-as.numeric(train$Recommended.IND)-1

data_variables <- as.matrix(train %>% select(-Recommended.IND))
data_label <- train[,"Recommended.IND"]
data_matrix <- xgb.DMatrix(data = data_variables, label = data_label)

numberOfClasses <- length(unique(train$Recommended.IND))
xgb_params <- list(booster = "gbtree","objective" = "binary:logistic",eta=0.1,gamma=5,max_depth=10)
xgbcv <- xgb.cv( params = xgb_params, data = data_matrix, nrounds = 100, nfold = 10, showsd = T, stratified = T, print.every.n = 10, early.stop.round = 20, maximize = F)

## Warning: 'print.every.n' is deprecated.
## Use 'print_every_n' instead.
## See help("Deprecated") and help("xgboost-deprecated").

## Warning: 'early.stop.round' is deprecated.
## Use 'early_stopping_rounds' instead.
## See help("Deprecated") and help("xgboost-deprecated").

## [1]  train-error:0.159108+0.001024   test-error:0.162288+0.006938 
## Multiple eval metrics are present. Will use test_error for early stopping.
## Will train until test_error hasn't improved in 20 rounds.
## 
## [11] train-error:0.160347+0.000871   test-error:0.163104+0.008183 
## [21] train-error:0.159960+0.000995   test-error:0.162179+0.007848 
## [31] train-error:0.159489+0.001176   test-error:0.162071+0.007700 
## [41] train-error:0.158237+0.000882   test-error:0.160819+0.007373 
## [51] train-error:0.158080+0.000961   test-error:0.160874+0.007085 
## [61] train-error:0.158080+0.000849   test-error:0.160819+0.007199 
## Stopping. Best iteration:
## [44] train-error:0.158159+0.000906   test-error:0.160547+0.007129

nround    <- xgbcv$best_iteration # number of XGBoost rounds
cv.nfold  <- 10

# Fit cv.nfold * cv.nround XGB models and save OOF predictions
bst_model <- xgb.train(params = xgb_params,
                       data = data_matrix,
                       nrounds = nround)

test$Recommended.IND<-as.numeric(test$Recommended.IND)-1


test_matrix<-xgb.DMatrix(data = as.matrix(test %>% select(-Recommended.IND)))
predictionsXGBoost<-predict(bst_model,newdata=test_matrix)

predictionsXGBoosttrain<-predict(bst_model,newdata=data_matrix)
print(table(predictionsXGBoost>0.5,test$Recommended.IND))

##        
##            0    1
##   FALSE  165  104
##   TRUE   728 4108

#Stacking model modelGLM,bst_model(XGBOOST) and ctree and using linear model on top
df<-data.frame(train,a=predictionsGLMtrain,b=predictionsGBMtrain,c=predictionsXGBoosttrain,d=predictionsctreetrain)


library(class)

testdf<-data.frame(test,a=predictionsGLM,b=predictionsGBM,c=predictionsXGBoost,d=predictionsctree)


model<-lm(Recommended.IND~.,data=df)


predictions<-predict(model,testdf)
table(predictions>0.5,test$Recommended.IND)

##        
##            0    1
##   FALSE  195  130
##   TRUE   698 4082

##Accuracy increased by stacking models

Future work which can increase accuracy-

Using more terms from corpus may increase the accuracy(but space complexity will increase a lot). The cactegorical values can be encoded by mean or frequency which can increase the accuracy as well. Stacking more model and bagging can also help. You can also use N-grams but will take up more space and time to build models.

Here I have achieved an accuracy of ~84% using 22 variables(including 3 Engineered variables) which is pretty good considering the space and time of the CPU used.

NLP/Text Mining - Predicting product recommendation using model ensembling(Stacking)

Pulkit Kalia

19 August 2018

Analysis Title of a product and predicting if the product will be recommended

Synopsis

Including Plots for EDA

Future work which can increase accuracy-