This dataset was taken from Kaggle and was used to predict if the product will be recommended. Variable Title was taken and corpus was created using TM library and most frequent words were analysed and used for predictions. Few new variables were also introduced after EDA which resulted in better ouput.
set.seed(1)
setwd("C:\\R Programming\\Women's Clothing Reviews")
data<-read.csv("data.csv",stringsAsFactors = FALSE,na.strings = c(""," "))
library(tm)
## Loading required package: NLP
library(SnowballC)
library(caTools)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(textstem)
## Loading required package: koRpus.lang.en
## Loading required package: koRpus
## Loading required package: data.table
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(ngram)
Sys.setenv(JAVA_HOME='C:\\Program Files\\Java\\jre-10.0.2')
library(RWeka)
##
## Attaching package: 'RWeka'
## The following object is masked from 'package:caTools':
##
## LogitBoost
data$notNull<-as.factor(ifelse(is.na(data$Title)==TRUE,0,1))
data$Title[is.na(data$Title)]<-"Missing"
corpus = VCorpus(VectorSource(data$Title))
corpus = tm_map(corpus, content_transformer(tolower))
corpus = tm_map(corpus, removePunctuation)
exceptions <- c("not","too","bad","just","no","but")
my_stopwords <- setdiff(stopwords("en"), exceptions)
corpus = tm_map(corpus, removeWords, my_stopwords)
corpus <- tm_map(corpus, lemmatize_strings)
corpus <- tm_map(corpus, PlainTextDocument)
corpus = tm_map(corpus, stemDocument)
#BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 3))
#frequencies <- TermDocumentMatrix(corpus, control = list(tokenize = BigramTokenizer))
#frequencies = DocumentTermMatrix(corpus)
frequencies = DocumentTermMatrix(corpus,control = list(weighting = function(x) weightTfIdf(x, normalize = TRUE)))
## Warning in weightTfIdf(x, normalize = TRUE): empty document(s):
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
sparse = removeSparseTerms(frequencies, 0.98)
TitleSparse = as.data.frame(as.matrix(sparse))
colnames(TitleSparse) = make.names(colnames(TitleSparse))
TitleSparse$Rating = as.factor(data$Rating)
TitleSparse$Recommended.IND = as.factor(data$Recommended.IND)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
## # A tibble: 5 x 2
## Rating `median(Age)`
## <int> <dbl>
## 1 1 42
## 2 2 41
## 3 3 40
## 4 4 41
## 5 5 41
## # A tibble: 2 x 2
## Recommended.IND `median(Age)`
## <int> <dbl>
## 1 0 40
## 2 1 41
## # A tibble: 4 x 2
## Division.Name `median(Age)`
## <chr> <dbl>
## 1 General 41
## 2 General Petite 41
## 3 Initmates 39
## 4 <NA> 38
## # A tibble: 7 x 2
## Department.Name `median(Age)`
## <chr> <dbl>
## 1 Bottoms 41
## 2 Dresses 40
## 3 Intimate 39
## 4 Jackets 42
## 5 Tops 42
## 6 Trend 43
## 7 <NA> 38
## # A tibble: 7 x 2
## Department.Name `median(Age)`
## <chr> <dbl>
## 1 Bottoms 41
## 2 Dresses 40
## 3 Intimate 39
## 4 Jackets 42
## 5 Tops 42
## 6 Trend 43
## 7 <NA> 38
Now using only values from corpus/Title variable and few Engineered variable to create models.
TitleSparse$Positive.Feedback.Count<-NULL
TitleSparse$Recommended.IND<-as.factor(data$Recommended.IND)
TitleSparse$containsdots<-ifelse(grepl("\\...",data$Title),1,0)
TitleSparse$containsexclamation<-ifelse(grepl("\\!",data$Title),1,0)
TitleSparse$containsquestionmark<-ifelse(grepl("\\?",data$Title),1,0)
explore<-TitleSparse %>% group_by(containsdots) %>% summarise(mean(as.numeric(Recommended.IND)-1))
print(explore)
## # A tibble: 2 x 2
## containsdots `mean(as.numeric(Recommended.IND) - 1)`
## <dbl> <dbl>
## 1 0 0.829
## 2 1 0.605
explore<-TitleSparse %>% group_by(containsexclamation) %>% summarise(mean(as.numeric(Recommended.IND)-1))
print(explore)
## # A tibble: 2 x 2
## containsexclamation `mean(as.numeric(Recommended.IND) - 1)`
## <dbl> <dbl>
## 1 0 0.800
## 2 1 0.924
explore<-TitleSparse %>% group_by(containsquestionmark) %>% summarise(mean(as.numeric(Recommended.IND)-1))
print(explore)
## # A tibble: 2 x 2
## containsquestionmark `mean(as.numeric(Recommended.IND) - 1)`
## <dbl> <dbl>
## 1 0 0.823
## 2 1 0.669
library(caTools)
splits<-sample.split(TitleSparse,SplitRatio = 0.8)
train<-subset(TitleSparse,splits==TRUE)
test<-subset(TitleSparse,splits==FALSE)
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
library(caret)
## Loading required package: lattice
output.tree <- train(factor(Recommended.IND)~.,data=train,method="ctree",trControl=trainControl(method="cv",number = 5))
predictionsctree<-predict(output.tree,test,type="prob")[,2]
predictionsctreetrain<-predict(output.tree,train,type="prob")[,2]
plot(output.tree)
print(output.tree)
## Conditional Inference Tree
##
## 18381 samples
## 22 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 14704, 14704, 14705, 14706, 14705
## Resampling results across tuning parameters:
##
## mincriterion Accuracy Kappa
## 0.01 0.8373859 0.2525123
## 0.50 0.8373315 0.2477633
## 0.99 0.8369510 0.2203518
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mincriterion = 0.01.
modelglm<-train(factor(Recommended.IND)~.,data=train,method="glm",trControl=trainControl(method="cv",number = 5))
predictionsGLM<-data.frame(predict(modelglm,test,type="prob"))
predictionsGLM<-predictionsGLM[,2]
predictionsGLMtrain<-data.frame(predict(modelglm,train,type="prob"))
predictionsGLMtrain<-predictionsGLMtrain[,2]
print(table(predictionsGLM>0.5,test$Recommended.IND))
##
## 0 1
## FALSE 145 96
## TRUE 748 4116
print(modelglm)
## Generalized Linear Model
##
## 18381 samples
## 22 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 14704, 14704, 14706, 14705, 14705
## Resampling results:
##
## Accuracy Kappa
## 0.8339044 0.1954241
modelgbm<-modelglm<-train(factor(Recommended.IND)~.,data=train,method="gbm",trControl=trainControl(method="cv",number = 5))
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.9288 nan 0.1000 0.0048
## 2 0.9223 nan 0.1000 0.0035
## 3 0.9172 nan 0.1000 0.0026
## 4 0.9133 nan 0.1000 0.0018
## 5 0.9079 nan 0.1000 0.0027
## 6 0.9044 nan 0.1000 0.0016
## 7 0.9018 nan 0.1000 0.0014
## 8 0.8994 nan 0.1000 0.0011
## 9 0.8970 nan 0.1000 0.0011
## 10 0.8940 nan 0.1000 0.0015
## 20 0.8783 nan 0.1000 0.0004
## 40 0.8564 nan 0.1000 0.0005
## 60 0.8415 nan 0.1000 0.0001
## 80 0.8298 nan 0.1000 0.0002
## 100 0.8214 nan 0.1000 0.0002
## 120 0.8131 nan 0.1000 0.0002
## 140 0.8072 nan 0.1000 0.0001
## 150 0.8053 nan 0.1000 -0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.9220 nan 0.1000 0.0077
## 2 0.9117 nan 0.1000 0.0050
## 3 0.9047 nan 0.1000 0.0033
## 4 0.8984 nan 0.1000 0.0030
## 5 0.8934 nan 0.1000 0.0025
## 6 0.8893 nan 0.1000 0.0022
## 7 0.8858 nan 0.1000 0.0015
## 8 0.8830 nan 0.1000 0.0010
## 9 0.8802 nan 0.1000 0.0013
## 10 0.8766 nan 0.1000 0.0016
## 20 0.8558 nan 0.1000 0.0009
## 40 0.8279 nan 0.1000 0.0005
## 60 0.8115 nan 0.1000 0.0000
## 80 0.7994 nan 0.1000 0.0002
## 100 0.7903 nan 0.1000 0.0001
## 120 0.7840 nan 0.1000 0.0001
## 140 0.7787 nan 0.1000 0.0000
## 150 0.7768 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.9208 nan 0.1000 0.0087
## 2 0.9091 nan 0.1000 0.0060
## 3 0.8990 nan 0.1000 0.0051
## 4 0.8913 nan 0.1000 0.0039
## 5 0.8850 nan 0.1000 0.0032
## 6 0.8806 nan 0.1000 0.0020
## 7 0.8762 nan 0.1000 0.0020
## 8 0.8720 nan 0.1000 0.0021
## 9 0.8677 nan 0.1000 0.0018
## 10 0.8646 nan 0.1000 0.0014
## 20 0.8392 nan 0.1000 0.0005
## 40 0.8108 nan 0.1000 0.0005
## 60 0.7927 nan 0.1000 0.0002
## 80 0.7816 nan 0.1000 0.0000
## 100 0.7746 nan 0.1000 0.0001
## 120 0.7703 nan 0.1000 -0.0001
## 140 0.7669 nan 0.1000 -0.0001
## 150 0.7650 nan 0.1000 -0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.9276 nan 0.1000 0.0048
## 2 0.9204 nan 0.1000 0.0036
## 3 0.9151 nan 0.1000 0.0028
## 4 0.9089 nan 0.1000 0.0031
## 5 0.9049 nan 0.1000 0.0020
## 6 0.9019 nan 0.1000 0.0014
## 7 0.8979 nan 0.1000 0.0014
## 8 0.8953 nan 0.1000 0.0011
## 9 0.8927 nan 0.1000 0.0011
## 10 0.8909 nan 0.1000 0.0008
## 20 0.8731 nan 0.1000 0.0004
## 40 0.8533 nan 0.1000 0.0005
## 60 0.8389 nan 0.1000 0.0003
## 80 0.8275 nan 0.1000 0.0001
## 100 0.8184 nan 0.1000 0.0002
## 120 0.8112 nan 0.1000 0.0001
## 140 0.8049 nan 0.1000 0.0001
## 150 0.8023 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.9212 nan 0.1000 0.0076
## 2 0.9108 nan 0.1000 0.0054
## 3 0.9024 nan 0.1000 0.0040
## 4 0.8959 nan 0.1000 0.0029
## 5 0.8913 nan 0.1000 0.0023
## 6 0.8873 nan 0.1000 0.0018
## 7 0.8830 nan 0.1000 0.0021
## 8 0.8791 nan 0.1000 0.0017
## 9 0.8761 nan 0.1000 0.0013
## 10 0.8732 nan 0.1000 0.0013
## 20 0.8524 nan 0.1000 0.0009
## 40 0.8254 nan 0.1000 0.0001
## 60 0.8087 nan 0.1000 0.0001
## 80 0.7970 nan 0.1000 0.0001
## 100 0.7885 nan 0.1000 0.0000
## 120 0.7816 nan 0.1000 -0.0000
## 140 0.7764 nan 0.1000 0.0002
## 150 0.7746 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.9191 nan 0.1000 0.0089
## 2 0.9055 nan 0.1000 0.0065
## 3 0.8968 nan 0.1000 0.0044
## 4 0.8886 nan 0.1000 0.0039
## 5 0.8820 nan 0.1000 0.0032
## 6 0.8770 nan 0.1000 0.0023
## 7 0.8726 nan 0.1000 0.0020
## 8 0.8688 nan 0.1000 0.0020
## 9 0.8655 nan 0.1000 0.0015
## 10 0.8615 nan 0.1000 0.0019
## 20 0.8369 nan 0.1000 0.0006
## 40 0.8072 nan 0.1000 0.0004
## 60 0.7899 nan 0.1000 0.0002
## 80 0.7799 nan 0.1000 0.0001
## 100 0.7725 nan 0.1000 0.0002
## 120 0.7680 nan 0.1000 -0.0001
## 140 0.7644 nan 0.1000 -0.0000
## 150 0.7631 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.9282 nan 0.1000 0.0044
## 2 0.9214 nan 0.1000 0.0033
## 3 0.9166 nan 0.1000 0.0025
## 4 0.9130 nan 0.1000 0.0016
## 5 0.9075 nan 0.1000 0.0030
## 6 0.9048 nan 0.1000 0.0011
## 7 0.9009 nan 0.1000 0.0019
## 8 0.8977 nan 0.1000 0.0013
## 9 0.8953 nan 0.1000 0.0011
## 10 0.8931 nan 0.1000 0.0010
## 20 0.8772 nan 0.1000 0.0004
## 40 0.8572 nan 0.1000 0.0002
## 60 0.8403 nan 0.1000 0.0003
## 80 0.8299 nan 0.1000 0.0002
## 100 0.8201 nan 0.1000 0.0002
## 120 0.8126 nan 0.1000 0.0001
## 140 0.8065 nan 0.1000 -0.0000
## 150 0.8034 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.9212 nan 0.1000 0.0075
## 2 0.9108 nan 0.1000 0.0047
## 3 0.9034 nan 0.1000 0.0039
## 4 0.8974 nan 0.1000 0.0028
## 5 0.8925 nan 0.1000 0.0026
## 6 0.8885 nan 0.1000 0.0018
## 7 0.8854 nan 0.1000 0.0016
## 8 0.8824 nan 0.1000 0.0014
## 9 0.8791 nan 0.1000 0.0014
## 10 0.8760 nan 0.1000 0.0014
## 20 0.8549 nan 0.1000 0.0007
## 40 0.8288 nan 0.1000 0.0004
## 60 0.8115 nan 0.1000 0.0002
## 80 0.7990 nan 0.1000 0.0000
## 100 0.7903 nan 0.1000 0.0001
## 120 0.7846 nan 0.1000 0.0000
## 140 0.7788 nan 0.1000 -0.0000
## 150 0.7770 nan 0.1000 -0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.9193 nan 0.1000 0.0092
## 2 0.9070 nan 0.1000 0.0060
## 3 0.8972 nan 0.1000 0.0046
## 4 0.8903 nan 0.1000 0.0033
## 5 0.8850 nan 0.1000 0.0025
## 6 0.8797 nan 0.1000 0.0026
## 7 0.8750 nan 0.1000 0.0020
## 8 0.8707 nan 0.1000 0.0019
## 9 0.8682 nan 0.1000 0.0011
## 10 0.8648 nan 0.1000 0.0015
## 20 0.8396 nan 0.1000 0.0009
## 40 0.8091 nan 0.1000 0.0004
## 60 0.7924 nan 0.1000 0.0003
## 80 0.7815 nan 0.1000 0.0002
## 100 0.7742 nan 0.1000 0.0001
## 120 0.7698 nan 0.1000 -0.0001
## 140 0.7668 nan 0.1000 -0.0000
## 150 0.7655 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.9294 nan 0.1000 0.0042
## 2 0.9239 nan 0.1000 0.0023
## 3 0.9176 nan 0.1000 0.0029
## 4 0.9134 nan 0.1000 0.0020
## 5 0.9100 nan 0.1000 0.0016
## 6 0.9065 nan 0.1000 0.0015
## 7 0.9038 nan 0.1000 0.0012
## 8 0.9014 nan 0.1000 0.0012
## 9 0.8994 nan 0.1000 0.0010
## 10 0.8975 nan 0.1000 0.0009
## 20 0.8825 nan 0.1000 0.0008
## 40 0.8625 nan 0.1000 0.0004
## 60 0.8474 nan 0.1000 0.0003
## 80 0.8366 nan 0.1000 0.0003
## 100 0.8286 nan 0.1000 0.0002
## 120 0.8212 nan 0.1000 -0.0000
## 140 0.8157 nan 0.1000 0.0002
## 150 0.8127 nan 0.1000 0.0002
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.9243 nan 0.1000 0.0072
## 2 0.9136 nan 0.1000 0.0048
## 3 0.9067 nan 0.1000 0.0030
## 4 0.9023 nan 0.1000 0.0021
## 5 0.8977 nan 0.1000 0.0024
## 6 0.8941 nan 0.1000 0.0018
## 7 0.8903 nan 0.1000 0.0015
## 8 0.8872 nan 0.1000 0.0015
## 9 0.8845 nan 0.1000 0.0014
## 10 0.8818 nan 0.1000 0.0013
## 20 0.8620 nan 0.1000 0.0004
## 40 0.8367 nan 0.1000 0.0004
## 60 0.8198 nan 0.1000 0.0005
## 80 0.8079 nan 0.1000 -0.0001
## 100 0.7991 nan 0.1000 0.0002
## 120 0.7920 nan 0.1000 0.0002
## 140 0.7872 nan 0.1000 0.0001
## 150 0.7854 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.9211 nan 0.1000 0.0075
## 2 0.9105 nan 0.1000 0.0057
## 3 0.9015 nan 0.1000 0.0040
## 4 0.8944 nan 0.1000 0.0035
## 5 0.8890 nan 0.1000 0.0025
## 6 0.8842 nan 0.1000 0.0023
## 7 0.8801 nan 0.1000 0.0020
## 8 0.8766 nan 0.1000 0.0014
## 9 0.8740 nan 0.1000 0.0010
## 10 0.8707 nan 0.1000 0.0014
## 20 0.8464 nan 0.1000 0.0010
## 40 0.8179 nan 0.1000 0.0005
## 60 0.8003 nan 0.1000 0.0004
## 80 0.7906 nan 0.1000 0.0001
## 100 0.7842 nan 0.1000 0.0001
## 120 0.7795 nan 0.1000 0.0001
## 140 0.7752 nan 0.1000 0.0000
## 150 0.7737 nan 0.1000 -0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.9291 nan 0.1000 0.0048
## 2 0.9223 nan 0.1000 0.0031
## 3 0.9178 nan 0.1000 0.0022
## 4 0.9144 nan 0.1000 0.0016
## 5 0.9099 nan 0.1000 0.0023
## 6 0.9066 nan 0.1000 0.0015
## 7 0.9036 nan 0.1000 0.0012
## 8 0.9015 nan 0.1000 0.0009
## 9 0.8986 nan 0.1000 0.0014
## 10 0.8963 nan 0.1000 0.0011
## 20 0.8813 nan 0.1000 0.0003
## 40 0.8606 nan 0.1000 0.0002
## 60 0.8451 nan 0.1000 0.0003
## 80 0.8337 nan 0.1000 0.0002
## 100 0.8251 nan 0.1000 0.0002
## 120 0.8177 nan 0.1000 0.0001
## 140 0.8116 nan 0.1000 0.0001
## 150 0.8089 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.9243 nan 0.1000 0.0069
## 2 0.9148 nan 0.1000 0.0046
## 3 0.9077 nan 0.1000 0.0037
## 4 0.9020 nan 0.1000 0.0026
## 5 0.8967 nan 0.1000 0.0026
## 6 0.8934 nan 0.1000 0.0016
## 7 0.8896 nan 0.1000 0.0016
## 8 0.8864 nan 0.1000 0.0014
## 9 0.8835 nan 0.1000 0.0014
## 10 0.8806 nan 0.1000 0.0015
## 20 0.8598 nan 0.1000 0.0003
## 40 0.8331 nan 0.1000 0.0004
## 60 0.8144 nan 0.1000 0.0004
## 80 0.8031 nan 0.1000 0.0001
## 100 0.7947 nan 0.1000 -0.0000
## 120 0.7876 nan 0.1000 -0.0000
## 140 0.7827 nan 0.1000 -0.0001
## 150 0.7804 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.9213 nan 0.1000 0.0077
## 2 0.9092 nan 0.1000 0.0056
## 3 0.9003 nan 0.1000 0.0044
## 4 0.8936 nan 0.1000 0.0033
## 5 0.8878 nan 0.1000 0.0028
## 6 0.8834 nan 0.1000 0.0023
## 7 0.8796 nan 0.1000 0.0015
## 8 0.8753 nan 0.1000 0.0020
## 9 0.8722 nan 0.1000 0.0014
## 10 0.8688 nan 0.1000 0.0015
## 20 0.8433 nan 0.1000 0.0009
## 40 0.8132 nan 0.1000 0.0005
## 60 0.7957 nan 0.1000 0.0002
## 80 0.7852 nan 0.1000 0.0002
## 100 0.7776 nan 0.1000 -0.0000
## 120 0.7723 nan 0.1000 0.0001
## 140 0.7691 nan 0.1000 -0.0000
## 150 0.7682 nan 0.1000 -0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.9217 nan 0.1000 0.0087
## 2 0.9092 nan 0.1000 0.0063
## 3 0.8998 nan 0.1000 0.0046
## 4 0.8924 nan 0.1000 0.0035
## 5 0.8867 nan 0.1000 0.0029
## 6 0.8820 nan 0.1000 0.0023
## 7 0.8773 nan 0.1000 0.0022
## 8 0.8740 nan 0.1000 0.0015
## 9 0.8704 nan 0.1000 0.0017
## 10 0.8678 nan 0.1000 0.0011
## 20 0.8425 nan 0.1000 0.0011
## 40 0.8134 nan 0.1000 0.0006
## 50 0.8043 nan 0.1000 0.0002
predictionsGBM<-predict(modelgbm,test,type="prob")[,2]
predictionsGBMtrain<-predict(modelgbm,train,type="prob")[,2]
plot(modelgbm)
print(modelgbm)
## Stochastic Gradient Boosting
##
## 18381 samples
## 22 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 14706, 14704, 14705, 14705, 14704
## Resampling results across tuning parameters:
##
## interaction.depth n.trees Accuracy Kappa
## 1 50 0.8328709 0.1778144
## 1 100 0.8337414 0.1940634
## 1 150 0.8336326 0.1967619
## 2 50 0.8347205 0.1999170
## 2 100 0.8341767 0.1987011
## 2 150 0.8338504 0.2111095
## 3 50 0.8361895 0.2093033
## 3 100 0.8348840 0.2173995
## 3 150 0.8360809 0.2295391
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 50, interaction.depth
## = 3, shrinkage = 0.1 and n.minobsinnode = 10.
library("xgboost")
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
library("Matrix")
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
##
## expand
library(dplyr)
train$Recommended.IND<-as.numeric(train$Recommended.IND)-1
data_variables <- as.matrix(train %>% select(-Recommended.IND))
data_label <- train[,"Recommended.IND"]
data_matrix <- xgb.DMatrix(data = data_variables, label = data_label)
numberOfClasses <- length(unique(train$Recommended.IND))
xgb_params <- list(booster = "gbtree","objective" = "binary:logistic",eta=0.1,gamma=5,max_depth=10)
xgbcv <- xgb.cv( params = xgb_params, data = data_matrix, nrounds = 100, nfold = 10, showsd = T, stratified = T, print.every.n = 10, early.stop.round = 20, maximize = F)
## Warning: 'print.every.n' is deprecated.
## Use 'print_every_n' instead.
## See help("Deprecated") and help("xgboost-deprecated").
## Warning: 'early.stop.round' is deprecated.
## Use 'early_stopping_rounds' instead.
## See help("Deprecated") and help("xgboost-deprecated").
## [1] train-error:0.159108+0.001024 test-error:0.162288+0.006938
## Multiple eval metrics are present. Will use test_error for early stopping.
## Will train until test_error hasn't improved in 20 rounds.
##
## [11] train-error:0.160347+0.000871 test-error:0.163104+0.008183
## [21] train-error:0.159960+0.000995 test-error:0.162179+0.007848
## [31] train-error:0.159489+0.001176 test-error:0.162071+0.007700
## [41] train-error:0.158237+0.000882 test-error:0.160819+0.007373
## [51] train-error:0.158080+0.000961 test-error:0.160874+0.007085
## [61] train-error:0.158080+0.000849 test-error:0.160819+0.007199
## Stopping. Best iteration:
## [44] train-error:0.158159+0.000906 test-error:0.160547+0.007129
nround <- xgbcv$best_iteration # number of XGBoost rounds
cv.nfold <- 10
# Fit cv.nfold * cv.nround XGB models and save OOF predictions
bst_model <- xgb.train(params = xgb_params,
data = data_matrix,
nrounds = nround)
test$Recommended.IND<-as.numeric(test$Recommended.IND)-1
test_matrix<-xgb.DMatrix(data = as.matrix(test %>% select(-Recommended.IND)))
predictionsXGBoost<-predict(bst_model,newdata=test_matrix)
predictionsXGBoosttrain<-predict(bst_model,newdata=data_matrix)
print(table(predictionsXGBoost>0.5,test$Recommended.IND))
##
## 0 1
## FALSE 165 104
## TRUE 728 4108
#Stacking model modelGLM,bst_model(XGBOOST) and ctree and using linear model on top
df<-data.frame(train,a=predictionsGLMtrain,b=predictionsGBMtrain,c=predictionsXGBoosttrain,d=predictionsctreetrain)
library(class)
testdf<-data.frame(test,a=predictionsGLM,b=predictionsGBM,c=predictionsXGBoost,d=predictionsctree)
model<-lm(Recommended.IND~.,data=df)
predictions<-predict(model,testdf)
table(predictions>0.5,test$Recommended.IND)
##
## 0 1
## FALSE 195 130
## TRUE 698 4082
##Accuracy increased by stacking models
Using more terms from corpus may increase the accuracy(but space complexity will increase a lot). The cactegorical values can be encoded by mean or frequency which can increase the accuracy as well. Stacking more model and bagging can also help. You can also use N-grams but will take up more space and time to build models.
Here I have achieved an accuracy of ~84% using 22 variables(including 3 Engineered variables) which is pretty good considering the space and time of the CPU used.