The following Analysis was done on a dataset from Kaggle which had SMS received as Spam and Ham. The message was taken and turned into a corpus using TM package in R with required filtering(such as punctuation removal, stop words removal, stemming etc.).
setwd("C:\\R Programming\\Spam SMS")
data<-read.csv("spam.csv",stringsAsFactors = FALSE)
library(tm)
library(SnowballC)
library(caTools)
library(dplyr)
library(textstem)
library(ngram)
Sys.setenv(JAVA_HOME='C:\\Program Files\\Java\\jre-10.0.2')
library(RWeka)
corpus = VCorpus(VectorSource(data$v2))
corpus = tm_map(corpus, content_transformer(tolower))
corpus = tm_map(corpus, removePunctuation)
#exceptions <- c("not","too","bad","just","no","but")
#my_stopwords <- setdiff(stopwords("en"), exceptions)
corpus = tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, lemmatize_strings)
corpus <- tm_map(corpus, PlainTextDocument)
corpus = tm_map(corpus, stemDocument)
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 3))
frequencies = DocumentTermMatrix(corpus,control = list(weighting = function(x) weightTfIdf(x, normalize = TRUE),tokenize = BigramTokenizer))
sparse = removeSparseTerms(frequencies, 0.97)
smsSparse = as.data.frame(as.matrix(sparse))
colnames(smsSparse) = make.names(colnames(smsSparse))
##Using 29 predictors
smsSparse$Class<-as.factor(data$v1)
set.seed(1)
library(caTools)
splits<-sample.split(smsSparse$Class,SplitRatio = 0.7)
train<-subset(smsSparse,splits==TRUE)
test<-subset(smsSparse,splits==FALSE)
You can also embed plots, for example:
## # A tibble: 2 x 30
## Class call can come day free get good ill
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ham 0.0146 0.0139 0.0180 0.00823 0.00336 0.0197 0.0176 1.61e-2
## 2 spam 0.0348 0.00313 0.00117 0.00406 0.0250 0.00699 0.00272 1.44e-4
## # ... with 21 more variables: just <dbl>, know <dbl>, late <dbl>,
## # like <dbl>, love <dbl>, ltgt <dbl>, need <dbl>, now <dbl>, okay <dbl>,
## # say <dbl>, see <dbl>, send <dbl>, take <dbl>, tell <dbl>, text <dbl>,
## # think <dbl>, time <dbl>, today <dbl>, want <dbl>, will <dbl>,
## # yes <dbl>
##Feature Engineering
library(stringr)
noOfExclamation<-str_count(data$v2,"!")
containsWebsite<-str_count(data$v2,"www.")
noOfDigits<-str_count(data$v2,"[0-9]")
containgsPhoneNumber<-ifelse(grepl("[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]",data$v2),1,0)
##Normalizing noOfExclamation variable
noOfExclamation<-(noOfExclamation-min(noOfExclamation))/(max(noOfExclamation)-min(noOfExclamation))
##Let's see if there's any relation between my engineered variables and outcome(Class)
smsSparse$noOfExclamation<-noOfExclamation
smsSparse$containsWebsite<-containsWebsite
smsSparse$containgsPhoneNumber<-containgsPhoneNumber
smsSparse$noOfDigits<-noOfDigits
smsSparse$noOfDigits<-(smsSparse$noOfDigits-min(smsSparse$noOfDigits))/(max(smsSparse$noOfDigits)-min(smsSparse$noOfDigits))
splits<-sample.split(smsSparse$Class,SplitRatio = 0.7)
train<-subset(smsSparse,splits==TRUE)
test<-subset(smsSparse,splits==FALSE)
explore<- smsSparse %>% group_by(Class) %>% summarise(mean(noOfExclamation))
print(explore)
## # A tibble: 2 x 2
## Class `mean(noOfExclamation)`
## <fct> <dbl>
## 1 ham 0.0158
## 2 spam 0.0660
explore<- smsSparse %>% group_by(Class) %>% summarise(mean(containsWebsite))
print(explore)
## # A tibble: 2 x 2
## Class `mean(containsWebsite)`
## <fct> <dbl>
## 1 ham 0.000622
## 2 spam 0.129
explore<- smsSparse %>% group_by(Class) %>% summarise(mean(containgsPhoneNumber))
print(explore)
## # A tibble: 2 x 2
## Class `mean(containgsPhoneNumber)`
## <fct> <dbl>
## 1 ham 0
## 2 spam 0.537
explore<- smsSparse %>% group_by(Class) %>% summarise(mean(noOfDigits))
print(explore)
## # A tibble: 2 x 2
## Class `mean(noOfDigits)`
## <fct> <dbl>
## 1 ham 0.00637
## 2 spam 0.335
##Clearly there is a trend between the engineered data and response variable.
##Data Modelling
library(caret)
library(nnet)
trControl<-trainControl(method="cv",number=5)
modelrpart<-train(Class~.,data=train,method="rpart",trControl=trControl)
predictionrpartTrain<-data.frame(predict(modelrpart,train,type="prob"))
predictionrpartTrain<-predictionrpartTrain[,2]
predictionrpart<-data.frame(predict(modelrpart,test,type="prob"))
predictionrpart<-predictionrpart[,2]
plot(modelrpart)
modelrf<-train(Class~.,data=train,method="rf",metric="Accuracy",trControl=trControl)
predictionrfTrain<-data.frame(predict(modelrf,train,type="prob"))
predictionrfTrain<-predictionrfTrain[,2]
predictionrf<-data.frame(predict(modelrf,test,type="prob"))
predictionrf<-predictionrf[,2]
plot(modelrf)
library(party)
modelctree<-train(Class~.,data=train,method="ctree",metric="Accuracy",trControl=trControl)
predictionctreeTrain<-data.frame(predict(modelctree,train,type="prob"))
predictionctreeTrain<-predictionctreeTrain[,2]
predictionctree<-data.frame(predict(modelctree,test,type="prob"))
predictionctree<-predictionctree[,2]
plot(modelctree)
modelgbm<-train(Class~.,data=train,method="gbm",metric="Accuracy",trControl=trControl)
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.6387 nan 0.1000 0.0715
## 2 0.5578 nan 0.1000 0.0386
## 3 0.4991 nan 0.1000 0.0288
## 4 0.4528 nan 0.1000 0.0229
## 5 0.4171 nan 0.1000 0.0179
## 6 0.3866 nan 0.1000 0.0153
## 7 0.3621 nan 0.1000 0.0126
## 8 0.3369 nan 0.1000 0.0122
## 9 0.3191 nan 0.1000 0.0089
## 10 0.3008 nan 0.1000 0.0088
## 20 0.2086 nan 0.1000 0.0021
## 40 0.1600 nan 0.1000 0.0006
## 60 0.1395 nan 0.1000 -0.0001
## 80 0.1272 nan 0.1000 0.0000
## 100 0.1213 nan 0.1000 0.0001
## 120 0.1154 nan 0.1000 -0.0000
## 140 0.1119 nan 0.1000 -0.0002
## 150 0.1111 nan 0.1000 -0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.6342 nan 0.1000 0.0752
## 2 0.5505 nan 0.1000 0.0429
## 3 0.4895 nan 0.1000 0.0301
## 4 0.4424 nan 0.1000 0.0235
## 5 0.4061 nan 0.1000 0.0187
## 6 0.3739 nan 0.1000 0.0157
## 7 0.3473 nan 0.1000 0.0132
## 8 0.3243 nan 0.1000 0.0111
## 9 0.3038 nan 0.1000 0.0089
## 10 0.2866 nan 0.1000 0.0081
## 20 0.1886 nan 0.1000 0.0026
## 40 0.1365 nan 0.1000 0.0005
## 60 0.1207 nan 0.1000 -0.0000
## 80 0.1113 nan 0.1000 0.0000
## 100 0.1057 nan 0.1000 -0.0002
## 120 0.1015 nan 0.1000 -0.0000
## 140 0.0971 nan 0.1000 -0.0003
## 150 0.0953 nan 0.1000 -0.0003
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.6332 nan 0.1000 0.0753
## 2 0.5501 nan 0.1000 0.0453
## 3 0.4863 nan 0.1000 0.0298
## 4 0.4396 nan 0.1000 0.0219
## 5 0.4007 nan 0.1000 0.0191
## 6 0.3681 nan 0.1000 0.0151
## 7 0.3400 nan 0.1000 0.0141
## 8 0.3166 nan 0.1000 0.0116
## 9 0.2952 nan 0.1000 0.0104
## 10 0.2775 nan 0.1000 0.0088
## 20 0.1816 nan 0.1000 0.0025
## 40 0.1294 nan 0.1000 0.0003
## 60 0.1115 nan 0.1000 -0.0001
## 80 0.1024 nan 0.1000 -0.0001
## 100 0.0963 nan 0.1000 -0.0000
## 120 0.0907 nan 0.1000 -0.0002
## 140 0.0862 nan 0.1000 -0.0003
## 150 0.0847 nan 0.1000 -0.0002
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.6357 nan 0.1000 0.0697
## 2 0.5552 nan 0.1000 0.0400
## 3 0.4949 nan 0.1000 0.0282
## 4 0.4499 nan 0.1000 0.0232
## 5 0.4134 nan 0.1000 0.0187
## 6 0.3831 nan 0.1000 0.0151
## 7 0.3584 nan 0.1000 0.0126
## 8 0.3359 nan 0.1000 0.0113
## 9 0.3171 nan 0.1000 0.0092
## 10 0.3016 nan 0.1000 0.0069
## 20 0.2143 nan 0.1000 0.0029
## 40 0.1650 nan 0.1000 0.0004
## 60 0.1458 nan 0.1000 0.0002
## 80 0.1357 nan 0.1000 -0.0001
## 100 0.1296 nan 0.1000 -0.0001
## 120 0.1252 nan 0.1000 -0.0001
## 140 0.1210 nan 0.1000 -0.0002
## 150 0.1198 nan 0.1000 -0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.6316 nan 0.1000 0.0801
## 2 0.5470 nan 0.1000 0.0399
## 3 0.4867 nan 0.1000 0.0297
## 4 0.4409 nan 0.1000 0.0218
## 5 0.4044 nan 0.1000 0.0160
## 6 0.3729 nan 0.1000 0.0154
## 7 0.3475 nan 0.1000 0.0126
## 8 0.3251 nan 0.1000 0.0109
## 9 0.3045 nan 0.1000 0.0104
## 10 0.2864 nan 0.1000 0.0085
## 20 0.1902 nan 0.1000 0.0023
## 40 0.1445 nan 0.1000 0.0004
## 60 0.1278 nan 0.1000 0.0000
## 80 0.1152 nan 0.1000 -0.0001
## 100 0.1109 nan 0.1000 -0.0001
## 120 0.1064 nan 0.1000 -0.0002
## 140 0.1021 nan 0.1000 -0.0002
## 150 0.1011 nan 0.1000 -0.0002
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.6331 nan 0.1000 0.0761
## 2 0.5485 nan 0.1000 0.0415
## 3 0.4886 nan 0.1000 0.0302
## 4 0.4410 nan 0.1000 0.0225
## 5 0.4031 nan 0.1000 0.0194
## 6 0.3707 nan 0.1000 0.0155
## 7 0.3422 nan 0.1000 0.0133
## 8 0.3192 nan 0.1000 0.0112
## 9 0.2984 nan 0.1000 0.0101
## 10 0.2820 nan 0.1000 0.0082
## 20 0.1878 nan 0.1000 0.0026
## 40 0.1353 nan 0.1000 0.0002
## 60 0.1189 nan 0.1000 0.0001
## 80 0.1112 nan 0.1000 -0.0001
## 100 0.1031 nan 0.1000 -0.0001
## 120 0.0987 nan 0.1000 -0.0003
## 140 0.0940 nan 0.1000 -0.0002
## 150 0.0926 nan 0.1000 -0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.6426 nan 0.1000 0.0785
## 2 0.5622 nan 0.1000 0.0412
## 3 0.5047 nan 0.1000 0.0278
## 4 0.4618 nan 0.1000 0.0205
## 5 0.4273 nan 0.1000 0.0164
## 6 0.3965 nan 0.1000 0.0153
## 7 0.3724 nan 0.1000 0.0122
## 8 0.3490 nan 0.1000 0.0113
## 9 0.3318 nan 0.1000 0.0083
## 10 0.3135 nan 0.1000 0.0092
## 20 0.2264 nan 0.1000 0.0027
## 40 0.1787 nan 0.1000 0.0004
## 60 0.1550 nan 0.1000 0.0004
## 80 0.1418 nan 0.1000 0.0001
## 100 0.1333 nan 0.1000 -0.0000
## 120 0.1293 nan 0.1000 0.0000
## 140 0.1263 nan 0.1000 -0.0002
## 150 0.1252 nan 0.1000 -0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.6359 nan 0.1000 0.0727
## 2 0.5538 nan 0.1000 0.0417
## 3 0.4969 nan 0.1000 0.0272
## 4 0.4514 nan 0.1000 0.0231
## 5 0.4131 nan 0.1000 0.0180
## 6 0.3827 nan 0.1000 0.0144
## 7 0.3569 nan 0.1000 0.0127
## 8 0.3345 nan 0.1000 0.0112
## 9 0.3147 nan 0.1000 0.0095
## 10 0.2979 nan 0.1000 0.0075
## 20 0.2047 nan 0.1000 0.0023
## 40 0.1544 nan 0.1000 0.0005
## 60 0.1389 nan 0.1000 -0.0001
## 80 0.1258 nan 0.1000 -0.0002
## 100 0.1178 nan 0.1000 -0.0002
## 120 0.1134 nan 0.1000 -0.0000
## 140 0.1104 nan 0.1000 -0.0003
## 150 0.1076 nan 0.1000 -0.0002
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.6377 nan 0.1000 0.0736
## 2 0.5536 nan 0.1000 0.0410
## 3 0.4937 nan 0.1000 0.0287
## 4 0.4478 nan 0.1000 0.0235
## 5 0.4100 nan 0.1000 0.0180
## 6 0.3792 nan 0.1000 0.0157
## 7 0.3523 nan 0.1000 0.0133
## 8 0.3291 nan 0.1000 0.0106
## 9 0.3099 nan 0.1000 0.0092
## 10 0.2939 nan 0.1000 0.0079
## 20 0.1993 nan 0.1000 0.0028
## 40 0.1427 nan 0.1000 0.0003
## 60 0.1235 nan 0.1000 -0.0003
## 80 0.1144 nan 0.1000 -0.0002
## 100 0.1099 nan 0.1000 -0.0004
## 120 0.1051 nan 0.1000 -0.0003
## 140 0.1007 nan 0.1000 -0.0001
## 150 0.0988 nan 0.1000 -0.0002
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.6425 nan 0.1000 0.0737
## 2 0.5650 nan 0.1000 0.0396
## 3 0.5099 nan 0.1000 0.0270
## 4 0.4680 nan 0.1000 0.0183
## 5 0.4302 nan 0.1000 0.0193
## 6 0.4000 nan 0.1000 0.0145
## 7 0.3761 nan 0.1000 0.0118
## 8 0.3517 nan 0.1000 0.0121
## 9 0.3320 nan 0.1000 0.0098
## 10 0.3137 nan 0.1000 0.0087
## 20 0.2243 nan 0.1000 0.0025
## 40 0.1743 nan 0.1000 0.0001
## 60 0.1535 nan 0.1000 0.0001
## 80 0.1402 nan 0.1000 0.0001
## 100 0.1344 nan 0.1000 -0.0001
## 120 0.1289 nan 0.1000 -0.0001
## 140 0.1261 nan 0.1000 -0.0002
## 150 0.1238 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.6388 nan 0.1000 0.0786
## 2 0.5558 nan 0.1000 0.0408
## 3 0.4978 nan 0.1000 0.0282
## 4 0.4522 nan 0.1000 0.0226
## 5 0.4169 nan 0.1000 0.0187
## 6 0.3862 nan 0.1000 0.0158
## 7 0.3590 nan 0.1000 0.0123
## 8 0.3371 nan 0.1000 0.0103
## 9 0.3177 nan 0.1000 0.0096
## 10 0.2997 nan 0.1000 0.0078
## 20 0.2054 nan 0.1000 0.0030
## 40 0.1503 nan 0.1000 0.0007
## 60 0.1318 nan 0.1000 -0.0003
## 80 0.1212 nan 0.1000 -0.0000
## 100 0.1152 nan 0.1000 -0.0001
## 120 0.1104 nan 0.1000 -0.0001
## 140 0.1072 nan 0.1000 0.0000
## 150 0.1051 nan 0.1000 -0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.6361 nan 0.1000 0.0746
## 2 0.5540 nan 0.1000 0.0404
## 3 0.4939 nan 0.1000 0.0280
## 4 0.4462 nan 0.1000 0.0238
## 5 0.4073 nan 0.1000 0.0179
## 6 0.3763 nan 0.1000 0.0151
## 7 0.3508 nan 0.1000 0.0131
## 8 0.3282 nan 0.1000 0.0104
## 9 0.3087 nan 0.1000 0.0095
## 10 0.2914 nan 0.1000 0.0078
## 20 0.1937 nan 0.1000 0.0027
## 40 0.1399 nan 0.1000 0.0004
## 60 0.1233 nan 0.1000 -0.0000
## 80 0.1131 nan 0.1000 -0.0002
## 100 0.1059 nan 0.1000 -0.0001
## 120 0.0997 nan 0.1000 -0.0005
## 140 0.0958 nan 0.1000 -0.0001
## 150 0.0941 nan 0.1000 -0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.6366 nan 0.1000 0.0764
## 2 0.5562 nan 0.1000 0.0391
## 3 0.4984 nan 0.1000 0.0294
## 4 0.4505 nan 0.1000 0.0230
## 5 0.4161 nan 0.1000 0.0170
## 6 0.3882 nan 0.1000 0.0139
## 7 0.3624 nan 0.1000 0.0129
## 8 0.3379 nan 0.1000 0.0113
## 9 0.3203 nan 0.1000 0.0091
## 10 0.3032 nan 0.1000 0.0081
## 20 0.2164 nan 0.1000 0.0018
## 40 0.1678 nan 0.1000 0.0006
## 60 0.1466 nan 0.1000 0.0000
## 80 0.1352 nan 0.1000 0.0001
## 100 0.1288 nan 0.1000 -0.0001
## 120 0.1255 nan 0.1000 -0.0001
## 140 0.1225 nan 0.1000 -0.0000
## 150 0.1212 nan 0.1000 -0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.6359 nan 0.1000 0.0714
## 2 0.5531 nan 0.1000 0.0405
## 3 0.4928 nan 0.1000 0.0305
## 4 0.4485 nan 0.1000 0.0225
## 5 0.4106 nan 0.1000 0.0192
## 6 0.3804 nan 0.1000 0.0152
## 7 0.3536 nan 0.1000 0.0127
## 8 0.3309 nan 0.1000 0.0112
## 9 0.3117 nan 0.1000 0.0093
## 10 0.2934 nan 0.1000 0.0090
## 20 0.1986 nan 0.1000 0.0024
## 40 0.1474 nan 0.1000 0.0006
## 60 0.1318 nan 0.1000 0.0001
## 80 0.1223 nan 0.1000 -0.0001
## 100 0.1166 nan 0.1000 -0.0003
## 120 0.1125 nan 0.1000 0.0000
## 140 0.1081 nan 0.1000 0.0000
## 150 0.1065 nan 0.1000 -0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.6342 nan 0.1000 0.0783
## 2 0.5507 nan 0.1000 0.0405
## 3 0.4907 nan 0.1000 0.0291
## 4 0.4442 nan 0.1000 0.0236
## 5 0.4055 nan 0.1000 0.0181
## 6 0.3730 nan 0.1000 0.0165
## 7 0.3459 nan 0.1000 0.0137
## 8 0.3228 nan 0.1000 0.0113
## 9 0.3013 nan 0.1000 0.0105
## 10 0.2844 nan 0.1000 0.0085
## 20 0.1862 nan 0.1000 0.0019
## 40 0.1372 nan 0.1000 0.0000
## 60 0.1204 nan 0.1000 -0.0002
## 80 0.1120 nan 0.1000 -0.0000
## 100 0.1048 nan 0.1000 -0.0001
## 120 0.0999 nan 0.1000 -0.0000
## 140 0.0962 nan 0.1000 -0.0001
## 150 0.0946 nan 0.1000 -0.0002
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.6364 nan 0.1000 0.0765
## 2 0.5530 nan 0.1000 0.0412
## 3 0.4922 nan 0.1000 0.0316
## 4 0.4472 nan 0.1000 0.0222
## 5 0.4116 nan 0.1000 0.0177
## 6 0.3802 nan 0.1000 0.0156
## 7 0.3520 nan 0.1000 0.0134
## 8 0.3294 nan 0.1000 0.0111
## 9 0.3103 nan 0.1000 0.0089
## 10 0.2925 nan 0.1000 0.0085
## 20 0.1968 nan 0.1000 0.0025
## 40 0.1470 nan 0.1000 0.0006
## 60 0.1302 nan 0.1000 -0.0002
## 80 0.1209 nan 0.1000 0.0000
## 100 0.1152 nan 0.1000 0.0000
## 120 0.1109 nan 0.1000 -0.0001
## 140 0.1088 nan 0.1000 -0.0004
## 150 0.1071 nan 0.1000 -0.0002
predictiongbmTrain<-data.frame(predict(modelgbm,train,type="prob"))
predictiongbmTrain<-predictiongbmTrain[,2]
predictiongbm<-data.frame(predict(modelgbm,test,type="prob"))
predictiongbm<-predictiongbm[,2]
plot(modelgbm)
modelxgbtree<-train(Class~.,data=train,method="xgbTree",metric="Accuracy",trControl=trControl)
predictionxgbtreeTrain<-data.frame(predict(modelxgbtree,train,type="prob"))
predictionxgbtreeTrain<-predictionxgbtreeTrain[,2]
predictionxgbtree<-data.frame(predict(modelxgbtree,test,type="prob"))
predictionxgbtree<-predictionxgbtree[,2]
plot(modelxgbtree)
modelknn<-train(Class~.,data=train,method="knn",metric="Accuracy",trControl=trControl)
predictionknnTrain<-data.frame(predict(modelknn,train,type="prob"))
predictionknnTrain<-predictionknnTrain[,2]
predictionknn<-data.frame(predict(modelknn,test,type="prob"))
predictionknn<-predictionknn[,2]
plot(modelknn)
stackdf<-data.frame(a=predictionrpartTrain,b=predictionrfTrain,c=predictionctreeTrain,d=predictiongbmTrain,e=predictionknnTrain,Class=train$Class)
stackdftest<-data.frame(a=predictionrpart,b=predictionrf,c=predictionctree,d=predictiongbm,e=predictionknn)
model<-train(Class~.,data=stackdf,method="nnet",trControl=trControl)
## # weights: 8
## initial value 1654.468585
## iter 10 value 158.609580
## iter 20 value 124.212118
## iter 30 value 109.250120
## iter 40 value 108.086963
## iter 50 value 107.036164
## iter 60 value 106.853873
## iter 70 value 106.571163
## iter 80 value 106.462988
## iter 90 value 106.352925
## iter 100 value 106.291958
## final value 106.291958
## stopped after 100 iterations
## # weights: 22
## initial value 4050.864630
## iter 10 value 173.570834
## iter 20 value 130.346197
## iter 30 value 110.255406
## iter 40 value 102.512652
## iter 50 value 94.029664
## iter 60 value 90.979446
## iter 70 value 90.447063
## iter 80 value 89.649382
## iter 90 value 88.793997
## iter 100 value 87.985714
## final value 87.985714
## stopped after 100 iterations
## # weights: 36
## initial value 2164.508845
## iter 10 value 154.185099
## iter 20 value 96.526624
## iter 30 value 91.394432
## iter 40 value 89.453173
## iter 50 value 88.869616
## iter 60 value 88.717318
## iter 70 value 88.292286
## iter 80 value 88.079918
## iter 90 value 88.031573
## iter 100 value 87.879798
## final value 87.879798
## stopped after 100 iterations
## # weights: 8
## initial value 2784.590463
## iter 10 value 231.823012
## iter 20 value 146.580392
## iter 30 value 136.176419
## final value 136.176259
## converged
## # weights: 22
## initial value 1530.746096
## iter 10 value 155.696535
## iter 20 value 128.963400
## iter 30 value 124.293519
## iter 40 value 123.897019
## iter 50 value 123.482133
## iter 60 value 123.417554
## final value 123.416766
## converged
## # weights: 36
## initial value 3148.370669
## iter 10 value 254.634227
## iter 20 value 178.212136
## iter 30 value 136.946245
## iter 40 value 132.461008
## iter 50 value 126.991197
## iter 60 value 125.176990
## iter 70 value 123.904455
## iter 80 value 122.974158
## iter 90 value 122.639679
## iter 100 value 122.552635
## final value 122.552635
## stopped after 100 iterations
## # weights: 8
## initial value 2267.504556
## iter 10 value 282.253401
## iter 20 value 272.217112
## iter 30 value 261.323420
## iter 40 value 260.045527
## iter 50 value 257.281232
## iter 60 value 203.206055
## iter 70 value 143.712683
## iter 80 value 141.573951
## iter 90 value 135.583805
## iter 100 value 116.448452
## final value 116.448452
## stopped after 100 iterations
## # weights: 22
## initial value 2439.600702
## iter 10 value 183.819744
## iter 20 value 107.285281
## iter 30 value 95.686774
## iter 40 value 92.669489
## iter 50 value 92.071032
## iter 60 value 91.897937
## iter 70 value 91.829057
## iter 80 value 91.603308
## iter 90 value 91.375363
## iter 100 value 91.251946
## final value 91.251946
## stopped after 100 iterations
## # weights: 36
## initial value 2203.977234
## iter 10 value 128.871838
## iter 20 value 92.251468
## iter 30 value 90.407431
## iter 40 value 90.036127
## iter 50 value 89.715283
## iter 60 value 89.338074
## iter 70 value 89.317172
## iter 80 value 89.309644
## iter 90 value 88.858895
## iter 100 value 85.072038
## final value 85.072038
## stopped after 100 iterations
## # weights: 8
## initial value 2461.057484
## iter 10 value 222.953387
## iter 20 value 191.491544
## iter 30 value 187.841480
## iter 40 value 187.774882
## iter 40 value 187.774881
## iter 40 value 187.774881
## final value 187.774881
## converged
## # weights: 22
## initial value 1843.576803
## iter 10 value 148.913196
## iter 20 value 107.149133
## iter 30 value 100.350074
## iter 40 value 97.465668
## iter 50 value 95.693344
## iter 60 value 95.206151
## iter 70 value 94.583856
## iter 80 value 94.488588
## iter 90 value 94.471463
## iter 100 value 94.456566
## final value 94.456566
## stopped after 100 iterations
## # weights: 36
## initial value 2682.148489
## iter 10 value 168.387508
## iter 20 value 112.973749
## iter 30 value 106.591794
## iter 40 value 104.498333
## iter 50 value 103.108555
## iter 60 value 101.689800
## iter 70 value 97.788881
## iter 80 value 94.097662
## iter 90 value 93.969829
## iter 100 value 93.950521
## final value 93.950521
## stopped after 100 iterations
## # weights: 8
## initial value 3136.257811
## iter 10 value 220.444032
## iter 20 value 151.239331
## final value 151.040209
## converged
## # weights: 22
## initial value 2132.587978
## iter 10 value 261.625955
## iter 20 value 147.008230
## iter 30 value 140.966037
## iter 40 value 138.626955
## iter 50 value 136.835991
## iter 60 value 134.092006
## iter 70 value 133.916899
## iter 80 value 132.972074
## final value 132.967750
## converged
## # weights: 36
## initial value 4010.038960
## iter 10 value 273.162601
## iter 20 value 185.871229
## iter 30 value 145.403483
## iter 40 value 138.195016
## iter 50 value 134.804456
## iter 60 value 133.900728
## iter 70 value 133.623315
## iter 80 value 133.007308
## iter 90 value 132.800507
## iter 100 value 132.800340
## final value 132.800340
## stopped after 100 iterations
## # weights: 8
## initial value 2953.323947
## iter 10 value 667.316012
## iter 20 value 223.485060
## iter 30 value 219.655162
## iter 40 value 218.688099
## iter 50 value 218.048338
## iter 60 value 206.440878
## iter 70 value 175.917053
## iter 80 value 144.768378
## iter 90 value 130.448934
## iter 100 value 125.924450
## final value 125.924450
## stopped after 100 iterations
## # weights: 22
## initial value 1590.656442
## iter 10 value 144.703565
## iter 20 value 106.868091
## iter 30 value 103.470887
## iter 40 value 102.774075
## iter 50 value 102.298848
## iter 60 value 102.076027
## iter 70 value 101.547785
## iter 80 value 99.568883
## iter 90 value 99.084532
## iter 100 value 99.012632
## final value 99.012632
## stopped after 100 iterations
## # weights: 36
## initial value 2598.445391
## iter 10 value 145.846462
## iter 20 value 107.627307
## iter 30 value 103.150740
## iter 40 value 102.789035
## iter 50 value 101.294742
## iter 60 value 100.689397
## iter 70 value 100.346777
## iter 80 value 99.916000
## iter 90 value 99.242876
## iter 100 value 99.117987
## final value 99.117987
## stopped after 100 iterations
## # weights: 8
## initial value 2060.197874
## iter 10 value 238.989846
## iter 20 value 157.303855
## iter 30 value 144.318659
## iter 40 value 135.358336
## iter 50 value 134.225918
## iter 60 value 133.482879
## iter 70 value 133.271535
## iter 80 value 132.997467
## iter 90 value 132.954530
## iter 100 value 132.788424
## final value 132.788424
## stopped after 100 iterations
## # weights: 22
## initial value 3303.317394
## iter 10 value 213.263844
## iter 20 value 126.099076
## iter 30 value 101.301736
## iter 40 value 97.180847
## iter 50 value 94.486722
## iter 60 value 93.835913
## iter 70 value 93.689402
## iter 80 value 93.506775
## iter 90 value 93.464172
## iter 100 value 93.382292
## final value 93.382292
## stopped after 100 iterations
## # weights: 36
## initial value 2509.153833
## iter 10 value 244.055660
## iter 20 value 106.247485
## iter 30 value 102.025460
## iter 40 value 101.071388
## iter 50 value 97.334782
## iter 60 value 88.815946
## iter 70 value 87.165440
## iter 80 value 85.518238
## iter 90 value 84.621882
## iter 100 value 82.972454
## final value 82.972454
## stopped after 100 iterations
## # weights: 8
## initial value 1871.428542
## iter 10 value 197.578514
## iter 20 value 163.958686
## final value 163.472703
## converged
## # weights: 22
## initial value 1846.586522
## iter 10 value 224.360957
## iter 20 value 156.450116
## iter 30 value 148.615965
## iter 40 value 146.474580
## iter 50 value 144.608749
## iter 60 value 142.752871
## iter 70 value 142.439049
## iter 80 value 142.412779
## final value 142.412690
## converged
## # weights: 36
## initial value 3558.586084
## iter 10 value 360.964841
## iter 20 value 157.316979
## iter 30 value 146.002164
## iter 40 value 145.540370
## iter 50 value 143.138534
## iter 60 value 142.011737
## iter 70 value 141.422586
## iter 80 value 141.249862
## iter 90 value 141.110475
## iter 100 value 140.991732
## final value 140.991732
## stopped after 100 iterations
## # weights: 8
## initial value 1832.249767
## iter 10 value 198.377040
## iter 20 value 170.265932
## iter 30 value 154.761779
## iter 40 value 138.414324
## iter 50 value 134.979848
## iter 60 value 134.184066
## iter 70 value 133.435649
## iter 80 value 133.327231
## iter 90 value 133.203363
## iter 100 value 133.168083
## final value 133.168083
## stopped after 100 iterations
## # weights: 22
## initial value 3266.963186
## iter 10 value 238.720111
## iter 20 value 125.062625
## iter 30 value 115.889701
## iter 40 value 113.969787
## iter 50 value 113.440762
## iter 60 value 113.221703
## iter 70 value 112.804830
## iter 80 value 112.371427
## iter 90 value 111.970484
## iter 100 value 110.111932
## final value 110.111932
## stopped after 100 iterations
## # weights: 36
## initial value 1282.556695
## iter 10 value 178.078493
## iter 20 value 128.677332
## iter 30 value 113.418989
## iter 40 value 107.911352
## iter 50 value 101.626040
## iter 60 value 98.593675
## iter 70 value 98.317022
## iter 80 value 98.087509
## iter 90 value 97.805095
## iter 100 value 95.216103
## final value 95.216103
## stopped after 100 iterations
## # weights: 8
## initial value 1668.010656
## iter 10 value 304.264712
## iter 20 value 198.375496
## iter 30 value 162.158535
## iter 40 value 145.934984
## iter 50 value 123.238659
## iter 60 value 118.549687
## iter 70 value 116.912825
## iter 80 value 116.713975
## iter 90 value 116.544746
## iter 100 value 116.483801
## final value 116.483801
## stopped after 100 iterations
## # weights: 22
## initial value 2409.584335
## iter 10 value 157.769808
## iter 20 value 106.360022
## iter 30 value 99.412863
## iter 40 value 98.675490
## iter 50 value 97.940246
## iter 60 value 96.154627
## iter 70 value 94.428527
## iter 80 value 94.232924
## iter 90 value 94.088825
## iter 100 value 93.870155
## final value 93.870155
## stopped after 100 iterations
## # weights: 36
## initial value 2429.227839
## iter 10 value 1230.822737
## iter 20 value 247.053267
## iter 30 value 228.716540
## iter 40 value 199.319120
## iter 50 value 160.703610
## iter 60 value 138.253801
## iter 70 value 120.521154
## iter 80 value 117.751888
## iter 90 value 116.915867
## iter 100 value 116.473264
## final value 116.473264
## stopped after 100 iterations
## # weights: 8
## initial value 2418.050334
## iter 10 value 174.379284
## iter 20 value 145.542367
## iter 30 value 145.526425
## iter 30 value 145.526425
## iter 30 value 145.526425
## final value 145.526425
## converged
## # weights: 22
## initial value 3235.426790
## iter 10 value 318.563521
## iter 20 value 143.054619
## iter 30 value 136.340000
## iter 40 value 134.699116
## iter 50 value 134.514434
## iter 60 value 134.509395
## iter 70 value 134.508913
## final value 134.508895
## converged
## # weights: 36
## initial value 3840.533917
## iter 10 value 340.234204
## iter 20 value 195.701200
## iter 30 value 142.107517
## iter 40 value 135.781560
## iter 50 value 135.117579
## iter 60 value 133.187522
## iter 70 value 131.067350
## iter 80 value 130.298403
## iter 90 value 130.046283
## iter 100 value 130.019507
## final value 130.019507
## stopped after 100 iterations
## # weights: 8
## initial value 1500.234939
## iter 10 value 233.374697
## iter 20 value 146.506042
## iter 30 value 126.776281
## iter 40 value 118.035362
## iter 50 value 117.223233
## iter 60 value 116.889178
## iter 70 value 116.782694
## iter 80 value 116.665703
## iter 90 value 116.615058
## iter 100 value 116.574070
## final value 116.574070
## stopped after 100 iterations
## # weights: 22
## initial value 3417.151473
## iter 10 value 157.052151
## iter 20 value 100.373898
## iter 30 value 94.585810
## iter 40 value 90.526258
## iter 50 value 89.013793
## iter 60 value 88.819343
## iter 70 value 88.798795
## iter 80 value 88.719891
## iter 90 value 88.572504
## iter 100 value 88.084978
## final value 88.084978
## stopped after 100 iterations
## # weights: 36
## initial value 1575.166993
## iter 10 value 187.783279
## iter 20 value 107.352354
## iter 30 value 97.224988
## iter 40 value 87.203090
## iter 50 value 80.587894
## iter 60 value 77.668074
## iter 70 value 76.446597
## iter 80 value 76.037463
## iter 90 value 75.279368
## iter 100 value 73.872189
## final value 73.872189
## stopped after 100 iterations
## # weights: 8
## initial value 1451.578705
## iter 10 value 287.263731
## iter 20 value 192.505554
## iter 30 value 162.068269
## iter 40 value 135.855500
## iter 50 value 130.537029
## iter 60 value 129.434877
## iter 70 value 128.434369
## iter 80 value 128.213105
## iter 90 value 128.003000
## iter 100 value 127.969879
## final value 127.969879
## stopped after 100 iterations
## # weights: 22
## initial value 3411.235848
## iter 10 value 198.787093
## iter 20 value 105.818124
## iter 30 value 100.867733
## iter 40 value 98.058844
## iter 50 value 95.777261
## iter 60 value 95.483778
## iter 70 value 95.162181
## iter 80 value 94.914659
## iter 90 value 94.650144
## iter 100 value 94.599758
## final value 94.599758
## stopped after 100 iterations
## # weights: 36
## initial value 1631.674585
## iter 10 value 177.689307
## iter 20 value 114.952164
## iter 30 value 96.999924
## iter 40 value 94.022111
## iter 50 value 91.537567
## iter 60 value 89.240289
## iter 70 value 87.415384
## iter 80 value 86.550450
## iter 90 value 85.094823
## iter 100 value 84.380640
## final value 84.380640
## stopped after 100 iterations
## # weights: 8
## initial value 1818.886441
## iter 10 value 264.303685
## iter 20 value 158.623058
## iter 30 value 157.679959
## iter 30 value 157.679959
## final value 157.679959
## converged
## # weights: 22
## initial value 2023.735071
## iter 10 value 272.618289
## iter 20 value 158.468502
## iter 30 value 142.629324
## iter 40 value 139.113420
## iter 50 value 138.505034
## iter 60 value 138.255975
## iter 70 value 138.250056
## final value 138.249902
## converged
## # weights: 36
## initial value 2465.105822
## iter 10 value 209.053274
## iter 20 value 158.429242
## iter 30 value 141.472800
## iter 40 value 139.933480
## iter 50 value 138.988663
## iter 60 value 138.185762
## iter 70 value 137.247848
## iter 80 value 136.645623
## iter 90 value 136.364256
## iter 100 value 136.334239
## final value 136.334239
## stopped after 100 iterations
## # weights: 8
## initial value 1782.637750
## iter 10 value 178.131236
## iter 20 value 137.061034
## iter 30 value 130.151375
## iter 40 value 129.546030
## iter 50 value 128.909204
## iter 60 value 128.545727
## iter 70 value 128.341662
## iter 80 value 128.322183
## iter 90 value 128.267854
## iter 100 value 128.262056
## final value 128.262056
## stopped after 100 iterations
## # weights: 22
## initial value 1784.371310
## iter 10 value 134.156966
## iter 20 value 100.300531
## iter 30 value 96.365231
## iter 40 value 92.908570
## iter 50 value 89.922580
## iter 60 value 88.160679
## iter 70 value 86.912152
## iter 80 value 86.378051
## iter 90 value 85.498582
## iter 100 value 85.107404
## final value 85.107404
## stopped after 100 iterations
## # weights: 36
## initial value 1293.548704
## iter 10 value 150.563683
## iter 20 value 98.140500
## iter 30 value 92.605203
## iter 40 value 90.229821
## iter 50 value 89.391293
## iter 60 value 87.936112
## iter 70 value 86.496572
## iter 80 value 86.326279
## iter 90 value 86.182188
## iter 100 value 85.952905
## final value 85.952905
## stopped after 100 iterations
## # weights: 36
## initial value 2328.101179
## iter 10 value 180.666626
## iter 20 value 127.643929
## iter 30 value 123.371569
## iter 40 value 119.076808
## iter 50 value 110.421357
## iter 60 value 106.268232
## iter 70 value 105.573992
## iter 80 value 105.174042
## iter 90 value 105.026504
## iter 100 value 104.406661
## final value 104.406661
## stopped after 100 iterations
predictions<-predict(model,stackdftest)
plot(model)
##Accuracy of ~97.6% on test set with stacking with 5 models
library(h2o)
Sys.setenv(JAVA_HOME='C:\\Program Files\\Java\\jre-10.0.2')
h2o.init()
##
## H2O is not running yet, starting it now...
##
## Note: In case of errors look at the following log files:
## C:\Users\Pulkit\AppData\Local\Temp\RtmpsPOTGV/h2o_Pulkit_started_from_r.out
## C:\Users\Pulkit\AppData\Local\Temp\RtmpsPOTGV/h2o_Pulkit_started_from_r.err
##
##
## Starting H2O JVM and connecting: . Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 6 seconds 708 milliseconds
## H2O cluster timezone: Asia/Kolkata
## H2O data parsing timezone: UTC
## H2O cluster version: 3.20.0.2
## H2O cluster version age: 2 months and 14 days
## H2O cluster name: H2O_started_from_R_Pulkit_vkg937
## H2O cluster total nodes: 1
## H2O cluster total memory: 1.98 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Algos, AutoML, Core V3, Core V4
## R Version: R version 3.5.1 (2018-07-02)
h2otrain<-as.h2o(stackdf)
##
|
| | 0%
|
|=================================================================| 100%
h2otest<-as.h2o(stackdftest)
##
|
| | 0%
|
|=================================================================| 100%
modeldeeplearning<-h2o.deeplearning(x = 1:5 ,
y = "Class",
training_frame = h2otrain,
activation = "RectifierWithDropout",
l1 = 1.0e-5,l2 = 1.0e-5,
hidden=c(400, 400,400),
epochs = 200,
seed = 3.656455e+18)
##
|
| | 0%
|
|= | 2%
|
|== | 3%
|
|=== | 5%
|
|==== | 7%
|
|====== | 9%
|
|======= | 10%
|
|======== | 12%
|
|========= | 14%
|
|========== | 16%
|
|=========== | 17%
|
|============ | 19%
|
|============= | 21%
|
|=============== | 22%
|
|================ | 24%
|
|================= | 26%
|
|================== | 28%
|
|=================== | 29%
|
|==================== | 31%
|
|===================== | 33%
|
|====================== | 35%
|
|======================== | 36%
|
|=================================================================| 100%
h2opredictions<-as.data.frame(h2o.predict(modeldeeplearning,h2otest))
##
|
| | 0%
|
|=================================================================| 100%
h2opredictions<-h2opredictions[,3]
table(h2opredictions>0.5,test$Class)
##
## ham spam
## FALSE 1436 25
## TRUE 11 199
##Accuracy of ~ 97.9%
More predictors could be used instead of already 29 but will result in modelling becoming slow. Selective N-grams could also be used with word2vec.