symbolic_similarity_features <- read.csv("D:/Documents/Large-Scale Product Matching/symbolic_similarity_features.csv")
SHORT_TEXT_FEATURES = c('gtin', 'mpn', 'sku', 'identifier', 'brand', 'manufacturer')
MEDIUM_LONG_TEXT_FEATURES = c('name', 'description')
OTHER_FEATURES = c('price') # 'category',
ALL_FEATURES = c(SHORT_TEXT_FEATURES, MEDIUM_LONG_TEXT_FEATURES, OTHER_FEATURES)
symbolic_similarity_features_only <- symbolic_similarity_features[, ALL_FEATURES]
all_y <- as.factor(make.names(as.character(symbolic_similarity_features$label)))
train_indices <- symbolic_similarity_features$dataset == "train"
str(symbolic_similarity_features_only)
## 'data.frame': 84503 obs. of 9 variables:
## $ gtin : num 0 0 0 0 0 0 0 0 0 0 ...
## $ mpn : num 0.444 0.444 0.444 0.444 0.222 ...
## $ sku : num 0 0 0 0 0 ...
## $ identifier : num 0 0 0 0 0 0 0 0 0 0 ...
## $ brand : num 0 0 0 0 0 0 0 0 0 1 ...
## $ manufacturer: num 0 0 0 0 0 0 0 0 0 0 ...
## $ name : num 0.4 0.492 0.493 0.474 0.355 ...
## $ description : num 0 0.857 0.865 0 0 ...
## $ price : num 0 0 0 0 0 0 0 0 0 0 ...
(preprocess_features <- preProcess(symbolic_similarity_features_only,
method = c("nzv", "corr", "center", "scale")))
## Warning in preProcess.default(symbolic_similarity_features_only, method = c("nzv", : correlation matrix could not be computed:
## 1correlation matrix could not be computed:
## -0.0206245283262564correlation matrix could not be computed:
## 0.427263954386876correlation matrix could not be computed:
## 0.570220930815773correlation matrix could not be computed:
## -0.245583195038904correlation matrix could not be computed:
## -0.0206245283262564correlation matrix could not be computed:
## 1correlation matrix could not be computed:
## 0.184388178142601correlation matrix could not be computed:
## 0.285476367681447correlation matrix could not be computed:
## -0.0979283455193642correlation matrix could not be computed:
## 0.427263954386876correlation matrix could not be computed:
## 0.184388178142601correlation matrix could not be computed:
## 1correlation matrix could not be computed:
## 0.484829331875313correlation matrix could not be computed:
## 0.0394430693841353correlation matrix could not be computed:
## 0.570220930815773correlation matrix could not be computed:
## 0.285476367681447correlation matrix could not be computed:
## 0.484829331875313correlation matrix could not be computed:
## 1correlation matrix could not be computed:
## -0.335576442098219correlation matrix could not be computed:
## -0.245583195038904correlation matrix could not be computed:
## -0.0979283455193642correlation matrix could not be computed:
## 0.0394430693841353correlation matrix could not be computed:
## -0.335576442098219correlation matrix could not be computed:
## 1
## Created from 84503 samples and 9 variables
##
## Pre-processing:
## - centered (5)
## - ignored (0)
## - removed (4)
## - scaled (5)
processed_features <- predict(preprocess_features, symbolic_similarity_features_only)
highly_correlated <- findCorrelation(cor(symbolic_similarity_features_only))
low_var_features <- nearZeroVar(symbolic_similarity_features_only, names = T)
print(preprocess_features$method$remove)
## [1] "gtin" "identifier" "manufacturer" "price"
train_features <- processed_features[train_indices, ]
test_features <- processed_features[!train_indices, ]
train_y <- all_y[train_indices]
test_y <- all_y[!train_indices]
train_set <- cbind(train_features, train_y)
set.seed(5)
lm_control <- trainControl(
method = "repeatedcv",
number = 5, #num_cvs,
repeats = 3,
classProbs = T,
allowParallel = T,
verboseIter = T,
savePredictions = "final",
summaryFunction = prSummary
)
class_freq <- table(train_y)
more_weighting <- 1
model_class_weights <- ifelse(train_y == names(class_freq[1]),
1 / class_freq[1] * (1 / more_weighting),
1 / class_freq[2] * more_weighting)
print(max(model_class_weights) / min(model_class_weights))
## [1] 1.236773
lm_grid <- expand.grid(alpha=10^c(-3:3),
lambda=10^c(-3:3))
(lmod <- train(train_y ~ .,
data = train_set,
method = "glmnet",
weights = model_class_weights,
trControl = lm_control,
tuneGrid = lm_grid))
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. AUC will be used instead.
## + Fold1.Rep1: alpha=1e-03, lambda=1000
## - Fold1.Rep1: alpha=1e-03, lambda=1000
## + Fold1.Rep1: alpha=1e-02, lambda=1000
## - Fold1.Rep1: alpha=1e-02, lambda=1000
## + Fold1.Rep1: alpha=1e-01, lambda=1000
## - Fold1.Rep1: alpha=1e-01, lambda=1000
## + Fold1.Rep1: alpha=1e+00, lambda=1000
## - Fold1.Rep1: alpha=1e+00, lambda=1000
## + Fold1.Rep1: alpha=1e+01, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold1.Rep1: alpha=1e+01, lambda=1000
## + Fold1.Rep1: alpha=1e+02, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold1.Rep1: alpha=1e+02, lambda=1000
## + Fold1.Rep1: alpha=1e+03, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold1.Rep1: alpha=1e+03, lambda=1000
## + Fold2.Rep1: alpha=1e-03, lambda=1000
## - Fold2.Rep1: alpha=1e-03, lambda=1000
## + Fold2.Rep1: alpha=1e-02, lambda=1000
## - Fold2.Rep1: alpha=1e-02, lambda=1000
## + Fold2.Rep1: alpha=1e-01, lambda=1000
## - Fold2.Rep1: alpha=1e-01, lambda=1000
## + Fold2.Rep1: alpha=1e+00, lambda=1000
## - Fold2.Rep1: alpha=1e+00, lambda=1000
## + Fold2.Rep1: alpha=1e+01, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold2.Rep1: alpha=1e+01, lambda=1000
## + Fold2.Rep1: alpha=1e+02, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold2.Rep1: alpha=1e+02, lambda=1000
## + Fold2.Rep1: alpha=1e+03, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold2.Rep1: alpha=1e+03, lambda=1000
## + Fold3.Rep1: alpha=1e-03, lambda=1000
## - Fold3.Rep1: alpha=1e-03, lambda=1000
## + Fold3.Rep1: alpha=1e-02, lambda=1000
## - Fold3.Rep1: alpha=1e-02, lambda=1000
## + Fold3.Rep1: alpha=1e-01, lambda=1000
## - Fold3.Rep1: alpha=1e-01, lambda=1000
## + Fold3.Rep1: alpha=1e+00, lambda=1000
## - Fold3.Rep1: alpha=1e+00, lambda=1000
## + Fold3.Rep1: alpha=1e+01, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold3.Rep1: alpha=1e+01, lambda=1000
## + Fold3.Rep1: alpha=1e+02, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold3.Rep1: alpha=1e+02, lambda=1000
## + Fold3.Rep1: alpha=1e+03, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold3.Rep1: alpha=1e+03, lambda=1000
## + Fold4.Rep1: alpha=1e-03, lambda=1000
## - Fold4.Rep1: alpha=1e-03, lambda=1000
## + Fold4.Rep1: alpha=1e-02, lambda=1000
## - Fold4.Rep1: alpha=1e-02, lambda=1000
## + Fold4.Rep1: alpha=1e-01, lambda=1000
## - Fold4.Rep1: alpha=1e-01, lambda=1000
## + Fold4.Rep1: alpha=1e+00, lambda=1000
## - Fold4.Rep1: alpha=1e+00, lambda=1000
## + Fold4.Rep1: alpha=1e+01, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold4.Rep1: alpha=1e+01, lambda=1000
## + Fold4.Rep1: alpha=1e+02, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold4.Rep1: alpha=1e+02, lambda=1000
## + Fold4.Rep1: alpha=1e+03, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold4.Rep1: alpha=1e+03, lambda=1000
## + Fold5.Rep1: alpha=1e-03, lambda=1000
## - Fold5.Rep1: alpha=1e-03, lambda=1000
## + Fold5.Rep1: alpha=1e-02, lambda=1000
## - Fold5.Rep1: alpha=1e-02, lambda=1000
## + Fold5.Rep1: alpha=1e-01, lambda=1000
## - Fold5.Rep1: alpha=1e-01, lambda=1000
## + Fold5.Rep1: alpha=1e+00, lambda=1000
## - Fold5.Rep1: alpha=1e+00, lambda=1000
## + Fold5.Rep1: alpha=1e+01, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold5.Rep1: alpha=1e+01, lambda=1000
## + Fold5.Rep1: alpha=1e+02, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold5.Rep1: alpha=1e+02, lambda=1000
## + Fold5.Rep1: alpha=1e+03, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold5.Rep1: alpha=1e+03, lambda=1000
## + Fold1.Rep2: alpha=1e-03, lambda=1000
## - Fold1.Rep2: alpha=1e-03, lambda=1000
## + Fold1.Rep2: alpha=1e-02, lambda=1000
## - Fold1.Rep2: alpha=1e-02, lambda=1000
## + Fold1.Rep2: alpha=1e-01, lambda=1000
## - Fold1.Rep2: alpha=1e-01, lambda=1000
## + Fold1.Rep2: alpha=1e+00, lambda=1000
## - Fold1.Rep2: alpha=1e+00, lambda=1000
## + Fold1.Rep2: alpha=1e+01, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold1.Rep2: alpha=1e+01, lambda=1000
## + Fold1.Rep2: alpha=1e+02, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold1.Rep2: alpha=1e+02, lambda=1000
## + Fold1.Rep2: alpha=1e+03, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold1.Rep2: alpha=1e+03, lambda=1000
## + Fold2.Rep2: alpha=1e-03, lambda=1000
## - Fold2.Rep2: alpha=1e-03, lambda=1000
## + Fold2.Rep2: alpha=1e-02, lambda=1000
## - Fold2.Rep2: alpha=1e-02, lambda=1000
## + Fold2.Rep2: alpha=1e-01, lambda=1000
## - Fold2.Rep2: alpha=1e-01, lambda=1000
## + Fold2.Rep2: alpha=1e+00, lambda=1000
## - Fold2.Rep2: alpha=1e+00, lambda=1000
## + Fold2.Rep2: alpha=1e+01, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold2.Rep2: alpha=1e+01, lambda=1000
## + Fold2.Rep2: alpha=1e+02, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold2.Rep2: alpha=1e+02, lambda=1000
## + Fold2.Rep2: alpha=1e+03, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold2.Rep2: alpha=1e+03, lambda=1000
## + Fold3.Rep2: alpha=1e-03, lambda=1000
## - Fold3.Rep2: alpha=1e-03, lambda=1000
## + Fold3.Rep2: alpha=1e-02, lambda=1000
## - Fold3.Rep2: alpha=1e-02, lambda=1000
## + Fold3.Rep2: alpha=1e-01, lambda=1000
## - Fold3.Rep2: alpha=1e-01, lambda=1000
## + Fold3.Rep2: alpha=1e+00, lambda=1000
## - Fold3.Rep2: alpha=1e+00, lambda=1000
## + Fold3.Rep2: alpha=1e+01, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold3.Rep2: alpha=1e+01, lambda=1000
## + Fold3.Rep2: alpha=1e+02, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold3.Rep2: alpha=1e+02, lambda=1000
## + Fold3.Rep2: alpha=1e+03, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold3.Rep2: alpha=1e+03, lambda=1000
## + Fold4.Rep2: alpha=1e-03, lambda=1000
## - Fold4.Rep2: alpha=1e-03, lambda=1000
## + Fold4.Rep2: alpha=1e-02, lambda=1000
## - Fold4.Rep2: alpha=1e-02, lambda=1000
## + Fold4.Rep2: alpha=1e-01, lambda=1000
## - Fold4.Rep2: alpha=1e-01, lambda=1000
## + Fold4.Rep2: alpha=1e+00, lambda=1000
## - Fold4.Rep2: alpha=1e+00, lambda=1000
## + Fold4.Rep2: alpha=1e+01, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold4.Rep2: alpha=1e+01, lambda=1000
## + Fold4.Rep2: alpha=1e+02, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold4.Rep2: alpha=1e+02, lambda=1000
## + Fold4.Rep2: alpha=1e+03, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold4.Rep2: alpha=1e+03, lambda=1000
## + Fold5.Rep2: alpha=1e-03, lambda=1000
## - Fold5.Rep2: alpha=1e-03, lambda=1000
## + Fold5.Rep2: alpha=1e-02, lambda=1000
## - Fold5.Rep2: alpha=1e-02, lambda=1000
## + Fold5.Rep2: alpha=1e-01, lambda=1000
## - Fold5.Rep2: alpha=1e-01, lambda=1000
## + Fold5.Rep2: alpha=1e+00, lambda=1000
## - Fold5.Rep2: alpha=1e+00, lambda=1000
## + Fold5.Rep2: alpha=1e+01, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold5.Rep2: alpha=1e+01, lambda=1000
## + Fold5.Rep2: alpha=1e+02, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold5.Rep2: alpha=1e+02, lambda=1000
## + Fold5.Rep2: alpha=1e+03, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold5.Rep2: alpha=1e+03, lambda=1000
## + Fold1.Rep3: alpha=1e-03, lambda=1000
## - Fold1.Rep3: alpha=1e-03, lambda=1000
## + Fold1.Rep3: alpha=1e-02, lambda=1000
## - Fold1.Rep3: alpha=1e-02, lambda=1000
## + Fold1.Rep3: alpha=1e-01, lambda=1000
## - Fold1.Rep3: alpha=1e-01, lambda=1000
## + Fold1.Rep3: alpha=1e+00, lambda=1000
## - Fold1.Rep3: alpha=1e+00, lambda=1000
## + Fold1.Rep3: alpha=1e+01, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold1.Rep3: alpha=1e+01, lambda=1000
## + Fold1.Rep3: alpha=1e+02, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold1.Rep3: alpha=1e+02, lambda=1000
## + Fold1.Rep3: alpha=1e+03, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold1.Rep3: alpha=1e+03, lambda=1000
## + Fold2.Rep3: alpha=1e-03, lambda=1000
## - Fold2.Rep3: alpha=1e-03, lambda=1000
## + Fold2.Rep3: alpha=1e-02, lambda=1000
## - Fold2.Rep3: alpha=1e-02, lambda=1000
## + Fold2.Rep3: alpha=1e-01, lambda=1000
## - Fold2.Rep3: alpha=1e-01, lambda=1000
## + Fold2.Rep3: alpha=1e+00, lambda=1000
## - Fold2.Rep3: alpha=1e+00, lambda=1000
## + Fold2.Rep3: alpha=1e+01, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold2.Rep3: alpha=1e+01, lambda=1000
## + Fold2.Rep3: alpha=1e+02, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold2.Rep3: alpha=1e+02, lambda=1000
## + Fold2.Rep3: alpha=1e+03, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold2.Rep3: alpha=1e+03, lambda=1000
## + Fold3.Rep3: alpha=1e-03, lambda=1000
## - Fold3.Rep3: alpha=1e-03, lambda=1000
## + Fold3.Rep3: alpha=1e-02, lambda=1000
## - Fold3.Rep3: alpha=1e-02, lambda=1000
## + Fold3.Rep3: alpha=1e-01, lambda=1000
## - Fold3.Rep3: alpha=1e-01, lambda=1000
## + Fold3.Rep3: alpha=1e+00, lambda=1000
## - Fold3.Rep3: alpha=1e+00, lambda=1000
## + Fold3.Rep3: alpha=1e+01, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold3.Rep3: alpha=1e+01, lambda=1000
## + Fold3.Rep3: alpha=1e+02, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold3.Rep3: alpha=1e+02, lambda=1000
## + Fold3.Rep3: alpha=1e+03, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold3.Rep3: alpha=1e+03, lambda=1000
## + Fold4.Rep3: alpha=1e-03, lambda=1000
## - Fold4.Rep3: alpha=1e-03, lambda=1000
## + Fold4.Rep3: alpha=1e-02, lambda=1000
## - Fold4.Rep3: alpha=1e-02, lambda=1000
## + Fold4.Rep3: alpha=1e-01, lambda=1000
## - Fold4.Rep3: alpha=1e-01, lambda=1000
## + Fold4.Rep3: alpha=1e+00, lambda=1000
## - Fold4.Rep3: alpha=1e+00, lambda=1000
## + Fold4.Rep3: alpha=1e+01, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold4.Rep3: alpha=1e+01, lambda=1000
## + Fold4.Rep3: alpha=1e+02, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold4.Rep3: alpha=1e+02, lambda=1000
## + Fold4.Rep3: alpha=1e+03, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold4.Rep3: alpha=1e+03, lambda=1000
## + Fold5.Rep3: alpha=1e-03, lambda=1000
## - Fold5.Rep3: alpha=1e-03, lambda=1000
## + Fold5.Rep3: alpha=1e-02, lambda=1000
## - Fold5.Rep3: alpha=1e-02, lambda=1000
## + Fold5.Rep3: alpha=1e-01, lambda=1000
## - Fold5.Rep3: alpha=1e-01, lambda=1000
## + Fold5.Rep3: alpha=1e+00, lambda=1000
## - Fold5.Rep3: alpha=1e+00, lambda=1000
## + Fold5.Rep3: alpha=1e+01, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold5.Rep3: alpha=1e+01, lambda=1000
## + Fold5.Rep3: alpha=1e+02, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold5.Rep3: alpha=1e+02, lambda=1000
## + Fold5.Rep3: alpha=1e+03, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold5.Rep3: alpha=1e+03, lambda=1000
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 1, lambda = 0.001 on full training set
## glmnet
##
## 82311 samples
## 5 predictor
## 2 classes: 'X0', 'X1'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 65848, 65849, 65849, 65850, 65848, 65849, ...
## Resampling results across tuning parameters:
##
## alpha lambda AUC Precision Recall F
## 1e-03 1e-03 0.8614288 0.8060006 0.8679396 0.8358208
## 1e-03 1e-02 0.8614288 0.8060006 0.8679396 0.8358208
## 1e-03 1e-01 0.8474832 0.7973607 0.8774536 0.8354878
## 1e-03 1e+00 0.8220325 0.7853674 0.8869895 0.8330865
## 1e-03 1e+01 0.8157200 0.7812838 0.8859934 0.8303464
## 1e-03 1e+02 0.8209812 0.7856607 0.8835399 0.8317266
## 1e-03 1e+03 0.0000000 0.5529097 0.4666667 0.7120951
## 1e-02 1e-03 0.8698132 0.8116686 0.8596268 0.8349540
## 1e-02 1e-02 0.8674663 0.8093901 0.8625125 0.8351020
## 1e-02 1e-01 0.8471700 0.7971651 0.8776880 0.8354868
## 1e-02 1e+00 0.8236007 0.7860227 0.8876267 0.8337367
## 1e-02 1e+01 0.8207496 0.7855453 0.8836204 0.8316985
## 1e-02 1e+02 0.0000000 0.5529097 0.4666667 0.7120951
## 1e-02 1e+03 0.0000000 0.5529097 0.4666667 0.7120951
## 1e-01 1e-03 0.8704439 0.8125947 0.8587259 0.8350181
## 1e-01 1e-02 0.8674341 0.8094027 0.8625857 0.8351430
## 1e-01 1e-01 0.8437170 0.7944304 0.8782446 0.8342344
## 1e-01 1e+00 0.8180077 0.7845709 0.8818114 0.8303501
## 1e-01 1e+01 0.0000000 0.5529097 0.4666667 0.7120951
## 1e-01 1e+02 0.0000000 0.5529097 0.4666667 0.7120951
## 1e-01 1e+03 0.0000000 0.5529097 0.4666667 0.7120951
## 1e+00 1e-03 0.8704508 0.8126176 0.8589163 0.8351201
## 1e+00 1e-02 0.8667924 0.8083617 0.8645779 0.8355205
## 1e+00 1e-01 0.8128153 0.7605660 0.8581033 0.8063918
## 1e+00 1e+00 0.0000000 0.5529097 0.4666667 0.7120951
## 1e+00 1e+01 0.0000000 0.5529097 0.4666667 0.7120951
## 1e+00 1e+02 0.0000000 0.5529097 0.4666667 0.7120951
## 1e+00 1e+03 0.0000000 0.5529097 0.4666667 0.7120951
## 1e+01 1e-03 0.8704508 0.8126176 0.8589163 0.8351201
## 1e+01 1e-02 0.8667924 0.8083617 0.8645779 0.8355205
## 1e+01 1e-01 0.8128153 0.7605660 0.8581033 0.8063918
## 1e+01 1e+00 0.0000000 0.5529097 0.4666667 0.7120951
## 1e+01 1e+01 0.0000000 0.5529097 0.4666667 0.7120951
## 1e+01 1e+02 0.0000000 0.5529097 0.4666667 0.7120951
## 1e+01 1e+03 0.0000000 0.5529097 0.4666667 0.7120951
## 1e+02 1e-03 0.8704508 0.8126176 0.8589163 0.8351201
## 1e+02 1e-02 0.8667924 0.8083617 0.8645779 0.8355205
## 1e+02 1e-01 0.8128153 0.7605660 0.8581033 0.8063918
## 1e+02 1e+00 0.0000000 0.5529097 0.4666667 0.7120951
## 1e+02 1e+01 0.0000000 0.5529097 0.4666667 0.7120951
## 1e+02 1e+02 0.0000000 0.5529097 0.4666667 0.7120951
## 1e+02 1e+03 0.0000000 0.5529097 0.4666667 0.7120951
## 1e+03 1e-03 0.8704508 0.8126176 0.8589163 0.8351201
## 1e+03 1e-02 0.8667924 0.8083617 0.8645779 0.8355205
## 1e+03 1e-01 0.8128153 0.7605660 0.8581033 0.8063918
## 1e+03 1e+00 0.0000000 0.5529097 0.4666667 0.7120951
## 1e+03 1e+01 0.0000000 0.5529097 0.4666667 0.7120951
## 1e+03 1e+02 0.0000000 0.5529097 0.4666667 0.7120951
## 1e+03 1e+03 0.0000000 0.5529097 0.4666667 0.7120951
##
## AUC was used to select the optimal model using the largest value.
## The final values used for the model were alpha = 1 and lambda = 0.001.
print(lmod$bestTune)
## alpha lambda
## 22 1 0.001
# https://stats.stackexchange.com/questions/67827/caret-and-coefficients-glmnet
y_test_pred <- predict(lmod, test_features)
confusionMatrix(y_test_pred, test_y, mode = "prec_recall", positive = "X1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction X0 X1
## X0 1028 223
## X1 566 375
##
## Accuracy : 0.6401
## 95% CI : (0.6196, 0.6602)
## No Information Rate : 0.7272
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2307
## Mcnemar's Test P-Value : <2e-16
##
## Precision : 0.3985
## Recall : 0.6271
## F1 : 0.4873
## Prevalence : 0.2728
## Detection Rate : 0.1711
## Detection Prevalence : 0.4293
## Balanced Accuracy : 0.6360
##
## 'Positive' Class : X1
##
(lmod_var_imp <- varImp(lmod))
## glmnet variable importance
##
## Overall
## name 100.00
## brand 41.81
## mpn 41.55
## sku 16.32
## description 0.00
plot(lmod_var_imp)

data.frame(Coefficient = coef(lmod$finalModel, lmod$bestTune$lambda)[, 1]) %>%
tibble::rownames_to_column(., "Feature") %>%
arrange(-Coefficient)
## Feature Coefficient
## 1 name 1.42073902
## 2 mpn 0.79160021
## 3 sku 0.52004883
## 4 (Intercept) 0.04029649
## 5 description -0.34436016
## 6 brand -0.79437893
get_more_granular <- function(param, multiplier = 3){
lesser_seq <- seq(param, multiplier * param, param)
greater_seq <- seq((param / multiplier), param, param / multiplier)
sort(unique(c(lesser_seq, greater_seq)))
}
new_lm_grid <- expand.grid(alpha=get_more_granular(lmod$bestTune$alpha),
lambda=get_more_granular(lmod$bestTune$lambda))
(new_lmod <- train(train_y ~ .,
data = train_set,
method = "glmnet",
weights = model_class_weights,
trControl = lm_control,
tuneGrid = new_lm_grid))
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. AUC will be used instead.
## + Fold1.Rep1: alpha=0.3333, lambda=0.003
## - Fold1.Rep1: alpha=0.3333, lambda=0.003
## + Fold1.Rep1: alpha=0.6667, lambda=0.003
## - Fold1.Rep1: alpha=0.6667, lambda=0.003
## + Fold1.Rep1: alpha=1.0000, lambda=0.003
## - Fold1.Rep1: alpha=1.0000, lambda=0.003
## + Fold1.Rep1: alpha=2.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold1.Rep1: alpha=2.0000, lambda=0.003
## + Fold1.Rep1: alpha=3.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold1.Rep1: alpha=3.0000, lambda=0.003
## + Fold2.Rep1: alpha=0.3333, lambda=0.003
## - Fold2.Rep1: alpha=0.3333, lambda=0.003
## + Fold2.Rep1: alpha=0.6667, lambda=0.003
## - Fold2.Rep1: alpha=0.6667, lambda=0.003
## + Fold2.Rep1: alpha=1.0000, lambda=0.003
## - Fold2.Rep1: alpha=1.0000, lambda=0.003
## + Fold2.Rep1: alpha=2.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold2.Rep1: alpha=2.0000, lambda=0.003
## + Fold2.Rep1: alpha=3.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold2.Rep1: alpha=3.0000, lambda=0.003
## + Fold3.Rep1: alpha=0.3333, lambda=0.003
## - Fold3.Rep1: alpha=0.3333, lambda=0.003
## + Fold3.Rep1: alpha=0.6667, lambda=0.003
## - Fold3.Rep1: alpha=0.6667, lambda=0.003
## + Fold3.Rep1: alpha=1.0000, lambda=0.003
## - Fold3.Rep1: alpha=1.0000, lambda=0.003
## + Fold3.Rep1: alpha=2.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold3.Rep1: alpha=2.0000, lambda=0.003
## + Fold3.Rep1: alpha=3.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold3.Rep1: alpha=3.0000, lambda=0.003
## + Fold4.Rep1: alpha=0.3333, lambda=0.003
## - Fold4.Rep1: alpha=0.3333, lambda=0.003
## + Fold4.Rep1: alpha=0.6667, lambda=0.003
## - Fold4.Rep1: alpha=0.6667, lambda=0.003
## + Fold4.Rep1: alpha=1.0000, lambda=0.003
## - Fold4.Rep1: alpha=1.0000, lambda=0.003
## + Fold4.Rep1: alpha=2.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold4.Rep1: alpha=2.0000, lambda=0.003
## + Fold4.Rep1: alpha=3.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold4.Rep1: alpha=3.0000, lambda=0.003
## + Fold5.Rep1: alpha=0.3333, lambda=0.003
## - Fold5.Rep1: alpha=0.3333, lambda=0.003
## + Fold5.Rep1: alpha=0.6667, lambda=0.003
## - Fold5.Rep1: alpha=0.6667, lambda=0.003
## + Fold5.Rep1: alpha=1.0000, lambda=0.003
## - Fold5.Rep1: alpha=1.0000, lambda=0.003
## + Fold5.Rep1: alpha=2.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold5.Rep1: alpha=2.0000, lambda=0.003
## + Fold5.Rep1: alpha=3.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold5.Rep1: alpha=3.0000, lambda=0.003
## + Fold1.Rep2: alpha=0.3333, lambda=0.003
## - Fold1.Rep2: alpha=0.3333, lambda=0.003
## + Fold1.Rep2: alpha=0.6667, lambda=0.003
## - Fold1.Rep2: alpha=0.6667, lambda=0.003
## + Fold1.Rep2: alpha=1.0000, lambda=0.003
## - Fold1.Rep2: alpha=1.0000, lambda=0.003
## + Fold1.Rep2: alpha=2.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold1.Rep2: alpha=2.0000, lambda=0.003
## + Fold1.Rep2: alpha=3.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold1.Rep2: alpha=3.0000, lambda=0.003
## + Fold2.Rep2: alpha=0.3333, lambda=0.003
## - Fold2.Rep2: alpha=0.3333, lambda=0.003
## + Fold2.Rep2: alpha=0.6667, lambda=0.003
## - Fold2.Rep2: alpha=0.6667, lambda=0.003
## + Fold2.Rep2: alpha=1.0000, lambda=0.003
## - Fold2.Rep2: alpha=1.0000, lambda=0.003
## + Fold2.Rep2: alpha=2.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold2.Rep2: alpha=2.0000, lambda=0.003
## + Fold2.Rep2: alpha=3.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold2.Rep2: alpha=3.0000, lambda=0.003
## + Fold3.Rep2: alpha=0.3333, lambda=0.003
## - Fold3.Rep2: alpha=0.3333, lambda=0.003
## + Fold3.Rep2: alpha=0.6667, lambda=0.003
## - Fold3.Rep2: alpha=0.6667, lambda=0.003
## + Fold3.Rep2: alpha=1.0000, lambda=0.003
## - Fold3.Rep2: alpha=1.0000, lambda=0.003
## + Fold3.Rep2: alpha=2.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold3.Rep2: alpha=2.0000, lambda=0.003
## + Fold3.Rep2: alpha=3.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold3.Rep2: alpha=3.0000, lambda=0.003
## + Fold4.Rep2: alpha=0.3333, lambda=0.003
## - Fold4.Rep2: alpha=0.3333, lambda=0.003
## + Fold4.Rep2: alpha=0.6667, lambda=0.003
## - Fold4.Rep2: alpha=0.6667, lambda=0.003
## + Fold4.Rep2: alpha=1.0000, lambda=0.003
## - Fold4.Rep2: alpha=1.0000, lambda=0.003
## + Fold4.Rep2: alpha=2.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold4.Rep2: alpha=2.0000, lambda=0.003
## + Fold4.Rep2: alpha=3.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold4.Rep2: alpha=3.0000, lambda=0.003
## + Fold5.Rep2: alpha=0.3333, lambda=0.003
## - Fold5.Rep2: alpha=0.3333, lambda=0.003
## + Fold5.Rep2: alpha=0.6667, lambda=0.003
## - Fold5.Rep2: alpha=0.6667, lambda=0.003
## + Fold5.Rep2: alpha=1.0000, lambda=0.003
## - Fold5.Rep2: alpha=1.0000, lambda=0.003
## + Fold5.Rep2: alpha=2.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold5.Rep2: alpha=2.0000, lambda=0.003
## + Fold5.Rep2: alpha=3.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold5.Rep2: alpha=3.0000, lambda=0.003
## + Fold1.Rep3: alpha=0.3333, lambda=0.003
## - Fold1.Rep3: alpha=0.3333, lambda=0.003
## + Fold1.Rep3: alpha=0.6667, lambda=0.003
## - Fold1.Rep3: alpha=0.6667, lambda=0.003
## + Fold1.Rep3: alpha=1.0000, lambda=0.003
## - Fold1.Rep3: alpha=1.0000, lambda=0.003
## + Fold1.Rep3: alpha=2.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold1.Rep3: alpha=2.0000, lambda=0.003
## + Fold1.Rep3: alpha=3.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold1.Rep3: alpha=3.0000, lambda=0.003
## + Fold2.Rep3: alpha=0.3333, lambda=0.003
## - Fold2.Rep3: alpha=0.3333, lambda=0.003
## + Fold2.Rep3: alpha=0.6667, lambda=0.003
## - Fold2.Rep3: alpha=0.6667, lambda=0.003
## + Fold2.Rep3: alpha=1.0000, lambda=0.003
## - Fold2.Rep3: alpha=1.0000, lambda=0.003
## + Fold2.Rep3: alpha=2.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold2.Rep3: alpha=2.0000, lambda=0.003
## + Fold2.Rep3: alpha=3.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold2.Rep3: alpha=3.0000, lambda=0.003
## + Fold3.Rep3: alpha=0.3333, lambda=0.003
## - Fold3.Rep3: alpha=0.3333, lambda=0.003
## + Fold3.Rep3: alpha=0.6667, lambda=0.003
## - Fold3.Rep3: alpha=0.6667, lambda=0.003
## + Fold3.Rep3: alpha=1.0000, lambda=0.003
## - Fold3.Rep3: alpha=1.0000, lambda=0.003
## + Fold3.Rep3: alpha=2.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold3.Rep3: alpha=2.0000, lambda=0.003
## + Fold3.Rep3: alpha=3.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold3.Rep3: alpha=3.0000, lambda=0.003
## + Fold4.Rep3: alpha=0.3333, lambda=0.003
## - Fold4.Rep3: alpha=0.3333, lambda=0.003
## + Fold4.Rep3: alpha=0.6667, lambda=0.003
## - Fold4.Rep3: alpha=0.6667, lambda=0.003
## + Fold4.Rep3: alpha=1.0000, lambda=0.003
## - Fold4.Rep3: alpha=1.0000, lambda=0.003
## + Fold4.Rep3: alpha=2.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold4.Rep3: alpha=2.0000, lambda=0.003
## + Fold4.Rep3: alpha=3.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold4.Rep3: alpha=3.0000, lambda=0.003
## + Fold5.Rep3: alpha=0.3333, lambda=0.003
## - Fold5.Rep3: alpha=0.3333, lambda=0.003
## + Fold5.Rep3: alpha=0.6667, lambda=0.003
## - Fold5.Rep3: alpha=0.6667, lambda=0.003
## + Fold5.Rep3: alpha=1.0000, lambda=0.003
## - Fold5.Rep3: alpha=1.0000, lambda=0.003
## + Fold5.Rep3: alpha=2.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold5.Rep3: alpha=2.0000, lambda=0.003
## + Fold5.Rep3: alpha=3.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold5.Rep3: alpha=3.0000, lambda=0.003
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 1, lambda = 0.000667 on full training set
## glmnet
##
## 82311 samples
## 5 predictor
## 2 classes: 'X0', 'X1'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 65849, 65848, 65849, 65849, 65849, 65850, ...
## Resampling results across tuning parameters:
##
## alpha lambda AUC Precision Recall F
## 0.3333333 0.0003333333 0.8704721 0.8127019 0.8588503 0.8351334
## 0.3333333 0.0006666667 0.8704721 0.8127019 0.8588503 0.8351334
## 0.3333333 0.0010000000 0.8704380 0.8126824 0.8588357 0.8351161
## 0.3333333 0.0020000000 0.8701167 0.8120109 0.8594290 0.8350420
## 0.3333333 0.0030000000 0.8697743 0.8115535 0.8598244 0.8349865
## 0.6666667 0.0003333333 0.8704840 0.8127384 0.8588357 0.8351460
## 0.6666667 0.0006666667 0.8704840 0.8127384 0.8588357 0.8351460
## 0.6666667 0.0010000000 0.8704416 0.8126727 0.8589749 0.8351770
## 0.6666667 0.0020000000 0.8701193 0.8119546 0.8594290 0.8350120
## 0.6666667 0.0030000000 0.8697732 0.8113505 0.8601028 0.8350104
## 1.0000000 0.0003333333 0.8704884 0.8126965 0.8589529 0.8351793
## 1.0000000 0.0006666667 0.8704884 0.8126965 0.8589529 0.8351793
## 1.0000000 0.0010000000 0.8704442 0.8125997 0.8590408 0.8351698
## 1.0000000 0.0020000000 0.8701153 0.8119203 0.8595535 0.8350529
## 1.0000000 0.0030000000 0.8697664 0.8112376 0.8601614 0.8349784
## 2.0000000 0.0003333333 0.8704884 0.8126965 0.8589529 0.8351793
## 2.0000000 0.0006666667 0.8704884 0.8126965 0.8589529 0.8351793
## 2.0000000 0.0010000000 0.8704442 0.8125997 0.8590408 0.8351698
## 2.0000000 0.0020000000 0.8701153 0.8119203 0.8595535 0.8350529
## 2.0000000 0.0030000000 0.8697664 0.8112376 0.8601614 0.8349784
## 3.0000000 0.0003333333 0.8704884 0.8126965 0.8589529 0.8351793
## 3.0000000 0.0006666667 0.8704884 0.8126965 0.8589529 0.8351793
## 3.0000000 0.0010000000 0.8704442 0.8125997 0.8590408 0.8351698
## 3.0000000 0.0020000000 0.8701153 0.8119203 0.8595535 0.8350529
## 3.0000000 0.0030000000 0.8697664 0.8112376 0.8601614 0.8349784
##
## AUC was used to select the optimal model using the largest value.
## The final values used for the model were alpha = 1 and lambda
## = 0.0006666667.
(new_lmod$bestTune)
## alpha lambda
## 12 1 0.0006666667
print(new_lmod$bestTune)
## alpha lambda
## 12 1 0.0006666667
# https://stats.stackexchange.com/questions/67827/caret-and-coefficients-glmnet
y_test_pred <- predict(new_lmod, test_features)
confusionMatrix(y_test_pred, test_y, mode = "prec_recall", positive = "X1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction X0 X1
## X0 1028 223
## X1 566 375
##
## Accuracy : 0.6401
## 95% CI : (0.6196, 0.6602)
## No Information Rate : 0.7272
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2307
## Mcnemar's Test P-Value : <2e-16
##
## Precision : 0.3985
## Recall : 0.6271
## F1 : 0.4873
## Prevalence : 0.2728
## Detection Rate : 0.1711
## Detection Prevalence : 0.4293
## Balanced Accuracy : 0.6360
##
## 'Positive' Class : X1
##
(lmod_var_imp <- varImp(lmod))
## glmnet variable importance
##
## Overall
## name 100.00
## brand 41.81
## mpn 41.55
## sku 16.32
## description 0.00
plot(lmod_var_imp)

data.frame(Coefficient = coef(new_lmod$finalModel, new_lmod$bestTune$lambda)[, 1]) %>%
tibble::rownames_to_column(., "Feature") %>%
arrange(-Coefficient)
## Feature Coefficient
## 1 name 1.42411835
## 2 mpn 0.79397556
## 3 sku 0.52253055
## 4 (Intercept) 0.04069206
## 5 description -0.34489731
## 6 brand -0.79873749
glmod <- glm(train_y ~ ., family = "binomial", data = train_set)
BIC_glmod <- step(glmod, k = log(nrow(train_set)), trace = 0)
summary(BIC_glmod)
##
## Call:
## glm(formula = train_y ~ mpn + sku + brand + name + description,
## family = "binomial", data = train_set)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.1818 -0.6487 -0.3556 0.4481 3.1050
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.164894 0.009838 -16.76 <2e-16 ***
## mpn 0.820504 0.011818 69.43 <2e-16 ***
## sku 0.540610 0.011521 46.93 <2e-16 ***
## brand -0.799630 0.013642 -58.62 <2e-16 ***
## name 1.437188 0.014593 98.49 <2e-16 ***
## description -0.354340 0.010059 -35.23 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 113183 on 82310 degrees of freedom
## Residual deviance: 68797 on 82305 degrees of freedom
## AIC: 68809
##
## Number of Fisher Scoring iterations: 5
BIC_glmod_y_test_pred <- as.factor(ifelse(predict(BIC_glmod, test_features, type = "response") > .5, "X0", "X1"))
summary(BIC_glmod_y_test_pred)
## X0 X1
## 848 1344
confusionMatrix(BIC_glmod_y_test_pred, test_y, mode = "prec_recall", positive = "X1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction X0 X1
## X0 502 346
## X1 1092 252
##
## Accuracy : 0.344
## 95% CI : (0.3241, 0.3643)
## No Information Rate : 0.7272
## P-Value [Acc > NIR] : 1
##
## Kappa : -0.1897
## Mcnemar's Test P-Value : <2e-16
##
## Precision : 0.1875
## Recall : 0.4214
## F1 : 0.2595
## Prevalence : 0.2728
## Detection Rate : 0.1150
## Detection Prevalence : 0.6131
## Balanced Accuracy : 0.3682
##
## 'Positive' Class : X1
##
car::vif(BIC_glmod)
## mpn sku brand name description
## 1.299525 1.086496 1.760883 1.638336 1.058042