symbolic_similarity_features <- read.csv("D:/Documents/Large-Scale Product Matching/symbolic_similarity_features.csv")

SHORT_TEXT_FEATURES = c('gtin', 'mpn', 'sku', 'identifier', 'brand', 'manufacturer')
MEDIUM_LONG_TEXT_FEATURES = c('name', 'description')
OTHER_FEATURES = c('price')  # 'category', 
ALL_FEATURES = c(SHORT_TEXT_FEATURES, MEDIUM_LONG_TEXT_FEATURES, OTHER_FEATURES)


symbolic_similarity_features_only <- symbolic_similarity_features[, ALL_FEATURES]

all_y <- as.factor(make.names(as.character(symbolic_similarity_features$label)))
train_indices <- symbolic_similarity_features$dataset == "train"
str(symbolic_similarity_features_only)
## 'data.frame':    84503 obs. of  9 variables:
##  $ gtin        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ mpn         : num  0.444 0.444 0.444 0.444 0.222 ...
##  $ sku         : num  0 0 0 0 0 ...
##  $ identifier  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ brand       : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ manufacturer: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ name        : num  0.4 0.492 0.493 0.474 0.355 ...
##  $ description : num  0 0.857 0.865 0 0 ...
##  $ price       : num  0 0 0 0 0 0 0 0 0 0 ...
(preprocess_features <- preProcess(symbolic_similarity_features_only,
                                   method = c("nzv", "corr", "center", "scale")))
## Warning in preProcess.default(symbolic_similarity_features_only, method = c("nzv", : correlation matrix could not be computed:
##  1correlation matrix could not be computed:
##  -0.0206245283262564correlation matrix could not be computed:
##  0.427263954386876correlation matrix could not be computed:
##  0.570220930815773correlation matrix could not be computed:
##  -0.245583195038904correlation matrix could not be computed:
##  -0.0206245283262564correlation matrix could not be computed:
##  1correlation matrix could not be computed:
##  0.184388178142601correlation matrix could not be computed:
##  0.285476367681447correlation matrix could not be computed:
##  -0.0979283455193642correlation matrix could not be computed:
##  0.427263954386876correlation matrix could not be computed:
##  0.184388178142601correlation matrix could not be computed:
##  1correlation matrix could not be computed:
##  0.484829331875313correlation matrix could not be computed:
##  0.0394430693841353correlation matrix could not be computed:
##  0.570220930815773correlation matrix could not be computed:
##  0.285476367681447correlation matrix could not be computed:
##  0.484829331875313correlation matrix could not be computed:
##  1correlation matrix could not be computed:
##  -0.335576442098219correlation matrix could not be computed:
##  -0.245583195038904correlation matrix could not be computed:
##  -0.0979283455193642correlation matrix could not be computed:
##  0.0394430693841353correlation matrix could not be computed:
##  -0.335576442098219correlation matrix could not be computed:
##  1
## Created from 84503 samples and 9 variables
## 
## Pre-processing:
##   - centered (5)
##   - ignored (0)
##   - removed (4)
##   - scaled (5)
processed_features <- predict(preprocess_features, symbolic_similarity_features_only)

highly_correlated <- findCorrelation(cor(symbolic_similarity_features_only))
low_var_features <- nearZeroVar(symbolic_similarity_features_only, names = T)

print(preprocess_features$method$remove)
## [1] "gtin"         "identifier"   "manufacturer" "price"
train_features <- processed_features[train_indices, ]
test_features <- processed_features[!train_indices, ]

train_y <- all_y[train_indices]
test_y <- all_y[!train_indices]

train_set <- cbind(train_features, train_y)
set.seed(5)
lm_control <- trainControl(
  method = "repeatedcv",
  number = 5, #num_cvs,
  repeats = 3,
  classProbs = T,
  allowParallel = T,
  verboseIter = T,
  savePredictions = "final",
  summaryFunction = prSummary
)


class_freq <- table(train_y)
more_weighting <- 1
model_class_weights <- ifelse(train_y == names(class_freq[1]),
                           1 / class_freq[1] * (1 / more_weighting),
                           1 / class_freq[2] * more_weighting)

print(max(model_class_weights) / min(model_class_weights))
## [1] 1.236773
lm_grid <-  expand.grid(alpha=10^c(-3:3),
                        lambda=10^c(-3:3))


(lmod <- train(train_y ~ .,
                   data = train_set,
                   method = "glmnet",
                   weights = model_class_weights,
                   trControl = lm_control,
                   tuneGrid = lm_grid))
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. AUC will be used instead.
## + Fold1.Rep1: alpha=1e-03, lambda=1000 
## - Fold1.Rep1: alpha=1e-03, lambda=1000 
## + Fold1.Rep1: alpha=1e-02, lambda=1000 
## - Fold1.Rep1: alpha=1e-02, lambda=1000 
## + Fold1.Rep1: alpha=1e-01, lambda=1000 
## - Fold1.Rep1: alpha=1e-01, lambda=1000 
## + Fold1.Rep1: alpha=1e+00, lambda=1000 
## - Fold1.Rep1: alpha=1e+00, lambda=1000 
## + Fold1.Rep1: alpha=1e+01, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold1.Rep1: alpha=1e+01, lambda=1000 
## + Fold1.Rep1: alpha=1e+02, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold1.Rep1: alpha=1e+02, lambda=1000 
## + Fold1.Rep1: alpha=1e+03, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold1.Rep1: alpha=1e+03, lambda=1000 
## + Fold2.Rep1: alpha=1e-03, lambda=1000 
## - Fold2.Rep1: alpha=1e-03, lambda=1000 
## + Fold2.Rep1: alpha=1e-02, lambda=1000 
## - Fold2.Rep1: alpha=1e-02, lambda=1000 
## + Fold2.Rep1: alpha=1e-01, lambda=1000 
## - Fold2.Rep1: alpha=1e-01, lambda=1000 
## + Fold2.Rep1: alpha=1e+00, lambda=1000 
## - Fold2.Rep1: alpha=1e+00, lambda=1000 
## + Fold2.Rep1: alpha=1e+01, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold2.Rep1: alpha=1e+01, lambda=1000 
## + Fold2.Rep1: alpha=1e+02, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold2.Rep1: alpha=1e+02, lambda=1000 
## + Fold2.Rep1: alpha=1e+03, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold2.Rep1: alpha=1e+03, lambda=1000 
## + Fold3.Rep1: alpha=1e-03, lambda=1000 
## - Fold3.Rep1: alpha=1e-03, lambda=1000 
## + Fold3.Rep1: alpha=1e-02, lambda=1000 
## - Fold3.Rep1: alpha=1e-02, lambda=1000 
## + Fold3.Rep1: alpha=1e-01, lambda=1000 
## - Fold3.Rep1: alpha=1e-01, lambda=1000 
## + Fold3.Rep1: alpha=1e+00, lambda=1000 
## - Fold3.Rep1: alpha=1e+00, lambda=1000 
## + Fold3.Rep1: alpha=1e+01, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold3.Rep1: alpha=1e+01, lambda=1000 
## + Fold3.Rep1: alpha=1e+02, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold3.Rep1: alpha=1e+02, lambda=1000 
## + Fold3.Rep1: alpha=1e+03, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold3.Rep1: alpha=1e+03, lambda=1000 
## + Fold4.Rep1: alpha=1e-03, lambda=1000 
## - Fold4.Rep1: alpha=1e-03, lambda=1000 
## + Fold4.Rep1: alpha=1e-02, lambda=1000 
## - Fold4.Rep1: alpha=1e-02, lambda=1000 
## + Fold4.Rep1: alpha=1e-01, lambda=1000 
## - Fold4.Rep1: alpha=1e-01, lambda=1000 
## + Fold4.Rep1: alpha=1e+00, lambda=1000 
## - Fold4.Rep1: alpha=1e+00, lambda=1000 
## + Fold4.Rep1: alpha=1e+01, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold4.Rep1: alpha=1e+01, lambda=1000 
## + Fold4.Rep1: alpha=1e+02, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold4.Rep1: alpha=1e+02, lambda=1000 
## + Fold4.Rep1: alpha=1e+03, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold4.Rep1: alpha=1e+03, lambda=1000 
## + Fold5.Rep1: alpha=1e-03, lambda=1000 
## - Fold5.Rep1: alpha=1e-03, lambda=1000 
## + Fold5.Rep1: alpha=1e-02, lambda=1000 
## - Fold5.Rep1: alpha=1e-02, lambda=1000 
## + Fold5.Rep1: alpha=1e-01, lambda=1000 
## - Fold5.Rep1: alpha=1e-01, lambda=1000 
## + Fold5.Rep1: alpha=1e+00, lambda=1000 
## - Fold5.Rep1: alpha=1e+00, lambda=1000 
## + Fold5.Rep1: alpha=1e+01, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold5.Rep1: alpha=1e+01, lambda=1000 
## + Fold5.Rep1: alpha=1e+02, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold5.Rep1: alpha=1e+02, lambda=1000 
## + Fold5.Rep1: alpha=1e+03, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold5.Rep1: alpha=1e+03, lambda=1000 
## + Fold1.Rep2: alpha=1e-03, lambda=1000 
## - Fold1.Rep2: alpha=1e-03, lambda=1000 
## + Fold1.Rep2: alpha=1e-02, lambda=1000 
## - Fold1.Rep2: alpha=1e-02, lambda=1000 
## + Fold1.Rep2: alpha=1e-01, lambda=1000 
## - Fold1.Rep2: alpha=1e-01, lambda=1000 
## + Fold1.Rep2: alpha=1e+00, lambda=1000 
## - Fold1.Rep2: alpha=1e+00, lambda=1000 
## + Fold1.Rep2: alpha=1e+01, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold1.Rep2: alpha=1e+01, lambda=1000 
## + Fold1.Rep2: alpha=1e+02, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold1.Rep2: alpha=1e+02, lambda=1000 
## + Fold1.Rep2: alpha=1e+03, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold1.Rep2: alpha=1e+03, lambda=1000 
## + Fold2.Rep2: alpha=1e-03, lambda=1000 
## - Fold2.Rep2: alpha=1e-03, lambda=1000 
## + Fold2.Rep2: alpha=1e-02, lambda=1000 
## - Fold2.Rep2: alpha=1e-02, lambda=1000 
## + Fold2.Rep2: alpha=1e-01, lambda=1000 
## - Fold2.Rep2: alpha=1e-01, lambda=1000 
## + Fold2.Rep2: alpha=1e+00, lambda=1000 
## - Fold2.Rep2: alpha=1e+00, lambda=1000 
## + Fold2.Rep2: alpha=1e+01, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold2.Rep2: alpha=1e+01, lambda=1000 
## + Fold2.Rep2: alpha=1e+02, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold2.Rep2: alpha=1e+02, lambda=1000 
## + Fold2.Rep2: alpha=1e+03, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold2.Rep2: alpha=1e+03, lambda=1000 
## + Fold3.Rep2: alpha=1e-03, lambda=1000 
## - Fold3.Rep2: alpha=1e-03, lambda=1000 
## + Fold3.Rep2: alpha=1e-02, lambda=1000 
## - Fold3.Rep2: alpha=1e-02, lambda=1000 
## + Fold3.Rep2: alpha=1e-01, lambda=1000 
## - Fold3.Rep2: alpha=1e-01, lambda=1000 
## + Fold3.Rep2: alpha=1e+00, lambda=1000 
## - Fold3.Rep2: alpha=1e+00, lambda=1000 
## + Fold3.Rep2: alpha=1e+01, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold3.Rep2: alpha=1e+01, lambda=1000 
## + Fold3.Rep2: alpha=1e+02, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold3.Rep2: alpha=1e+02, lambda=1000 
## + Fold3.Rep2: alpha=1e+03, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold3.Rep2: alpha=1e+03, lambda=1000 
## + Fold4.Rep2: alpha=1e-03, lambda=1000 
## - Fold4.Rep2: alpha=1e-03, lambda=1000 
## + Fold4.Rep2: alpha=1e-02, lambda=1000 
## - Fold4.Rep2: alpha=1e-02, lambda=1000 
## + Fold4.Rep2: alpha=1e-01, lambda=1000 
## - Fold4.Rep2: alpha=1e-01, lambda=1000 
## + Fold4.Rep2: alpha=1e+00, lambda=1000 
## - Fold4.Rep2: alpha=1e+00, lambda=1000 
## + Fold4.Rep2: alpha=1e+01, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold4.Rep2: alpha=1e+01, lambda=1000 
## + Fold4.Rep2: alpha=1e+02, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold4.Rep2: alpha=1e+02, lambda=1000 
## + Fold4.Rep2: alpha=1e+03, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold4.Rep2: alpha=1e+03, lambda=1000 
## + Fold5.Rep2: alpha=1e-03, lambda=1000 
## - Fold5.Rep2: alpha=1e-03, lambda=1000 
## + Fold5.Rep2: alpha=1e-02, lambda=1000 
## - Fold5.Rep2: alpha=1e-02, lambda=1000 
## + Fold5.Rep2: alpha=1e-01, lambda=1000 
## - Fold5.Rep2: alpha=1e-01, lambda=1000 
## + Fold5.Rep2: alpha=1e+00, lambda=1000 
## - Fold5.Rep2: alpha=1e+00, lambda=1000 
## + Fold5.Rep2: alpha=1e+01, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold5.Rep2: alpha=1e+01, lambda=1000 
## + Fold5.Rep2: alpha=1e+02, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold5.Rep2: alpha=1e+02, lambda=1000 
## + Fold5.Rep2: alpha=1e+03, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold5.Rep2: alpha=1e+03, lambda=1000 
## + Fold1.Rep3: alpha=1e-03, lambda=1000 
## - Fold1.Rep3: alpha=1e-03, lambda=1000 
## + Fold1.Rep3: alpha=1e-02, lambda=1000 
## - Fold1.Rep3: alpha=1e-02, lambda=1000 
## + Fold1.Rep3: alpha=1e-01, lambda=1000 
## - Fold1.Rep3: alpha=1e-01, lambda=1000 
## + Fold1.Rep3: alpha=1e+00, lambda=1000 
## - Fold1.Rep3: alpha=1e+00, lambda=1000 
## + Fold1.Rep3: alpha=1e+01, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold1.Rep3: alpha=1e+01, lambda=1000 
## + Fold1.Rep3: alpha=1e+02, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold1.Rep3: alpha=1e+02, lambda=1000 
## + Fold1.Rep3: alpha=1e+03, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold1.Rep3: alpha=1e+03, lambda=1000 
## + Fold2.Rep3: alpha=1e-03, lambda=1000 
## - Fold2.Rep3: alpha=1e-03, lambda=1000 
## + Fold2.Rep3: alpha=1e-02, lambda=1000 
## - Fold2.Rep3: alpha=1e-02, lambda=1000 
## + Fold2.Rep3: alpha=1e-01, lambda=1000 
## - Fold2.Rep3: alpha=1e-01, lambda=1000 
## + Fold2.Rep3: alpha=1e+00, lambda=1000 
## - Fold2.Rep3: alpha=1e+00, lambda=1000 
## + Fold2.Rep3: alpha=1e+01, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold2.Rep3: alpha=1e+01, lambda=1000 
## + Fold2.Rep3: alpha=1e+02, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold2.Rep3: alpha=1e+02, lambda=1000 
## + Fold2.Rep3: alpha=1e+03, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold2.Rep3: alpha=1e+03, lambda=1000 
## + Fold3.Rep3: alpha=1e-03, lambda=1000 
## - Fold3.Rep3: alpha=1e-03, lambda=1000 
## + Fold3.Rep3: alpha=1e-02, lambda=1000 
## - Fold3.Rep3: alpha=1e-02, lambda=1000 
## + Fold3.Rep3: alpha=1e-01, lambda=1000 
## - Fold3.Rep3: alpha=1e-01, lambda=1000 
## + Fold3.Rep3: alpha=1e+00, lambda=1000 
## - Fold3.Rep3: alpha=1e+00, lambda=1000 
## + Fold3.Rep3: alpha=1e+01, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold3.Rep3: alpha=1e+01, lambda=1000 
## + Fold3.Rep3: alpha=1e+02, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold3.Rep3: alpha=1e+02, lambda=1000 
## + Fold3.Rep3: alpha=1e+03, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold3.Rep3: alpha=1e+03, lambda=1000 
## + Fold4.Rep3: alpha=1e-03, lambda=1000 
## - Fold4.Rep3: alpha=1e-03, lambda=1000 
## + Fold4.Rep3: alpha=1e-02, lambda=1000 
## - Fold4.Rep3: alpha=1e-02, lambda=1000 
## + Fold4.Rep3: alpha=1e-01, lambda=1000 
## - Fold4.Rep3: alpha=1e-01, lambda=1000 
## + Fold4.Rep3: alpha=1e+00, lambda=1000 
## - Fold4.Rep3: alpha=1e+00, lambda=1000 
## + Fold4.Rep3: alpha=1e+01, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold4.Rep3: alpha=1e+01, lambda=1000 
## + Fold4.Rep3: alpha=1e+02, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold4.Rep3: alpha=1e+02, lambda=1000 
## + Fold4.Rep3: alpha=1e+03, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold4.Rep3: alpha=1e+03, lambda=1000 
## + Fold5.Rep3: alpha=1e-03, lambda=1000 
## - Fold5.Rep3: alpha=1e-03, lambda=1000 
## + Fold5.Rep3: alpha=1e-02, lambda=1000 
## - Fold5.Rep3: alpha=1e-02, lambda=1000 
## + Fold5.Rep3: alpha=1e-01, lambda=1000 
## - Fold5.Rep3: alpha=1e-01, lambda=1000 
## + Fold5.Rep3: alpha=1e+00, lambda=1000 
## - Fold5.Rep3: alpha=1e+00, lambda=1000 
## + Fold5.Rep3: alpha=1e+01, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold5.Rep3: alpha=1e+01, lambda=1000 
## + Fold5.Rep3: alpha=1e+02, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold5.Rep3: alpha=1e+02, lambda=1000 
## + Fold5.Rep3: alpha=1e+03, lambda=1000
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold5.Rep3: alpha=1e+03, lambda=1000
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 1, lambda = 0.001 on full training set
## glmnet 
## 
## 82311 samples
##     5 predictor
##     2 classes: 'X0', 'X1' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 3 times) 
## Summary of sample sizes: 65848, 65849, 65849, 65850, 65848, 65849, ... 
## Resampling results across tuning parameters:
## 
##   alpha  lambda  AUC        Precision  Recall     F        
##   1e-03  1e-03   0.8614288  0.8060006  0.8679396  0.8358208
##   1e-03  1e-02   0.8614288  0.8060006  0.8679396  0.8358208
##   1e-03  1e-01   0.8474832  0.7973607  0.8774536  0.8354878
##   1e-03  1e+00   0.8220325  0.7853674  0.8869895  0.8330865
##   1e-03  1e+01   0.8157200  0.7812838  0.8859934  0.8303464
##   1e-03  1e+02   0.8209812  0.7856607  0.8835399  0.8317266
##   1e-03  1e+03   0.0000000  0.5529097  0.4666667  0.7120951
##   1e-02  1e-03   0.8698132  0.8116686  0.8596268  0.8349540
##   1e-02  1e-02   0.8674663  0.8093901  0.8625125  0.8351020
##   1e-02  1e-01   0.8471700  0.7971651  0.8776880  0.8354868
##   1e-02  1e+00   0.8236007  0.7860227  0.8876267  0.8337367
##   1e-02  1e+01   0.8207496  0.7855453  0.8836204  0.8316985
##   1e-02  1e+02   0.0000000  0.5529097  0.4666667  0.7120951
##   1e-02  1e+03   0.0000000  0.5529097  0.4666667  0.7120951
##   1e-01  1e-03   0.8704439  0.8125947  0.8587259  0.8350181
##   1e-01  1e-02   0.8674341  0.8094027  0.8625857  0.8351430
##   1e-01  1e-01   0.8437170  0.7944304  0.8782446  0.8342344
##   1e-01  1e+00   0.8180077  0.7845709  0.8818114  0.8303501
##   1e-01  1e+01   0.0000000  0.5529097  0.4666667  0.7120951
##   1e-01  1e+02   0.0000000  0.5529097  0.4666667  0.7120951
##   1e-01  1e+03   0.0000000  0.5529097  0.4666667  0.7120951
##   1e+00  1e-03   0.8704508  0.8126176  0.8589163  0.8351201
##   1e+00  1e-02   0.8667924  0.8083617  0.8645779  0.8355205
##   1e+00  1e-01   0.8128153  0.7605660  0.8581033  0.8063918
##   1e+00  1e+00   0.0000000  0.5529097  0.4666667  0.7120951
##   1e+00  1e+01   0.0000000  0.5529097  0.4666667  0.7120951
##   1e+00  1e+02   0.0000000  0.5529097  0.4666667  0.7120951
##   1e+00  1e+03   0.0000000  0.5529097  0.4666667  0.7120951
##   1e+01  1e-03   0.8704508  0.8126176  0.8589163  0.8351201
##   1e+01  1e-02   0.8667924  0.8083617  0.8645779  0.8355205
##   1e+01  1e-01   0.8128153  0.7605660  0.8581033  0.8063918
##   1e+01  1e+00   0.0000000  0.5529097  0.4666667  0.7120951
##   1e+01  1e+01   0.0000000  0.5529097  0.4666667  0.7120951
##   1e+01  1e+02   0.0000000  0.5529097  0.4666667  0.7120951
##   1e+01  1e+03   0.0000000  0.5529097  0.4666667  0.7120951
##   1e+02  1e-03   0.8704508  0.8126176  0.8589163  0.8351201
##   1e+02  1e-02   0.8667924  0.8083617  0.8645779  0.8355205
##   1e+02  1e-01   0.8128153  0.7605660  0.8581033  0.8063918
##   1e+02  1e+00   0.0000000  0.5529097  0.4666667  0.7120951
##   1e+02  1e+01   0.0000000  0.5529097  0.4666667  0.7120951
##   1e+02  1e+02   0.0000000  0.5529097  0.4666667  0.7120951
##   1e+02  1e+03   0.0000000  0.5529097  0.4666667  0.7120951
##   1e+03  1e-03   0.8704508  0.8126176  0.8589163  0.8351201
##   1e+03  1e-02   0.8667924  0.8083617  0.8645779  0.8355205
##   1e+03  1e-01   0.8128153  0.7605660  0.8581033  0.8063918
##   1e+03  1e+00   0.0000000  0.5529097  0.4666667  0.7120951
##   1e+03  1e+01   0.0000000  0.5529097  0.4666667  0.7120951
##   1e+03  1e+02   0.0000000  0.5529097  0.4666667  0.7120951
##   1e+03  1e+03   0.0000000  0.5529097  0.4666667  0.7120951
## 
## AUC was used to select the optimal model using the largest value.
## The final values used for the model were alpha = 1 and lambda = 0.001.
print(lmod$bestTune)
##    alpha lambda
## 22     1  0.001
# https://stats.stackexchange.com/questions/67827/caret-and-coefficients-glmnet
y_test_pred <- predict(lmod, test_features)

confusionMatrix(y_test_pred, test_y, mode = "prec_recall", positive = "X1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   X0   X1
##         X0 1028  223
##         X1  566  375
##                                           
##                Accuracy : 0.6401          
##                  95% CI : (0.6196, 0.6602)
##     No Information Rate : 0.7272          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2307          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##               Precision : 0.3985          
##                  Recall : 0.6271          
##                      F1 : 0.4873          
##              Prevalence : 0.2728          
##          Detection Rate : 0.1711          
##    Detection Prevalence : 0.4293          
##       Balanced Accuracy : 0.6360          
##                                           
##        'Positive' Class : X1              
## 
(lmod_var_imp <- varImp(lmod))
## glmnet variable importance
## 
##             Overall
## name         100.00
## brand         41.81
## mpn           41.55
## sku           16.32
## description    0.00
plot(lmod_var_imp)

data.frame(Coefficient = coef(lmod$finalModel, lmod$bestTune$lambda)[, 1]) %>% 
  tibble::rownames_to_column(., "Feature") %>% 
  arrange(-Coefficient) 
##       Feature Coefficient
## 1        name  1.42073902
## 2         mpn  0.79160021
## 3         sku  0.52004883
## 4 (Intercept)  0.04029649
## 5 description -0.34436016
## 6       brand -0.79437893
get_more_granular <- function(param, multiplier = 3){
  
  lesser_seq <- seq(param, multiplier * param, param)
  greater_seq <- seq((param / multiplier), param, param / multiplier)                
  sort(unique(c(lesser_seq, greater_seq)))

}


new_lm_grid <-  expand.grid(alpha=get_more_granular(lmod$bestTune$alpha),
                        lambda=get_more_granular(lmod$bestTune$lambda))

(new_lmod <- train(train_y ~ .,
                   data = train_set,
                   method = "glmnet",
                   weights = model_class_weights,
                   trControl = lm_control,
                   tuneGrid = new_lm_grid))
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. AUC will be used instead.
## + Fold1.Rep1: alpha=0.3333, lambda=0.003 
## - Fold1.Rep1: alpha=0.3333, lambda=0.003 
## + Fold1.Rep1: alpha=0.6667, lambda=0.003 
## - Fold1.Rep1: alpha=0.6667, lambda=0.003 
## + Fold1.Rep1: alpha=1.0000, lambda=0.003 
## - Fold1.Rep1: alpha=1.0000, lambda=0.003 
## + Fold1.Rep1: alpha=2.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold1.Rep1: alpha=2.0000, lambda=0.003 
## + Fold1.Rep1: alpha=3.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold1.Rep1: alpha=3.0000, lambda=0.003 
## + Fold2.Rep1: alpha=0.3333, lambda=0.003 
## - Fold2.Rep1: alpha=0.3333, lambda=0.003 
## + Fold2.Rep1: alpha=0.6667, lambda=0.003 
## - Fold2.Rep1: alpha=0.6667, lambda=0.003 
## + Fold2.Rep1: alpha=1.0000, lambda=0.003 
## - Fold2.Rep1: alpha=1.0000, lambda=0.003 
## + Fold2.Rep1: alpha=2.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold2.Rep1: alpha=2.0000, lambda=0.003 
## + Fold2.Rep1: alpha=3.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold2.Rep1: alpha=3.0000, lambda=0.003 
## + Fold3.Rep1: alpha=0.3333, lambda=0.003 
## - Fold3.Rep1: alpha=0.3333, lambda=0.003 
## + Fold3.Rep1: alpha=0.6667, lambda=0.003 
## - Fold3.Rep1: alpha=0.6667, lambda=0.003 
## + Fold3.Rep1: alpha=1.0000, lambda=0.003 
## - Fold3.Rep1: alpha=1.0000, lambda=0.003 
## + Fold3.Rep1: alpha=2.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold3.Rep1: alpha=2.0000, lambda=0.003 
## + Fold3.Rep1: alpha=3.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold3.Rep1: alpha=3.0000, lambda=0.003 
## + Fold4.Rep1: alpha=0.3333, lambda=0.003 
## - Fold4.Rep1: alpha=0.3333, lambda=0.003 
## + Fold4.Rep1: alpha=0.6667, lambda=0.003 
## - Fold4.Rep1: alpha=0.6667, lambda=0.003 
## + Fold4.Rep1: alpha=1.0000, lambda=0.003 
## - Fold4.Rep1: alpha=1.0000, lambda=0.003 
## + Fold4.Rep1: alpha=2.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold4.Rep1: alpha=2.0000, lambda=0.003 
## + Fold4.Rep1: alpha=3.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold4.Rep1: alpha=3.0000, lambda=0.003 
## + Fold5.Rep1: alpha=0.3333, lambda=0.003 
## - Fold5.Rep1: alpha=0.3333, lambda=0.003 
## + Fold5.Rep1: alpha=0.6667, lambda=0.003 
## - Fold5.Rep1: alpha=0.6667, lambda=0.003 
## + Fold5.Rep1: alpha=1.0000, lambda=0.003 
## - Fold5.Rep1: alpha=1.0000, lambda=0.003 
## + Fold5.Rep1: alpha=2.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold5.Rep1: alpha=2.0000, lambda=0.003 
## + Fold5.Rep1: alpha=3.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold5.Rep1: alpha=3.0000, lambda=0.003 
## + Fold1.Rep2: alpha=0.3333, lambda=0.003 
## - Fold1.Rep2: alpha=0.3333, lambda=0.003 
## + Fold1.Rep2: alpha=0.6667, lambda=0.003 
## - Fold1.Rep2: alpha=0.6667, lambda=0.003 
## + Fold1.Rep2: alpha=1.0000, lambda=0.003 
## - Fold1.Rep2: alpha=1.0000, lambda=0.003 
## + Fold1.Rep2: alpha=2.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold1.Rep2: alpha=2.0000, lambda=0.003 
## + Fold1.Rep2: alpha=3.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold1.Rep2: alpha=3.0000, lambda=0.003 
## + Fold2.Rep2: alpha=0.3333, lambda=0.003 
## - Fold2.Rep2: alpha=0.3333, lambda=0.003 
## + Fold2.Rep2: alpha=0.6667, lambda=0.003 
## - Fold2.Rep2: alpha=0.6667, lambda=0.003 
## + Fold2.Rep2: alpha=1.0000, lambda=0.003 
## - Fold2.Rep2: alpha=1.0000, lambda=0.003 
## + Fold2.Rep2: alpha=2.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold2.Rep2: alpha=2.0000, lambda=0.003 
## + Fold2.Rep2: alpha=3.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold2.Rep2: alpha=3.0000, lambda=0.003 
## + Fold3.Rep2: alpha=0.3333, lambda=0.003 
## - Fold3.Rep2: alpha=0.3333, lambda=0.003 
## + Fold3.Rep2: alpha=0.6667, lambda=0.003 
## - Fold3.Rep2: alpha=0.6667, lambda=0.003 
## + Fold3.Rep2: alpha=1.0000, lambda=0.003 
## - Fold3.Rep2: alpha=1.0000, lambda=0.003 
## + Fold3.Rep2: alpha=2.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold3.Rep2: alpha=2.0000, lambda=0.003 
## + Fold3.Rep2: alpha=3.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold3.Rep2: alpha=3.0000, lambda=0.003 
## + Fold4.Rep2: alpha=0.3333, lambda=0.003 
## - Fold4.Rep2: alpha=0.3333, lambda=0.003 
## + Fold4.Rep2: alpha=0.6667, lambda=0.003 
## - Fold4.Rep2: alpha=0.6667, lambda=0.003 
## + Fold4.Rep2: alpha=1.0000, lambda=0.003 
## - Fold4.Rep2: alpha=1.0000, lambda=0.003 
## + Fold4.Rep2: alpha=2.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold4.Rep2: alpha=2.0000, lambda=0.003 
## + Fold4.Rep2: alpha=3.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold4.Rep2: alpha=3.0000, lambda=0.003 
## + Fold5.Rep2: alpha=0.3333, lambda=0.003 
## - Fold5.Rep2: alpha=0.3333, lambda=0.003 
## + Fold5.Rep2: alpha=0.6667, lambda=0.003 
## - Fold5.Rep2: alpha=0.6667, lambda=0.003 
## + Fold5.Rep2: alpha=1.0000, lambda=0.003 
## - Fold5.Rep2: alpha=1.0000, lambda=0.003 
## + Fold5.Rep2: alpha=2.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold5.Rep2: alpha=2.0000, lambda=0.003 
## + Fold5.Rep2: alpha=3.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold5.Rep2: alpha=3.0000, lambda=0.003 
## + Fold1.Rep3: alpha=0.3333, lambda=0.003 
## - Fold1.Rep3: alpha=0.3333, lambda=0.003 
## + Fold1.Rep3: alpha=0.6667, lambda=0.003 
## - Fold1.Rep3: alpha=0.6667, lambda=0.003 
## + Fold1.Rep3: alpha=1.0000, lambda=0.003 
## - Fold1.Rep3: alpha=1.0000, lambda=0.003 
## + Fold1.Rep3: alpha=2.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold1.Rep3: alpha=2.0000, lambda=0.003 
## + Fold1.Rep3: alpha=3.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold1.Rep3: alpha=3.0000, lambda=0.003 
## + Fold2.Rep3: alpha=0.3333, lambda=0.003 
## - Fold2.Rep3: alpha=0.3333, lambda=0.003 
## + Fold2.Rep3: alpha=0.6667, lambda=0.003 
## - Fold2.Rep3: alpha=0.6667, lambda=0.003 
## + Fold2.Rep3: alpha=1.0000, lambda=0.003 
## - Fold2.Rep3: alpha=1.0000, lambda=0.003 
## + Fold2.Rep3: alpha=2.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold2.Rep3: alpha=2.0000, lambda=0.003 
## + Fold2.Rep3: alpha=3.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold2.Rep3: alpha=3.0000, lambda=0.003 
## + Fold3.Rep3: alpha=0.3333, lambda=0.003 
## - Fold3.Rep3: alpha=0.3333, lambda=0.003 
## + Fold3.Rep3: alpha=0.6667, lambda=0.003 
## - Fold3.Rep3: alpha=0.6667, lambda=0.003 
## + Fold3.Rep3: alpha=1.0000, lambda=0.003 
## - Fold3.Rep3: alpha=1.0000, lambda=0.003 
## + Fold3.Rep3: alpha=2.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold3.Rep3: alpha=2.0000, lambda=0.003 
## + Fold3.Rep3: alpha=3.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold3.Rep3: alpha=3.0000, lambda=0.003 
## + Fold4.Rep3: alpha=0.3333, lambda=0.003 
## - Fold4.Rep3: alpha=0.3333, lambda=0.003 
## + Fold4.Rep3: alpha=0.6667, lambda=0.003 
## - Fold4.Rep3: alpha=0.6667, lambda=0.003 
## + Fold4.Rep3: alpha=1.0000, lambda=0.003 
## - Fold4.Rep3: alpha=1.0000, lambda=0.003 
## + Fold4.Rep3: alpha=2.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold4.Rep3: alpha=2.0000, lambda=0.003 
## + Fold4.Rep3: alpha=3.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold4.Rep3: alpha=3.0000, lambda=0.003 
## + Fold5.Rep3: alpha=0.3333, lambda=0.003 
## - Fold5.Rep3: alpha=0.3333, lambda=0.003 
## + Fold5.Rep3: alpha=0.6667, lambda=0.003 
## - Fold5.Rep3: alpha=0.6667, lambda=0.003 
## + Fold5.Rep3: alpha=1.0000, lambda=0.003 
## - Fold5.Rep3: alpha=1.0000, lambda=0.003 
## + Fold5.Rep3: alpha=2.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold5.Rep3: alpha=2.0000, lambda=0.003 
## + Fold5.Rep3: alpha=3.0000, lambda=0.003
## Warning in (function (x, y, family = c("gaussian", "binomial", "poisson", :
## alpha >1; set to 1
## - Fold5.Rep3: alpha=3.0000, lambda=0.003 
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 1, lambda = 0.000667 on full training set
## glmnet 
## 
## 82311 samples
##     5 predictor
##     2 classes: 'X0', 'X1' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 3 times) 
## Summary of sample sizes: 65849, 65848, 65849, 65849, 65849, 65850, ... 
## Resampling results across tuning parameters:
## 
##   alpha      lambda        AUC        Precision  Recall     F        
##   0.3333333  0.0003333333  0.8704721  0.8127019  0.8588503  0.8351334
##   0.3333333  0.0006666667  0.8704721  0.8127019  0.8588503  0.8351334
##   0.3333333  0.0010000000  0.8704380  0.8126824  0.8588357  0.8351161
##   0.3333333  0.0020000000  0.8701167  0.8120109  0.8594290  0.8350420
##   0.3333333  0.0030000000  0.8697743  0.8115535  0.8598244  0.8349865
##   0.6666667  0.0003333333  0.8704840  0.8127384  0.8588357  0.8351460
##   0.6666667  0.0006666667  0.8704840  0.8127384  0.8588357  0.8351460
##   0.6666667  0.0010000000  0.8704416  0.8126727  0.8589749  0.8351770
##   0.6666667  0.0020000000  0.8701193  0.8119546  0.8594290  0.8350120
##   0.6666667  0.0030000000  0.8697732  0.8113505  0.8601028  0.8350104
##   1.0000000  0.0003333333  0.8704884  0.8126965  0.8589529  0.8351793
##   1.0000000  0.0006666667  0.8704884  0.8126965  0.8589529  0.8351793
##   1.0000000  0.0010000000  0.8704442  0.8125997  0.8590408  0.8351698
##   1.0000000  0.0020000000  0.8701153  0.8119203  0.8595535  0.8350529
##   1.0000000  0.0030000000  0.8697664  0.8112376  0.8601614  0.8349784
##   2.0000000  0.0003333333  0.8704884  0.8126965  0.8589529  0.8351793
##   2.0000000  0.0006666667  0.8704884  0.8126965  0.8589529  0.8351793
##   2.0000000  0.0010000000  0.8704442  0.8125997  0.8590408  0.8351698
##   2.0000000  0.0020000000  0.8701153  0.8119203  0.8595535  0.8350529
##   2.0000000  0.0030000000  0.8697664  0.8112376  0.8601614  0.8349784
##   3.0000000  0.0003333333  0.8704884  0.8126965  0.8589529  0.8351793
##   3.0000000  0.0006666667  0.8704884  0.8126965  0.8589529  0.8351793
##   3.0000000  0.0010000000  0.8704442  0.8125997  0.8590408  0.8351698
##   3.0000000  0.0020000000  0.8701153  0.8119203  0.8595535  0.8350529
##   3.0000000  0.0030000000  0.8697664  0.8112376  0.8601614  0.8349784
## 
## AUC was used to select the optimal model using the largest value.
## The final values used for the model were alpha = 1 and lambda
##  = 0.0006666667.
(new_lmod$bestTune)
##    alpha       lambda
## 12     1 0.0006666667
print(new_lmod$bestTune)
##    alpha       lambda
## 12     1 0.0006666667
# https://stats.stackexchange.com/questions/67827/caret-and-coefficients-glmnet
y_test_pred <- predict(new_lmod, test_features)

confusionMatrix(y_test_pred, test_y, mode = "prec_recall", positive = "X1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   X0   X1
##         X0 1028  223
##         X1  566  375
##                                           
##                Accuracy : 0.6401          
##                  95% CI : (0.6196, 0.6602)
##     No Information Rate : 0.7272          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2307          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##               Precision : 0.3985          
##                  Recall : 0.6271          
##                      F1 : 0.4873          
##              Prevalence : 0.2728          
##          Detection Rate : 0.1711          
##    Detection Prevalence : 0.4293          
##       Balanced Accuracy : 0.6360          
##                                           
##        'Positive' Class : X1              
## 
(lmod_var_imp <- varImp(lmod))
## glmnet variable importance
## 
##             Overall
## name         100.00
## brand         41.81
## mpn           41.55
## sku           16.32
## description    0.00
plot(lmod_var_imp)

data.frame(Coefficient = coef(new_lmod$finalModel, new_lmod$bestTune$lambda)[, 1]) %>% 
  tibble::rownames_to_column(., "Feature") %>% 
  arrange(-Coefficient) 
##       Feature Coefficient
## 1        name  1.42411835
## 2         mpn  0.79397556
## 3         sku  0.52253055
## 4 (Intercept)  0.04069206
## 5 description -0.34489731
## 6       brand -0.79873749
glmod <- glm(train_y ~ ., family = "binomial", data = train_set)
BIC_glmod <- step(glmod, k = log(nrow(train_set)), trace = 0)

summary(BIC_glmod)
## 
## Call:
## glm(formula = train_y ~ mpn + sku + brand + name + description, 
##     family = "binomial", data = train_set)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.1818  -0.6487  -0.3556   0.4481   3.1050  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.164894   0.009838  -16.76   <2e-16 ***
## mpn          0.820504   0.011818   69.43   <2e-16 ***
## sku          0.540610   0.011521   46.93   <2e-16 ***
## brand       -0.799630   0.013642  -58.62   <2e-16 ***
## name         1.437188   0.014593   98.49   <2e-16 ***
## description -0.354340   0.010059  -35.23   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 113183  on 82310  degrees of freedom
## Residual deviance:  68797  on 82305  degrees of freedom
## AIC: 68809
## 
## Number of Fisher Scoring iterations: 5
BIC_glmod_y_test_pred <- as.factor(ifelse(predict(BIC_glmod, test_features, type = "response") > .5, "X0", "X1"))
summary(BIC_glmod_y_test_pred)
##   X0   X1 
##  848 1344
confusionMatrix(BIC_glmod_y_test_pred, test_y, mode = "prec_recall", positive = "X1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   X0   X1
##         X0  502  346
##         X1 1092  252
##                                           
##                Accuracy : 0.344           
##                  95% CI : (0.3241, 0.3643)
##     No Information Rate : 0.7272          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : -0.1897         
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##               Precision : 0.1875          
##                  Recall : 0.4214          
##                      F1 : 0.2595          
##              Prevalence : 0.2728          
##          Detection Rate : 0.1150          
##    Detection Prevalence : 0.6131          
##       Balanced Accuracy : 0.3682          
##                                           
##        'Positive' Class : X1              
## 
car::vif(BIC_glmod)
##         mpn         sku       brand        name description 
##    1.299525    1.086496    1.760883    1.638336    1.058042