Loading the tidyverse
library(tidyverse)
Loading the dataset to our enviroment
password <- read.csv(file.choose())
I’ll create 10 variables I think will help to classify the passwords. Among them:
Dummy variables for upper and lower case letters, numbers, vowels and punctuation. Also ratio variables for these same variables. Lastly, a numerical variable with the number of characters per password.
password<-password %>% mutate(Upper= ifelse(str_detect(password,"[A-Z]"),1,0),Digits=ifelse(str_detect(password,"[0-9]"),1,0),punct=ifelse(str_detect(password,"[:punct:]"),1,0),length=str_count(password),ratio_upper_tot=str_count(password,"[A-Z]") /str_count(password),ratio_digits_tot=str_count(password,"[0-9]") /str_count(password),ratio_punct_tot=str_count(password,"[:punct:]") /str_count(password),ratio_lower_tot=str_count(password,"[a-z]") /str_count(password),ratio_vowels_tot=str_count(password,"[a|e|i|o|u|A|E|I|O|U]") /str_count(password),vowels= ifelse(str_detect(password,"[a|e|i|o|u|A|E|I|O|U]"),1,0))
Let’s count
password %>% count(strength,sort=TRUE)
deleting the rows with na’s
password<-password %>% na.omit(strength)
converting all factors to strings
password <- password %>% mutate(across(where(is_character),as_factor))
Deleting the password variable in order to run our model
password<-password[,-1]
I’ll start by creating a test and train set. I’ll stratify it by our outcome. I’ll also use a k-fold cross validation of k = 5
library(tidymodels)
library(finetune)
set.seed(2021)
spl <- initial_split(password,strata=strength)
train <- training(spl)
test <- testing(spl)
mset <-metric_set(roc_auc)
grid_control <- control_grid(save_pred = TRUE,
save_workflow = TRUE,
extract = extract_model)
set.seed(2021)
train_fold5 <- train %>%
vfold_cv(5)
Creating our model specification for a Xgboost model and then I’ll put the preprocessing and model together in a workflow().
xg_rec <- recipe(strength ~.,data = train)
xg_mod <- boost_tree("classification",
mtry = tune(),
trees = tune(),
learn_rate = tune()) %>%
set_engine("xgboost")
xg_wf <- workflow() %>%
add_recipe(xg_rec) %>%
add_model(xg_mod)
Tuning the model
doParallel::registerDoParallel()
xg_tune <- xg_wf %>%
tune_grid(train_fold5,
metrics = mset,
control = grid_control,
grid = crossing(mtry = c(7),
trees = seq(250, 800, 50),
learn_rate = c(.008, .01)))
It took two hours to train! Time to look at the results
autoplot(xg_tune)

It seems like the models with the 0.010 learning rate perform better.
xg_tune %>%
collect_metrics() %>%
arrange(mean)
Picking the best model.
best_log.loss <- select_best(xg_tune, "roc_auc")
best_log.loss
NA
NA
Puting the model in the workflow
final_xgb <- finalize_workflow(
xg_wf,
best_log.loss
)
final_xgb
== Workflow ==========================================================================================
[3mPreprocessor:[23m Recipe
[3mModel:[23m boost_tree()
-- Preprocessor --------------------------------------------------------------------------------------
0 Recipe Steps
-- Model ---------------------------------------------------------------------------------------------
Boosted Tree Model Specification (classification)
Main Arguments:
mtry = 7
trees = 800
learn_rate = 0.01
Computational engine: xgboost
Looking at the most important variables in our model. This took approximately 30 minutes.
library(vip)
package 㤼㸱vip㤼㸲 was built under R version 4.0.5
Attaching package: 㤼㸱vip㤼㸲
The following object is masked from 㤼㸱package:utils㤼㸲:
vi
final_xgb %>%
fit(data = train) %>%
pull_workflow_fit() %>%
vip(geom = "point", num_features = 20)
`pull_workflow_fit()` was deprecated in workflows 0.2.3.
Please use `extract_fit_parsnip()` instead.

NA
NA
Performance of the final model
final_res <- last_fit(final_xgb, spl)
collect_metrics(final_res)
NA
Looking at the difference in performance across classes with a confusion matrix.
xg_conf <- final_res %>%
unnest(.predictions) %>%
conf_mat(strength, .pred_class)
xg_conf
Truth
Prediction moderate strong weak
moderate 124187 2 12
strong 7 20782 0
weak 6 0 22413
The model seems to be performing quite well. There are only a few observations that are misclassified. 13 for moderate, 2 for strong and 12 for weak. Xgboost is computationally expensive, but the results are really good.
LS0tDQp0aXRsZTogIk1hY2hpbmUgTGVhcm5pbmciDQphdXRob3I6ICIyMTAwMzU0IEtheW9kZSINCmRhdGU6ICIxNi1Ob3YtMjAyMSINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQoNCg0KTG9hZGluZyB0aGUgdGlkeXZlcnNlDQpgYGB7cn0NCmxpYnJhcnkodGlkeXZlcnNlKQ0KYGBgDQoNCkxvYWRpbmcgdGhlIGRhdGFzZXQgdG8gb3VyIGVudmlyb21lbnQNCmBgYHtyfQ0KDQpwYXNzd29yZCA8LSByZWFkLmNzdihmaWxlLmNob29zZSgpKQ0KDQpgYGANCg0KSSdsbCBjcmVhdGUgMTAgdmFyaWFibGVzIEkgdGhpbmsgd2lsbCBoZWxwIHRvIGNsYXNzaWZ5IHRoZSBwYXNzd29yZHMuIEFtb25nIHRoZW06DQoNCkR1bW15IHZhcmlhYmxlcyBmb3IgdXBwZXIgYW5kIGxvd2VyIGNhc2UgbGV0dGVycywgbnVtYmVycywgdm93ZWxzIGFuZCBwdW5jdHVhdGlvbi4gQWxzbyByYXRpbyB2YXJpYWJsZXMgZm9yIHRoZXNlIHNhbWUgdmFyaWFibGVzLiBMYXN0bHksIGEgbnVtZXJpY2FsIHZhcmlhYmxlIHdpdGggdGhlIG51bWJlciBvZiBjaGFyYWN0ZXJzIHBlciBwYXNzd29yZC4NCg0KDQpgYGB7cn0NCg0KcGFzc3dvcmQ8LXBhc3N3b3JkICU+JSBtdXRhdGUoVXBwZXI9IGlmZWxzZShzdHJfZGV0ZWN0KHBhc3N3b3JkLCJbQS1aXSIpLDEsMCksRGlnaXRzPWlmZWxzZShzdHJfZGV0ZWN0KHBhc3N3b3JkLCJbMC05XSIpLDEsMCkscHVuY3Q9aWZlbHNlKHN0cl9kZXRlY3QocGFzc3dvcmQsIls6cHVuY3Q6XSIpLDEsMCksbGVuZ3RoPXN0cl9jb3VudChwYXNzd29yZCkscmF0aW9fdXBwZXJfdG90PXN0cl9jb3VudChwYXNzd29yZCwiW0EtWl0iKSAvc3RyX2NvdW50KHBhc3N3b3JkKSxyYXRpb19kaWdpdHNfdG90PXN0cl9jb3VudChwYXNzd29yZCwiWzAtOV0iKSAvc3RyX2NvdW50KHBhc3N3b3JkKSxyYXRpb19wdW5jdF90b3Q9c3RyX2NvdW50KHBhc3N3b3JkLCJbOnB1bmN0Ol0iKSAvc3RyX2NvdW50KHBhc3N3b3JkKSxyYXRpb19sb3dlcl90b3Q9c3RyX2NvdW50KHBhc3N3b3JkLCJbYS16XSIpIC9zdHJfY291bnQocGFzc3dvcmQpLHJhdGlvX3Zvd2Vsc190b3Q9c3RyX2NvdW50KHBhc3N3b3JkLCJbYXxlfGl8b3x1fEF8RXxJfE98VV0iKSAvc3RyX2NvdW50KHBhc3N3b3JkKSx2b3dlbHM9IGlmZWxzZShzdHJfZGV0ZWN0KHBhc3N3b3JkLCJbYXxlfGl8b3x1fEF8RXxJfE98VV0iKSwxLDApKQ0KDQpgYGANCg0KTGV0J3MgY291bnQNCmBgYHtyfQ0KcGFzc3dvcmQgJT4lIGNvdW50KHN0cmVuZ3RoLHNvcnQ9VFJVRSkNCmBgYA0KDQpkZWxldGluZyB0aGUgcm93cyB3aXRoIG5hJ3MNCg0KYGBge3J9DQpwYXNzd29yZDwtcGFzc3dvcmQgJT4lIG5hLm9taXQoc3RyZW5ndGgpDQpgYGANCg0KY29udmVydGluZyBhbGwgZmFjdG9ycyB0byBzdHJpbmdzDQpgYGB7cn0NCnBhc3N3b3JkIDwtIHBhc3N3b3JkICU+JSBtdXRhdGUoYWNyb3NzKHdoZXJlKGlzX2NoYXJhY3RlciksYXNfZmFjdG9yKSkNCg0KYGBgDQoNCkRlbGV0aW5nIHRoZSBwYXNzd29yZCB2YXJpYWJsZSBpbiBvcmRlciB0byBydW4gb3VyIG1vZGVsDQpgYGB7cn0NCnBhc3N3b3JkPC1wYXNzd29yZFssLTFdDQpgYGANCg0KDQpJJ2xsIHN0YXJ0IGJ5IGNyZWF0aW5nIGEgdGVzdCBhbmQgdHJhaW4gc2V0LiBJ4oCZbGwgc3RyYXRpZnkgaXQgYnkgb3VyIG91dGNvbWUuDQpJJ2xsIGFsc28gdXNlIGEgay1mb2xkIGNyb3NzIHZhbGlkYXRpb24gb2YgayA9IDUNCg0KYGBge3J9DQpsaWJyYXJ5KHRpZHltb2RlbHMpDQpsaWJyYXJ5KGZpbmV0dW5lKQ0KDQoNCnNldC5zZWVkKDIwMjEpDQpzcGwgPC0gaW5pdGlhbF9zcGxpdChwYXNzd29yZCxzdHJhdGE9c3RyZW5ndGgpDQp0cmFpbiA8LSB0cmFpbmluZyhzcGwpDQp0ZXN0IDwtIHRlc3Rpbmcoc3BsKQ0KDQoNCm1zZXQgPC1tZXRyaWNfc2V0KHJvY19hdWMpDQoNCmdyaWRfY29udHJvbCA8LSBjb250cm9sX2dyaWQoc2F2ZV9wcmVkID0gVFJVRSwNCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgc2F2ZV93b3JrZmxvdyA9IFRSVUUsDQogICAgICAgICAgICAgICAgICAgICAgICAgICAgIGV4dHJhY3QgPSBleHRyYWN0X21vZGVsKQ0Kc2V0LnNlZWQoMjAyMSkNCnRyYWluX2ZvbGQ1IDwtIHRyYWluICU+JQ0KICB2Zm9sZF9jdig1KQ0KYGBgDQoNCkNyZWF0aW5nIG91ciBtb2RlbCBzcGVjaWZpY2F0aW9uIGZvciBhIFhnYm9vc3QgbW9kZWwgYW5kIHRoZW4gSSdsbCBwdXQgdGhlIHByZXByb2Nlc3NpbmcgYW5kIG1vZGVsIHRvZ2V0aGVyIGluIGEgd29ya2Zsb3coKS4gDQoNCmBgYHtyfQ0KeGdfcmVjIDwtIHJlY2lwZShzdHJlbmd0aCB+LixkYXRhID0gdHJhaW4pDQoNCnhnX21vZCA8LSBib29zdF90cmVlKCJjbGFzc2lmaWNhdGlvbiIsDQogICAgICAgICAgICAgICAgICAgICBtdHJ5ID0gdHVuZSgpLA0KICAgICAgICAgICAgICAgICAgICAgdHJlZXMgPSB0dW5lKCksDQogICAgICAgICAgICAgICAgICAgICBsZWFybl9yYXRlID0gdHVuZSgpKSAlPiUNCiAgc2V0X2VuZ2luZSgieGdib29zdCIpDQoNCg0KeGdfd2YgPC0gd29ya2Zsb3coKSAlPiUNCiAgYWRkX3JlY2lwZSh4Z19yZWMpICU+JQ0KICBhZGRfbW9kZWwoeGdfbW9kKQ0KDQoNCmBgYA0KDQoNClR1bmluZyB0aGUgbW9kZWwNCg0KDQpgYGB7cn0NCg0KZG9QYXJhbGxlbDo6cmVnaXN0ZXJEb1BhcmFsbGVsKCkNCg0KeGdfdHVuZSA8LSB4Z193ZiAlPiUNCiAgdHVuZV9ncmlkKHRyYWluX2ZvbGQ1LA0KICAgICAgICAgICAgbWV0cmljcyA9IG1zZXQsDQogICAgICAgICAgICBjb250cm9sID0gZ3JpZF9jb250cm9sLA0KICAgICAgICAgICAgZ3JpZCA9IGNyb3NzaW5nKG10cnkgPSBjKDcpLA0KICAgICAgICAgICAgICAgICAgICAgICAgICAgIHRyZWVzID0gc2VxKDI1MCwgODAwLCA1MCksDQogICAgICAgICAgICAgICAgICAgICAgICAgICAgbGVhcm5fcmF0ZSA9IGMoLjAwOCwgLjAxKSkpDQoNCg0KYGBgDQoNCkl0IHRvb2sgdHdvIGhvdXJzIHRvIHRyYWluISBUaW1lIHRvIGxvb2sgYXQgdGhlIHJlc3VsdHMNCg0KDQpgYGB7cn0NCmF1dG9wbG90KHhnX3R1bmUpDQoNCmBgYA0KSXQgc2VlbXMgbGlrZSB0aGUgbW9kZWxzIHdpdGggdGhlIDAuMDEwIGxlYXJuaW5nIHJhdGUgcGVyZm9ybSBiZXR0ZXIuDQoNCg0KYGBge3Isd2FybmluZz1GQUxTRX0NCnhnX3R1bmUgJT4lDQogIGNvbGxlY3RfbWV0cmljcygpICU+JQ0KICBhcnJhbmdlKG1lYW4pDQpgYGANCg0KUGlja2luZyB0aGUgYmVzdCBtb2RlbC4NCg0KYGBge3IsbWVzc2FnZT1GQUxTRSx3YXJuaW5nPUZBTFNFfQ0KDQogIGJlc3RfbG9nLmxvc3MgPC0gc2VsZWN0X2Jlc3QoeGdfdHVuZSwgInJvY19hdWMiKQ0KICBiZXN0X2xvZy5sb3NzDQogIA0KDQpgYGANCg0KDQpQdXRpbmcgdGhlIG1vZGVsIGluIHRoZSB3b3JrZmxvdw0KDQpgYGB7cn0NCiANCiAgZmluYWxfeGdiIDwtIGZpbmFsaXplX3dvcmtmbG93KA0KICAgIHhnX3dmLA0KICAgIGJlc3RfbG9nLmxvc3MNCiAgKQ0KICANCiAgZmluYWxfeGdiDQpgYGANCg0KTG9va2luZyBhdCB0aGUgbW9zdCBpbXBvcnRhbnQgdmFyaWFibGVzIGluIG91ciBtb2RlbC4gVGhpcyB0b29rIGFwcHJveGltYXRlbHkgMzAgbWludXRlcy4NCmBgYHtyfQ0KIGxpYnJhcnkodmlwKQ0KICANCiAgZmluYWxfeGdiICU+JQ0KICAgIGZpdChkYXRhID0gdHJhaW4pICU+JQ0KICAgIHB1bGxfd29ya2Zsb3dfZml0KCkgJT4lDQogICAgdmlwKGdlb20gPSAicG9pbnQiLCBudW1fZmVhdHVyZXMgPSAyMCkgIA0KICANCg0KYGBgDQoNClBlcmZvcm1hbmNlIG9mIHRoZSBmaW5hbCBtb2RlbA0KYGBge3J9DQpmaW5hbF9yZXMgPC0gbGFzdF9maXQoZmluYWxfeGdiLCBzcGwpDQogIA0KYGBgDQoNCmBgYHtyLHdhcm5pbmc9RkFMU0UsbWVzc2FnZT1GQUxTRX0NCmNvbGxlY3RfbWV0cmljcyhmaW5hbF9yZXMpIA0KDQpgYGANCg0KDQpMb29raW5nIGF0IHRoZSBkaWZmZXJlbmNlIGluIHBlcmZvcm1hbmNlIGFjcm9zcyBjbGFzc2VzIHdpdGggYSBjb25mdXNpb24gbWF0cml4Lg0KYGBge3J9DQp4Z19jb25mIDwtIGZpbmFsX3JlcyAlPiUNCiAgdW5uZXN0KC5wcmVkaWN0aW9ucykgJT4lDQogIGNvbmZfbWF0KHN0cmVuZ3RoLCAucHJlZF9jbGFzcykNCg0KeGdfY29uZg0KDQoNCmBgYA0KDQpUaGUgbW9kZWwgc2VlbXMgdG8gYmUgcGVyZm9ybWluZyBxdWl0ZSB3ZWxsLiBUaGVyZSBhcmUgb25seSBhIGZldyBvYnNlcnZhdGlvbnMgdGhhdCBhcmUgbWlzY2xhc3NpZmllZC4gMTMgZm9yIG1vZGVyYXRlLCAyIGZvciBzdHJvbmcgYW5kIDEyIGZvciB3ZWFrLiBYZ2Jvb3N0IGlzIGNvbXB1dGF0aW9uYWxseSBleHBlbnNpdmUsIGJ1dCB0aGUgcmVzdWx0cyBhcmUgcmVhbGx5IGdvb2QuDQoNCg0K