Overview

You’ve been hired by CarVana to look at used cars and identify used cars that are likely going to be bad (isBadBuy) i.e. you don’t want them on the lot for customers to purchase because it’s going to cause you all sorts of headaches. Your job is to identify the crappy cars before the auction buyer purchases them, in a nutshell you are going to build a model to predict how likely a car is going to be a bad buy - i.e the variable isBadBuy.

You will be building a logistic regression model and evaluating it. You can EITHER use glm() function aka the manual approach OR the tidy models framework. I recommend tidy models but if you are pressed for time glm will work too. keep it simple!

Load Libraries

load the typical libraries you are going to be using, if you dump them in a single block it makes it easy to find. Refer back to module 13 examples.

options(scipen=999)

library(tidyverse)  
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.1.1     ✓ dplyr   1.0.5
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tidymodels)  
## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.3 ──
## ✓ broom        0.7.6      ✓ rsample      0.0.9 
## ✓ dials        0.0.9      ✓ tune         0.1.5 
## ✓ infer        0.5.4      ✓ workflows    0.2.2 
## ✓ modeldata    0.1.0      ✓ workflowsets 0.0.2 
## ✓ parsnip      0.1.5      ✓ yardstick    0.0.8 
## ✓ recipes      0.1.16
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## x scales::discard() masks purrr::discard()
## x dplyr::filter()   masks stats::filter()
## x recipes::fixed()  masks stringr::fixed()
## x dplyr::lag()      masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step()   masks stats::step()
## ● Use tidymodels_prefer() to resolve common conflicts.
library(janitor)    
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(skimr)      
library(vip)        
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi

Stage

read this csv files into R, maybe clean up the column names. - kicked_cars.csv: this is the data set you will build your models with

cars <- read_csv("kicked_cars.csv") %>%
  clean_names()
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   PurchDate = col_character(),
##   Auction = col_character(),
##   Make = col_character(),
##   Model = col_character(),
##   Trim = col_character(),
##   SubModel = col_character(),
##   Color = col_character(),
##   Transmission = col_character(),
##   WheelTypeID = col_character(),
##   WheelType = col_character(),
##   Nationality = col_character(),
##   Size = col_character(),
##   TopThreeAmericanName = col_character(),
##   PRIMEUNIT = col_character(),
##   AUCGUART = col_character(),
##   VNST = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
head(cars,10)
## # A tibble: 10 x 34
##    ref_id is_bad_buy purch_date auction veh_year vehicle_age make   model  trim 
##     <dbl>      <dbl> <chr>      <chr>      <dbl>       <dbl> <chr>  <chr>  <chr>
##  1      1          0 12/7/2009  ADESA       2006           3 MAZDA  MAZDA3 i    
##  2      3          0 12/7/2009  ADESA       2005           4 DODGE  STRAT… SXT  
##  3      6          0 12/7/2009  ADESA       2004           5 MITSU… GALAN… ES   
##  4      7          0 12/7/2009  ADESA       2004           5 KIA    SPECT… EX   
##  5      9          0 12/7/2009  ADESA       2007           2 KIA    SPECT… EX   
##  6     10          0 12/7/2009  ADESA       2007           2 FORD   FIVE … SEL  
##  7     12          0 12/14/2009 ADESA       2001           8 FORD   F150 … XL   
##  8     13          1 12/14/2009 ADESA       2005           4 DODGE  CARAV… SE   
##  9     14          0 12/14/2009 ADESA       2005           4 NISSAN ALTIMA Bas  
## 10     15          0 12/14/2009 ADESA       2006           3 DODGE  CARAV… SXT  
## # … with 25 more variables: sub_model <chr>, color <chr>, transmission <chr>,
## #   wheel_type_id <chr>, wheel_type <chr>, veh_odo <dbl>, nationality <chr>,
## #   size <chr>, top_three_american_name <chr>,
## #   mmr_acquisition_auction_average_price <dbl>,
## #   mmr_acquisition_auction_clean_price <dbl>,
## #   mmr_acquisition_retail_average_price <dbl>,
## #   mmr_acquisiton_retail_clean_price <dbl>,
## #   mmr_current_auction_average_price <dbl>,
## #   mmr_current_auction_clean_price <dbl>,
## #   mmr_current_retail_average_price <dbl>,
## #   mmr_current_retail_clean_price <dbl>, primeunit <chr>, aucguart <chr>,
## #   byrno <dbl>, vnzip1 <dbl>, vnst <chr>, veh_b_cost <dbl>,
## #   is_online_sale <dbl>, warranty_cost <dbl>

Explore

the target variable isBadBuy - hint histogram or geom_col will work nicely. what is the % of cars that are bad buys?

cars %>%
  count(is_bad_buy) %>%
  mutate(pct = n / sum(n)) %>%
  ggplot(aes(is_bad_buy, pct, label=sprintf("%1.1f%%",pct*100))) +
  geom_col() +
  geom_text( size = 3, color = "white", position = position_stack(vjust = 0.5)) + 
  labs(title = "% of cars that are bad buys",
       x = "Bad Buy",
       y = "%")

Skim your data

what variables are likely to be problems? - any variable with missing values? - any categorical variable with lots of levels? remember categorical data turns each level into a 0/1 column so if there are 100 n_distinct levels that equates to 100 new columns when we built a model. - any id variables?

cars %>%
  skim_without_charts() %>%
  select(skim_type, skim_variable, n_missing, numeric.mean)
Data summary
Name Piped data
Number of rows 51285
Number of columns 34
_______________________
Column type frequency:
character 16
numeric 18
________________________
Group variables None

Variable type: character

skim_variable n_missing
purch_date 0
auction 0
make 0
model 0
trim 1664
sub_model 0
color 0
transmission 1
wheel_type_id 0
wheel_type 0
nationality 0
size 0
top_three_american_name 0
primeunit 0
aucguart 0
vnst 0

Variable type: numeric

skim_variable n_missing mean
ref_id 0 36498.85
is_bad_buy 0 0.12
veh_year 0 2005.34
vehicle_age 0 4.18
veh_odo 0 71509.42
mmr_acquisition_auction_average_price 585 6207.81
mmr_acquisition_auction_clean_price 496 7453.57
mmr_acquisition_retail_average_price 585 8603.50
mmr_acquisiton_retail_clean_price 585 9974.01
mmr_current_auction_average_price 594 6183.66
mmr_current_auction_clean_price 508 7438.82
mmr_current_retail_average_price 594 8844.72
mmr_current_retail_clean_price 594 10224.88
byrno 0 26337.53
vnzip1 0 58095.45
veh_b_cost 46 6733.14
is_online_sale 0 0.03
warranty_cost 0 1278.84

Explore Categorical Variables

I recommend using cross tabulations, frequency plots or simply “stretched” bar charts. what you are looking to do is understand what categorical “levels” are likely predictive of your target as well as understanding nulls and their potential impact.

Use the following variables to create categorical explorations of variable by target:

  • make
  • color
  • transmission
  • wheel_type
  • size
cars %>%
  mutate(is_bad_buy = as.factor(is_bad_buy)) %>%
  group_by(make, is_bad_buy) %>%
  summarise(n = n()) %>%
  mutate(pct = n / sum(n)) %>%
  ggplot(aes(y = pct, x = make, label=sprintf("%1.1f%%",pct*100), fill = is_bad_buy)) +
  geom_col() +
  geom_text( size = 3, position = position_stack(vjust = 0.5)) + 
  labs(title = "Bad Buy by Make") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## `summarise()` has grouped output by 'make'. You can override using the `.groups` argument.

cars %>%
  mutate(is_bad_buy = as.factor(is_bad_buy)) %>%
  group_by(color, is_bad_buy) %>%
  summarise(n = n()) %>%
  mutate(pct = n / sum(n)) %>%
  ggplot(aes(y = pct, x = color, label=sprintf("%1.1f%%",pct*100), fill = is_bad_buy)) +
  geom_col() +
  geom_text( size = 3, position = position_stack(vjust = 0.5)) + 
  labs(title = "Bad Buy by Color") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## `summarise()` has grouped output by 'color'. You can override using the `.groups` argument.

cars %>%
  mutate(is_bad_buy = as.factor(is_bad_buy)) %>%
  group_by(transmission, is_bad_buy) %>%
  summarise(n = n()) %>%
  mutate(pct = n / sum(n)) %>%
  ggplot(aes(y = pct, x = transmission, label=sprintf("%1.1f%%",pct*100), fill = is_bad_buy)) +
  geom_col() +
  geom_text( size = 3, position = position_stack(vjust = 0.5)) + 
  labs(title = "Bad Buy by Transmission") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## `summarise()` has grouped output by 'transmission'. You can override using the `.groups` argument.

cars %>%
  mutate(is_bad_buy = as.factor(is_bad_buy)) %>%
  group_by(wheel_type, is_bad_buy) %>%
  summarise(n = n()) %>%
  mutate(pct = n / sum(n)) %>%
  ggplot(aes(y = pct, x = wheel_type, label=sprintf("%1.1f%%",pct*100), fill = is_bad_buy)) +
  geom_col() +
  geom_text( size = 3, position = position_stack(vjust = 0.5)) + 
  labs(title = "Bad Buy by Wheel Type") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## `summarise()` has grouped output by 'wheel_type'. You can override using the `.groups` argument.

cars %>%
  mutate(is_bad_buy = as.factor(is_bad_buy)) %>%
  group_by(size, is_bad_buy) %>%
  summarise(n = n()) %>%
  mutate(pct = n / sum(n)) %>%
  ggplot(aes(y = pct, x = size, label=sprintf("%1.1f%%",pct*100), fill = is_bad_buy)) +
  geom_col() +
  geom_text( size = 3, position = position_stack(vjust = 0.5)) + 
  labs(title = "Bad Buy by Size") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## `summarise()` has grouped output by 'size'. You can override using the `.groups` argument.

Numeric Exploration

here we ask questions like what is the relationship of a numeric variable to the target variable. we do this via histograms, box plots and the like.

Use the following variables to create Numeric explorations of variable by target:

  • veh_year
  • vehicle_age
  • veh_odo
  • veh_b_cost
  • warranty_cost
cars %>%
  mutate(is_bad_buy = as.factor(is_bad_buy)) %>%
  ggplot(aes(veh_year, fill = is_bad_buy)) +
  geom_histogram( bins=35, ) + 
  labs(title = "No Fill: Does vehicle year impact a bad buy? ") +
  ylab("count")+ xlab("veh_year")

cars %>%
  mutate(is_bad_buy = as.factor(is_bad_buy)) %>%
  ggplot(aes(vehicle_age, fill=is_bad_buy)) +
  geom_histogram( bins=35, 
                  ) + 
  labs(title = "No Fill: Does vehicle age impact a bad buy? ") +
  ylab("count")+ xlab("Vehicle Age ")

cars %>%
  mutate(is_bad_buy = as.factor(is_bad_buy)) %>%
  ggplot(aes(veh_odo, fill=is_bad_buy)) +
  geom_histogram( bins=35, 
                  ) + 
  labs(title = "No Fill: Does vehicle odo impact a bad buy? ") +
  ylab("count")+ xlab("Vehicle Odo ")

cars %>%
  mutate(is_bad_buy = as.factor(is_bad_buy)) %>%
  ggplot(aes(veh_b_cost, fill=is_bad_buy)) +
  geom_histogram( bins=35,
                  ) + 
  labs(title = "No Fill: Does vehicle cost impact a bad buy? ") +
  ylab("count")+ xlab("Vehicle Cost ")
## Warning: Removed 46 rows containing non-finite values (stat_bin).

cars %>%
  mutate(is_bad_buy = as.factor(is_bad_buy)) %>%
  ggplot(aes(warranty_cost, fill=is_bad_buy)) +
  geom_histogram( bins=35,
                  ) + 
  labs(title = "No Fill: Does warranty cost impact a bad buy? ") +
  ylab("count")+ xlab("Warranty Cost ")

Correlation

We need to understand if there are any pairs of numeric variables that are highly correlated to each other - this can be a problem(don’t worry about it for now), and we need to understand how correlated numeric variables are to the target. Logistic regression has an assumption of a relationship between numeric predictors and the target.IsBadBuy is a numeric variable What numeric variables have the highest correlation to the target

use the following variables which ones are likely not useful? “ref_id”, “is_bad_buy”, “veh_year”, “vehicle_age”, “veh_odo”, “mmr_acquisition_auction_average_price” “mmr_acquisition_auction_clean_price” , “mmr_acquisition_retail_average_price” “mmr_acquisiton_retail_clean_price”, “mmr_current_auction_average_price”,
“mmr_current_auction_clean_price” , “mmr_current_retail_average_price”
“mmr_current_retail_clean_price”
“veh_b_cost”
“is_online_sale”
“warranty_cost”

library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
 cars_cor <- cars %>%
  na.omit() %>%
  select_if(is.numeric) %>%
  cor() %>%
  melt()
 
 cars_cor %>%
  ggplot(aes(x=Var1, y=Var2, fill=value)) +
  geom_tile() +
  scale_fill_gradient2(mid="#FBFEF9",low="#0C6291",high="#A63446") +
  geom_text(aes(label=round(value,2)), color="black") +
  theme(axis.text.x = element_text(angle = 90)) +
  labs(title = "Correlation to bad buy",
       subtitle = "exclude varibles w. abs(correlation) < .2?",
       y = "",
       x = "")

 cars_cor %>%
  filter(Var1 == "is_bad_buy") %>%
  ggplot(aes(x=Var1, y=reorder(Var2,value), fill=value)) +
  geom_tile() +
  scale_fill_gradient2(mid="#FBFEF9",low="#0C6291",high="#A63446") +
  geom_text(aes(label=round(value,2)), color="black") +
  theme(axis.text.x = element_text(angle = 60)) +
  labs(title = "Correlation to is_bad_buy",
       subtitle = "exclude varibles w. abs(correlation) < .2?",
       y = "",
       x = "")

Transformation

convert character values to a factor

how do you want to handle vinzip1, this is the zip code of the car? byrno - this is the buyer number? drop any high carnality character values (think sub model, model, vinzip1, purchase date)

cars_prep <- cars %>%
  mutate(age = if_else(is.na(veh_b_cost), mean(veh_b_cost, na.rm=TRUE), veh_b_cost)) %>%
  mutate_if(is.character, as.factor) %>%
  select( -vnzip1, -byrno, -sub_model, -model, -purch_date)
 
cars_prep %>%
  skim_without_charts()
Data summary
Name Piped data
Number of rows 51285
Number of columns 30
_______________________
Column type frequency:
factor 13
numeric 17
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
auction 0 1.00 FALSE 3 MAN: 28872, OTH: 12262, ADE: 10151
make 0 1.00 FALSE 32 CHE: 12107, DOD: 9034, FOR: 7877, CHR: 6185
trim 1664 0.97 FALSE 130 Bas: 9779, LS: 7134, SE: 6500, SXT: 2700
color 0 1.00 FALSE 17 SIL: 10486, WHI: 8455, BLU: 7177, GRE: 5573
transmission 1 1.00 FALSE 3 AUT: 49465, MAN: 1811, NUL: 8
wheel_type_id 0 1.00 FALSE 5 1: 25359, 2: 23182, NUL: 2215, 3: 526
wheel_type 0 1.00 FALSE 4 All: 25359, Cov: 23182, NUL: 2218, Spe: 526
nationality 0 1.00 FALSE 5 AME: 42760, OTH: 5738, TOP: 2653, OTH: 130
size 0 1.00 FALSE 13 MED: 21639, LAR: 6260, MED: 5704, COM: 5013
top_three_american_name 0 1.00 FALSE 5 GM: 17832, CHR: 16357, FOR: 8571, OTH: 8521
primeunit 0 1.00 FALSE 3 NUL: 48873, NO: 2365, YES: 47
aucguart 0 1.00 FALSE 3 NUL: 48873, GRE: 2357, RED: 55
vnst 0 1.00 FALSE 37 TX: 9491, FL: 7298, CA: 5025, NC: 4992

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
ref_id 0 1.00 36498.85 21078.79 1 18270 36417 54697 73014
is_bad_buy 0 1.00 0.12 0.33 0 0 0 0 1
veh_year 0 1.00 2005.34 1.73 2001 2004 2005 2007 2009
vehicle_age 0 1.00 4.18 1.71 1 3 4 5 9
veh_odo 0 1.00 71509.42 14564.30 5368 61883 73326 82419 115717
mmr_acquisition_auction_average_price 585 0.99 6207.81 2389.63 884 4351 6132 7788 35722
mmr_acquisition_auction_clean_price 496 0.99 7453.57 2638.60 1 5482 7341 9044 36859
mmr_acquisition_retail_average_price 585 0.99 8603.50 3041.34 1455 6369 8491 10679 39080
mmr_acquisiton_retail_clean_price 585 0.99 9974.01 3236.75 1662 7569 9851 12127 41482
mmr_current_auction_average_price 594 0.99 6183.66 2392.27 369 4321 6088 7750 35722
mmr_current_auction_clean_price 508 0.99 7438.82 2643.48 1 5468 7329 9022 36859
mmr_current_retail_average_price 594 0.99 8844.72 3012.27 899 6599 8756 10924 39080
mmr_current_retail_clean_price 594 0.99 10224.88 3211.04 1034 7846 10135 12328 41062
veh_b_cost 46 1.00 6733.14 1764.16 1 5440 6710 7900 38785
is_online_sale 0 1.00 0.03 0.16 0 0 0 0 1
warranty_cost 0 1.00 1278.84 602.17 462 837 1169 1623 7498
age 0 1.00 6733.14 1763.37 1 5440 6715 7900 38785

Partition 70/30

partition cars data into a 70/30, train / test split

set.seed(123)

train_test_split <- initial_split(cars_prep, prop = 0.7)

train <- training(train_test_split)

test <- testing(train_test_split)

sprintf("Train PCT : %1.2f%%", nrow(train)/ nrow(cars_prep) * 100)
## [1] "Train PCT : 70.00%"
sprintf("Test  PCT : %1.2f%%", nrow(test)/ nrow(cars_prep) * 100)
## [1] "Test  PCT : 30.00%"

Logistic Model

glm() model approach

create a logistic model using glm pick 5 variables you believe will be most useful to predict is bad buy.

car_glm_model <- glm(is_bad_buy ~ veh_odo + vehicle_age, data = train)

– eyeball model fit, don’t worry about it– glance(car_glm_model)

– look at model coefficients – tidy(car_glm_model) %>% mutate_at(c(“estimate”, “std.error”, “statistic”, “p.value”),round, 4)

What does this tell you about your model?

car_glm_model <- glm(is_bad_buy ~ veh_b_cost + warranty_cost + vehicle_age + veh_year + veh_odo, data = train)

glance(car_glm_model)
## # A tibble: 1 x 8
##   null.deviance df.null  logLik    AIC    BIC deviance df.residual  nobs
##           <dbl>   <int>   <dbl>  <dbl>  <dbl>    <dbl>       <int> <int>
## 1         3874.   35867 -10359. 20732. 20791.    3742.       35862 35868
tidy(car_glm_model) %>%
    mutate_at(c("estimate", "std.error", "statistic", "p.value"),round, 4) 
## # A tibble: 6 x 5
##   term          estimate std.error statistic p.value
##   <chr>            <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)   -25.5       7.10      -3.60   0.0003
## 2 veh_b_cost      0         0        -11.2    0     
## 3 warranty_cost   0         0          0.692  0.489 
## 4 vehicle_age     0.0392    0.0035    11.1    0     
## 5 veh_year        0.0127    0.0035     3.60   0.0003
## 6 veh_odo         0         0          5.81   0
#is_bad_buy ~ veh_b_cost + warranty_cost + vehicle_age + byrno + ref_id

glm() evaluate model

here you will need to

  1. make predictions on the test data set ex: test$.pred <- predict(car_glm_model, test, type = “response”)
  2. assign a predicted class ex: mutate(.pred_class = as.factor(if_else(.pred >= 0.5, “1”,“0”)))
 test$.pred <- predict(car_glm_model, test, type = "response")

train$.pred <- predict(car_glm_model, train, type = "response")

test <- test %>% 
  mutate(is_bad_buy = as.factor(is_bad_buy)) %>%
  mutate(.pred_class = as.factor(if_else(.pred >= 0.2, "1","0"))) 

train <- train %>% 
  mutate(is_bad_buy = as.factor(is_bad_buy)) %>%
  mutate(.pred_class = as.factor(if_else(.pred >= 0.2, "1","0")))

head(test,10)
## # A tibble: 10 x 32
##    ref_id is_bad_buy auction veh_year vehicle_age make  trim  color transmission
##     <dbl> <fct>      <fct>      <dbl>       <dbl> <fct> <fct> <fct> <fct>       
##  1      3 0          ADESA       2005           4 DODGE SXT   MARO… AUTO        
##  2      9 0          ADESA       2007           2 KIA   EX    BLACK AUTO        
##  3     10 0          ADESA       2007           2 FORD  SEL   RED   AUTO        
##  4     12 0          ADESA       2001           8 FORD  XL    WHITE MANUAL      
##  5     13 1          ADESA       2005           4 DODGE SE    RED   AUTO        
##  6     15 0          ADESA       2006           3 DODGE SXT   GOLD  AUTO        
##  7     25 0          ADESA       2007           2 CHEV… LS    GREY  AUTO        
##  8     27 0          ADESA       2002           7 GMC   SLE   GOLD  AUTO        
##  9     28 0          ADESA       2004           5 DODGE ST    WHITE AUTO        
## 10     29 0          ADESA       2004           5 DODGE SLT   RED   AUTO        
## # … with 23 more variables: wheel_type_id <fct>, wheel_type <fct>,
## #   veh_odo <dbl>, nationality <fct>, size <fct>,
## #   top_three_american_name <fct>, mmr_acquisition_auction_average_price <dbl>,
## #   mmr_acquisition_auction_clean_price <dbl>,
## #   mmr_acquisition_retail_average_price <dbl>,
## #   mmr_acquisiton_retail_clean_price <dbl>,
## #   mmr_current_auction_average_price <dbl>,
## #   mmr_current_auction_clean_price <dbl>,
## #   mmr_current_retail_average_price <dbl>,
## #   mmr_current_retail_clean_price <dbl>, primeunit <fct>, aucguart <fct>,
## #   vnst <fct>, veh_b_cost <dbl>, is_online_sale <dbl>, warranty_cost <dbl>,
## #   age <dbl>, .pred <dbl>, .pred_class <fct>

Evaluate glm()

  1. calculate accuracy
  2. calculate precision
  3. make a confusion matrix
  4. make a score distribution
  5. make a variable importance plot

hint: look at 13_logistic_reg_pt1,rmd

accuracy(test, truth=is_bad_buy, estimate = .pred_class, na_rm = TRUE)
## # A tibble: 1 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.815
precision(test, truth=is_bad_buy, estimate = .pred_class, na_rm = TRUE)
## # A tibble: 1 x 3
##   .metric   .estimator .estimate
##   <chr>     <chr>          <dbl>
## 1 precision binary         0.892
test %>%
  conf_mat(is_bad_buy, .pred_class) %>%
  autoplot(type = "heatmap") +
  coord_flip() +
  labs(title="Confusion Matrix") 

test %>%
  ggplot(aes(x=.pred, fill=is_bad_buy)) +
  geom_histogram(bins=100) +
  geom_vline(xintercept = 0.2) +
  annotate(geom="text", x=0.6, y=1000, label="True Positive & False Positives",
              color="red") +
  annotate(geom="text", x=0.05, y=1000, label="True Negatives & False Negatives",
              color="blue") + 
  labs(title="Test score distribution",
       x = "predicted probability",
       y = "count")
## Warning: Removed 14 rows containing non-finite values (stat_bin).

vip(car_glm_model, 20) +
  labs(title="Variable Importance")

tidymodel model approach

Instead of the old way use tidymdoels to define a recipe

Define a Recipe

here simply find 5 - 10 variables (or all of them) that you want to build a model with here is an example

car_recipe <- recipe(is_bad_buy ~ make + wheel_type + vnst + vehicle_age + veh_odo + veh_b_cost + is_online_sale, data=cars) %>% step_modeimpute(all_nominal(), -all_numeric()) %>% step_meanimpute(all_numeric(), -all_nominal()) %>% step_nzv( unique_cut = 3 ,all_nominal()) %>% step_novel(all_nominal(), -all_outcomes()) %>% step_dummy(all_nominal(), -all_outcomes()) %>% prep()

car_recipe <-recipe(is_bad_buy ~ veh_b_cost + warranty_cost + vehicle_age + veh_year + veh_odo
                       , data=cars) %>%
  step_impute_mode(all_nominal(), -all_numeric()) %>%
  step_impute_mean(all_numeric(), -all_nominal()) %>%
  step_nzv( unique_cut = 3 ,all_nominal()) %>%
  step_novel(all_nominal(), -all_outcomes()) %>%
  step_dummy(all_nominal(), -all_outcomes()) %>%
  prep()

Bake Recipe

use the bake() function to apply your recipe to training and test

bake_train <- bake(car_recipe, train)

print out the before and after column count!

bake_train <- bake(car_recipe, train)

bake_test <- bake(car_recipe, test)

sprintf("Column count before baking : %2d ", ncol(train))
## [1] "Column count before baking : 32 "
sprintf("Column count before baking : %2d ", ncol(test))
## [1] "Column count before baking : 32 "
sprintf("Column count after  baking : %2d", ncol(bake_train))
## [1] "Column count after  baking :  6"
sprintf("Column count after  baking : %2d", ncol(bake_test))
## [1] "Column count after  baking :  6"

Create and Fit Model

your model should look something like this:

car_glm_model <- logistic_reg() %>% set_mode(“classification”) %>% set_engine(“glm”) %>% fit(is_bad_buy ~ ., data= bake_train)

for fun you can create a random forest like this:

car_rf_model <- rand_forest(trees=15) %>% set_mode(“classification”) %>% set_engine(“ranger”, importance = “permutation”) %>% fit(is_bad_buy ~ ., data= bake_train)

car_glm_model <- logistic_reg() %>%
  set_mode("classification") %>%
  set_engine("glm") %>%
  fit(is_bad_buy ~ veh_b_cost + warranty_cost + vehicle_age + veh_year + veh_odo , data= bake_test)

car_glm_model <- logistic_reg() %>%
  set_mode("classification") %>%
  set_engine("glm") %>%
  fit(is_bad_buy ~ veh_b_cost + warranty_cost + vehicle_age + veh_year + veh_odo , data= bake_train)

Evaluate Model

Make Predictions

something like the following for both training and test

train_scored <- predict(car_glm_model, bake_train, type = “prob”) %>% bind_cols(predict(car_glm_model, bake_train, type = “class”)) %>% bind_cols(train)

train_scored <- predict(car_glm_model, bake_train, type = "prob") %>%
  bind_cols(predict(car_glm_model, bake_train, type = "class"))  %>%
  bind_cols(train)
## New names:
## * .pred_class -> .pred_class...3
## * .pred_class -> .pred_class...35
train_scored_test <- predict(car_glm_model, bake_test, type = "prob") %>%
  bind_cols(predict(car_glm_model, bake_test, type = "class"))  %>%
  bind_cols(test)
## New names:
## * .pred_class -> .pred_class...3
## * .pred_class -> .pred_class...35
train_scored %>%
   mutate(is_bad_buy = as.factor(is_bad_buy)) 
## # A tibble: 35,900 x 35
##    .pred_0 .pred_1 .pred_class...3 ref_id is_bad_buy auction veh_year
##      <dbl>   <dbl> <fct>            <dbl> <fct>      <fct>      <dbl>
##  1   0.908  0.0918 0                    1 0          ADESA       2006
##  2   0.849  0.151  0                    6 0          ADESA       2004
##  3   0.848  0.152  0                    7 0          ADESA       2004
##  4   0.907  0.0925 0                   14 0          ADESA       2005
##  5   0.891  0.109  0                   17 0          ADESA       2005
##  6   0.814  0.186  0                   18 0          ADESA       2003
##  7   0.892  0.108  0                   19 0          ADESA       2005
##  8   0.805  0.195  0                   22 0          ADESA       2002
##  9   0.811  0.189  0                   23 0          ADESA       2004
## 10   0.879  0.121  0                   30 0          ADESA       2005
## # … with 35,890 more rows, and 28 more variables: vehicle_age <dbl>,
## #   make <fct>, trim <fct>, color <fct>, transmission <fct>,
## #   wheel_type_id <fct>, wheel_type <fct>, veh_odo <dbl>, nationality <fct>,
## #   size <fct>, top_three_american_name <fct>,
## #   mmr_acquisition_auction_average_price <dbl>,
## #   mmr_acquisition_auction_clean_price <dbl>,
## #   mmr_acquisition_retail_average_price <dbl>,
## #   mmr_acquisiton_retail_clean_price <dbl>,
## #   mmr_current_auction_average_price <dbl>,
## #   mmr_current_auction_clean_price <dbl>,
## #   mmr_current_retail_average_price <dbl>,
## #   mmr_current_retail_clean_price <dbl>, primeunit <fct>, aucguart <fct>,
## #   vnst <fct>, veh_b_cost <dbl>, is_online_sale <dbl>, warranty_cost <dbl>,
## #   age <dbl>, .pred <dbl>, .pred_class...35 <fct>
train_scored_test %>%
   mutate(is_bad_buy = as.factor(is_bad_buy)) 
## # A tibble: 15,385 x 35
##    .pred_0 .pred_1 .pred_class...3 ref_id is_bad_buy auction veh_year
##      <dbl>   <dbl> <fct>            <dbl> <fct>      <fct>      <dbl>
##  1   0.876  0.124  0                    3 0          ADESA       2005
##  2   0.937  0.0629 0                    9 0          ADESA       2007
##  3   0.933  0.0672 0                   10 0          ADESA       2007
##  4   0.739  0.261  0                   12 0          ADESA       2001
##  5   0.877  0.123  0                   13 1          ADESA       2005
##  6   0.917  0.0833 0                   15 0          ADESA       2006
##  7   0.938  0.0620 0                   25 0          ADESA       2007
##  8   0.811  0.189  0                   27 0          ADESA       2002
##  9   0.892  0.108  0                   28 0          ADESA       2004
## 10   0.897  0.103  0                   29 0          ADESA       2004
## # … with 15,375 more rows, and 28 more variables: vehicle_age <dbl>,
## #   make <fct>, trim <fct>, color <fct>, transmission <fct>,
## #   wheel_type_id <fct>, wheel_type <fct>, veh_odo <dbl>, nationality <fct>,
## #   size <fct>, top_three_american_name <fct>,
## #   mmr_acquisition_auction_average_price <dbl>,
## #   mmr_acquisition_auction_clean_price <dbl>,
## #   mmr_acquisition_retail_average_price <dbl>,
## #   mmr_acquisiton_retail_clean_price <dbl>,
## #   mmr_current_auction_average_price <dbl>,
## #   mmr_current_auction_clean_price <dbl>,
## #   mmr_current_retail_average_price <dbl>,
## #   mmr_current_retail_clean_price <dbl>, primeunit <fct>, aucguart <fct>,
## #   vnst <fct>, veh_b_cost <dbl>, is_online_sale <dbl>, warranty_cost <dbl>,
## #   age <dbl>, .pred <dbl>, .pred_class...35 <fct>
skim(train_scored)
Data summary
Name train_scored
Number of rows 35900
Number of columns 35
_______________________
Column type frequency:
factor 16
numeric 19
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
.pred_class…3 0 1.00 FALSE 1 0: 35900, 1: 0
is_bad_buy 0 1.00 FALSE 2 0: 31472, 1: 4428
auction 0 1.00 FALSE 3 MAN: 20133, OTH: 8675, ADE: 7092
make 0 1.00 FALSE 30 CHE: 8465, DOD: 6331, FOR: 5510, CHR: 4359
trim 1129 0.97 FALSE 125 Bas: 6895, LS: 5016, SE: 4543, SXT: 1919
color 0 1.00 FALSE 17 SIL: 7381, WHI: 5948, BLU: 5057, GRE: 3884
transmission 1 1.00 FALSE 3 AUT: 34612, MAN: 1282, NUL: 5
wheel_type_id 0 1.00 FALSE 4 1: 17733, 2: 16237, NUL: 1558, 3: 372
wheel_type 0 1.00 FALSE 4 All: 17733, Cov: 16237, NUL: 1558, Spe: 372
nationality 0 1.00 FALSE 5 AME: 29952, OTH: 3979, TOP: 1875, OTH: 90
size 0 1.00 FALSE 13 MED: 15135, LAR: 4359, MED: 3973, COM: 3502
top_three_american_name 0 1.00 FALSE 5 GM: 12447, CHR: 11497, FOR: 6008, OTH: 5944
primeunit 0 1.00 FALSE 3 NUL: 34193, NO: 1671, YES: 36
aucguart 0 1.00 FALSE 3 NUL: 34193, GRE: 1675, RED: 32
vnst 0 1.00 FALSE 37 TX: 6616, FL: 5056, NC: 3525, CA: 3511
.pred_class…35 32 1.00 FALSE 2 0: 31735, 1: 4133

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
.pred_0 0 1.00 0.88 0.06 0.53 0.85 0.89 0.92 1.00 ▁▁▂▇▇
.pred_1 0 1.00 0.12 0.06 0.00 0.08 0.11 0.15 0.47 ▇▇▂▁▁
ref_id 0 1.00 36486.28 21089.72 1.00 18287.75 36333.50 54713.75 73013.00 ▇▇▇▇▇
veh_year 0 1.00 2005.35 1.73 2001.00 2004.00 2005.00 2007.00 2009.00 ▂▅▅▇▂
vehicle_age 0 1.00 4.17 1.72 1.00 3.00 4.00 5.00 9.00 ▃▇▃▃▁
veh_odo 0 1.00 71515.13 14531.70 5368.00 61885.00 73343.50 82438.25 115026.00 ▁▁▆▇▁
mmr_acquisition_auction_average_price 420 0.99 6222.02 2391.99 884.00 4368.00 6155.00 7790.00 35722.00 ▇▂▁▁▁
mmr_acquisition_auction_clean_price 354 0.99 7467.86 2643.17 1.00 5499.00 7357.50 9045.00 36859.00 ▇▇▁▁▁
mmr_acquisition_retail_average_price 420 0.99 8618.35 3036.05 1455.00 6387.00 8505.00 10681.25 39080.00 ▇▆▁▁▁
mmr_acquisiton_retail_clean_price 420 0.99 9989.35 3231.50 1662.00 7597.00 9855.00 12135.00 40308.00 ▆▇▁▁▁
mmr_current_auction_average_price 409 0.99 6196.11 2395.74 430.00 4334.00 6105.00 7763.00 35722.00 ▇▃▁▁▁
mmr_current_auction_clean_price 344 0.99 7451.26 2649.91 1.00 5480.00 7339.00 9039.00 36859.00 ▇▇▁▁▁
mmr_current_retail_average_price 409 0.99 8861.33 3008.36 964.00 6618.00 8787.00 10937.50 39080.00 ▇▇▁▁▁
mmr_current_retail_clean_price 409 0.99 10242.70 3208.07 1203.00 7867.00 10153.00 12341.00 40308.00 ▅▇▁▁▁
veh_b_cost 32 1.00 6741.34 1762.49 1620.00 5450.00 6730.00 7900.00 38785.00 ▇▁▁▁▁
is_online_sale 0 1.00 0.03 0.16 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
warranty_cost 0 1.00 1276.03 594.96 462.00 837.00 1169.00 1623.00 7198.00 ▇▂▁▁▁
age 0 1.00 6741.33 1761.71 1620.00 5455.00 6730.00 7900.00 38785.00 ▇▁▁▁▁
.pred 32 1.00 0.12 0.06 -0.36 0.08 0.12 0.16 0.33 ▁▁▂▇▂
skim(train_scored_test)
Data summary
Name train_scored_test
Number of rows 15385
Number of columns 35
_______________________
Column type frequency:
factor 16
numeric 19
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
.pred_class…3 0 1.00 FALSE 1 0: 15385, 1: 0
is_bad_buy 0 1.00 FALSE 2 0: 13475, 1: 1910
auction 0 1.00 FALSE 3 MAN: 8739, OTH: 3587, ADE: 3059
make 0 1.00 FALSE 32 CHE: 3642, DOD: 2703, FOR: 2367, CHR: 1826
trim 535 0.97 FALSE 113 Bas: 2884, LS: 2118, SE: 1957, SXT: 781
color 0 1.00 FALSE 17 SIL: 3105, WHI: 2507, BLU: 2120, GRE: 1689
transmission 0 1.00 FALSE 3 AUT: 14853, MAN: 529, NUL: 3
wheel_type_id 0 1.00 FALSE 5 1: 7626, 2: 6945, NUL: 657, 3: 154
wheel_type 0 1.00 FALSE 4 All: 7626, Cov: 6945, NUL: 660, Spe: 154
nationality 0 1.00 FALSE 4 AME: 12808, OTH: 1759, TOP: 778, OTH: 40
size 0 1.00 FALSE 12 MED: 6504, LAR: 1901, MED: 1731, COM: 1511
top_three_american_name 0 1.00 FALSE 4 GM: 5385, CHR: 4860, OTH: 2577, FOR: 2563
primeunit 0 1.00 FALSE 3 NUL: 14680, NO: 694, YES: 11
aucguart 0 1.00 FALSE 3 NUL: 14680, GRE: 682, RED: 23
vnst 0 1.00 FALSE 37 TX: 2875, FL: 2242, CA: 1514, NC: 1467
.pred_class…35 14 1.00 FALSE 2 0: 13565, 1: 1806

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
.pred_0 0 1.00 0.88 0.06 0.51 0.85 0.89 0.92 1.00 ▁▁▂▇▇
.pred_1 0 1.00 0.12 0.06 0.00 0.08 0.11 0.15 0.49 ▇▇▂▁▁
ref_id 0 1.00 36528.19 21053.91 3.00 18230.00 36601.00 54660.00 73014.00 ▇▇▇▇▇
veh_year 0 1.00 2005.33 1.74 2001.00 2004.00 2005.00 2007.00 2009.00 ▂▅▅▇▂
vehicle_age 0 1.00 4.19 1.71 1.00 3.00 4.00 5.00 9.00 ▃▇▃▃▁
veh_odo 0 1.00 71496.10 14640.56 8706.00 61875.00 73274.00 82395.00 115717.00 ▁▂▆▇▁
mmr_acquisition_auction_average_price 165 0.99 6174.68 2383.85 966.00 4324.00 6083.00 7776.00 32063.00 ▇▃▁▁▁
mmr_acquisition_auction_clean_price 142 0.99 7420.25 2627.68 1.00 5434.00 7306.00 9032.00 35108.00 ▇▇▁▁▁
mmr_acquisition_retail_average_price 165 0.99 8568.88 3053.43 1543.00 6311.00 8472.50 10675.25 37885.00 ▇▇▁▁▁
mmr_acquisiton_retail_clean_price 165 0.99 9938.25 3248.78 2274.00 7515.75 9848.00 12111.00 41482.00 ▇▇▁▁▁
mmr_current_auction_average_price 185 0.99 6154.59 2383.97 369.00 4293.50 6056.00 7728.00 31127.00 ▇▆▁▁▁
mmr_current_auction_clean_price 164 0.99 7409.77 2628.25 1.00 5433.00 7308.00 8995.00 34798.00 ▇▇▁▁▁
mmr_current_retail_average_price 185 0.99 8805.94 3021.12 899.00 6556.75 8700.50 10905.00 38151.00 ▇▇▁▁▁
mmr_current_retail_clean_price 185 0.99 10183.27 3217.67 1034.00 7798.00 10090.00 12308.00 41062.00 ▅▇▁▁▁
veh_b_cost 14 1.00 6714.00 1767.95 1.00 5410.00 6690.00 7900.00 36485.00 ▇▅▁▁▁
is_online_sale 0 1.00 0.03 0.16 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
warranty_cost 0 1.00 1285.38 618.64 462.00 853.00 1169.00 1623.00 7498.00 ▇▂▁▁▁
age 0 1.00 6714.02 1767.15 1.00 5410.00 6695.00 7900.00 36485.00 ▇▅▁▁▁
.pred 14 1.00 0.12 0.06 -0.32 0.08 0.12 0.16 0.35 ▁▁▃▇▁

Evaluate

for both train and test

  1. calculate accuracy()
  2. calculate precision()
  3. make a confusion matrix
  4. make a score distribution
  5. make a variable importance plot

What does precision mean? can you compare it to the do assign everyone to the minority case?

accuracy(train, truth=is_bad_buy, estimate = .pred_class, na_rm = TRUE)
## # A tibble: 1 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.821
accuracy(test, truth=is_bad_buy, estimate = .pred_class, na_rm = TRUE)
## # A tibble: 1 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.815
precision(train, truth=is_bad_buy, estimate = .pred_class, na_rm = TRUE)
## # A tibble: 1 x 3
##   .metric   .estimator .estimate
##   <chr>     <chr>          <dbl>
## 1 precision binary         0.894
precision(test, truth=is_bad_buy, estimate = .pred_class, na_rm = TRUE)
## # A tibble: 1 x 3
##   .metric   .estimator .estimate
##   <chr>     <chr>          <dbl>
## 1 precision binary         0.892
train %>%
  conf_mat(is_bad_buy, .pred_class) %>%
  autoplot(type = "heatmap") +
  coord_flip() +
  labs(title="Train Confusion Matrix") 

test %>%
  conf_mat(is_bad_buy, .pred_class) %>%
  autoplot(type = "heatmap") +
  coord_flip() +
  labs(title="Test Confusion Matrix") 

train%>%
  ggplot(aes(x=.pred, fill=is_bad_buy)) +
  geom_histogram(bins=100) +
  geom_vline(xintercept = 0.2) +
  annotate(geom="text", x=0.6, y=1000, label="True Positive & False Positives",
              color="red") +
  annotate(geom="text", x=0.05, y=1000, label="True Negatives & False Negatives",
              color="blue") + 
  labs(title="Train score distribution",
       x = "predicted probability",
       y = "count")
## Warning: Removed 32 rows containing non-finite values (stat_bin).

test %>%
  ggplot(aes(x=.pred, fill=is_bad_buy)) +
  geom_histogram(bins=100) +
  geom_vline(xintercept = 0.2) +
  annotate(geom="text", x=0.6, y=1000, label="True Positive & False Positives",
              color="red") +
  annotate(geom="text", x=0.05, y=1000, label="True Negatives & False Negatives",
              color="blue") + 
  labs(title="Test score distribution",
       x = "predicted probability",
       y = "count")
## Warning: Removed 14 rows containing non-finite values (stat_bin).

vip(car_glm_model, 20) +
  labs(title="Variable Importance")