Overview

You’ve been hired by CarVana to look at used cars and identify used cars that are likely going to be bad (isBadBuy) i.e. you don’t want them on the lot for customers to purchase because it’s going to cause you all sorts of headaches. Your job is to identify the crappy cars before the auction buyer purchases them, in a nutshell you are going to build a model to predict how likely a car is going to be a bad buy - i.e the variable isBadBuy.

You will be building a logistic regression model and evaluating it. You can EITHER use glm() function aka the manual approach OR the tidy models framework. I recommend tidy models but if you are pressed for time glm will work too. keep it simple!

Load Libraries

load the typical libraries you are going to be using, if you dump them in a single block it makes it easy to find. Refer back to module 13 examples.

options(scipen=999)

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.1.1     ✓ dplyr   1.0.5
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.3 ──

## ✓ broom        0.7.6      ✓ rsample      0.0.9 
## ✓ dials        0.0.9      ✓ tune         0.1.5 
## ✓ infer        0.5.4      ✓ workflows    0.2.2 
## ✓ modeldata    0.1.0      ✓ workflowsets 0.0.2 
## ✓ parsnip      0.1.5      ✓ yardstick    0.0.8 
## ✓ recipes      0.1.16

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## x scales::discard() masks purrr::discard()
## x dplyr::filter()   masks stats::filter()
## x recipes::fixed()  masks stringr::fixed()
## x dplyr::lag()      masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step()   masks stats::step()
## ● Use tidymodels_prefer() to resolve common conflicts.

library(janitor)

## 
## Attaching package: 'janitor'

## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(skimr)      
library(vip)

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

Stage

read this csv files into R, maybe clean up the column names. - kicked_cars.csv: this is the data set you will build your models with

cars <- read_csv("kicked_cars.csv") %>%
  clean_names()

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   PurchDate = col_character(),
##   Auction = col_character(),
##   Make = col_character(),
##   Model = col_character(),
##   Trim = col_character(),
##   SubModel = col_character(),
##   Color = col_character(),
##   Transmission = col_character(),
##   WheelTypeID = col_character(),
##   WheelType = col_character(),
##   Nationality = col_character(),
##   Size = col_character(),
##   TopThreeAmericanName = col_character(),
##   PRIMEUNIT = col_character(),
##   AUCGUART = col_character(),
##   VNST = col_character()
## )
## ℹ Use `spec()` for the full column specifications.

head(cars,10)

## # A tibble: 10 x 34
##    ref_id is_bad_buy purch_date auction veh_year vehicle_age make   model  trim 
##     <dbl>      <dbl> <chr>      <chr>      <dbl>       <dbl> <chr>  <chr>  <chr>
##  1      1          0 12/7/2009  ADESA       2006           3 MAZDA  MAZDA3 i    
##  2      3          0 12/7/2009  ADESA       2005           4 DODGE  STRAT… SXT  
##  3      6          0 12/7/2009  ADESA       2004           5 MITSU… GALAN… ES   
##  4      7          0 12/7/2009  ADESA       2004           5 KIA    SPECT… EX   
##  5      9          0 12/7/2009  ADESA       2007           2 KIA    SPECT… EX   
##  6     10          0 12/7/2009  ADESA       2007           2 FORD   FIVE … SEL  
##  7     12          0 12/14/2009 ADESA       2001           8 FORD   F150 … XL   
##  8     13          1 12/14/2009 ADESA       2005           4 DODGE  CARAV… SE   
##  9     14          0 12/14/2009 ADESA       2005           4 NISSAN ALTIMA Bas  
## 10     15          0 12/14/2009 ADESA       2006           3 DODGE  CARAV… SXT  
## # … with 25 more variables: sub_model <chr>, color <chr>, transmission <chr>,
## #   wheel_type_id <chr>, wheel_type <chr>, veh_odo <dbl>, nationality <chr>,
## #   size <chr>, top_three_american_name <chr>,
## #   mmr_acquisition_auction_average_price <dbl>,
## #   mmr_acquisition_auction_clean_price <dbl>,
## #   mmr_acquisition_retail_average_price <dbl>,
## #   mmr_acquisiton_retail_clean_price <dbl>,
## #   mmr_current_auction_average_price <dbl>,
## #   mmr_current_auction_clean_price <dbl>,
## #   mmr_current_retail_average_price <dbl>,
## #   mmr_current_retail_clean_price <dbl>, primeunit <chr>, aucguart <chr>,
## #   byrno <dbl>, vnzip1 <dbl>, vnst <chr>, veh_b_cost <dbl>,
## #   is_online_sale <dbl>, warranty_cost <dbl>

Explore

the target variable isBadBuy - hint histogram or geom_col will work nicely. what is the % of cars that are bad buys?

cars %>%
  count(is_bad_buy) %>%
  mutate(pct = n / sum(n)) %>%
  ggplot(aes(is_bad_buy, pct, label=sprintf("%1.1f%%",pct*100))) +
  geom_col() +
  geom_text( size = 3, color = "white", position = position_stack(vjust = 0.5)) + 
  labs(title = "% of cars that are bad buys",
       x = "Bad Buy",
       y = "%")

Skim your data

what variables are likely to be problems? - any variable with missing values? - any categorical variable with lots of levels? remember categorical data turns each level into a 0/1 column so if there are 100 n_distinct levels that equates to 100 new columns when we built a model. - any id variables?

cars %>%
  skim_without_charts() %>%
  select(skim_type, skim_variable, n_missing, numeric.mean)

Data summary
Name	Piped data
Number of rows	51285
Number of columns	34
_______________________
Column type frequency:
character	16
numeric	18
________________________
Group variables	None

Variable type: character

skim_variable	n_missing
purch_date	0
auction	0
make	0
model	0
trim	1664
sub_model	0
color	0
transmission	1
wheel_type_id	0
wheel_type	0
nationality	0
size	0
top_three_american_name	0
primeunit	0
aucguart	0
vnst	0

Variable type: numeric

skim_variable	n_missing	mean
ref_id	0	36498.85
is_bad_buy	0	0.12
veh_year	0	2005.34
vehicle_age	0	4.18
veh_odo	0	71509.42
mmr_acquisition_auction_average_price	585	6207.81
mmr_acquisition_auction_clean_price	496	7453.57
mmr_acquisition_retail_average_price	585	8603.50
mmr_acquisiton_retail_clean_price	585	9974.01
mmr_current_auction_average_price	594	6183.66
mmr_current_auction_clean_price	508	7438.82
mmr_current_retail_average_price	594	8844.72
mmr_current_retail_clean_price	594	10224.88
byrno	0	26337.53
vnzip1	0	58095.45
veh_b_cost	46	6733.14
is_online_sale	0	0.03
warranty_cost	0	1278.84

Explore Categorical Variables

I recommend using cross tabulations, frequency plots or simply “stretched” bar charts. what you are looking to do is understand what categorical “levels” are likely predictive of your target as well as understanding nulls and their potential impact.

Use the following variables to create categorical explorations of variable by target:

make
color
transmission
wheel_type
size

cars %>%
  mutate(is_bad_buy = as.factor(is_bad_buy)) %>%
  group_by(make, is_bad_buy) %>%
  summarise(n = n()) %>%
  mutate(pct = n / sum(n)) %>%
  ggplot(aes(y = pct, x = make, label=sprintf("%1.1f%%",pct*100), fill = is_bad_buy)) +
  geom_col() +
  geom_text( size = 3, position = position_stack(vjust = 0.5)) + 
  labs(title = "Bad Buy by Make") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

## `summarise()` has grouped output by 'make'. You can override using the `.groups` argument.

cars %>%
  mutate(is_bad_buy = as.factor(is_bad_buy)) %>%
  group_by(color, is_bad_buy) %>%
  summarise(n = n()) %>%
  mutate(pct = n / sum(n)) %>%
  ggplot(aes(y = pct, x = color, label=sprintf("%1.1f%%",pct*100), fill = is_bad_buy)) +
  geom_col() +
  geom_text( size = 3, position = position_stack(vjust = 0.5)) + 
  labs(title = "Bad Buy by Color") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

## `summarise()` has grouped output by 'color'. You can override using the `.groups` argument.

cars %>%
  mutate(is_bad_buy = as.factor(is_bad_buy)) %>%
  group_by(transmission, is_bad_buy) %>%
  summarise(n = n()) %>%
  mutate(pct = n / sum(n)) %>%
  ggplot(aes(y = pct, x = transmission, label=sprintf("%1.1f%%",pct*100), fill = is_bad_buy)) +
  geom_col() +
  geom_text( size = 3, position = position_stack(vjust = 0.5)) + 
  labs(title = "Bad Buy by Transmission") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

## `summarise()` has grouped output by 'transmission'. You can override using the `.groups` argument.

cars %>%
  mutate(is_bad_buy = as.factor(is_bad_buy)) %>%
  group_by(wheel_type, is_bad_buy) %>%
  summarise(n = n()) %>%
  mutate(pct = n / sum(n)) %>%
  ggplot(aes(y = pct, x = wheel_type, label=sprintf("%1.1f%%",pct*100), fill = is_bad_buy)) +
  geom_col() +
  geom_text( size = 3, position = position_stack(vjust = 0.5)) + 
  labs(title = "Bad Buy by Wheel Type") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

## `summarise()` has grouped output by 'wheel_type'. You can override using the `.groups` argument.

cars %>%
  mutate(is_bad_buy = as.factor(is_bad_buy)) %>%
  group_by(size, is_bad_buy) %>%
  summarise(n = n()) %>%
  mutate(pct = n / sum(n)) %>%
  ggplot(aes(y = pct, x = size, label=sprintf("%1.1f%%",pct*100), fill = is_bad_buy)) +
  geom_col() +
  geom_text( size = 3, position = position_stack(vjust = 0.5)) + 
  labs(title = "Bad Buy by Size") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

## `summarise()` has grouped output by 'size'. You can override using the `.groups` argument.

Numeric Exploration

here we ask questions like what is the relationship of a numeric variable to the target variable. we do this via histograms, box plots and the like.

Use the following variables to create Numeric explorations of variable by target:

veh_year
vehicle_age
veh_odo
veh_b_cost
warranty_cost

cars %>%
  mutate(is_bad_buy = as.factor(is_bad_buy)) %>%
  ggplot(aes(veh_year, fill = is_bad_buy)) +
  geom_histogram( bins=35, ) + 
  labs(title = "No Fill: Does vehicle year impact a bad buy? ") +
  ylab("count")+ xlab("veh_year")

cars %>%
  mutate(is_bad_buy = as.factor(is_bad_buy)) %>%
  ggplot(aes(vehicle_age, fill=is_bad_buy)) +
  geom_histogram( bins=35, 
                  ) + 
  labs(title = "No Fill: Does vehicle age impact a bad buy? ") +
  ylab("count")+ xlab("Vehicle Age ")

cars %>%
  mutate(is_bad_buy = as.factor(is_bad_buy)) %>%
  ggplot(aes(veh_odo, fill=is_bad_buy)) +
  geom_histogram( bins=35, 
                  ) + 
  labs(title = "No Fill: Does vehicle odo impact a bad buy? ") +
  ylab("count")+ xlab("Vehicle Odo ")

cars %>%
  mutate(is_bad_buy = as.factor(is_bad_buy)) %>%
  ggplot(aes(veh_b_cost, fill=is_bad_buy)) +
  geom_histogram( bins=35,
                  ) + 
  labs(title = "No Fill: Does vehicle cost impact a bad buy? ") +
  ylab("count")+ xlab("Vehicle Cost ")

## Warning: Removed 46 rows containing non-finite values (stat_bin).

cars %>%
  mutate(is_bad_buy = as.factor(is_bad_buy)) %>%
  ggplot(aes(warranty_cost, fill=is_bad_buy)) +
  geom_histogram( bins=35,
                  ) + 
  labs(title = "No Fill: Does warranty cost impact a bad buy? ") +
  ylab("count")+ xlab("Warranty Cost ")

Correlation

We need to understand if there are any pairs of numeric variables that are highly correlated to each other - this can be a problem(don’t worry about it for now), and we need to understand how correlated numeric variables are to the target. Logistic regression has an assumption of a relationship between numeric predictors and the target.IsBadBuy is a numeric variable What numeric variables have the highest correlation to the target

use the following variables which ones are likely not useful? “ref_id”, “is_bad_buy”, “veh_year”, “vehicle_age”, “veh_odo”, “mmr_acquisition_auction_average_price” “mmr_acquisition_auction_clean_price” , “mmr_acquisition_retail_average_price” “mmr_acquisiton_retail_clean_price”, “mmr_current_auction_average_price”,
“mmr_current_auction_clean_price” , “mmr_current_retail_average_price”
“mmr_current_retail_clean_price”
“veh_b_cost”
“is_online_sale”
“warranty_cost”

library(reshape2)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

 cars_cor <- cars %>%
  na.omit() %>%
  select_if(is.numeric) %>%
  cor() %>%
  melt()
 
 cars_cor %>%
  ggplot(aes(x=Var1, y=Var2, fill=value)) +
  geom_tile() +
  scale_fill_gradient2(mid="#FBFEF9",low="#0C6291",high="#A63446") +
  geom_text(aes(label=round(value,2)), color="black") +
  theme(axis.text.x = element_text(angle = 90)) +
  labs(title = "Correlation to bad buy",
       subtitle = "exclude varibles w. abs(correlation) < .2?",
       y = "",
       x = "")

 cars_cor %>%
  filter(Var1 == "is_bad_buy") %>%
  ggplot(aes(x=Var1, y=reorder(Var2,value), fill=value)) +
  geom_tile() +
  scale_fill_gradient2(mid="#FBFEF9",low="#0C6291",high="#A63446") +
  geom_text(aes(label=round(value,2)), color="black") +
  theme(axis.text.x = element_text(angle = 60)) +
  labs(title = "Correlation to is_bad_buy",
       subtitle = "exclude varibles w. abs(correlation) < .2?",
       y = "",
       x = "")

Transformation

convert character values to a factor

how do you want to handle vinzip1, this is the zip code of the car? byrno - this is the buyer number? drop any high carnality character values (think sub model, model, vinzip1, purchase date)

cars_prep <- cars %>%
  mutate(age = if_else(is.na(veh_b_cost), mean(veh_b_cost, na.rm=TRUE), veh_b_cost)) %>%
  mutate_if(is.character, as.factor) %>%
  select( -vnzip1, -byrno, -sub_model, -model, -purch_date)
 
cars_prep %>%
  skim_without_charts()

Data summary
Name	Piped data
Number of rows	51285
Number of columns	30
_______________________
Column type frequency:
factor	13
numeric	17
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
auction	0	1.00	FALSE	3	MAN: 28872, OTH: 12262, ADE: 10151
make	0	1.00	FALSE	32	CHE: 12107, DOD: 9034, FOR: 7877, CHR: 6185
trim	1664	0.97	FALSE	130	Bas: 9779, LS: 7134, SE: 6500, SXT: 2700
color	0	1.00	FALSE	17	SIL: 10486, WHI: 8455, BLU: 7177, GRE: 5573
transmission	1	1.00	FALSE	3	AUT: 49465, MAN: 1811, NUL: 8
wheel_type_id	0	1.00	FALSE	5	1: 25359, 2: 23182, NUL: 2215, 3: 526
wheel_type	0	1.00	FALSE	4	All: 25359, Cov: 23182, NUL: 2218, Spe: 526
nationality	0	1.00	FALSE	5	AME: 42760, OTH: 5738, TOP: 2653, OTH: 130
size	0	1.00	FALSE	13	MED: 21639, LAR: 6260, MED: 5704, COM: 5013
top_three_american_name	0	1.00	FALSE	5	GM: 17832, CHR: 16357, FOR: 8571, OTH: 8521
primeunit	0	1.00	FALSE	3	NUL: 48873, NO: 2365, YES: 47
aucguart	0	1.00	FALSE	3	NUL: 48873, GRE: 2357, RED: 55
vnst	0	1.00	FALSE	37	TX: 9491, FL: 7298, CA: 5025, NC: 4992

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100
ref_id	0	1.00	36498.85	21078.79	1	18270	36417	54697	73014
is_bad_buy	0	1.00	0.12	0.33	0	0	0	0	1
veh_year	0	1.00	2005.34	1.73	2001	2004	2005	2007	2009
vehicle_age	0	1.00	4.18	1.71	1	3	4	5	9
veh_odo	0	1.00	71509.42	14564.30	5368	61883	73326	82419	115717
mmr_acquisition_auction_average_price	585	0.99	6207.81	2389.63	884	4351	6132	7788	35722
mmr_acquisition_auction_clean_price	496	0.99	7453.57	2638.60	1	5482	7341	9044	36859
mmr_acquisition_retail_average_price	585	0.99	8603.50	3041.34	1455	6369	8491	10679	39080
mmr_acquisiton_retail_clean_price	585	0.99	9974.01	3236.75	1662	7569	9851	12127	41482
mmr_current_auction_average_price	594	0.99	6183.66	2392.27	369	4321	6088	7750	35722
mmr_current_auction_clean_price	508	0.99	7438.82	2643.48	1	5468	7329	9022	36859
mmr_current_retail_average_price	594	0.99	8844.72	3012.27	899	6599	8756	10924	39080
mmr_current_retail_clean_price	594	0.99	10224.88	3211.04	1034	7846	10135	12328	41062
veh_b_cost	46	1.00	6733.14	1764.16	1	5440	6710	7900	38785
is_online_sale	0	1.00	0.03	0.16	0	0	0	0	1
warranty_cost	0	1.00	1278.84	602.17	462	837	1169	1623	7498
age	0	1.00	6733.14	1763.37	1	5440	6715	7900	38785

Partition 70/30

partition cars data into a 70/30, train / test split

set.seed(123)

train_test_split <- initial_split(cars_prep, prop = 0.7)

train <- training(train_test_split)

test <- testing(train_test_split)

sprintf("Train PCT : %1.2f%%", nrow(train)/ nrow(cars_prep) * 100)

## [1] "Train PCT : 70.00%"

sprintf("Test  PCT : %1.2f%%", nrow(test)/ nrow(cars_prep) * 100)

## [1] "Test  PCT : 30.00%"

Logistic Model

glm() model approach

create a logistic model using glm pick 5 variables you believe will be most useful to predict is bad buy.

car_glm_model <- glm(is_bad_buy ~ veh_odo + vehicle_age, data = train)

– eyeball model fit, don’t worry about it– glance(car_glm_model)

– look at model coefficients – tidy(car_glm_model) %>% mutate_at(c(“estimate”, “std.error”, “statistic”, “p.value”),round, 4)

What does this tell you about your model?

car_glm_model <- glm(is_bad_buy ~ veh_b_cost + warranty_cost + vehicle_age + veh_year + veh_odo, data = train)

glance(car_glm_model)

## # A tibble: 1 x 8
##   null.deviance df.null  logLik    AIC    BIC deviance df.residual  nobs
##           <dbl>   <int>   <dbl>  <dbl>  <dbl>    <dbl>       <int> <int>
## 1         3874.   35867 -10359. 20732. 20791.    3742.       35862 35868

tidy(car_glm_model) %>%
    mutate_at(c("estimate", "std.error", "statistic", "p.value"),round, 4)

## # A tibble: 6 x 5
##   term          estimate std.error statistic p.value
##   <chr>            <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)   -25.5       7.10      -3.60   0.0003
## 2 veh_b_cost      0         0        -11.2    0     
## 3 warranty_cost   0         0          0.692  0.489 
## 4 vehicle_age     0.0392    0.0035    11.1    0     
## 5 veh_year        0.0127    0.0035     3.60   0.0003
## 6 veh_odo         0         0          5.81   0

#is_bad_buy ~ veh_b_cost + warranty_cost + vehicle_age + byrno + ref_id

glm() evaluate model

here you will need to

make predictions on the test data set ex: test$.pred <- predict(car_glm_model, test, type = “response”)
assign a predicted class ex: mutate(.pred_class = as.factor(if_else(.pred >= 0.5, “1”,“0”)))

 test$.pred <- predict(car_glm_model, test, type = "response")

train$.pred <- predict(car_glm_model, train, type = "response")

test <- test %>% 
  mutate(is_bad_buy = as.factor(is_bad_buy)) %>%
  mutate(.pred_class = as.factor(if_else(.pred >= 0.2, "1","0"))) 

train <- train %>% 
  mutate(is_bad_buy = as.factor(is_bad_buy)) %>%
  mutate(.pred_class = as.factor(if_else(.pred >= 0.2, "1","0")))

head(test,10)

## # A tibble: 10 x 32
##    ref_id is_bad_buy auction veh_year vehicle_age make  trim  color transmission
##     <dbl> <fct>      <fct>      <dbl>       <dbl> <fct> <fct> <fct> <fct>       
##  1      3 0          ADESA       2005           4 DODGE SXT   MARO… AUTO        
##  2      9 0          ADESA       2007           2 KIA   EX    BLACK AUTO        
##  3     10 0          ADESA       2007           2 FORD  SEL   RED   AUTO        
##  4     12 0          ADESA       2001           8 FORD  XL    WHITE MANUAL      
##  5     13 1          ADESA       2005           4 DODGE SE    RED   AUTO        
##  6     15 0          ADESA       2006           3 DODGE SXT   GOLD  AUTO        
##  7     25 0          ADESA       2007           2 CHEV… LS    GREY  AUTO        
##  8     27 0          ADESA       2002           7 GMC   SLE   GOLD  AUTO        
##  9     28 0          ADESA       2004           5 DODGE ST    WHITE AUTO        
## 10     29 0          ADESA       2004           5 DODGE SLT   RED   AUTO        
## # … with 23 more variables: wheel_type_id <fct>, wheel_type <fct>,
## #   veh_odo <dbl>, nationality <fct>, size <fct>,
## #   top_three_american_name <fct>, mmr_acquisition_auction_average_price <dbl>,
## #   mmr_acquisition_auction_clean_price <dbl>,
## #   mmr_acquisition_retail_average_price <dbl>,
## #   mmr_acquisiton_retail_clean_price <dbl>,
## #   mmr_current_auction_average_price <dbl>,
## #   mmr_current_auction_clean_price <dbl>,
## #   mmr_current_retail_average_price <dbl>,
## #   mmr_current_retail_clean_price <dbl>, primeunit <fct>, aucguart <fct>,
## #   vnst <fct>, veh_b_cost <dbl>, is_online_sale <dbl>, warranty_cost <dbl>,
## #   age <dbl>, .pred <dbl>, .pred_class <fct>

Evaluate glm()

calculate accuracy
calculate precision
make a confusion matrix
make a score distribution
make a variable importance plot

hint: look at 13_logistic_reg_pt1,rmd

accuracy(test, truth=is_bad_buy, estimate = .pred_class, na_rm = TRUE)

## # A tibble: 1 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.815

precision(test, truth=is_bad_buy, estimate = .pred_class, na_rm = TRUE)

## # A tibble: 1 x 3
##   .metric   .estimator .estimate
##   <chr>     <chr>          <dbl>
## 1 precision binary         0.892

test %>%
  conf_mat(is_bad_buy, .pred_class) %>%
  autoplot(type = "heatmap") +
  coord_flip() +
  labs(title="Confusion Matrix")

test %>%
  ggplot(aes(x=.pred, fill=is_bad_buy)) +
  geom_histogram(bins=100) +
  geom_vline(xintercept = 0.2) +
  annotate(geom="text", x=0.6, y=1000, label="True Positive & False Positives",
              color="red") +
  annotate(geom="text", x=0.05, y=1000, label="True Negatives & False Negatives",
              color="blue") + 
  labs(title="Test score distribution",
       x = "predicted probability",
       y = "count")

## Warning: Removed 14 rows containing non-finite values (stat_bin).

vip(car_glm_model, 20) +
  labs(title="Variable Importance")

tidymodel model approach

Instead of the old way use tidymdoels to define a recipe

Define a Recipe

here simply find 5 - 10 variables (or all of them) that you want to build a model with here is an example

car_recipe <- recipe(is_bad_buy ~ make + wheel_type + vnst + vehicle_age + veh_odo + veh_b_cost + is_online_sale, data=cars) %>% step_modeimpute(all_nominal(), -all_numeric()) %>% step_meanimpute(all_numeric(), -all_nominal()) %>% step_nzv( unique_cut = 3 ,all_nominal()) %>% step_novel(all_nominal(), -all_outcomes()) %>% step_dummy(all_nominal(), -all_outcomes()) %>% prep()

car_recipe <-recipe(is_bad_buy ~ veh_b_cost + warranty_cost + vehicle_age + veh_year + veh_odo
                       , data=cars) %>%
  step_impute_mode(all_nominal(), -all_numeric()) %>%
  step_impute_mean(all_numeric(), -all_nominal()) %>%
  step_nzv( unique_cut = 3 ,all_nominal()) %>%
  step_novel(all_nominal(), -all_outcomes()) %>%
  step_dummy(all_nominal(), -all_outcomes()) %>%
  prep()

Bake Recipe

use the bake() function to apply your recipe to training and test

bake_train <- bake(car_recipe, train)

print out the before and after column count!

bake_train <- bake(car_recipe, train)

bake_test <- bake(car_recipe, test)

sprintf("Column count before baking : %2d ", ncol(train))

## [1] "Column count before baking : 32 "

sprintf("Column count before baking : %2d ", ncol(test))

## [1] "Column count before baking : 32 "

sprintf("Column count after  baking : %2d", ncol(bake_train))

## [1] "Column count after  baking :  6"

sprintf("Column count after  baking : %2d", ncol(bake_test))

## [1] "Column count after  baking :  6"

Create and Fit Model

your model should look something like this:

car_glm_model <- logistic_reg() %>% set_mode(“classification”) %>% set_engine(“glm”) %>% fit(is_bad_buy ~ ., data= bake_train)

for fun you can create a random forest like this:

car_rf_model <- rand_forest(trees=15) %>% set_mode(“classification”) %>% set_engine(“ranger”, importance = “permutation”) %>% fit(is_bad_buy ~ ., data= bake_train)

car_glm_model <- logistic_reg() %>%
  set_mode("classification") %>%
  set_engine("glm") %>%
  fit(is_bad_buy ~ veh_b_cost + warranty_cost + vehicle_age + veh_year + veh_odo , data= bake_test)

car_glm_model <- logistic_reg() %>%
  set_mode("classification") %>%
  set_engine("glm") %>%
  fit(is_bad_buy ~ veh_b_cost + warranty_cost + vehicle_age + veh_year + veh_odo , data= bake_train)

Evaluate Model

Make Predictions

something like the following for both training and test

train_scored <- predict(car_glm_model, bake_train, type = “prob”) %>% bind_cols(predict(car_glm_model, bake_train, type = “class”)) %>% bind_cols(train)

train_scored <- predict(car_glm_model, bake_train, type = "prob") %>%
  bind_cols(predict(car_glm_model, bake_train, type = "class"))  %>%
  bind_cols(train)

## New names:
## * .pred_class -> .pred_class...3
## * .pred_class -> .pred_class...35

train_scored_test <- predict(car_glm_model, bake_test, type = "prob") %>%
  bind_cols(predict(car_glm_model, bake_test, type = "class"))  %>%
  bind_cols(test)

## New names:
## * .pred_class -> .pred_class...3
## * .pred_class -> .pred_class...35

train_scored %>%
   mutate(is_bad_buy = as.factor(is_bad_buy))

## # A tibble: 35,900 x 35
##    .pred_0 .pred_1 .pred_class...3 ref_id is_bad_buy auction veh_year
##      <dbl>   <dbl> <fct>            <dbl> <fct>      <fct>      <dbl>
##  1   0.908  0.0918 0                    1 0          ADESA       2006
##  2   0.849  0.151  0                    6 0          ADESA       2004
##  3   0.848  0.152  0                    7 0          ADESA       2004
##  4   0.907  0.0925 0                   14 0          ADESA       2005
##  5   0.891  0.109  0                   17 0          ADESA       2005
##  6   0.814  0.186  0                   18 0          ADESA       2003
##  7   0.892  0.108  0                   19 0          ADESA       2005
##  8   0.805  0.195  0                   22 0          ADESA       2002
##  9   0.811  0.189  0                   23 0          ADESA       2004
## 10   0.879  0.121  0                   30 0          ADESA       2005
## # … with 35,890 more rows, and 28 more variables: vehicle_age <dbl>,
## #   make <fct>, trim <fct>, color <fct>, transmission <fct>,
## #   wheel_type_id <fct>, wheel_type <fct>, veh_odo <dbl>, nationality <fct>,
## #   size <fct>, top_three_american_name <fct>,
## #   mmr_acquisition_auction_average_price <dbl>,
## #   mmr_acquisition_auction_clean_price <dbl>,
## #   mmr_acquisition_retail_average_price <dbl>,
## #   mmr_acquisiton_retail_clean_price <dbl>,
## #   mmr_current_auction_average_price <dbl>,
## #   mmr_current_auction_clean_price <dbl>,
## #   mmr_current_retail_average_price <dbl>,
## #   mmr_current_retail_clean_price <dbl>, primeunit <fct>, aucguart <fct>,
## #   vnst <fct>, veh_b_cost <dbl>, is_online_sale <dbl>, warranty_cost <dbl>,
## #   age <dbl>, .pred <dbl>, .pred_class...35 <fct>

train_scored_test %>%
   mutate(is_bad_buy = as.factor(is_bad_buy))

## # A tibble: 15,385 x 35
##    .pred_0 .pred_1 .pred_class...3 ref_id is_bad_buy auction veh_year
##      <dbl>   <dbl> <fct>            <dbl> <fct>      <fct>      <dbl>
##  1   0.876  0.124  0                    3 0          ADESA       2005
##  2   0.937  0.0629 0                    9 0          ADESA       2007
##  3   0.933  0.0672 0                   10 0          ADESA       2007
##  4   0.739  0.261  0                   12 0          ADESA       2001
##  5   0.877  0.123  0                   13 1          ADESA       2005
##  6   0.917  0.0833 0                   15 0          ADESA       2006
##  7   0.938  0.0620 0                   25 0          ADESA       2007
##  8   0.811  0.189  0                   27 0          ADESA       2002
##  9   0.892  0.108  0                   28 0          ADESA       2004
## 10   0.897  0.103  0                   29 0          ADESA       2004
## # … with 15,375 more rows, and 28 more variables: vehicle_age <dbl>,
## #   make <fct>, trim <fct>, color <fct>, transmission <fct>,
## #   wheel_type_id <fct>, wheel_type <fct>, veh_odo <dbl>, nationality <fct>,
## #   size <fct>, top_three_american_name <fct>,
## #   mmr_acquisition_auction_average_price <dbl>,
## #   mmr_acquisition_auction_clean_price <dbl>,
## #   mmr_acquisition_retail_average_price <dbl>,
## #   mmr_acquisiton_retail_clean_price <dbl>,
## #   mmr_current_auction_average_price <dbl>,
## #   mmr_current_auction_clean_price <dbl>,
## #   mmr_current_retail_average_price <dbl>,
## #   mmr_current_retail_clean_price <dbl>, primeunit <fct>, aucguart <fct>,
## #   vnst <fct>, veh_b_cost <dbl>, is_online_sale <dbl>, warranty_cost <dbl>,
## #   age <dbl>, .pred <dbl>, .pred_class...35 <fct>

skim(train_scored)

Data summary
Name	train_scored
Number of rows	35900
Number of columns	35
_______________________
Column type frequency:
factor	16
numeric	19
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
.pred_class…3	0	1.00	FALSE	1	0: 35900, 1: 0
is_bad_buy	0	1.00	FALSE	2	0: 31472, 1: 4428
auction	0	1.00	FALSE	3	MAN: 20133, OTH: 8675, ADE: 7092
make	0	1.00	FALSE	30	CHE: 8465, DOD: 6331, FOR: 5510, CHR: 4359
trim	1129	0.97	FALSE	125	Bas: 6895, LS: 5016, SE: 4543, SXT: 1919
color	0	1.00	FALSE	17	SIL: 7381, WHI: 5948, BLU: 5057, GRE: 3884
transmission	1	1.00	FALSE	3	AUT: 34612, MAN: 1282, NUL: 5
wheel_type_id	0	1.00	FALSE	4	1: 17733, 2: 16237, NUL: 1558, 3: 372
wheel_type	0	1.00	FALSE	4	All: 17733, Cov: 16237, NUL: 1558, Spe: 372
nationality	0	1.00	FALSE	5	AME: 29952, OTH: 3979, TOP: 1875, OTH: 90
size	0	1.00	FALSE	13	MED: 15135, LAR: 4359, MED: 3973, COM: 3502
top_three_american_name	0	1.00	FALSE	5	GM: 12447, CHR: 11497, FOR: 6008, OTH: 5944
primeunit	0	1.00	FALSE	3	NUL: 34193, NO: 1671, YES: 36
aucguart	0	1.00	FALSE	3	NUL: 34193, GRE: 1675, RED: 32
vnst	0	1.00	FALSE	37	TX: 6616, FL: 5056, NC: 3525, CA: 3511
.pred_class…35	32	1.00	FALSE	2	0: 31735, 1: 4133

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
.pred_0	0	1.00	0.88	0.06	0.53	0.85	0.89	0.92	1.00	▁▁▂▇▇
.pred_1	0	1.00	0.12	0.06	0.00	0.08	0.11	0.15	0.47	▇▇▂▁▁
ref_id	0	1.00	36486.28	21089.72	1.00	18287.75	36333.50	54713.75	73013.00	▇▇▇▇▇
veh_year	0	1.00	2005.35	1.73	2001.00	2004.00	2005.00	2007.00	2009.00	▂▅▅▇▂
vehicle_age	0	1.00	4.17	1.72	1.00	3.00	4.00	5.00	9.00	▃▇▃▃▁
veh_odo	0	1.00	71515.13	14531.70	5368.00	61885.00	73343.50	82438.25	115026.00	▁▁▆▇▁
mmr_acquisition_auction_average_price	420	0.99	6222.02	2391.99	884.00	4368.00	6155.00	7790.00	35722.00	▇▂▁▁▁
mmr_acquisition_auction_clean_price	354	0.99	7467.86	2643.17	1.00	5499.00	7357.50	9045.00	36859.00	▇▇▁▁▁
mmr_acquisition_retail_average_price	420	0.99	8618.35	3036.05	1455.00	6387.00	8505.00	10681.25	39080.00	▇▆▁▁▁
mmr_acquisiton_retail_clean_price	420	0.99	9989.35	3231.50	1662.00	7597.00	9855.00	12135.00	40308.00	▆▇▁▁▁
mmr_current_auction_average_price	409	0.99	6196.11	2395.74	430.00	4334.00	6105.00	7763.00	35722.00	▇▃▁▁▁
mmr_current_auction_clean_price	344	0.99	7451.26	2649.91	1.00	5480.00	7339.00	9039.00	36859.00	▇▇▁▁▁
mmr_current_retail_average_price	409	0.99	8861.33	3008.36	964.00	6618.00	8787.00	10937.50	39080.00	▇▇▁▁▁
mmr_current_retail_clean_price	409	0.99	10242.70	3208.07	1203.00	7867.00	10153.00	12341.00	40308.00	▅▇▁▁▁
veh_b_cost	32	1.00	6741.34	1762.49	1620.00	5450.00	6730.00	7900.00	38785.00	▇▁▁▁▁
is_online_sale	0	1.00	0.03	0.16	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
warranty_cost	0	1.00	1276.03	594.96	462.00	837.00	1169.00	1623.00	7198.00	▇▂▁▁▁
age	0	1.00	6741.33	1761.71	1620.00	5455.00	6730.00	7900.00	38785.00	▇▁▁▁▁
.pred	32	1.00	0.12	0.06	-0.36	0.08	0.12	0.16	0.33	▁▁▂▇▂

skim(train_scored_test)

Data summary
Name	train_scored_test
Number of rows	15385
Number of columns	35
_______________________
Column type frequency:
factor	16
numeric	19
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
.pred_class…3	0	1.00	FALSE	1	0: 15385, 1: 0
is_bad_buy	0	1.00	FALSE	2	0: 13475, 1: 1910
auction	0	1.00	FALSE	3	MAN: 8739, OTH: 3587, ADE: 3059
make	0	1.00	FALSE	32	CHE: 3642, DOD: 2703, FOR: 2367, CHR: 1826
trim	535	0.97	FALSE	113	Bas: 2884, LS: 2118, SE: 1957, SXT: 781
color	0	1.00	FALSE	17	SIL: 3105, WHI: 2507, BLU: 2120, GRE: 1689
transmission	0	1.00	FALSE	3	AUT: 14853, MAN: 529, NUL: 3
wheel_type_id	0	1.00	FALSE	5	1: 7626, 2: 6945, NUL: 657, 3: 154
wheel_type	0	1.00	FALSE	4	All: 7626, Cov: 6945, NUL: 660, Spe: 154
nationality	0	1.00	FALSE	4	AME: 12808, OTH: 1759, TOP: 778, OTH: 40
size	0	1.00	FALSE	12	MED: 6504, LAR: 1901, MED: 1731, COM: 1511
top_three_american_name	0	1.00	FALSE	4	GM: 5385, CHR: 4860, OTH: 2577, FOR: 2563
primeunit	0	1.00	FALSE	3	NUL: 14680, NO: 694, YES: 11
aucguart	0	1.00	FALSE	3	NUL: 14680, GRE: 682, RED: 23
vnst	0	1.00	FALSE	37	TX: 2875, FL: 2242, CA: 1514, NC: 1467
.pred_class…35	14	1.00	FALSE	2	0: 13565, 1: 1806

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
.pred_0	0	1.00	0.88	0.06	0.51	0.85	0.89	0.92	1.00	▁▁▂▇▇
.pred_1	0	1.00	0.12	0.06	0.00	0.08	0.11	0.15	0.49	▇▇▂▁▁
ref_id	0	1.00	36528.19	21053.91	3.00	18230.00	36601.00	54660.00	73014.00	▇▇▇▇▇
veh_year	0	1.00	2005.33	1.74	2001.00	2004.00	2005.00	2007.00	2009.00	▂▅▅▇▂
vehicle_age	0	1.00	4.19	1.71	1.00	3.00	4.00	5.00	9.00	▃▇▃▃▁
veh_odo	0	1.00	71496.10	14640.56	8706.00	61875.00	73274.00	82395.00	115717.00	▁▂▆▇▁
mmr_acquisition_auction_average_price	165	0.99	6174.68	2383.85	966.00	4324.00	6083.00	7776.00	32063.00	▇▃▁▁▁
mmr_acquisition_auction_clean_price	142	0.99	7420.25	2627.68	1.00	5434.00	7306.00	9032.00	35108.00	▇▇▁▁▁
mmr_acquisition_retail_average_price	165	0.99	8568.88	3053.43	1543.00	6311.00	8472.50	10675.25	37885.00	▇▇▁▁▁
mmr_acquisiton_retail_clean_price	165	0.99	9938.25	3248.78	2274.00	7515.75	9848.00	12111.00	41482.00	▇▇▁▁▁
mmr_current_auction_average_price	185	0.99	6154.59	2383.97	369.00	4293.50	6056.00	7728.00	31127.00	▇▆▁▁▁
mmr_current_auction_clean_price	164	0.99	7409.77	2628.25	1.00	5433.00	7308.00	8995.00	34798.00	▇▇▁▁▁
mmr_current_retail_average_price	185	0.99	8805.94	3021.12	899.00	6556.75	8700.50	10905.00	38151.00	▇▇▁▁▁
mmr_current_retail_clean_price	185	0.99	10183.27	3217.67	1034.00	7798.00	10090.00	12308.00	41062.00	▅▇▁▁▁
veh_b_cost	14	1.00	6714.00	1767.95	1.00	5410.00	6690.00	7900.00	36485.00	▇▅▁▁▁
is_online_sale	0	1.00	0.03	0.16	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
warranty_cost	0	1.00	1285.38	618.64	462.00	853.00	1169.00	1623.00	7498.00	▇▂▁▁▁
age	0	1.00	6714.02	1767.15	1.00	5410.00	6695.00	7900.00	36485.00	▇▅▁▁▁
.pred	14	1.00	0.12	0.06	-0.32	0.08	0.12	0.16	0.35	▁▁▃▇▁

Evaluate

for both train and test

calculate accuracy()
calculate precision()
make a confusion matrix
make a score distribution
make a variable importance plot

What does precision mean? can you compare it to the do assign everyone to the minority case?

accuracy(train, truth=is_bad_buy, estimate = .pred_class, na_rm = TRUE)

## # A tibble: 1 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.821

accuracy(test, truth=is_bad_buy, estimate = .pred_class, na_rm = TRUE)

## # A tibble: 1 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.815

precision(train, truth=is_bad_buy, estimate = .pred_class, na_rm = TRUE)

## # A tibble: 1 x 3
##   .metric   .estimator .estimate
##   <chr>     <chr>          <dbl>
## 1 precision binary         0.894

precision(test, truth=is_bad_buy, estimate = .pred_class, na_rm = TRUE)

## # A tibble: 1 x 3
##   .metric   .estimator .estimate
##   <chr>     <chr>          <dbl>
## 1 precision binary         0.892

train %>%
  conf_mat(is_bad_buy, .pred_class) %>%
  autoplot(type = "heatmap") +
  coord_flip() +
  labs(title="Train Confusion Matrix")

test %>%
  conf_mat(is_bad_buy, .pred_class) %>%
  autoplot(type = "heatmap") +
  coord_flip() +
  labs(title="Test Confusion Matrix")

train%>%
  ggplot(aes(x=.pred, fill=is_bad_buy)) +
  geom_histogram(bins=100) +
  geom_vline(xintercept = 0.2) +
  annotate(geom="text", x=0.6, y=1000, label="True Positive & False Positives",
              color="red") +
  annotate(geom="text", x=0.05, y=1000, label="True Negatives & False Negatives",
              color="blue") + 
  labs(title="Train score distribution",
       x = "predicted probability",
       y = "count")

## Warning: Removed 32 rows containing non-finite values (stat_bin).

test %>%
  ggplot(aes(x=.pred, fill=is_bad_buy)) +
  geom_histogram(bins=100) +
  geom_vline(xintercept = 0.2) +
  annotate(geom="text", x=0.6, y=1000, label="True Positive & False Positives",
              color="red") +
  annotate(geom="text", x=0.05, y=1000, label="True Negatives & False Negatives",
              color="blue") + 
  labs(title="Test score distribution",
       x = "predicted probability",
       y = "count")

## Warning: Removed 14 rows containing non-finite values (stat_bin).

vip(car_glm_model, 20) +
  labs(title="Variable Importance")

Week 14 logistic regression

April 1, 2021

Overview

Load Libraries

Stage

Explore

Skim your data

Explore Categorical Variables

Numeric Exploration

Correlation

Transformation

Partition 70/30

Logistic Model

glm() model approach

glm() evaluate model

Evaluate glm()

tidymodel model approach

Define a Recipe

Bake Recipe

Create and Fit Model

Evaluate Model

Make Predictions

Evaluate