Interpreting coefficients of categorical variables

Techniques involved: logistic regression, interpretting coefficients, AUC, ROCR curve, cross validation

Individuals selling used items are often faced with a difficult choice – should they try to sell the items through a yard/estate sale, a consignment shop, an auction, or some other means? Often, this choice will come down to the convenience of selling the items, the price the items can fetch, and the speed with which the items can be sold.

To determine whether analytics can be used to help make this choice, we will look at whether data from previous auctions on eBay, a major online auction and shopping site, can be used to predict whether a new item will be sold at some target price. We will limit our attention to Christian Louboutin shoes, using data from nearly 4,000 auctions from late 2014. In this analysis, the dependent variable will be the binary outcome variable sold, which takes value 1 if the item was sold and 0 if it was not sold. We also include saleprice, which is the price the shoe sold at (NA for shoes that did not sell). For each item, the file ebay.csv contains the following independent variables:

biddable: Whether this is an auction (biddable=1) or a sale with a fixed price (biddable=0)

startprice: The start price (in US Dollars) for the auction (if biddable=1) or the sale price (if biddable=0)

condition: The condition of the shoe (New with box, New with defects, New without box, or Pre-owned)

size: The size of the shoe (converted to US shoe sizes)

heel: The size of the heel (Flat, Low, Medium, High)

style: The style of the shoe (Open Toe, Platform, Pump, Slingback, Stiletto, or Other/Missing)

color: The color of the shoe (Beige, Black, Brown, Red, or Other/Missing)

material: The material of the shoe (Leather, Patent Leather, Satin, Snakeskin, Suede, or Other/Missing)

snippit: A short snippit of text describing the shoe

description: A long text description describing the shoe

Load the data

setwd("C:/Users/jzchen/Documents/Courses/Analytics Edge/Final")
eBay <- read.csv("ebay.csv", stringsAsFactors = F)
table(eBay$sold)
## 
##    0    1 
## 2997  799
summary(eBay)
##     biddable           sold          startprice       saleprice     
##  Min.   :0.0000   Min.   :0.0000   Min.   :   0.0   Min.   :   0.0  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.: 280.0   1st Qu.: 200.0  
##  Median :1.0000   Median :0.0000   Median : 449.0   Median : 325.0  
##  Mean   :0.5911   Mean   :0.2105   Mean   : 472.3   Mean   : 372.7  
##  3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.: 600.0   3rd Qu.: 500.0  
##  Max.   :1.0000   Max.   :1.0000   Max.   :4500.0   Max.   :3995.0  
##                                                     NA's   :2997    
##   condition              size            heel              style          
##  Length:3796        Min.   : 4.000   Length:3796        Length:3796       
##  Class :character   1st Qu.: 7.000   Class :character   Class :character  
##  Mode  :character   Median : 8.000   Mode  :character   Mode  :character  
##                     Mean   : 7.933                                        
##                     3rd Qu.: 9.000                                        
##                     Max.   :12.000                                        
##                     NA's   :68                                            
##     color             material           snippit         
##  Length:3796        Length:3796        Length:3796       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  description       
##  Length:3796       
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
sort(table(eBay$size))
## 
##    4   12 11.5  4.5   11    5 10.5  5.5   10    6  9.5  6.5  8.5    9    7 
##   14   14   21   28   79   87  139  154  235  264  298  356  385  397  402 
##  7.5    8 
##  413  442

CONVERTING VARIABLES TO FACTORS

eBay$sold <- as.factor(eBay$sold)
eBay$condition <- as.factor(eBay$condition)
eBay$heel <- as.factor(eBay$heel)
eBay$style <- as.factor(eBay$style)
eBay$color <- as.factor(eBay$color)
eBay$material <- as.factor(eBay$material)

Split the data

library(caTools)
set.seed(144)
spl <- sample.split(eBay$sold, SplitRatio = 0.7)
training <- subset(eBay, spl == T)
testing <- subset(eBay, spl == F)

TRAINING A LOGISTIC REGRESSION MODEL

LR <- glm(sold ~ biddable + startprice + condition + heel + style + color + material, data = training, family = binomial)
summary(LR)
## 
## Call:
## glm(formula = sold ~ biddable + startprice + condition + heel + 
##     style + color + material, family = binomial, data = training)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5805  -0.7022  -0.5002  -0.2166   5.9322  
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                0.5990788  0.3285428   1.823 0.068236 .  
## biddable                   0.0113984  0.1114610   0.102 0.918547    
## startprice                -0.0044423  0.0003041 -14.607  < 2e-16 ***
## conditionNew with defects -0.2451855  0.3727966  -0.658 0.510736    
## conditionNew without box  -0.2145965  0.2290351  -0.937 0.348780    
## conditionPre-owned        -0.4952981  0.1374505  -3.603 0.000314 ***
## heelFlat                   0.1431346  0.6387994   0.224 0.822704    
## heelHigh                   0.1224260  0.1340119   0.914 0.360955    
## heelLow                   -2.5549302  1.0411255  -2.454 0.014127 *  
## heelMedium                -0.5830418  0.2674958  -2.180 0.029285 *  
## styleOther/Missing         0.5268920  0.2127852   2.476 0.013280 *  
## stylePlatform             -0.1712048  0.2102085  -0.814 0.415386    
## stylePump                  0.4683107  0.1817995   2.576 0.009996 ** 
## styleSlingback            -0.2294999  0.2535765  -0.905 0.365438    
## styleStiletto              0.8325406  0.2606786   3.194 0.001404 ** 
## colorBlack                 0.2226547  0.1766847   1.260 0.207604    
## colorBrown                -0.5252811  0.2982060  -1.761 0.078159 .  
## colorOther/Missing        -0.2051389  0.1793759  -1.144 0.252779    
## colorRed                  -0.1261035  0.2705234  -0.466 0.641111    
## materialOther/Missing     -0.2192565  0.1531385  -1.432 0.152214    
## materialPatent Leather     0.0809572  0.1431549   0.566 0.571719    
## materialSatin             -1.1078098  0.3153264  -3.513 0.000443 ***
## materialSnakeskin          0.1562727  0.3444677   0.454 0.650070    
## materialSuede             -0.0713244  0.1789439  -0.399 0.690199    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2733.9  on 2656  degrees of freedom
## Residual deviance: 2372.7  on 2633  degrees of freedom
## AIC: 2420.7
## 
## Number of Fisher Scoring iterations: 5

What is the meaning of the coefficient labeled “styleStiletto” in the logistic regression summary output? The coefficients of the model are the log odds associated with that variable; so we see that the odds of being sold are exp(0.8325406)=2.299153 those of an otherwise identical shoe in the baseline category for the style variable (which is “Open Toe”). This means the stiletto is predicted to have 129.9% higher odds of being sold.

PREDICTING USING A LOGISTIC REGRESSION MODEL

Consider a shoe that is not for auction (biddable=0), that has start price $100, that is in condition “Pre-owned”, that has “High” heels, that has style “Open Toe”, that has color “Black”, and that has material “Satin”. What is the predicted probability that this shoe will be sold according to the logistic regression model?

LRpredict <- predict(LR, newdata = data.frame(biddable = 0, startprice = 100, condition = "Pre-owned", heel = "High", style = "Open Toe", color = "Black", material = "Satin"), type = "response")

OBTAINING TEST SET PREDICTIONS

LRpred <- predict(LR, newdata = testing, type = "response")
table(LRpred >= 0.5)
## 
## FALSE  TRUE 
##  1059    80
table(training$sold)
## 
##    0    1 
## 2098  559

COMPUTING TEST-SET AUC

library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## 
## The following object is masked from 'package:stats':
## 
##     lowess
ROCRpred <- prediction(LRpred, testing$sold)
as.numeric(performance(ROCRpred, "auc")@y.values)
## [1] 0.7444244
perfROCR <- performance(ROCRpred, "tpr", "fpr")
plot(perfROCR, colorize = T)

CROSS-VALIDATION TO SELECT PARAMETERS

In 10-fold cross validation, the model with each parameter setting will be trained on 10 90% subsets of the training set. Hence, a total of 30 models will be trained. The models are evaluated in each case on the last 10% of the training set (not on the testing set).

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(e1071)
set.seed(144)
numFolds <- trainControl(method = "cv", number = 10)
cpGrid <- expand.grid(.cp = seq(0.001, 0.05, 0.001))
train(sold ~ biddable + startprice + condition + heel + style + color + material, data = training, method = "rpart", trControl = numFolds, tuneGrid = cpGrid)
## Loading required package: rpart
## CART 
## 
## 2657 samples
##   11 predictor
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## 
## Summary of sample sizes: 2391, 2391, 2392, 2391, 2392, 2391, ... 
## 
## Resampling results across tuning parameters:
## 
##   cp     Accuracy   Kappa      Accuracy SD  Kappa SD  
##   0.001  0.8091928  0.3009632  0.025965542  0.10279441
##   0.002  0.8106994  0.2830072  0.021504296  0.08698237
##   0.003  0.8152135  0.2785765  0.022484082  0.09425898
##   0.004  0.8148291  0.2777788  0.023007816  0.10510122
##   0.005  0.8174606  0.2792412  0.021719248  0.10717739
##   0.006  0.8170833  0.2756554  0.023126922  0.11405847
##   0.007  0.8163314  0.2705521  0.022331590  0.10935238
##   0.008  0.8163314  0.2707547  0.021544196  0.10553452
##   0.009  0.8163314  0.2707547  0.021544196  0.10553452
##   0.010  0.8167073  0.2674603  0.021927526  0.10898871
##   0.011  0.8152022  0.2602950  0.023073901  0.10434024
##   0.012  0.8140743  0.2590317  0.023316362  0.10430422
##   0.013  0.8148262  0.2592311  0.022682228  0.10416590
##   0.014  0.8140729  0.2676487  0.023127717  0.09776566
##   0.015  0.8140729  0.2676487  0.023127717  0.09776566
##   0.016  0.8140729  0.2676487  0.023127717  0.09776566
##   0.017  0.8140729  0.2676487  0.023127717  0.09776566
##   0.018  0.8140729  0.2676487  0.023127717  0.09776566
##   0.019  0.8140729  0.2676487  0.023127717  0.09776566
##   0.020  0.8140729  0.2676487  0.023127717  0.09776566
##   0.021  0.8140729  0.2676487  0.023127717  0.09776566
##   0.022  0.8140729  0.2676487  0.023127717  0.09776566
##   0.023  0.8140729  0.2676487  0.023127717  0.09776566
##   0.024  0.8140729  0.2676487  0.023127717  0.09776566
##   0.025  0.8121932  0.2511626  0.022012193  0.09643119
##   0.026  0.8125692  0.2531311  0.022394577  0.09886132
##   0.027  0.8125692  0.2531311  0.022394577  0.09886132
##   0.028  0.8076720  0.2147947  0.016193693  0.06488761
##   0.029  0.8076720  0.2147947  0.016193693  0.06488761
##   0.030  0.8057923  0.1998072  0.013016926  0.04098977
##   0.031  0.8057923  0.1998072  0.013016926  0.04098977
##   0.032  0.8057923  0.1998072  0.013016926  0.04098977
##   0.033  0.8057923  0.1998072  0.013016926  0.04098977
##   0.034  0.8057923  0.1998072  0.013016926  0.04098977
##   0.035  0.8057923  0.1998072  0.013016926  0.04098977
##   0.036  0.8057923  0.1998072  0.013016926  0.04098977
##   0.037  0.8057923  0.1998072  0.013016926  0.04098977
##   0.038  0.8057923  0.1998072  0.013016926  0.04098977
##   0.039  0.8057923  0.1998072  0.013016926  0.04098977
##   0.040  0.8057923  0.1998072  0.013016926  0.04098977
##   0.041  0.8057923  0.1998072  0.013016926  0.04098977
##   0.042  0.8057923  0.1998072  0.013016926  0.04098977
##   0.043  0.8057923  0.1998072  0.013016926  0.04098977
##   0.044  0.8057923  0.1998072  0.013016926  0.04098977
##   0.045  0.8057923  0.1998072  0.013016926  0.04098977
##   0.046  0.8020187  0.1742727  0.011785686  0.07105536
##   0.047  0.8020187  0.1742727  0.011785686  0.07105536
##   0.048  0.7990112  0.1534393  0.010592145  0.08838699
##   0.049  0.7963796  0.1350344  0.009128368  0.09973832
##   0.050  0.7941240  0.1191052  0.007467116  0.10782603
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was cp = 0.005.

Train CART model

library(rpart)
library(rpart.plot)
CARTmodel <- rpart(sold ~ biddable + startprice + condition + heel + style + color + material, data = training, method = "class", cp = 0.005)
prp(CARTmodel)

BUILDING A CORPUS

library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## 
## The following object is masked from 'package:ggplot2':
## 
##     annotate
corpus <- Corpus(VectorSource(eBay$description))

Pre-process the corpus

corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, stemDocument)
dtm <- DocumentTermMatrix(corpus)
spdtm <- removeSparseTerms(dtm, 0.90)
descriptionText <- as.data.frame(as.matrix(spdtm))

Which word stem appears the most frequently across all descriptions?

sort(colSums(descriptionText))
##   dustbag      take     money     upper      also     cover     never 
##       402       479       481       483       505       510       511 
##      just   perfect       tri     right      feel      sold    preown 
##       523       523       536       543       544       548       562 
##       run    unless    bottom     happi      line  deliveri       due 
##       563       576       579       590       608       609       613 
##     photo      mark    seller      peep    beauti     insur      high 
##       614       615       618       623       632       643       665 
##       buy     excel      sure      need      sued       pay     offer 
##       675       679       681       686       703       722       731 
##    height     style  prioriti   confirm      hour       one       fee 
##       736       737       742       745       767       780       788 
##    refund       tax      mail      like       amp       may      pair 
##       789       806       809       816       833       838       845 
##       end    retail     small  descript  feedback     great       usp 
##       852       861       862       865       870       877       878 
##      cost    detail      busi     check     email   respons       can 
##       887       891       892       892       893       923       928 
##       fit      back     final      know     itali     prior     thank 
##       931       933       938       960       961       962       979 
##      good      inch       see     addit    combin     brand      look 
##       988      1003      1003      1024      1026      1036      1051 
##    pictur    receiv      make       use      sell     price  guarante 
##      1061      1066      1090      1108      1120      1160      1165 
##     insol      note  signatur      must    measur      list     color 
##      1170      1180      1199      1206      1213      1215      1225 
##   address    custom      dust      time    design    includ      wear 
##      1231      1239      1262      1282      1319      1374      1381 
##      free       ask   auction      come    patent   contact    accept 
##      1399      1410      1411      1415      1425      1460      1477 
##     charg      made       bag     black      worn     store    paypal 
##      1515      1520      1577      1582      1603      1640      1651 
##      sale     buyer  platform      ebay       toe    origin   purchas 
##      1652      1666      1717      1736      1749      1809      1879 
##       red    intern  question      pump       100       bid      sole 
##      1882      1932      1967      2015      2151      2244      2315 
##    within       day       new       box    condit   payment    return 
##      2333      2411      2481      3068      3068      3227      3602 
##   leather   authent      shoe christian      size      heel      will 
##      3612      3724      4283      4320      4545      5011      5097 
## louboutin     pleas      item      ship 
##      5686      6428      8218      8353

Column names

names(descriptionText) <- paste0("D", names(descriptionText))

Adding variables from original data

descriptionText$sold <- eBay$sold
descriptionText$biddable <- eBay$biddable
descriptionText$startprice <- eBay$startprice
descriptionText$condition <- eBay$condition
descriptionText$heel <- eBay$heel
descriptionText$style <- eBay$style
descriptionText$color <- eBay$color
descriptionText$material <- eBay$material

Split the data

trainText <- subset(descriptionText, spl == T)
testText <- subset(descriptionText, spl == F)

TRAINING ANOTHER LOGISTIC REGRESSION MODEL

glmText <- glm(sold~., data = trainText, family = binomial)
glmText_pred <- predict(glmText, newdata = testText, type = "response")

Evaluate the model

predROCR <- prediction(predictions = glmText_pred, labels = testText$sold)
as.numeric(performance(predROCR, "auc")@y.values)
## [1] 0.7337875

on the training set

glmText_pred_train <- predict(glmText, type = "response")
predROCR_train <- prediction(predictions = glmText_pred_train, labels = trainText$sold)
as.numeric(performance(predROCR_train, "auc")@y.values)
## [1] 0.8190665