Synopsis

Sellers on online auction websites need to understand the characteristics of a successful item listing to maximize their revenue. Buyers might also be interested in understanding which listings are less attractive so as to score a good deal. In this competition, we challenge you to develop an analytics model that will help buyers and sellers predict the sales success of a set of eBay listings for Apple iPads from spring 2015.

Data fields

The dependent variable in this problem is the variable sold, which labels if an iPad listed on the eBay site was sold (equal to 1 if it did, and 0 if it did not). The dependent variable is provided in the training data set, but not the testing dataset.

  • description = The text description of the product provided by the seller.
  • biddable = Whether this is an auction (biddable=1) or a sale with a fixed price (biddable=0).
  • startprice = The start price (in US Dollars) for the auction (if biddable=1) or the sale price (if biddable=0).
  • condition = The condition of the product (new, used, etc.)
  • cellular = Whether the iPad has cellular connectivity (cellular=1) or not (cellular=0).
  • carrier = The cellular carrier for which the iPad is equipped (if cellular=1); listed as “None” if cellular=0.
  • color = The color of the iPad.
  • storage = The iPad’s storage capacity (in gigabytes).
  • productline = The name of the product being sold.

We will first start by loading the data. Once the data is loaded, cleaning procedures are applied using the business rules.

# Adding the libraries

library(tm)             # For text analytics and making a bag of words
library(StatMatch)      # Clustering the data
library(caret)          # A useful package that contains predictive models
library(xgboost)        # For implementing the xgboost model
library(ROCR)           # It creates ROCR plots and AUC predictions
library(Metrics)        # Makes dummy variables
library(ggplot2)        # Package for plots


#setwd("E:/Kaggle/MITx")
setwd("C:/Users/rruj/Downloads")
set.seed(144)

#Reading the data
train <- read.csv("eBayiPadTrain.csv")
test <- read.csv("eBayiPadTest.csv")

#Converting the description column to character as by default columns are read as factors
train$description <- as.character(train$description)
test$description <- as.character(test$description)

#Creating a new column in the dataset
train$Desc <- ifelse(nchar(train$description) > 0, 1, 0)
test$Desc <- ifelse(nchar(test$description) > 0, 1, 0)

#Removing the row for iPad5 and Ipad mini retina in 'productline' column
train <- train[-which(train$productline=="iPad 5"),]
train <- train[-which(train$productline=="iPad mini Retina"),]

#Changing 'Other' in carrier to 'None'
train$carrier[train$carrier == "Other"] <- "None"
test$carrier[test$carrier == "Other"] <- "None"

#Combining Sprint and T-Mobile
train$carrier <- as.character(train$carrier)
test$carrier <- as.character(test$carrier)
train$carrier[train$carrier == "Sprint" | train$carrier == "T-Mobile"] <- "T-Sprint"
test$carrier[test$carrier == "Sprint" | test$carrier == "T-Mobile"] <- "T-Sprint"
train$carrier <- as.factor(train$carrier)
test$carrier <- as.factor(test$carrier)
train <- droplevels(train)
test <- droplevels(test)

train$condition <- as.character(train$condition)
test$condition <- as.character(test$condition)
train$condition[train$condition == "Manufacturer refurbished" | train$condition == "Seller refurbished"] <- "Refurbished"
test$condition[test$condition == "Manufacturer refurbished" | test$condition == "Seller refurbished"] <- "Refurbished"
train$condition <- as.factor(train$condition)
test$condition <- as.factor(test$condition)

Let’s check the price column. We observe that the price is right skewed, i.e most of the observations have low price. This kind of imbalanced distribution impedes the ML algorith to predict well. We will make a log transformation to it.

hist(train$startprice, col = "tomato", xlab = "Start Price", main = "Frequency Distribution of startprice")

train$startprice <- log(train$startprice)

hist(train$startprice, col = "tomato", xlab = "Start Price", main = "Frequency Distribution of startprice", xlim = c(2,8), breaks = 25)

test$startprice <- log(test$startprice)

Let’s work with the description variable part. We will apply the ‘bag-of-words’ approach to the description column. Furthermore, lets make a new variable called negetive which gives the sum of negetive words in the description variable.

To create the corpus, we will use the factor of 0.99 which will only take those words which appear in 99% of the records.

CorpusDescription = Corpus(VectorSource(c(train$description, test$description)))
CorpusDescription = tm_map(CorpusDescription, tolower)
CorpusDescription = tm_map(CorpusDescription, PlainTextDocument)
CorpusDescription = tm_map(CorpusDescription, removePunctuation)
CorpusDescription = tm_map(CorpusDescription, removeWords, stopwords("english"))
CorpusDescription = tm_map(CorpusDescription, stemDocument)
dtm = DocumentTermMatrix(CorpusDescription)

sparse = removeSparseTerms(dtm, 0.99)  # Only taking the terms that appear atleast in 99%

DescriptionWords = as.data.frame(as.matrix(sparse))
colnames(DescriptionWords) = make.names(colnames(DescriptionWords))


negetiveWords <- c("blemish","crack","damag","dent","scratch","wear","tear","lock")

negetive_count <- apply(
        as.matrix(DescriptionWords[,names(DescriptionWords) %in% negetiveWords]),
        1, sum)


# check if it contains the word 'no'
no <- grepl(" no ", c(train$description, test$description))
NO <- grepl(" NO ", c(train$description, test$description))
No <- grepl(" No ", c(train$description, test$description))

word_no <- No | NO | no

negetive_count <- negetive_count * (!word_no)

train$negetive <- negetive_count[1:nrow(train)]
test$negetive <- negetive_count[(nrow(train)+1):(nrow(train) + nrow(test))]

Now that we are done with creating new variables and analyzing the description column, Lets work on clustering the data. We will use the StatMatch package which can cluster the data with non-numeric variables as well. As the business demands, the clusters can be formed accordingly. Here in this case, i will choose 4 clusters.

# Clustering the data using the statmatch package
clustdata <- rbind(train[,-c(1,10:11)], test[,-c(1,10)])
distance <- as.dist(gower.dist(clustdata))
clusteritems <- hclust(distance, method="ward.D")
plot(clusteritems)
rect.hclust(clusteritems, 4)

## we get a cut a 4 groups
clusterGroups <- cutree(clusteritems, k=4)
trainG <- clusterGroups[1:1852]
testG <- clusterGroups[1853:2650]
train1 <- train[trainG ==1,]
train2 <- train[trainG ==2,]
train3 <- train[trainG ==3,]
train4 <- train[trainG ==4,]
test1 <- test[testG ==1,]
test2 <- test[testG ==2,]
test3 <- test[testG ==3,]
test4 <- test[testG ==4,]

Lets start making a logistic regression model over the clusters that we have created

# Model-1
Mod1 <- glm(sold ~ ., data = train1[,-c(1,11)], family = binomial)
summary(Mod1)  # AIC: 358.11
## 
## Call:
## glm(formula = sold ~ ., family = binomial, data = train1[, -c(1, 
##     11)])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.8579  -0.6333  -0.2955   0.3901   2.9605  
## 
## Coefficients: (1 not defined because of singularities)
##                                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                       14.0470     2.9235   4.805 1.55e-06 ***
## biddable                           1.0263     0.3474   2.955 0.003131 ** 
## startprice                        -3.3620     0.6037  -5.569 2.56e-08 ***
## conditionNew other (see details)   1.6187     0.8842   1.831 0.067155 .  
## conditionRefurbished               1.1497     0.7853   1.464 0.143186    
## conditionUsed                      1.7055     0.6298   2.708 0.006764 ** 
## cellularUnknown                  -13.6136   882.7445  -0.015 0.987696    
## carrierUnknown                         NA         NA      NA       NA    
## colorGold                          0.1551     1.0341   0.150 0.880802    
## colorSpace Gray                   -0.2966     0.7309  -0.406 0.684876    
## colorUnknown                      -0.3065     0.3935  -0.779 0.435992    
## colorWhite                        -0.2420     0.4042  -0.599 0.549263    
## storage16                         -0.6486     0.8966  -0.723 0.469414    
## storage32                         -1.4312     1.0162  -1.408 0.159023    
## storage64                          0.2721     0.9053   0.301 0.763785    
## storageUnknown                    -1.0308     1.4675  -0.702 0.482415    
## productlineiPad 2                  1.6320     0.6329   2.579 0.009923 ** 
## productlineiPad 3                  2.5381     0.7856   3.231 0.001235 ** 
## productlineiPad 4                  3.1863     0.8844   3.603 0.000315 ***
## productlineiPad Air                3.7456     0.9755   3.840 0.000123 ***
## productlineiPad Air 2              5.0313     1.1746   4.283 1.84e-05 ***
## productlineiPad mini               2.0542     0.7161   2.869 0.004120 ** 
## productlineiPad mini 2             3.0251     0.9798   3.087 0.002019 ** 
## productlineiPad mini 3             4.2137     1.7775   2.371 0.017763 *  
## productlineUnknown                 0.8344     1.0601   0.787 0.431227    
## Desc                              -0.8252     0.9713  -0.850 0.395603    
## negetive                           0.1579     0.2070   0.763 0.445725    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 543.91  on 393  degrees of freedom
## Residual deviance: 306.11  on 368  degrees of freedom
## AIC: 358.11
## 
## Number of Fisher Scoring iterations: 13
# We will select the variables using the AIC score. The step function determines the optimum number of variables to use for the model

step(Mod1) # AIC: 346.6
## Start:  AIC=358.11
## sold ~ biddable + startprice + condition + cellular + carrier + 
##     color + storage + productline + Desc + negetive
## 
## 
## Step:  AIC=358.11
## sold ~ biddable + startprice + condition + cellular + color + 
##     storage + productline + Desc + negetive
## 
##               Df Deviance    AIC
## - color        4   306.95 350.95
## - cellular     1   306.65 356.65
## - negetive     1   306.69 356.69
## - Desc         1   306.83 356.83
## - storage      4   313.85 357.85
## <none>             306.11 358.11
## - condition    3   315.05 361.05
## - biddable     1   314.81 364.81
## - productline  9   332.57 366.57
## - startprice   1   389.96 439.96
## 
## Step:  AIC=350.95
## sold ~ biddable + startprice + condition + cellular + storage + 
##     productline + Desc + negetive
## 
##               Df Deviance    AIC
## - cellular     1   307.39 349.39
## - negetive     1   307.59 349.59
## - Desc         1   307.62 349.62
## - storage      4   314.93 350.93
## <none>             306.95 350.95
## - condition    3   316.70 354.70
## - biddable     1   315.29 357.29
## - productline  9   335.27 361.27
## - startprice   1   393.45 435.45
## 
## Step:  AIC=349.39
## sold ~ biddable + startprice + condition + storage + productline + 
##     Desc + negetive
## 
##               Df Deviance    AIC
## - negetive     1   308.03 348.03
## - Desc         1   308.13 348.13
## <none>             307.39 349.39
## - storage      4   315.63 349.63
## - condition    3   317.32 353.32
## - biddable     1   315.61 355.61
## - productline  9   336.13 360.13
## - startprice   1   395.48 435.48
## 
## Step:  AIC=348.03
## sold ~ biddable + startprice + condition + storage + productline + 
##     Desc
## 
##               Df Deviance    AIC
## - Desc         1   308.65 346.65
## <none>             308.03 348.03
## - storage      4   316.49 348.49
## - condition    3   317.86 351.86
## - biddable     1   315.90 353.90
## - productline  9   337.87 359.87
## - startprice   1   398.39 436.39
## 
## Step:  AIC=346.65
## sold ~ biddable + startprice + condition + storage + productline
## 
##               Df Deviance    AIC
## <none>             308.65 346.65
## - storage      4   316.80 346.80
## - condition    3   318.28 350.28
## - biddable     1   316.50 352.50
## - productline  9   338.80 358.80
## - startprice   1   400.35 436.35
## 
## Call:  glm(formula = sold ~ biddable + startprice + condition + storage + 
##     productline, family = binomial, data = train1[, -c(1, 11)])
## 
## Coefficients:
##                      (Intercept)                          biddable  
##                          13.6631                            0.9550  
##                       startprice  conditionNew other (see details)  
##                          -3.5019                            1.6369  
##             conditionRefurbished                     conditionUsed  
##                           1.4225                            1.8178  
##                        storage16                         storage32  
##                          -0.6818                           -1.4028  
##                        storage64                    storageUnknown  
##                           0.2493                           -0.9083  
##                productlineiPad 2                 productlineiPad 3  
##                           1.7587                            2.6579  
##                productlineiPad 4               productlineiPad Air  
##                           3.3657                            3.8487  
##            productlineiPad Air 2              productlineiPad mini  
##                           5.2621                            2.1456  
##           productlineiPad mini 2            productlineiPad mini 3  
##                           3.1238                            4.4439  
##               productlineUnknown  
##                           0.8791  
## 
## Degrees of Freedom: 393 Total (i.e. Null);  375 Residual
## Null Deviance:       543.9 
## Residual Deviance: 308.6     AIC: 346.6
M1 <- glm(formula = sold ~ biddable + startprice + condition + storage + 
                  productline, family = binomial, data = train1[, -c(1, 11)])

Similarly we perform the logistic regression over the remaining 3 models

# Model -2
Mod2 <- glm(sold ~ ., data = train2[,-c(1,11)], family = binomial)
summary(Mod2) # AIC: 469.26
step(Mod2) # AIC: 456.1
M2 <- glm(formula = sold ~ biddable + startprice + storage + productline + 
                  Desc, family = binomial, data = train2[, -c(1, 11)])

# Model -3
Mod3 <- glm(sold ~ ., data = train3[,-c(1,5,6,11)], family = binomial)
summary(Mod3) # AIC: 693.3
step(Mod3) # AIC: 680.6
M3 <- glm(formula = sold ~ biddable + startprice + condition + productline, 
          family = binomial, data = train3[, -c(1, 5, 6, 11)])

# Model - 4
Mod4 <- glm(sold ~ ., data = train4[,-c(1,5,6,11)], family = binomial)
summary(Mod4) # AIC: 263.22
step(Mod4) # AIC: 237.5
M4 <- glm(formula = sold ~ biddable + startprice, family = binomial, 
          data = train4[, -c(1, 5, 6, 11)])

Predicting the AUC score on the training set

#AUC on training set
p1 <- predict(M1, type = "response")
p2 <- predict(M2, type = "response")
p3 <- predict(M3, type = "response")
p4 <- predict(M4, type = "response")

sold <- c(train1$sold, train2$sold, train3$sold, train4$sold)

ROCR <- prediction(c(p1,p2,p3,p4), sold)
as.numeric(performance(ROCR,"auc")@y.values) # 0.8805123
## [1] 0.8805123
ROCRplot <- performance(ROCR, "tpr", "fpr")
plot(ROCRplot, colorize = T)

#Making predictions on the test set
predict1 <- predict(M1, newdata = test1[,-c(1,10)], type = "response")
predict2 <- predict(M2, newdata = test2[,-c(1,10)], type = "response")
predict3 <- predict(M3, newdata = test3[, -c(1, 5, 6, 10)], type = "response")
predict4 <- predict(M4, newdata = test4[, -c(1, 5, 6, 10)], type = "response")

ID <- c(test1$UniqueID, test2$UniqueID, test3$UniqueID, test4$UniqueID)
output_GLM <- data.frame(UniqueID = ID, Probability_LM = c(predict1, predict2, predict3, predict4))

Now lets try the 2nd model on XGBoost

set.seed(144)

train <- train[,-c(1,11)]
test <- test[,-c(1,10)]

train <- train[,-c(5:6)]
test <- test[,-c(5:6)]

dmy <- dummyVars(" ~ .", data = train)
trainTrsf <- data.frame(predict(dmy, newdata = train))
dmytest <- dummyVars(" ~ .", data = test)
testTrsf <- data.frame(predict(dmytest, newdata = test))


set.seed(123)
res.cv <- xgb.cv(data = as.matrix(trainTrsf[,-26]),
                 label = as.matrix(trainTrsf[,26]),
                 nfold = 3,
                 nrounds = 1000,
                 verbose = TRUE,
                 objective = "binary:logistic",
                 eval_metric = "auc",
                 eta = 0.1,
                 subsample = 0.9,
                 max.depth = 2)

Let’s check the output res.cv file. This file contains the results of each iterations of xgboost. Columns 1 and 2 are the stats for the training set. Column 3 and 4 are the stats for the test set created in the cross-validation. We will select the model which gives the maxuimum confidence of test set AUC (95%)

##    train.auc.mean train.auc.std test.auc.mean test.auc.std
## 1:       0.829263      0.008611      0.821688     0.004125
## 2:       0.830874      0.009257      0.822635     0.004941
## 3:       0.833422      0.004876      0.825316     0.009580
## 4:       0.833982      0.005194      0.826568     0.008387
## 5:       0.834204      0.005376      0.826959     0.008025
## 6:       0.838384      0.005574      0.831394     0.008596

We will now train the model using the optimum value of the rounds.

with(res.cv, max(test.auc.mean - test.auc.std))
round <- with(res.cv, which.max(test.auc.mean - test.auc.std))
round #532 rounds

set.seed(123)

bst <- xgboost(data = as.matrix(trainTrsf[,-26]),
               label = trainTrsf[,26],
               nround=round, eta = 0.1, subsample = 0.9,
               objective = "binary:logistic", verbose=TRUE, max.depth = 2)
importance_Matrix <- xgb.importance(names(trainTrsf), model=bst)
xgb.plot.importance(importance_Matrix)

predictions_xgboost <- predict(bst, as.matrix(testTrsf))
test1 <- read.csv("eBayiPadTest.csv")
output_XGBoost <- data.frame(UniqueID = test1$UniqueID, Probability_XG = predictions_xgboost)
output <- merge(output_GLM, output_XGBoost, by.x = "UniqueID", by.y = "UniqueID")
output$Probability1 <- with(output, (Probability_LM + Probability_XG)/2)
output <- output[,-c(2,3)]
write.csv(output, "Stacked_LM_XGBoost.csv", row.names = FALSE)

This model gave me a rank of 12th out of 1884 participants.