Sellers on online auction websites need to understand the characteristics of a successful item listing to maximize their revenue. Buyers might also be interested in understanding which listings are less attractive so as to score a good deal. In this competition, we challenge you to develop an analytics model that will help buyers and sellers predict the sales success of a set of eBay listings for Apple iPads from spring 2015.
The dependent variable in this problem is the variable sold, which labels if an iPad listed on the eBay site was sold (equal to 1 if it did, and 0 if it did not). The dependent variable is provided in the training data set, but not the testing dataset.
# Adding the libraries
library(tm) # For text analytics and making a bag of words
library(StatMatch) # Clustering the data
library(caret) # A useful package that contains predictive models
library(xgboost) # For implementing the xgboost model
library(ROCR) # It creates ROCR plots and AUC predictions
library(Metrics) # Makes dummy variables
library(ggplot2) # Package for plots
#setwd("E:/Kaggle/MITx")
setwd("C:/Users/rruj/Downloads")
set.seed(144)
#Reading the data
train <- read.csv("eBayiPadTrain.csv")
test <- read.csv("eBayiPadTest.csv")
#Converting the description column to character as by default columns are read as factors
train$description <- as.character(train$description)
test$description <- as.character(test$description)
#Creating a new column in the dataset
train$Desc <- ifelse(nchar(train$description) > 0, 1, 0)
test$Desc <- ifelse(nchar(test$description) > 0, 1, 0)
#Removing the row for iPad5 and Ipad mini retina in 'productline' column
train <- train[-which(train$productline=="iPad 5"),]
train <- train[-which(train$productline=="iPad mini Retina"),]
#Changing 'Other' in carrier to 'None'
train$carrier[train$carrier == "Other"] <- "None"
test$carrier[test$carrier == "Other"] <- "None"
#Combining Sprint and T-Mobile
train$carrier <- as.character(train$carrier)
test$carrier <- as.character(test$carrier)
train$carrier[train$carrier == "Sprint" | train$carrier == "T-Mobile"] <- "T-Sprint"
test$carrier[test$carrier == "Sprint" | test$carrier == "T-Mobile"] <- "T-Sprint"
train$carrier <- as.factor(train$carrier)
test$carrier <- as.factor(test$carrier)
train <- droplevels(train)
test <- droplevels(test)
train$condition <- as.character(train$condition)
test$condition <- as.character(test$condition)
train$condition[train$condition == "Manufacturer refurbished" | train$condition == "Seller refurbished"] <- "Refurbished"
test$condition[test$condition == "Manufacturer refurbished" | test$condition == "Seller refurbished"] <- "Refurbished"
train$condition <- as.factor(train$condition)
test$condition <- as.factor(test$condition)
Let’s check the price column. We observe that the price is right skewed, i.e most of the observations have low price. This kind of imbalanced distribution impedes the ML algorith to predict well. We will make a log transformation to it.
hist(train$startprice, col = "tomato", xlab = "Start Price", main = "Frequency Distribution of startprice")
train$startprice <- log(train$startprice)
hist(train$startprice, col = "tomato", xlab = "Start Price", main = "Frequency Distribution of startprice", xlim = c(2,8), breaks = 25)
test$startprice <- log(test$startprice)
Let’s work with the description variable part. We will apply the ‘bag-of-words’ approach to the description column. Furthermore, lets make a new variable called negetive which gives the sum of negetive words in the description variable.
To create the corpus, we will use the factor of 0.99 which will only take those words which appear in 99% of the records.
CorpusDescription = Corpus(VectorSource(c(train$description, test$description)))
CorpusDescription = tm_map(CorpusDescription, tolower)
CorpusDescription = tm_map(CorpusDescription, PlainTextDocument)
CorpusDescription = tm_map(CorpusDescription, removePunctuation)
CorpusDescription = tm_map(CorpusDescription, removeWords, stopwords("english"))
CorpusDescription = tm_map(CorpusDescription, stemDocument)
dtm = DocumentTermMatrix(CorpusDescription)
sparse = removeSparseTerms(dtm, 0.99) # Only taking the terms that appear atleast in 99%
DescriptionWords = as.data.frame(as.matrix(sparse))
colnames(DescriptionWords) = make.names(colnames(DescriptionWords))
negetiveWords <- c("blemish","crack","damag","dent","scratch","wear","tear","lock")
negetive_count <- apply(
as.matrix(DescriptionWords[,names(DescriptionWords) %in% negetiveWords]),
1, sum)
# check if it contains the word 'no'
no <- grepl(" no ", c(train$description, test$description))
NO <- grepl(" NO ", c(train$description, test$description))
No <- grepl(" No ", c(train$description, test$description))
word_no <- No | NO | no
negetive_count <- negetive_count * (!word_no)
train$negetive <- negetive_count[1:nrow(train)]
test$negetive <- negetive_count[(nrow(train)+1):(nrow(train) + nrow(test))]
Now that we are done with creating new variables and analyzing the description column, Lets work on clustering the data. We will use the StatMatch package which can cluster the data with non-numeric variables as well. As the business demands, the clusters can be formed accordingly. Here in this case, i will choose 4 clusters.
# Clustering the data using the statmatch package
clustdata <- rbind(train[,-c(1,10:11)], test[,-c(1,10)])
distance <- as.dist(gower.dist(clustdata))
clusteritems <- hclust(distance, method="ward.D")
plot(clusteritems)
rect.hclust(clusteritems, 4)
## we get a cut a 4 groups
clusterGroups <- cutree(clusteritems, k=4)
trainG <- clusterGroups[1:1852]
testG <- clusterGroups[1853:2650]
train1 <- train[trainG ==1,]
train2 <- train[trainG ==2,]
train3 <- train[trainG ==3,]
train4 <- train[trainG ==4,]
test1 <- test[testG ==1,]
test2 <- test[testG ==2,]
test3 <- test[testG ==3,]
test4 <- test[testG ==4,]
Lets start making a logistic regression model over the clusters that we have created
# Model-1
Mod1 <- glm(sold ~ ., data = train1[,-c(1,11)], family = binomial)
summary(Mod1) # AIC: 358.11
##
## Call:
## glm(formula = sold ~ ., family = binomial, data = train1[, -c(1,
## 11)])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.8579 -0.6333 -0.2955 0.3901 2.9605
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 14.0470 2.9235 4.805 1.55e-06 ***
## biddable 1.0263 0.3474 2.955 0.003131 **
## startprice -3.3620 0.6037 -5.569 2.56e-08 ***
## conditionNew other (see details) 1.6187 0.8842 1.831 0.067155 .
## conditionRefurbished 1.1497 0.7853 1.464 0.143186
## conditionUsed 1.7055 0.6298 2.708 0.006764 **
## cellularUnknown -13.6136 882.7445 -0.015 0.987696
## carrierUnknown NA NA NA NA
## colorGold 0.1551 1.0341 0.150 0.880802
## colorSpace Gray -0.2966 0.7309 -0.406 0.684876
## colorUnknown -0.3065 0.3935 -0.779 0.435992
## colorWhite -0.2420 0.4042 -0.599 0.549263
## storage16 -0.6486 0.8966 -0.723 0.469414
## storage32 -1.4312 1.0162 -1.408 0.159023
## storage64 0.2721 0.9053 0.301 0.763785
## storageUnknown -1.0308 1.4675 -0.702 0.482415
## productlineiPad 2 1.6320 0.6329 2.579 0.009923 **
## productlineiPad 3 2.5381 0.7856 3.231 0.001235 **
## productlineiPad 4 3.1863 0.8844 3.603 0.000315 ***
## productlineiPad Air 3.7456 0.9755 3.840 0.000123 ***
## productlineiPad Air 2 5.0313 1.1746 4.283 1.84e-05 ***
## productlineiPad mini 2.0542 0.7161 2.869 0.004120 **
## productlineiPad mini 2 3.0251 0.9798 3.087 0.002019 **
## productlineiPad mini 3 4.2137 1.7775 2.371 0.017763 *
## productlineUnknown 0.8344 1.0601 0.787 0.431227
## Desc -0.8252 0.9713 -0.850 0.395603
## negetive 0.1579 0.2070 0.763 0.445725
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 543.91 on 393 degrees of freedom
## Residual deviance: 306.11 on 368 degrees of freedom
## AIC: 358.11
##
## Number of Fisher Scoring iterations: 13
# We will select the variables using the AIC score. The step function determines the optimum number of variables to use for the model
step(Mod1) # AIC: 346.6
## Start: AIC=358.11
## sold ~ biddable + startprice + condition + cellular + carrier +
## color + storage + productline + Desc + negetive
##
##
## Step: AIC=358.11
## sold ~ biddable + startprice + condition + cellular + color +
## storage + productline + Desc + negetive
##
## Df Deviance AIC
## - color 4 306.95 350.95
## - cellular 1 306.65 356.65
## - negetive 1 306.69 356.69
## - Desc 1 306.83 356.83
## - storage 4 313.85 357.85
## <none> 306.11 358.11
## - condition 3 315.05 361.05
## - biddable 1 314.81 364.81
## - productline 9 332.57 366.57
## - startprice 1 389.96 439.96
##
## Step: AIC=350.95
## sold ~ biddable + startprice + condition + cellular + storage +
## productline + Desc + negetive
##
## Df Deviance AIC
## - cellular 1 307.39 349.39
## - negetive 1 307.59 349.59
## - Desc 1 307.62 349.62
## - storage 4 314.93 350.93
## <none> 306.95 350.95
## - condition 3 316.70 354.70
## - biddable 1 315.29 357.29
## - productline 9 335.27 361.27
## - startprice 1 393.45 435.45
##
## Step: AIC=349.39
## sold ~ biddable + startprice + condition + storage + productline +
## Desc + negetive
##
## Df Deviance AIC
## - negetive 1 308.03 348.03
## - Desc 1 308.13 348.13
## <none> 307.39 349.39
## - storage 4 315.63 349.63
## - condition 3 317.32 353.32
## - biddable 1 315.61 355.61
## - productline 9 336.13 360.13
## - startprice 1 395.48 435.48
##
## Step: AIC=348.03
## sold ~ biddable + startprice + condition + storage + productline +
## Desc
##
## Df Deviance AIC
## - Desc 1 308.65 346.65
## <none> 308.03 348.03
## - storage 4 316.49 348.49
## - condition 3 317.86 351.86
## - biddable 1 315.90 353.90
## - productline 9 337.87 359.87
## - startprice 1 398.39 436.39
##
## Step: AIC=346.65
## sold ~ biddable + startprice + condition + storage + productline
##
## Df Deviance AIC
## <none> 308.65 346.65
## - storage 4 316.80 346.80
## - condition 3 318.28 350.28
## - biddable 1 316.50 352.50
## - productline 9 338.80 358.80
## - startprice 1 400.35 436.35
##
## Call: glm(formula = sold ~ biddable + startprice + condition + storage +
## productline, family = binomial, data = train1[, -c(1, 11)])
##
## Coefficients:
## (Intercept) biddable
## 13.6631 0.9550
## startprice conditionNew other (see details)
## -3.5019 1.6369
## conditionRefurbished conditionUsed
## 1.4225 1.8178
## storage16 storage32
## -0.6818 -1.4028
## storage64 storageUnknown
## 0.2493 -0.9083
## productlineiPad 2 productlineiPad 3
## 1.7587 2.6579
## productlineiPad 4 productlineiPad Air
## 3.3657 3.8487
## productlineiPad Air 2 productlineiPad mini
## 5.2621 2.1456
## productlineiPad mini 2 productlineiPad mini 3
## 3.1238 4.4439
## productlineUnknown
## 0.8791
##
## Degrees of Freedom: 393 Total (i.e. Null); 375 Residual
## Null Deviance: 543.9
## Residual Deviance: 308.6 AIC: 346.6
M1 <- glm(formula = sold ~ biddable + startprice + condition + storage +
productline, family = binomial, data = train1[, -c(1, 11)])
Similarly we perform the logistic regression over the remaining 3 models
# Model -2
Mod2 <- glm(sold ~ ., data = train2[,-c(1,11)], family = binomial)
summary(Mod2) # AIC: 469.26
step(Mod2) # AIC: 456.1
M2 <- glm(formula = sold ~ biddable + startprice + storage + productline +
Desc, family = binomial, data = train2[, -c(1, 11)])
# Model -3
Mod3 <- glm(sold ~ ., data = train3[,-c(1,5,6,11)], family = binomial)
summary(Mod3) # AIC: 693.3
step(Mod3) # AIC: 680.6
M3 <- glm(formula = sold ~ biddable + startprice + condition + productline,
family = binomial, data = train3[, -c(1, 5, 6, 11)])
# Model - 4
Mod4 <- glm(sold ~ ., data = train4[,-c(1,5,6,11)], family = binomial)
summary(Mod4) # AIC: 263.22
step(Mod4) # AIC: 237.5
M4 <- glm(formula = sold ~ biddable + startprice, family = binomial,
data = train4[, -c(1, 5, 6, 11)])
Predicting the AUC score on the training set
#AUC on training set
p1 <- predict(M1, type = "response")
p2 <- predict(M2, type = "response")
p3 <- predict(M3, type = "response")
p4 <- predict(M4, type = "response")
sold <- c(train1$sold, train2$sold, train3$sold, train4$sold)
ROCR <- prediction(c(p1,p2,p3,p4), sold)
as.numeric(performance(ROCR,"auc")@y.values) # 0.8805123
## [1] 0.8805123
ROCRplot <- performance(ROCR, "tpr", "fpr")
plot(ROCRplot, colorize = T)
#Making predictions on the test set
predict1 <- predict(M1, newdata = test1[,-c(1,10)], type = "response")
predict2 <- predict(M2, newdata = test2[,-c(1,10)], type = "response")
predict3 <- predict(M3, newdata = test3[, -c(1, 5, 6, 10)], type = "response")
predict4 <- predict(M4, newdata = test4[, -c(1, 5, 6, 10)], type = "response")
ID <- c(test1$UniqueID, test2$UniqueID, test3$UniqueID, test4$UniqueID)
output_GLM <- data.frame(UniqueID = ID, Probability_LM = c(predict1, predict2, predict3, predict4))
Now lets try the 2nd model on XGBoost
set.seed(144)
train <- train[,-c(1,11)]
test <- test[,-c(1,10)]
train <- train[,-c(5:6)]
test <- test[,-c(5:6)]
dmy <- dummyVars(" ~ .", data = train)
trainTrsf <- data.frame(predict(dmy, newdata = train))
dmytest <- dummyVars(" ~ .", data = test)
testTrsf <- data.frame(predict(dmytest, newdata = test))
set.seed(123)
res.cv <- xgb.cv(data = as.matrix(trainTrsf[,-26]),
label = as.matrix(trainTrsf[,26]),
nfold = 3,
nrounds = 1000,
verbose = TRUE,
objective = "binary:logistic",
eval_metric = "auc",
eta = 0.1,
subsample = 0.9,
max.depth = 2)
Let’s check the output res.cv file. This file contains the results of each iterations of xgboost. Columns 1 and 2 are the stats for the training set. Column 3 and 4 are the stats for the test set created in the cross-validation. We will select the model which gives the maxuimum confidence of test set AUC (95%)
## train.auc.mean train.auc.std test.auc.mean test.auc.std
## 1: 0.829263 0.008611 0.821688 0.004125
## 2: 0.830874 0.009257 0.822635 0.004941
## 3: 0.833422 0.004876 0.825316 0.009580
## 4: 0.833982 0.005194 0.826568 0.008387
## 5: 0.834204 0.005376 0.826959 0.008025
## 6: 0.838384 0.005574 0.831394 0.008596
We will now train the model using the optimum value of the rounds.
with(res.cv, max(test.auc.mean - test.auc.std))
round <- with(res.cv, which.max(test.auc.mean - test.auc.std))
round #532 rounds
set.seed(123)
bst <- xgboost(data = as.matrix(trainTrsf[,-26]),
label = trainTrsf[,26],
nround=round, eta = 0.1, subsample = 0.9,
objective = "binary:logistic", verbose=TRUE, max.depth = 2)
importance_Matrix <- xgb.importance(names(trainTrsf), model=bst)
xgb.plot.importance(importance_Matrix)
predictions_xgboost <- predict(bst, as.matrix(testTrsf))
test1 <- read.csv("eBayiPadTest.csv")
output_XGBoost <- data.frame(UniqueID = test1$UniqueID, Probability_XG = predictions_xgboost)
output <- merge(output_GLM, output_XGBoost, by.x = "UniqueID", by.y = "UniqueID")
output$Probability1 <- with(output, (Probability_LM + Probability_XG)/2)
output <- output[,-c(2,3)]
write.csv(output, "Stacked_LM_XGBoost.csv", row.names = FALSE)