Home Depot Product Search Relevance is Kaggle competition targets to improve Home Depot customers’ shopping experience by developing a model that can accurately predict the relevance of search results.
Search relevancy is an implicit measure Home Depot uses to gauge how quickly they can get customers to the right products. Currently, human raters evaluate the impact of potential changes to their search algorithms, which is a slow and subjective process. By removing or minimizing human input in search relevance evaluation, the target is to predict the relevance for each pair listed in the test set. Given that the test set contains both seen and unseen search terms.
This data set contains a number of products and real customer search terms from Home Depot’s website. The challenge is to predict a relevance score for the provided combinations of search terms and products.
The relevance is a number between 1 (not relevant) to 3 (highly relevant). For example, a search for “AA battery” would be considered highly relevant to a pack of size AA batteries (relevance = 3), mildly relevant to a cordless drill battery (relevance = 2), and not relevant to a snow shovel (relevance = 1). Let us explore the data together. ## Main product data
Which include the following files:
train.csv –> the training set, contains products, searches, and relevance scores.
test.csv –> the test set, data will be used to submitting to the Kaggle competition.
# Read training data first
products_training <- tbl_df(read.csv("data/train.csv", stringsAsFactors = FALSE))
# Have a quick glimpese on it
glimpse(products_training)
## Observations: 74,067
## Variables: 5
## $ id (int) 2, 3, 9, 16, 17, 18, 20, 21, 23, 27, 34, 35, 37,...
## $ product_uid (int) 100001, 100001, 100002, 100005, 100005, 100006, ...
## $ product_title (chr) "Simpson Strong-Tie 12-Gauge Angle", "Simpson St...
## $ search_term (chr) "angle bracket", "l bracket", "deck over", "rain...
## $ relevance (dbl) 3.00, 2.50, 3.00, 2.33, 2.67, 3.00, 2.67, 3.00, ...
# Read relevance data
relevance_dist <- products_training %>% count(as.factor(relevance)) %>% arrange(desc(n))
relevance_dist
## Source: local data frame [13 x 2]
##
## as.factor(relevance) n
## (fctr) (int)
## 1 3 19125
## 2 2.33 16060
## 3 2.67 15202
## 4 2 11730
## 5 1.67 6780
## 6 1.33 3006
## 7 1 2105
## 8 2.5 19
## 9 2.25 11
## 10 2.75 11
## 11 1.75 9
## 12 1.5 5
## 13 1.25 4
#Let us explore the distribution
ggplot(products_training, aes(x = relevance)) +
geom_histogram(color = "black", fill = "DarkOrange") +
scale_x_continuous(breaks = seq(0, 3, 0.2))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Desnisty function
ggplot(products_training, aes(x = relevance)) +
geom_density(color = "black") +
scale_x_continuous(breaks = seq(0, 3, 0.2))
Looks like most search terms could score relevance score of 3 and a little of 1.25.
# Now we can read test data
products_test <- tbl_df(read.csv("data/test.csv",stringsAsFactors = FALSE))
#combine training and test data, Joining by: c("id", "product_uid", "product_title", "search_term")
suppressMessages(product_training_test<- full_join(products_training,products_test))
# Read product description
product_description <- tbl_df(read.csv("data/product_descriptions.csv",stringsAsFactors = FALSE))
glimpse(product_description)
## Observations: 124,428
## Variables: 2
## $ product_uid (int) 100001, 100002, 100003, 100004, 100005, 10...
## $ product_description (chr) "Not only do angles make joints stronger, ...
#combine test data and data combined at the previous steps,Joining by: c("product_uid")
suppressMessages(product_all <- right_join(product_training_test, product_description))
#data after merging training, test and description datasets
glimpse(product_all)
## Observations: 240,760
## Variables: 6
## $ id (int) 2, 3, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...
## $ product_uid (int) 100001, 100001, 100001, 100001, 100001, 10...
## $ product_title (chr) "Simpson Strong-Tie 12-Gauge Angle", "Simp...
## $ search_term (chr) "angle bracket", "l bracket", "90 degree b...
## $ relevance (dbl) 3.00, 2.50, NA, NA, NA, NA, NA, NA, 3.00, ...
## $ product_description (chr) "Not only do angles make joints stronger, ...
#Now let us find out the most searched terms
product_all %>% count(search_term) %>% arrange(desc(n))
## Source: local data frame [24,601 x 2]
##
## search_term n
## (chr) (int)
## 1 patio chair cushions 36
## 2 1x4 23
## 3 24 inch vanity 23
## 4 40 gal gas water heater 23
## 5 4x6 23
## 6 acrylic 23
## 7 air conditioner portable 23
## 8 air conditioner with heat 23
## 9 allure plank flooring 23
## 10 allure vinyl flooring 23
## .. ... ...
#Let us have a nice histogram about it
product_all %>% count(product_uid) %>% arrange(desc(n))
## Source: local data frame [124,428 x 2]
##
## product_uid n
## (int) (int)
## 1 101892 70
## 2 101442 49
## 3 102456 48
## 4 101959 47
## 5 101280 45
## 6 102162 44
## 7 104691 44
## 8 101148 43
## 9 100898 42
## 10 109594 41
## .. ... ...
#Read the attributes of products
products_attributes <- tbl_df(read.csv("data/attributes.csv",stringsAsFactors = FALSE, na.strings = 'N/A'))
glimpse(products_attributes)
## Observations: 2,044,803
## Variables: 3
## $ product_uid (int) 100001, 100001, 100001, 100001, 100001, 100001, 10...
## $ name (chr) "Bullet01", "Bullet02", "Bullet03", "Bullet04", "B...
## $ value (chr) "Versatile connector for various 90° connections ...
products_attributes <- products_attributes %>%
filter(product_uid != 'NA') %>% #revmove null rows
unite(property, c(name,value), sep = ';;') #combine name and values columns
glimpse(products_attributes)
## Observations: 2,044,648
## Variables: 2
## $ product_uid (int) 100001, 100001, 100001, 100001, 100001, 100001, 10...
## $ property (chr) "Bullet01;;Versatile connector for various 90° co...
# group rows with the same id
products_attributes <- aggregate(products_attributes$property ~ products_attributes$property, by=list(products_attributes$product_uid), FUN=paste, collapse="@@@@")
glimpse(products_attributes)
## Observations: 86,263
## Variables: 2
## $ Group.1 (int) 100001, 100002, 100003, 100004, 1...
## $ products_attributes$property (chr) "Bullet01;;Versatile connector fo...
#restore original names
products_attributes <- products_attributes %>% rename(product_uid = Group.1, property = `products_attributes$property`)
glimpse(products_attributes)
## Observations: 86,263
## Variables: 2
## $ product_uid (int) 100001, 100002, 100003, 100004, 100005, 100006, 10...
## $ property (chr) "Bullet01;;Versatile connector for various 90° co...
source('attributesParser.R')
products_attributes <- mutate(products_attributes, bullets = sapply(property, FUN = bulletsParser),
yeses = sapply(property, FUN = yesesParser),
nos = sapply(property, FUN = nosParser),
keys = sapply(property, FUN = keysParser),
values = sapply(property, FUN = valuesParser))
#merge attributes with the main dataset
product_all <- full_join(product_all, products_attributes)
## Joining by: "product_uid"
source('matchScorer.R')
## Loading required package: KernSmooth
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
product_all <- mutate(product_all, bulletsScore = mapply(phrasesMatchScore, search_term, bullets),
yesesScore = mapply(phrasesMatchScore, search_term, yeses),
nosScore = mapply(phrasesMatchScore, search_term, nos),
keysScore = mapply(phrasesMatchScore, search_term, keys),
valuesScore = mapply(phrasesMatchScore, search_term, values))
We divide the dataset back so can perform prediction and test our model
product_all_train <- subset(product_all, !is.na(relevance))
product_all_test <- subset(product_all, is.na(relevance))
RegModel = lm(relevance ~ bulletsScore + yesesScore + nosScore + keysScore + valuesScore, data = product_all_train)
TestPredictions = predict(RegModel, newdata = product_all_test)
summary(RegModel)
##
## Call:
## lm(formula = relevance ~ bulletsScore + yesesScore + nosScore +
## keysScore + valuesScore, data = product_all_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.54892 -0.35275 0.01231 0.53109 0.77971
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.468907 0.003862 639.280 < 2e-16 ***
## bulletsScore -0.278528 0.012414 -22.436 < 2e-16 ***
## yesesScore 0.035908 0.011058 3.247 0.001166 **
## nosScore 0.040682 0.010579 3.846 0.000120 ***
## keysScore 0.056294 0.016768 3.357 0.000788 ***
## valuesScore 0.096021 0.012965 7.406 1.31e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5303 on 74061 degrees of freedom
## Multiple R-squared: 0.01389, Adjusted R-squared: 0.01382
## F-statistic: 208.6 on 5 and 74061 DF, p-value: < 2.2e-16
# Read relevance data
product_all_test$relevance <- as.numeric(TestPredictions)
relevance_dist_final <- product_all_test %>% count(as.factor(relevance)) %>% arrange(desc(n))
relevance_dist_final
## Source: local data frame [100,818 x 2]
##
## as.factor(relevance) n
## (fctr) (int)
## 1 2.46890706875534 28316
## 2 2.26653655293876 143
## 3 2.29054188230833 78
## 4 2.40580055196752 59
## 5 2.30461529405922 55
## 6 2.2919223803524 53
## 7 2.31454721167789 48
## 8 2.28557592349899 45
## 9 2.25053300002571 39
## 10 2.34269403517969 39
## .. ... ...
#Let us explore the distribution
ggplot(product_all_test, aes(x = relevance)) +
geom_histogram(color = "black", fill = "DarkOrange", bins = 12) +
scale_x_continuous(breaks = seq(2, 3, 0.1))
Let us see another view
# Desnisty function
relecance_graph <- ggplot(product_all_test, aes(x = relevance)) +
geom_density(color = "black") +
scale_x_continuous(breaks = seq(2, 3, 0.2))
It confirms our previous induction.
Let us now investigate the features used in the model to see their affect on the result
bullets_graph <- ggplot(product_all_test, aes(x = bulletsScore))+
geom_density(color = "black") +
scale_x_continuous()
yeses_graph <- ggplot(product_all_test, aes(x = yesesScore))+
geom_density(color = "black") +
scale_x_continuous()
nos_graph <- ggplot(product_all_test, aes(x = nosScore))+
geom_density(color = "black") +
scale_x_continuous()
keys_graph <- ggplot(product_all_test, aes(x = keysScore))+
geom_density(color = "black") +
scale_x_continuous()
values_graph <- ggplot(product_all_test, aes(x = valuesScore))+
geom_density(color = "black") +
scale_x_continuous()
grid.arrange(bullets_graph, yeses_graph, nos_graph, keys_graph, values_graph, relecance_graph)
It seems that the builets score is one affects the outlier.
Now, let us investigate more by seeing what each feature behave against relevance
bullets_rel <- ggplot(product_all_test, aes(x = relevance, y = bulletsScore))+
geom_point()
yeses_rel <- ggplot(product_all_test, aes(x = relevance, y = yesesScore))+
geom_point()
nos_rel <- ggplot(product_all_test, aes(x = relevance, y = nosScore))+
geom_point()
keys_rel <- ggplot(product_all_test, aes(x = relevance, y = keysScore))+
geom_point()
values_rel <- ggplot(product_all_test, aes(x = relevance, y = valuesScore))+
geom_point()
grid.arrange(bullets_rel, yeses_rel, nos_rel, keys_rel, values_rel)
and it seems the value score the one drive the relevance