How to build BigQuery ML classification
1 Objectives
- The tasks are to perform:
- Use BigQuery to find public datasets
- Query and explore the ecommerce dataset
- Create a training and evaluation dataset to be used for batch prediction
- Create a classification (logistic regression) model in BQML
- Evaluate the performance of your machine learning model
- Predict and rank the probability that a visitor will make a purchase
1.1 Google Cloud Platform Console
1.2 Explore online google cloud ecommerce data
#standardSQLWITH visitors AS(
SELECT
COUNT(DISTINCT fullVisitorId) AS total_visitors
FROM `data-to-insights.ecommerce.web_analytics`
),
AS(
purchasers SELECT
COUNT(DISTINCT fullVisitorId) AS total_purchasers
FROM `data-to-insights.ecommerce.web_analytics`
WHERE totals.transactions IS NOT NULL
)
SELECT
total_visitors,
total_purchasers,/ total_visitors AS conversion_rate
total_purchasers FROM visitors, purchasers
SELECT
p.v2ProductName,
p.v2ProductCategory,SUM(p.productQuantity) AS units_sold,
ROUND(SUM(p.localProductRevenue/1000000),2) AS revenue
FROM `data-to-insights.ecommerce.web_analytics`,
AS h,
UNNEST(hits) AS p
UNNEST(h.product) GROUP BY 1, 2
ORDER BY revenue DESC
LIMIT 5;
on a return visit (could have bought on first as well
# visitors who bought WITH all_visitor_stats AS (
SELECT
741,721 unique visitors
fullvisitorid, # IF(COUNTIF(totals.transactions > 0 AND totals.newVisits IS NULL) > 0, 1, 0) AS will_buy_on_return_visit
FROM `data-to-insights.ecommerce.web_analytics`
GROUP BY fullvisitorid
)
SELECT
COUNT(DISTINCT fullvisitorid) AS total_visitors,
will_buy_on_return_visitFROM all_visitor_stats
GROUP BY will_buy_on_return_visit
SELECT
* EXCEPT(fullVisitorId)
FROM
# featuresSELECT
(
fullVisitorId,0) AS bounces,
IFNULL(totals.bounces, 0) AS time_on_site
IFNULL(totals.timeOnSite, FROM
-to-insights.ecommerce.web_analytics`
`dataWHERE
= 1)
totals.newVisits JOIN
SELECT
(
fullvisitorid,IF(COUNTIF(totals.transactions > 0 AND totals.newVisits IS NULL) > 0, 1, 0) AS will_buy_on_return_visit
FROM
-to-insights.ecommerce.web_analytics`
`dataGROUP BY fullvisitorid)
USING (fullVisitorId)
ORDER BY time_on_site DESC
LIMIT 10;
1.3 Create a BigQuery dataset to store the final model
- to create a model and specify model
CREATE OR REPLACE MODEL `ecommerce.classification_model`
OPTIONS
(='logistic_reg',
model_type= ['will_buy_on_return_visit']
labels
)AS
#standardSQLSELECT
* EXCEPT(fullVisitorId)
FROM
# featuresSELECT
(
fullVisitorId,0) AS bounces,
IFNULL(totals.bounces, 0) AS time_on_site
IFNULL(totals.timeOnSite, FROM
-to-insights.ecommerce.web_analytics`
`dataWHERE
= 1
totals.newVisits AND date BETWEEN '20160801' AND '20170430') # train on first 9 months
JOIN
SELECT
(
fullvisitorid,IF(COUNTIF(totals.transactions > 0 AND totals.newVisits IS NULL) > 0, 1, 0) AS will_buy_on_return_visit
FROM
-to-insights.ecommerce.web_analytics`
`dataGROUP BY fullvisitorid)
USING (fullVisitorId)
;
1.4 Evaluate classification model performance with ROC curve
- In BQML, roc_auc is simply a queryable field when evaluating your trained ML model.
- When training is completed, then it evaluates how well the model performs with this query using ML.EVALUATE:
SELECT
roc_auc,CASE
WHEN roc_auc > .9 THEN 'good'
WHEN roc_auc > .8 THEN 'fair'
WHEN roc_auc > .7 THEN 'not great'
ELSE 'poor' END AS model_quality
FROM
ML.EVALUATE(MODEL ecommerce.classification_model, (
SELECT
* EXCEPT(fullVisitorId)
FROM
# featuresSELECT
(
fullVisitorId,0) AS bounces,
IFNULL(totals.bounces, 0) AS time_on_site
IFNULL(totals.timeOnSite, FROM
-to-insights.ecommerce.web_analytics`
`dataWHERE
= 1
totals.newVisits AND date BETWEEN '20170501' AND '20170630') # eval on 2 months
JOIN
SELECT
(
fullvisitorid,IF(COUNTIF(totals.transactions > 0 AND totals.newVisits IS NULL) > 0, 1, 0) AS will_buy_on_return_visit
FROM
-to-insights.ecommerce.web_analytics`
`dataGROUP BY fullvisitorid)
USING (fullVisitorId)
));
1.5 Improve model performance with Feature Engineering
create a second machine learning model called classification_model_2:
- How far the visitor got in the checkout process on their first visit
- Where the visitor came from (traffic source: organic search, referring site etc..)
- Device category (mobile, tablet, desktop)
- Geographic information (country)
CREATE OR REPLACE MODEL `ecommerce.classification_model_2`
OPTIONS='logistic_reg', labels = ['will_buy_on_return_visit']) AS
(model_type
WITH all_visitor_stats AS (
SELECT
fullvisitorid,IF(COUNTIF(totals.transactions > 0 AND totals.newVisits IS NULL) > 0, 1, 0) AS will_buy_on_return_visit
FROM `data-to-insights.ecommerce.web_analytics`
GROUP BY fullvisitorid
)
add in new features
# SELECT * EXCEPT(unique_session_id) FROM (
SELECT
CONCAT(fullvisitorid, CAST(visitId AS STRING)) AS unique_session_id,
# labels
will_buy_on_return_visit,
MAX(CAST(h.eCommerceAction.action_type AS INT64)) AS latest_ecommerce_progress,
on the site
# behavior 0) AS bounces,
IFNULL(totals.bounces, 0) AS time_on_site,
IFNULL(totals.timeOnSite,
totals.pageviews,
where the visitor came from
# source,
trafficSource.
trafficSource.medium,
channelGrouping,
or desktop
# mobile
device.deviceCategory,
# geographic"") AS country
IFNULL(geoNetwork.country,
FROM `data-to-insights.ecommerce.web_analytics`,
AS h
UNNEST(hits)
JOIN all_visitor_stats USING(fullvisitorid)
WHERE 1=1
only predict for new visits
# AND totals.newVisits = 1
AND date BETWEEN '20160801' AND '20170430' # train 9 months
GROUP BY
unique_session_id,
will_buy_on_return_visit,
bounces,
time_on_site,
totals.pageviews,source,
trafficSource.
trafficSource.medium,
channelGrouping,
device.deviceCategory,
country );
- Evaluate this new model for predictive power:
#standardSQLSELECT
roc_auc,CASE
WHEN roc_auc > .9 THEN 'good'
WHEN roc_auc > .8 THEN 'fair'
WHEN roc_auc > .7 THEN 'not great'
ELSE 'poor' END AS model_quality
FROM
ML.EVALUATE(MODEL ecommerce.classification_model_2, (
WITH all_visitor_stats AS (
SELECT
fullvisitorid,IF(COUNTIF(totals.transactions > 0 AND totals.newVisits IS NULL) > 0, 1, 0) AS will_buy_on_return_visit
FROM `data-to-insights.ecommerce.web_analytics`
GROUP BY fullvisitorid
)
add in new features
# SELECT * EXCEPT(unique_session_id) FROM (
SELECT
CONCAT(fullvisitorid, CAST(visitId AS STRING)) AS unique_session_id,
# labels
will_buy_on_return_visit,
MAX(CAST(h.eCommerceAction.action_type AS INT64)) AS latest_ecommerce_progress,
on the site
# behavior 0) AS bounces,
IFNULL(totals.bounces, 0) AS time_on_site,
IFNULL(totals.timeOnSite,
totals.pageviews,
where the visitor came from
# source,
trafficSource.
trafficSource.medium,
channelGrouping,
or desktop
# mobile
device.deviceCategory,
# geographic"") AS country
IFNULL(geoNetwork.country,
FROM `data-to-insights.ecommerce.web_analytics`,
AS h
UNNEST(hits)
JOIN all_visitor_stats USING(fullvisitorid)
WHERE 1=1
only predict for new visits
# AND totals.newVisits = 1
AND date BETWEEN '20170501' AND '20170630' # eval 2 months
GROUP BY
unique_session_id,
will_buy_on_return_visit,
bounces,
time_on_site,
totals.pageviews,source,
trafficSource.
trafficSource.medium,
channelGrouping,
device.deviceCategory,
country
) ));
1.6 Predict which new visitors will come back and purchase
- The prediction query below uses the improved classification model to predict the probability that a first-time visitor to the Google Merchandise Store will make a purchase in a later visit:
SELECT
*
FROM
ml.PREDICT(MODEL `ecommerce.classification_model_2`,
(
WITH all_visitor_stats AS (
SELECT
fullvisitorid,IF(COUNTIF(totals.transactions > 0 AND totals.newVisits IS NULL) > 0, 1, 0) AS will_buy_on_return_visit
FROM `data-to-insights.ecommerce.web_analytics`
GROUP BY fullvisitorid
)
SELECT
CONCAT(fullvisitorid, '-',CAST(visitId AS STRING)) AS unique_session_id,
# labels
will_buy_on_return_visit,
MAX(CAST(h.eCommerceAction.action_type AS INT64)) AS latest_ecommerce_progress,
on the site
# behavior 0) AS bounces,
IFNULL(totals.bounces, 0) AS time_on_site,
IFNULL(totals.timeOnSite,
totals.pageviews,
where the visitor came from
# source,
trafficSource.
trafficSource.medium,
channelGrouping,
or desktop
# mobile
device.deviceCategory,
# geographic"") AS country
IFNULL(geoNetwork.country,
FROM `data-to-insights.ecommerce.web_analytics`,
AS h
UNNEST(hits)
JOIN all_visitor_stats USING(fullvisitorid)
WHERE
only predict for new visits
# = 1
totals.newVisits AND date BETWEEN '20170701' AND '20170801' # test 1 month
GROUP BY
unique_session_id,
will_buy_on_return_visit,
bounces,
time_on_site,
totals.pageviews,source,
trafficSource.
trafficSource.medium,
channelGrouping,
device.deviceCategory,
country
)
)
ORDER BY
DESC; predicted_will_buy_on_return_visit
2 Conclusions:
- More than 6% make a purchase in a later visit.
- These users represent nearly 50% of all first-time visitors who make a purchase in a later visit.
- Only 0.7% of first-time visitors make a purchase in a later visit.
- Targeting the top 6% of first-time increases marketing ROI by 9 times instead of targeting them all!