Cohort Analysis

#https://campus.datacamp.com/courses/customer-segmentation-in-python/
# Python environment
library(reticulate)
#use_python("C:/Users/info/AppData/Roaming/Python/Python36",required=TRUE)
#use_python('C:\\ProgramData\\Anaconda3\\envs\\r-reticulate\\python.exe')
#use_condaenv(condaenv = "r-reticulate", conda = "C:/ProgramData/Anaconda3/envs/r-reticulate/python.exe")
use_condaenv(condaenv = "r-reticulate")
#use_virtualenv("r-reticulate")

#TO INSTALL PYTHON PACKAGES:
# START ANACONDA PROMPT AS ADMIN
# RUN 'activate r-reticulate'
# RUN 'python -m pip install numpy' (replace numpy with whatever library)


#https://github.com/rstudio/reticulate/blob/master/vignettes/versions.Rmd



#https://archive.ics.uci.edu/ml/datasets/Online%20Retail
data <- read.csv("Online Retail.csv",stringsAsFactors = F)
data$InvoiceDate <- as.Date(data$InvoiceDate, "%m/%d/%Y")
data <- data.frame(data)

library(DT)

#https://campus.datacamp.com/courses/customer-segmentation-in-python/cohort-analysis?ex=3
import pandas as pd
online = r.data

datatable(head(py$online))

#How many rows?
nrow(py$online)

## [1] 541909

Assign acquisition month cohort

#https://campus.datacamp.com/courses/customer-segmentation-in-python/cohort-analysis?ex=3
import datetime as dt
def get_month(x): return dt.datetime(x.year,x.month,1)
online['InvoiceMonth'] = online['InvoiceDate'].apply(get_month)
grouping = online.groupby('CustomerID')['InvoiceMonth']
online['CohortMonth'] = grouping.transform('min')

Assign time offset value

#https://campus.datacamp.com/courses/customer-segmentation-in-python/cohort-analysis?ex=3
#Extract integer values from data function
def get_date_int(df, column):
  year = df[column].dt.year
  month = df[column].dt.month
  day = df[column].dt.day
  return year, month, day
invoice_year, invoice_month, _ = get_date_int(online, 'InvoiceMonth')
cohort_year, cohort_month, _ = get_date_int(online, 'CohortMonth')
years_diff = invoice_year - cohort_year
months_diff = invoice_month - cohort_month
online['CohortIndex'] = years_diff * 12 + months_diff + 1

Count monthly active customers from each cohort

#https://campus.datacamp.com/courses/customer-segmentation-in-python/cohort-analysis?ex=3
grouping = online.groupby(['CohortMonth','CohortIndex'])
cohort_data = grouping['CustomerID'].apply(pd.Series.nunique)
cohort_data = cohort_data.reset_index()
cohort_counts = cohort_data.pivot(index = 'CohortMonth',
                                 columns = 'CohortIndex',
                                 values = 'CustomerID')

Assign daily acquisition cohort

# Define a function that will parse the date
def get_day(x): return dt.datetime(x.year, x.month, x.day) 
# Create InvoiceDay column
online['InvoiceDay'] = online['InvoiceDate'].apply(get_day) 
# Group by CustomerID and select the InvoiceDay value
grouping = online.groupby('CustomerID')['InvoiceDay'] 
# Assign a minimum InvoiceDay value to the dataset
online['CohortDay'] = grouping.transform('min')

datatable(head(py$online))

Calculate the time offset in days

# Get the integers for date parts from the InvoiceDaycolumn
invoice_year, invoice_month, invoice_day = get_date_int(online, 'InvoiceDay')
# Get the integers for date parts from the CohortDay column
cohort_year, cohort_month, cohort_day = get_date_int(online, 'CohortDay')
# Calculate difference in years
years_diff = invoice_year - cohort_year
# Calculate difference in months
months_diff = invoice_month - cohort_month
# Calculate difference in days
days_diff = invoice_day - cohort_day
# Extract the difference in days from all previous values
online['CohortIndex'] = years_diff * 365 + months_diff * 30 + days_diff + 1

Calculate Retention rate

#https://campus.datacamp.com/courses/customer-segmentation-in-python/cohort-analysis?ex=7
#Store the first column as cohort_sizes
cohort_sizes = cohort_counts.iloc[:,0]
#Divide all values in the cohort_countss table by cohort_sizes
retention = cohort_counts.divide(cohort_sizes, axis = 0)

Build retention heatmap

import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
plt.figure(figsize=(10,8))
plt.title('Retention rates')
sns.heatmap(data = retention,
            annot = True,
            fmt = '.0%',
            vmin = 0.0,
            vmax = 0.5,
            cmap = 'BuGn',
            yticklabels=['2010-12','2011-01','2011-02','2011-03','2011-04','2011-05','2011-06',
                         '2011-07','2011-08','2011-09','2011-10','2011-11','2011-12'])
plt.show()