Data Mining - Apriori & Association Rules

Author: Quan Nguyen

1 Introduction

2 Methodology

3 Small Data

# Import Libraries
import warnings
warnings.filterwarnings('ignore')
import io
import requests
import pandas as pd
pd.options.mode.chained_assignment = None
from mlxtend.frequent_patterns import apriori, association_rules
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx

# Import Data
url = "https://raw.githubusercontent.com/QuanNguyenIU/Dataset/main/dataset.csv"
s = requests.get(url).content
df = pd.read_csv(io.StringIO(s.decode('utf-8')), sep = ',')
df

##           0       1       2       3       4       5       6
## 0     Bread    Wine    Eggs    Meat  Cheese  Pencil  Diaper
## 1     Bread  Cheese    Meat  Diaper    Wine    Milk  Pencil
## 2    Cheese    Meat    Eggs    Milk    Wine     NaN     NaN
## 3    Cheese    Meat    Eggs    Milk    Wine     NaN     NaN
## 4      Meat  Pencil    Wine     NaN     NaN     NaN     NaN
## ..      ...     ...     ...     ...     ...     ...     ...
## 310   Bread    Eggs  Cheese     NaN     NaN     NaN     NaN
## 311    Meat    Milk  Pencil     NaN     NaN     NaN     NaN
## 312   Bread  Cheese    Eggs    Meat  Pencil  Diaper    Wine
## 313    Meat  Cheese     NaN     NaN     NaN     NaN     NaN
## 314    Eggs    Wine   Bagel   Bread    Meat     NaN     NaN
## 
## [315 rows x 7 columns]

unique_items = df.values.ravel()
nan_items = pd.unique(unique_items)
items = [x for x in nan_items if str(x) != 'nan']
print('All Unique Items:\n', items)

## All Unique Items:
##  ['Bread', 'Wine', 'Eggs', 'Meat', 'Cheese', 'Pencil', 'Diaper', 'Milk', 'Bagel']

# One Hot Encoding
itemset = set(items)
envals = []
for idx, row in df.iterrows():
    rowset = set(row)
    labels = {}
    uncomms = list(itemset - rowset)
    commons = list(itemset.intersection(rowset))
    for uc in uncomms:
        labels[uc] = 0
    for cm in commons:
        labels[cm] = 1
    envals.append(labels)

# Apriori
ohet = pd.DataFrame(envals)
fitems = apriori(ohet, min_support = 0.2, use_colnames = True)
fitems.head(10)

##     support       itemsets
## 0  0.501587         (Milk)
## 1  0.425397        (Bagel)
## 2  0.504762        (Bread)
## 3  0.476190         (Meat)
## 4  0.406349       (Diaper)
## 5  0.361905       (Pencil)
## 6  0.438095         (Eggs)
## 7  0.438095         (Wine)
## 8  0.501587       (Cheese)
## 9  0.225397  (Milk, Bagel)

# Association Rules
rules = association_rules(fitems, metric = "confidence", min_threshold = 0.6)
rules.head(10)

##       antecedents consequents  ...  leverage  conviction
## 0        (Cheese)      (Milk)  ...  0.053172    1.270148
## 1          (Milk)    (Cheese)  ...  0.053172    1.270148
## 2         (Bagel)     (Bread)  ...  0.064641    1.442650
## 3          (Eggs)      (Meat)  ...  0.058050    1.338624
## 4        (Cheese)      (Meat)  ...  0.084958    1.477891
## 5          (Meat)    (Cheese)  ...  0.084958    1.557540
## 6          (Eggs)    (Cheese)  ...  0.078670    1.563203
## 7          (Wine)    (Cheese)  ...  0.050098    1.297754
## 8  (Cheese, Milk)      (Meat)  ...  0.058050    1.571429
## 9  (Cheese, Meat)      (Milk)  ...  0.040756    1.337845
## 
## [10 rows x 9 columns]

# Plotting Results
plt.scatter(rules['support'], rules['confidence'], alpha = 0.5)
plt.xlabel('support')
plt.ylabel('confidence')
plt.title('Support vs Confidence')
plt.show()

# Drawing Directed Graph from 10 Top Rules
def draw_graph(rules, rules_to_show):
    G1 = nx.DiGraph()
    color_map = []
    N = 50
    colors = np.random.rand(N)    
    strs=['R0', 'R1', 'R2', 'R3', 'R4', 'R5', 
          'R6', 'R7', 'R8', 'R9', 'R10', 'R11']
    for i in range(rules_to_show):      
        G1.add_nodes_from(['R' + str(i)])
        for a in rules.iloc[i]['antecedents']:             
            G1.add_nodes_from([a])
            G1.add_edge(a, 'R' + str(i), color = colors[i] , weight = 1)
        for c in rules.iloc[i]['consequents']:
            G1.add_nodes_from([c])
            G1.add_edge('R' + str(i), c, color = colors[i],  weight = 1)
    for node in G1:
        found_a_string = False
        for item in strs: 
           if node == item: found_a_string = True
        if found_a_string: color_map.append('yellow')
        else: color_map.append('cyan')       
    edgelist = G1.edges()
    edge_color = [G1[u][v]['color'] for u, v in edgelist]
    weights = [G1[u][v]['weight'] for u, v in edgelist]
    pos = nx.spring_layout(G1, k = 16, scale = 1)
    nx.draw(G1, pos, edgelist = edgelist, node_color = color_map,
            edge_color = edge_color, width = weights, font_size = 10,
            with_labels = True)
    plt.show()

draw_graph(rules, 10)

To read the network:

Bagel -> R0 -> Bread.
Cheese -> R1 -> Milk.
Milk -> R2 -> Cheese.
and so on.

Folks may observe that Cheese, Meat and Milk involve in multiples rules (seems interesting).

4 Medium Data

# Import Data
url = "https://raw.githubusercontent.com/QuanNguyenIU/Dataset/main/store.csv"
s = requests.get(url).content
df = pd.read_csv(io.StringIO(s.decode('utf-8')), sep = ',')
df

##                   0                  1  ...       18         19
## 0            shrimp            almonds  ...  spinach  olive oil
## 1           burgers          meatballs  ...      NaN        NaN
## 2           chutney                NaN  ...      NaN        NaN
## 3            turkey            avocado  ...      NaN        NaN
## 4     mineral water               milk  ...      NaN        NaN
## ...             ...                ...  ...      ...        ...
## 7496         butter         light mayo  ...      NaN        NaN
## 7497        burgers  frozen vegetables  ...      NaN        NaN
## 7498        chicken                NaN  ...      NaN        NaN
## 7499       escalope          green tea  ...      NaN        NaN
## 7500           eggs    frozen smoothie  ...      NaN        NaN
## 
## [7501 rows x 20 columns]

items = df['0'].unique()
print('All Unique Items:\n', items)

## All Unique Items:
##  ['shrimp' 'burgers' 'chutney' 'turkey' 'mineral water' 'low fat yogurt'
##  'whole wheat pasta' 'soup' 'frozen vegetables' 'french fries' 'eggs'
##  'cookies' 'spaghetti' 'meatballs' 'red wine' 'rice' 'parmesan cheese'
##  'ground beef' 'sparkling water' 'herb & pepper' 'pickles' 'energy bar'
##  'fresh tuna' 'escalope' 'avocado' 'tomato sauce' 'clothes accessories'
##  'energy drink' 'chocolate' 'grated cheese' 'yogurt cake' 'mint'
##  'asparagus' 'champagne' 'ham' 'muffins' 'french wine' 'chicken' 'pasta'
##  'tomatoes' 'pancakes' 'frozen smoothie' 'carrots' 'yams' 'shallot'
##  'butter' 'light mayo' 'pepper' 'candy bars' 'cooking oil' 'milk'
##  'green tea' 'bug spray' 'oil' 'olive oil' 'salmon' 'cake' 'almonds'
##  'salt' 'strong cheese' 'hot dogs' 'pet food' 'whole wheat rice'
##  'antioxydant juice' 'honey' 'sandwich' 'salad' 'magazines' 'protein bar'
##  'mayonnaise' 'cider' 'burger sauce' 'green grapes' 'vegetables mix'
##  'bramble' 'nonfat milk' 'tomato juice' 'green beans' 'strawberries'
##  'eggplant' 'mushroom cream sauce' 'gums' 'cereals' 'flax seed' 'spinach'
##  'soda' 'dessert wine' 'corn' 'fresh bread' 'brownies' 'fromage blanc'
##  'chocolate bread' 'mashed potato' 'gluten free bar' 'cottage cheese'
##  'whole weat flour' 'chili' 'barbecue sauce' 'light cream'
##  'mint green tea' 'black tea' 'bacon' 'shampoo' 'blueberries'
##  'cauliflower' 'extra dark chocolate' 'white wine' 'babies food'
##  'toothpaste' 'melons' 'ketchup' 'cream' 'hand protein bar' 'body spray'
##  'oatmeal']

# One Hot Encoding
itemset = set(items)
envals = []
for idx, row in df.iterrows():
    rowset = set(row)
    labels = {}
    uncomms = list(itemset - rowset)
    commons = list(itemset.intersection(rowset))
    for uc in uncomms:
        labels[uc] = 0
    for cm in commons:
        labels[cm] = 1
    envals.append(labels)

# Apriori
ohet = pd.DataFrame(envals)
# For larger dataset,
# min_support and min_threshold are lowered to 0.01
fitems = apriori(ohet, min_support = 0.01, use_colnames = True)
fitems.head(10)

##     support          itemsets
## 0  0.018531     (protein bar)
## 1  0.062525          (turkey)
## 2  0.011998          (melons)
## 3  0.028130        (red wine)
## 4  0.027063      (energy bar)
## 5  0.087188         (burgers)
## 6  0.052393   (grated cheese)
## 7  0.050527            (soup)
## 8  0.010799  (barbecue sauce)
## 9  0.080389         (cookies)

# Applying Association Rules
rules = association_rules(fitems, metric = "confidence", min_threshold = 0.01)
rules.head(10)

##       antecedents     consequents  ...  leverage  conviction
## 0       (burgers)        (turkey)  ...  0.005214    1.068134
## 1        (turkey)       (burgers)  ...  0.005214    1.100536
## 2          (eggs)        (turkey)  ...  0.008228    1.051345
## 3        (turkey)          (eggs)  ...  0.008228    1.191072
## 4     (spaghetti)        (turkey)  ...  0.005645    1.035823
## 5        (turkey)     (spaghetti)  ...  0.005645    1.122731
## 6     (chocolate)        (turkey)  ...  0.001087    1.007130
## 7        (turkey)     (chocolate)  ...  0.001087    1.021242
## 8  (french fries)        (turkey)  ... -0.000021    0.999869
## 9        (turkey)  (french fries)  ... -0.000021    0.999596
## 
## [10 rows x 9 columns]

# Plotting Results
plt.scatter(rules['support'], rules['confidence'], alpha = 0.5)
plt.xlabel('Support')
plt.ylabel('Confidence')
plt.title('Store Data : Support vs Confidence')
plt.show()

# Drawing Directed Graph from 10 Top Rules
draw_graph(rules, 10)

5 Big Data

# Import Libraries
import nltk
from nltk.corpus import stopwords
import string

5.1 Import Data

url = r'https://raw.githubusercontent.com/QuanNguyenIU/Dataset/main/Online%20Retail.xlsx'
df = pd.read_excel(url)
# Remove non-transactions (dumps/refunds)
df = df[df.UnitPrice > 0]
df = df[df.Quantity > 0]
# Remove transactions with NULL Description
df = df[df.Description.notnull()]
# Remove transactions with non-purchase stock codes
df = df[~df.StockCode.isin(['B', 'C2', 'D', 'm', 'M', 'S', 'DOT', 'POST'])]
df = df[~df.StockCode.isin(['AMAZONFEE', 'BANK CHARGES', 'CRUK', 'PADS'])]
df = df[~df.StockCode.apply(str).str.contains("gift")]
# Remove Punctuations
PUNCT = string.punctuation
def remove_punctuation(text):
      return text.translate(str.maketrans('', '', PUNCT))
col = df['Description']
col = col.astype(str)
col = col.str.lower()
col = col.apply(lambda text: remove_punctuation(text))
df['Description'] = col
# Re-enumerate indexes due to deleted transactions
df.reset_index(drop = True, inplace = True)
df

##        InvoiceNo StockCode  ... CustomerID         Country
## 0         536365    85123A  ...    17850.0  United Kingdom
## 1         536365     71053  ...    17850.0  United Kingdom
## 2         536365    84406B  ...    17850.0  United Kingdom
## 3         536365    84029G  ...    17850.0  United Kingdom
## 4         536365    84029E  ...    17850.0  United Kingdom
## ...          ...       ...  ...        ...             ...
## 527753    581587     22613  ...    12680.0          France
## 527754    581587     22899  ...    12680.0          France
## 527755    581587     23254  ...    12680.0          France
## 527756    581587     23255  ...    12680.0          France
## 527757    581587     22138  ...    12680.0          France
## 
## [527758 rows x 8 columns]

5.2 Brief Summary

# Include extra attributes, for further analysis
def extra_fields(dataframe):
    dataframe['TotalAmount'] = dataframe['Quantity'] * dataframe['UnitPrice']
    dataframe['InvoiceYear'] = dataframe['InvoiceDate'].dt.year
    dataframe['InvoiceMonth'] = dataframe['InvoiceDate'].dt.month

# Sort and group the DataFrame, with specifed attributes
def sort_dataframe(dataframe, group, fsort):
    df_group = dataframe.groupby(group)
    result = df_group[fsort].agg(np.sum).sort_values(ascending = False)
    return result

def sort_unique_dataframe(dataframe, group, fsort):
    df_group = dataframe.groupby(group)
    uniques = df_group[fsort].unique()
    result = uniques.agg(np.size).sort_values(ascending = False)
    return result

# Bar plot the DataFrame specified field
def plot_bar(dataframe, head, df_title):
    dataframe.head(head).plot(kind = 'bar', title = df_title)
    plt.show()

def general_info(dataframe):
    print('Transaction Count: ',
          len(dataframe['InvoiceNo'].unique()))
    print('Anonymous Transaction Count: ',
          len(dataframe[dataframe['CustomerID'].isnull()]['InvoiceNo'
                                                          ].unique()))
    print('Customer Count: ', len(dataframe['CustomerID'].unique()) - 1)
    print('Total Profit: ', round(sum(dataframe['TotalAmount']), 2))
    top_customers = sort_dataframe(dataframe, 'CustomerID', 'TotalAmount')
    plot_bar(top_customers, 10, 'Top Customers by Total Amount')
    sort_quantity = sort_dataframe(dataframe, 'Description', 'Quantity')
    plot_bar(sort_quantity, 10, 'Frequent Items by Quantity')
    sort_amount = sort_dataframe(dataframe, 'Description', 'TotalAmount')
    plot_bar(sort_amount, 10, 'Frequent Items by Total Amount')
    
def explore_month(dataframe):
    df_month = dataframe.sort_values('InvoiceDate').groupby(['InvoiceYear',
                                                             'InvoiceMonth'])
    month_invoice = df_month['InvoiceNo'].unique().agg(np.size)
    plot_bar(month_invoice, 12, 'Invoice Count by Month')
    month_amount = df_month['TotalAmount'].agg(np.sum)
    plot_bar(month_amount, 12, 'Total Amount by Month')

def explore_country(dataframe):
    sort_amount = sort_dataframe(dataframe, 'Country', 'TotalAmount')
    plot_bar(sort_amount, 10, 'Countries by Total Amount')
    sort_invoice = sort_unique_dataframe(dataframe, 'Country', 'InvoiceNo')
    plot_bar(sort_invoice, 10, 'Countries by Invoice Count')
    sort_customer = sort_unique_dataframe(dataframe, 'Country', 'CustomerID')
    plot_bar(sort_customer, 10, 'Countries by Customer Count')

def brief_summary(dataframe):
    df_brief = dataframe
    extra_fields(df_brief)
    general_info(df_brief)
    explore_month(df_brief)
    explore_country(df_brief)

brief_summary(df)

## Transaction Count:  19773
## Anonymous Transaction Count:  1371
## Customer Count:  4334
## Total Profit:  10271433.06

5.3 Apriori & Association Rules

# Convert DataFrame to Standard Form
def transform_data(df_original):
    invoice = ''
    transactions = []
    for index, value in enumerate(df_original['InvoiceNo']):
        if invoice != value:
            invoice = value
            transactions.append([df_original['Description'][index]])
            continue
        transactions[-1].append(df_original['Description'][index])
    df_transform = pd.DataFrame(transactions)
    df_transform.fillna(value = pd.np.nan, inplace = True)
    return df_transform

# Find All Unique Items
def unique_items(dataframe):
    squeeze = dataframe.values.ravel()
    nan_items = pd.unique(squeeze)
    items = [x for x in nan_items if str(x) != 'nan']
    return items

# One Hot Encoding
def one_hot_encoding(dataframe, items):
    itemset = set(items)
    envals = []
    for idx, row in dataframe.iterrows():
          rowset = set(row)
          labels = {}
          uncomms = list(itemset - rowset)
          commons = list(itemset.intersection(rowset))
          for uc in uncomms: labels[uc] = 0
          for cm in commons: labels[cm] = 1
          envals.append(labels)
    return envals

# Apriori & Association Rules
def ap_as_plot(envals):
    ohedt = pd.DataFrame(envals)
    
    fitems = apriori(ohedt, min_support = 0.01, use_colnames = True)
    fitems.sort_values('support', ascending = 0, inplace = True)
    fitems.reset_index(drop = True, inplace = True)
    fitems.head(10)
    rules = association_rules(fitems, metric = "confidence", 
                              min_threshold = 0.02)
    rules.sort_values('support', ascending = 0, inplace = True)
    rules.head(10)
    # Plotting Results
    plt.scatter(rules['support'], rules['confidence'], 
                alpha = 0.5, marker = "*")
    plt.xlabel('support')
    plt.ylabel('confidence')
    plt.title('Support vs Confidence - Apriori from \'mlxtend\'')
    plt.show()
    return rules

# Drawing Directed Graph from 10 Top Rules
def draw_graph(rules, rules_to_show):
    G1 = nx.DiGraph()
    color_map = []
    N = 50
    colors = np.random.rand(N)    
    strs=['R0', 'R1', 'R2', 'R3', 'R4', 'R5', 
          'R6', 'R7', 'R8', 'R9', 'R10', 'R11']
    for i in range(rules_to_show):      
        G1.add_nodes_from(['R' + str(i)])
        for a in rules.iloc[i]['antecedents']:             
            G1.add_nodes_from([a])
            G1.add_edge(a, 'R' + str(i), color = colors[i] , weight = 1)
        for c in rules.iloc[i]['consequents']:
            G1.add_nodes_from([c])
            G1.add_edge('R' + str(i), c, color = colors[i],  weight = 1)
    for node in G1:
        found_a_string = False
        for item in strs: 
           if node == item: found_a_string = True
        if found_a_string: color_map.append('yellow')
        else: color_map.append('cyan')       
    edgelist = G1.edges()
    edge_color = [G1[u][v]['color'] for u, v in edgelist]
    weights = [G1[u][v]['weight'] for u, v in edgelist]
    pos = nx.spring_layout(G1, k = 16, scale = 1)
    nx.draw(G1, pos, edgelist = edgelist, node_color = color_map,
            edge_color = edge_color, width = weights, font_size = 10,
            with_labels = True)
    plt.show()

def ap_as(dataframe):
    df_transform = transform_data(dataframe)
    items = unique_items(df_transform)
    envals = one_hot_encoding(df_transform, items)
    rules = ap_as_plot(envals)
    draw_graph(rules, 10)

ap_as(df)