Data Mining - Apriori & Association Rules
Author: Quan Nguyen
1 Introduction
2 Methodology
3 Small Data
# Import Libraries
import warnings
warnings.filterwarnings('ignore')
import io
import requests
import pandas as pd
pd.options.mode.chained_assignment = None
from mlxtend.frequent_patterns import apriori, association_rules
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx# Import Data
url = "https://raw.githubusercontent.com/QuanNguyenIU/Dataset/main/dataset.csv"
s = requests.get(url).content
df = pd.read_csv(io.StringIO(s.decode('utf-8')), sep = ',')
df## 0 1 2 3 4 5 6
## 0 Bread Wine Eggs Meat Cheese Pencil Diaper
## 1 Bread Cheese Meat Diaper Wine Milk Pencil
## 2 Cheese Meat Eggs Milk Wine NaN NaN
## 3 Cheese Meat Eggs Milk Wine NaN NaN
## 4 Meat Pencil Wine NaN NaN NaN NaN
## .. ... ... ... ... ... ... ...
## 310 Bread Eggs Cheese NaN NaN NaN NaN
## 311 Meat Milk Pencil NaN NaN NaN NaN
## 312 Bread Cheese Eggs Meat Pencil Diaper Wine
## 313 Meat Cheese NaN NaN NaN NaN NaN
## 314 Eggs Wine Bagel Bread Meat NaN NaN
##
## [315 rows x 7 columns]
unique_items = df.values.ravel()
nan_items = pd.unique(unique_items)
items = [x for x in nan_items if str(x) != 'nan']
print('All Unique Items:\n', items)## All Unique Items:
## ['Bread', 'Wine', 'Eggs', 'Meat', 'Cheese', 'Pencil', 'Diaper', 'Milk', 'Bagel']
# One Hot Encoding
itemset = set(items)
envals = []
for idx, row in df.iterrows():
rowset = set(row)
labels = {}
uncomms = list(itemset - rowset)
commons = list(itemset.intersection(rowset))
for uc in uncomms:
labels[uc] = 0
for cm in commons:
labels[cm] = 1
envals.append(labels)# Apriori
ohet = pd.DataFrame(envals)
fitems = apriori(ohet, min_support = 0.2, use_colnames = True)
fitems.head(10)## support itemsets
## 0 0.501587 (Milk)
## 1 0.425397 (Bagel)
## 2 0.504762 (Bread)
## 3 0.476190 (Meat)
## 4 0.406349 (Diaper)
## 5 0.361905 (Pencil)
## 6 0.438095 (Eggs)
## 7 0.438095 (Wine)
## 8 0.501587 (Cheese)
## 9 0.225397 (Milk, Bagel)
# Association Rules
rules = association_rules(fitems, metric = "confidence", min_threshold = 0.6)
rules.head(10)## antecedents consequents ... leverage conviction
## 0 (Cheese) (Milk) ... 0.053172 1.270148
## 1 (Milk) (Cheese) ... 0.053172 1.270148
## 2 (Bagel) (Bread) ... 0.064641 1.442650
## 3 (Eggs) (Meat) ... 0.058050 1.338624
## 4 (Cheese) (Meat) ... 0.084958 1.477891
## 5 (Meat) (Cheese) ... 0.084958 1.557540
## 6 (Eggs) (Cheese) ... 0.078670 1.563203
## 7 (Wine) (Cheese) ... 0.050098 1.297754
## 8 (Cheese, Milk) (Meat) ... 0.058050 1.571429
## 9 (Cheese, Meat) (Milk) ... 0.040756 1.337845
##
## [10 rows x 9 columns]
# Plotting Results
plt.scatter(rules['support'], rules['confidence'], alpha = 0.5)
plt.xlabel('support')
plt.ylabel('confidence')
plt.title('Support vs Confidence')
plt.show()# Drawing Directed Graph from 10 Top Rules
def draw_graph(rules, rules_to_show):
G1 = nx.DiGraph()
color_map = []
N = 50
colors = np.random.rand(N)
strs=['R0', 'R1', 'R2', 'R3', 'R4', 'R5',
'R6', 'R7', 'R8', 'R9', 'R10', 'R11']
for i in range(rules_to_show):
G1.add_nodes_from(['R' + str(i)])
for a in rules.iloc[i]['antecedents']:
G1.add_nodes_from([a])
G1.add_edge(a, 'R' + str(i), color = colors[i] , weight = 1)
for c in rules.iloc[i]['consequents']:
G1.add_nodes_from([c])
G1.add_edge('R' + str(i), c, color = colors[i], weight = 1)
for node in G1:
found_a_string = False
for item in strs:
if node == item: found_a_string = True
if found_a_string: color_map.append('yellow')
else: color_map.append('cyan')
edgelist = G1.edges()
edge_color = [G1[u][v]['color'] for u, v in edgelist]
weights = [G1[u][v]['weight'] for u, v in edgelist]
pos = nx.spring_layout(G1, k = 16, scale = 1)
nx.draw(G1, pos, edgelist = edgelist, node_color = color_map,
edge_color = edge_color, width = weights, font_size = 10,
with_labels = True)
plt.show()
draw_graph(rules, 10)To read the network:
- Bagel -> R0 -> Bread.
- Cheese -> R1 -> Milk.
- Milk -> R2 -> Cheese.
- and so on.
Folks may observe that Cheese, Meat and Milk involve in multiples rules (seems interesting).
4 Medium Data
# Import Data
url = "https://raw.githubusercontent.com/QuanNguyenIU/Dataset/main/store.csv"
s = requests.get(url).content
df = pd.read_csv(io.StringIO(s.decode('utf-8')), sep = ',')
df## 0 1 ... 18 19
## 0 shrimp almonds ... spinach olive oil
## 1 burgers meatballs ... NaN NaN
## 2 chutney NaN ... NaN NaN
## 3 turkey avocado ... NaN NaN
## 4 mineral water milk ... NaN NaN
## ... ... ... ... ... ...
## 7496 butter light mayo ... NaN NaN
## 7497 burgers frozen vegetables ... NaN NaN
## 7498 chicken NaN ... NaN NaN
## 7499 escalope green tea ... NaN NaN
## 7500 eggs frozen smoothie ... NaN NaN
##
## [7501 rows x 20 columns]
items = df['0'].unique()
print('All Unique Items:\n', items)## All Unique Items:
## ['shrimp' 'burgers' 'chutney' 'turkey' 'mineral water' 'low fat yogurt'
## 'whole wheat pasta' 'soup' 'frozen vegetables' 'french fries' 'eggs'
## 'cookies' 'spaghetti' 'meatballs' 'red wine' 'rice' 'parmesan cheese'
## 'ground beef' 'sparkling water' 'herb & pepper' 'pickles' 'energy bar'
## 'fresh tuna' 'escalope' 'avocado' 'tomato sauce' 'clothes accessories'
## 'energy drink' 'chocolate' 'grated cheese' 'yogurt cake' 'mint'
## 'asparagus' 'champagne' 'ham' 'muffins' 'french wine' 'chicken' 'pasta'
## 'tomatoes' 'pancakes' 'frozen smoothie' 'carrots' 'yams' 'shallot'
## 'butter' 'light mayo' 'pepper' 'candy bars' 'cooking oil' 'milk'
## 'green tea' 'bug spray' 'oil' 'olive oil' 'salmon' 'cake' 'almonds'
## 'salt' 'strong cheese' 'hot dogs' 'pet food' 'whole wheat rice'
## 'antioxydant juice' 'honey' 'sandwich' 'salad' 'magazines' 'protein bar'
## 'mayonnaise' 'cider' 'burger sauce' 'green grapes' 'vegetables mix'
## 'bramble' 'nonfat milk' 'tomato juice' 'green beans' 'strawberries'
## 'eggplant' 'mushroom cream sauce' 'gums' 'cereals' 'flax seed' 'spinach'
## 'soda' 'dessert wine' 'corn' 'fresh bread' 'brownies' 'fromage blanc'
## 'chocolate bread' 'mashed potato' 'gluten free bar' 'cottage cheese'
## 'whole weat flour' 'chili' 'barbecue sauce' 'light cream'
## 'mint green tea' 'black tea' 'bacon' 'shampoo' 'blueberries'
## 'cauliflower' 'extra dark chocolate' 'white wine' 'babies food'
## 'toothpaste' 'melons' 'ketchup' 'cream' 'hand protein bar' 'body spray'
## 'oatmeal']
# One Hot Encoding
itemset = set(items)
envals = []
for idx, row in df.iterrows():
rowset = set(row)
labels = {}
uncomms = list(itemset - rowset)
commons = list(itemset.intersection(rowset))
for uc in uncomms:
labels[uc] = 0
for cm in commons:
labels[cm] = 1
envals.append(labels)# Apriori
ohet = pd.DataFrame(envals)
# For larger dataset,
# min_support and min_threshold are lowered to 0.01
fitems = apriori(ohet, min_support = 0.01, use_colnames = True)
fitems.head(10)## support itemsets
## 0 0.018531 (protein bar)
## 1 0.062525 (turkey)
## 2 0.011998 (melons)
## 3 0.028130 (red wine)
## 4 0.027063 (energy bar)
## 5 0.087188 (burgers)
## 6 0.052393 (grated cheese)
## 7 0.050527 (soup)
## 8 0.010799 (barbecue sauce)
## 9 0.080389 (cookies)
# Applying Association Rules
rules = association_rules(fitems, metric = "confidence", min_threshold = 0.01)
rules.head(10)## antecedents consequents ... leverage conviction
## 0 (burgers) (turkey) ... 0.005214 1.068134
## 1 (turkey) (burgers) ... 0.005214 1.100536
## 2 (eggs) (turkey) ... 0.008228 1.051345
## 3 (turkey) (eggs) ... 0.008228 1.191072
## 4 (spaghetti) (turkey) ... 0.005645 1.035823
## 5 (turkey) (spaghetti) ... 0.005645 1.122731
## 6 (chocolate) (turkey) ... 0.001087 1.007130
## 7 (turkey) (chocolate) ... 0.001087 1.021242
## 8 (french fries) (turkey) ... -0.000021 0.999869
## 9 (turkey) (french fries) ... -0.000021 0.999596
##
## [10 rows x 9 columns]
# Plotting Results
plt.scatter(rules['support'], rules['confidence'], alpha = 0.5)
plt.xlabel('Support')
plt.ylabel('Confidence')
plt.title('Store Data : Support vs Confidence')
plt.show()# Drawing Directed Graph from 10 Top Rules
draw_graph(rules, 10)5 Big Data
# Import Libraries
import nltk
from nltk.corpus import stopwords
import string5.1 Import Data
url = r'https://raw.githubusercontent.com/QuanNguyenIU/Dataset/main/Online%20Retail.xlsx'
df = pd.read_excel(url)
# Remove non-transactions (dumps/refunds)
df = df[df.UnitPrice > 0]
df = df[df.Quantity > 0]
# Remove transactions with NULL Description
df = df[df.Description.notnull()]
# Remove transactions with non-purchase stock codes
df = df[~df.StockCode.isin(['B', 'C2', 'D', 'm', 'M', 'S', 'DOT', 'POST'])]
df = df[~df.StockCode.isin(['AMAZONFEE', 'BANK CHARGES', 'CRUK', 'PADS'])]
df = df[~df.StockCode.apply(str).str.contains("gift")]
# Remove Punctuations
PUNCT = string.punctuation
def remove_punctuation(text):
return text.translate(str.maketrans('', '', PUNCT))
col = df['Description']
col = col.astype(str)
col = col.str.lower()
col = col.apply(lambda text: remove_punctuation(text))
df['Description'] = col
# Re-enumerate indexes due to deleted transactions
df.reset_index(drop = True, inplace = True)
df## InvoiceNo StockCode ... CustomerID Country
## 0 536365 85123A ... 17850.0 United Kingdom
## 1 536365 71053 ... 17850.0 United Kingdom
## 2 536365 84406B ... 17850.0 United Kingdom
## 3 536365 84029G ... 17850.0 United Kingdom
## 4 536365 84029E ... 17850.0 United Kingdom
## ... ... ... ... ... ...
## 527753 581587 22613 ... 12680.0 France
## 527754 581587 22899 ... 12680.0 France
## 527755 581587 23254 ... 12680.0 France
## 527756 581587 23255 ... 12680.0 France
## 527757 581587 22138 ... 12680.0 France
##
## [527758 rows x 8 columns]
5.2 Brief Summary
# Include extra attributes, for further analysis
def extra_fields(dataframe):
dataframe['TotalAmount'] = dataframe['Quantity'] * dataframe['UnitPrice']
dataframe['InvoiceYear'] = dataframe['InvoiceDate'].dt.year
dataframe['InvoiceMonth'] = dataframe['InvoiceDate'].dt.month
# Sort and group the DataFrame, with specifed attributes
def sort_dataframe(dataframe, group, fsort):
df_group = dataframe.groupby(group)
result = df_group[fsort].agg(np.sum).sort_values(ascending = False)
return result
def sort_unique_dataframe(dataframe, group, fsort):
df_group = dataframe.groupby(group)
uniques = df_group[fsort].unique()
result = uniques.agg(np.size).sort_values(ascending = False)
return result
# Bar plot the DataFrame specified field
def plot_bar(dataframe, head, df_title):
dataframe.head(head).plot(kind = 'bar', title = df_title)
plt.show()
def general_info(dataframe):
print('Transaction Count: ',
len(dataframe['InvoiceNo'].unique()))
print('Anonymous Transaction Count: ',
len(dataframe[dataframe['CustomerID'].isnull()]['InvoiceNo'
].unique()))
print('Customer Count: ', len(dataframe['CustomerID'].unique()) - 1)
print('Total Profit: ', round(sum(dataframe['TotalAmount']), 2))
top_customers = sort_dataframe(dataframe, 'CustomerID', 'TotalAmount')
plot_bar(top_customers, 10, 'Top Customers by Total Amount')
sort_quantity = sort_dataframe(dataframe, 'Description', 'Quantity')
plot_bar(sort_quantity, 10, 'Frequent Items by Quantity')
sort_amount = sort_dataframe(dataframe, 'Description', 'TotalAmount')
plot_bar(sort_amount, 10, 'Frequent Items by Total Amount')
def explore_month(dataframe):
df_month = dataframe.sort_values('InvoiceDate').groupby(['InvoiceYear',
'InvoiceMonth'])
month_invoice = df_month['InvoiceNo'].unique().agg(np.size)
plot_bar(month_invoice, 12, 'Invoice Count by Month')
month_amount = df_month['TotalAmount'].agg(np.sum)
plot_bar(month_amount, 12, 'Total Amount by Month')
def explore_country(dataframe):
sort_amount = sort_dataframe(dataframe, 'Country', 'TotalAmount')
plot_bar(sort_amount, 10, 'Countries by Total Amount')
sort_invoice = sort_unique_dataframe(dataframe, 'Country', 'InvoiceNo')
plot_bar(sort_invoice, 10, 'Countries by Invoice Count')
sort_customer = sort_unique_dataframe(dataframe, 'Country', 'CustomerID')
plot_bar(sort_customer, 10, 'Countries by Customer Count')
def brief_summary(dataframe):
df_brief = dataframe
extra_fields(df_brief)
general_info(df_brief)
explore_month(df_brief)
explore_country(df_brief)
brief_summary(df)## Transaction Count: 19773
## Anonymous Transaction Count: 1371
## Customer Count: 4334
## Total Profit: 10271433.06
5.3 Apriori & Association Rules
# Convert DataFrame to Standard Form
def transform_data(df_original):
invoice = ''
transactions = []
for index, value in enumerate(df_original['InvoiceNo']):
if invoice != value:
invoice = value
transactions.append([df_original['Description'][index]])
continue
transactions[-1].append(df_original['Description'][index])
df_transform = pd.DataFrame(transactions)
df_transform.fillna(value = pd.np.nan, inplace = True)
return df_transform
# Find All Unique Items
def unique_items(dataframe):
squeeze = dataframe.values.ravel()
nan_items = pd.unique(squeeze)
items = [x for x in nan_items if str(x) != 'nan']
return items
# One Hot Encoding
def one_hot_encoding(dataframe, items):
itemset = set(items)
envals = []
for idx, row in dataframe.iterrows():
rowset = set(row)
labels = {}
uncomms = list(itemset - rowset)
commons = list(itemset.intersection(rowset))
for uc in uncomms: labels[uc] = 0
for cm in commons: labels[cm] = 1
envals.append(labels)
return envals
# Apriori & Association Rules
def ap_as_plot(envals):
ohedt = pd.DataFrame(envals)
fitems = apriori(ohedt, min_support = 0.01, use_colnames = True)
fitems.sort_values('support', ascending = 0, inplace = True)
fitems.reset_index(drop = True, inplace = True)
fitems.head(10)
rules = association_rules(fitems, metric = "confidence",
min_threshold = 0.02)
rules.sort_values('support', ascending = 0, inplace = True)
rules.head(10)
# Plotting Results
plt.scatter(rules['support'], rules['confidence'],
alpha = 0.5, marker = "*")
plt.xlabel('support')
plt.ylabel('confidence')
plt.title('Support vs Confidence - Apriori from \'mlxtend\'')
plt.show()
return rules
# Drawing Directed Graph from 10 Top Rules
def draw_graph(rules, rules_to_show):
G1 = nx.DiGraph()
color_map = []
N = 50
colors = np.random.rand(N)
strs=['R0', 'R1', 'R2', 'R3', 'R4', 'R5',
'R6', 'R7', 'R8', 'R9', 'R10', 'R11']
for i in range(rules_to_show):
G1.add_nodes_from(['R' + str(i)])
for a in rules.iloc[i]['antecedents']:
G1.add_nodes_from([a])
G1.add_edge(a, 'R' + str(i), color = colors[i] , weight = 1)
for c in rules.iloc[i]['consequents']:
G1.add_nodes_from([c])
G1.add_edge('R' + str(i), c, color = colors[i], weight = 1)
for node in G1:
found_a_string = False
for item in strs:
if node == item: found_a_string = True
if found_a_string: color_map.append('yellow')
else: color_map.append('cyan')
edgelist = G1.edges()
edge_color = [G1[u][v]['color'] for u, v in edgelist]
weights = [G1[u][v]['weight'] for u, v in edgelist]
pos = nx.spring_layout(G1, k = 16, scale = 1)
nx.draw(G1, pos, edgelist = edgelist, node_color = color_map,
edge_color = edge_color, width = weights, font_size = 10,
with_labels = True)
plt.show()
def ap_as(dataframe):
df_transform = transform_data(dataframe)
items = unique_items(df_transform)
envals = one_hot_encoding(df_transform, items)
rules = ap_as_plot(envals)
draw_graph(rules, 10)
ap_as(df)