#load libraries
library(float)
library(readr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.1
library(arules)
## Warning: package 'arules' was built under R version 4.2.1
library(arulesViz)
## Warning: package 'arulesViz' was built under R version 4.2.1
library(janitor)
## Warning: package 'janitor' was built under R version 4.2.1
#read data
amazonorders <- read_csv("D:/Al/DataMining Final Project/amazonorders.csv")
#summarize data
head(amazonorders)
## # A tibble: 6 × 36
## Order_Date Order_ID Title Category `ASIN/ISBN` `UNSPSC Code` Website
## <date> <dbl> <chr> <chr> <chr> <dbl> <chr>
## 1 2021-01-01 1.13e16 "PawMedica Glu… ANIMAL_… B085W81GTG 50500000 Amazon…
## 2 2021-01-01 1.13e16 "ULTRAIDEAS Wo… SLIPPER B07RSPVR8H 53110000 Amazon…
## 3 2021-01-01 1.13e16 "ULTRAIDEAS Wo… SLIPPER B07RSPTGGW 53110000 Amazon…
## 4 2021-01-01 1.13e16 "Leather Desk … DESK_PAD B084FFRPLX 44000000 Amazon…
## 5 2021-01-01 1.13e16 "Car Cup Holde… CUP_HOL… B082VB76QX 43191600 Amazon…
## 6 2021-01-01 1.14e16 "Arctic Paw Wo… HAT B07HRGMV3C 53100000 Amazon…
## # … with 29 more variables: Release_Date <dttm>, Condition <chr>, Seller <chr>,
## # Seller_Credentials <chr>, List_Price_Per_Unit <dbl>,
## # Purchase_Price_Per_Unit <dbl>, Quantity <dbl>,
## # Payment_Instrument_Type <chr>, Purchase_Order_Number <lgl>,
## # PO_Line_Number <lgl>, Ordering_Customer_Email <chr>, Shipment_Date <date>,
## # Shipping_Address_Name <chr>, Shipping_Address_Street_1 <chr>,
## # Shipping_Address_Street_2 <chr>, Shipping_Address_City <chr>, …
library(janitor)
#SCRUB DATA: delete columns
remove_empty(amazonorders, which = c("cols"), quiet = TRUE)
## # A tibble: 1,016 × 32
## Order_Date Order_ID Title Category `ASIN/ISBN` `UNSPSC Code` Website
## <date> <dbl> <chr> <chr> <chr> <dbl> <chr>
## 1 2021-01-01 1.13e16 "PawMedica Gl… ANIMAL_… B085W81GTG 50500000 Amazon…
## 2 2021-01-01 1.13e16 "ULTRAIDEAS W… SLIPPER B07RSPVR8H 53110000 Amazon…
## 3 2021-01-01 1.13e16 "ULTRAIDEAS W… SLIPPER B07RSPTGGW 53110000 Amazon…
## 4 2021-01-01 1.13e16 "Leather Desk… DESK_PAD B084FFRPLX 44000000 Amazon…
## 5 2021-01-01 1.13e16 "Car Cup Hold… CUP_HOL… B082VB76QX 43191600 Amazon…
## 6 2021-01-01 1.14e16 "Arctic Paw W… HAT B07HRGMV3C 53100000 Amazon…
## 7 2021-01-01 1.14e16 "Arctic Paw W… HAT B071RDGY5Z 53100000 Amazon…
## 8 2021-01-01 1.14e16 "Lifewit Larg… STORAGE… B07PCNGJP8 44111500 Amazon…
## 9 2021-01-01 1.14e16 "1000 Synthet… PROTECT… B08MVCRCPJ 46181500 Amazon…
## 10 2021-01-08 1.14e16 "Henry’s Hous… COFFEE B071CZV4LN 50201706 Amazon…
## # … with 1,006 more rows, and 25 more variables: Release_Date <dttm>,
## # Condition <chr>, Seller <chr>, Seller_Credentials <chr>,
## # List_Price_Per_Unit <dbl>, Purchase_Price_Per_Unit <dbl>, Quantity <dbl>,
## # Payment_Instrument_Type <chr>, Ordering_Customer_Email <chr>,
## # Shipment_Date <date>, Shipping_Address_Name <chr>,
## # Shipping_Address_Street_1 <chr>, Shipping_Address_Street_2 <chr>,
## # Shipping_Address_City <chr>, Shipping_Address_State <chr>, …
#SCRUB DATA: remove irrelevant columns
edit_amazonorders <-select(amazonorders,-one_of(c("Release_Date")))
edit_amazonorders <-select(amazonorders,-one_of(c("Purchase_Order_Number")))
edit_amazonorders <-select(amazonorders,-one_of(c("PO_Line_Number")))
edit_amazonorders <-select(amazonorders,-one_of(c("Tax_Exemption_Type")))
edit_amazonorders <-select(amazonorders,-one_of(c("Group_Name")))
edit_amazonorders <-select(amazonorders,-one_of(c("Ordering_Customer_Email")))
edit_amazonorders <-select(amazonorders,-one_of(c("Buyer_Name")))
edit_amazonorders <-select(amazonorders,-one_of(c("Shipping_Address_Street_2")))
edit_amazonorders <-select(amazonorders,-one_of(c("Website")))
edit_amazonorders <-select(amazonorders,-one_of(c("Condition")))
edit_amazonorders <-select(amazonorders,-one_of(c("ASIN/ISBN")))
edit_amazonorders <-select(amazonorders,-one_of(c("UNSPSC Code")))
#summarize edited data
summary(edit_amazonorders)
## Order_Date Order_ID Title Category
## Min. :2021-01-01 Min. :1.110e+16 Length:1016 Length:1016
## 1st Qu.:2021-07-13 1st Qu.:1.122e+16 Class :character Class :character
## Median :2021-11-28 Median :1.135e+16 Mode :character Mode :character
## Mean :2021-11-13 Mean :1.132e+16
## 3rd Qu.:2022-03-18 3rd Qu.:1.143e+16
## Max. :2022-08-14 Max. :1.150e+16
##
## ASIN/ISBN Website Release_Date
## Length:1016 Length:1016 Min. :2016-12-01 00:00:01
## Class :character Class :character 1st Qu.:2019-05-01 18:00:01
## Mode :character Mode :character Median :2020-10-11 12:00:01
## Mean :2020-03-09 16:48:01
## 3rd Qu.:2021-03-23 06:00:01
## Max. :2021-09-30 00:00:01
## NA's :1006
## Condition Seller Seller_Credentials List_Price_Per_Unit
## Length:1016 Length:1016 Length:1016 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 0.00
## Mode :character Mode :character Mode :character Median : 12.99
## Mean : 25.03
## 3rd Qu.: 29.99
## Max. :999.10
##
## Purchase_Price_Per_Unit Quantity Payment_Instrument_Type
## Min. : 2.99 Min. :0.000 Length:1016
## 1st Qu.: 10.77 1st Qu.:1.000 Class :character
## Median : 18.00 Median :1.000 Mode :character
## Mean : 23.69 Mean :1.124
## 3rd Qu.: 25.00 3rd Qu.:1.000
## Max. :449.25 Max. :6.000
##
## Purchase_Order_Number PO_Line_Number Ordering_Customer_Email
## Mode:logical Mode:logical Length:1016
## NA's:1016 NA's:1016 Class :character
## Mode :character
##
##
##
##
## Shipment_Date Shipping_Address_Name Shipping_Address_Street_1
## Min. :2021-01-02 Length:1016 Length:1016
## 1st Qu.:2021-07-18 Class :character Class :character
## Median :2021-11-29 Mode :character Mode :character
## Mean :2021-11-12
## 3rd Qu.:2022-03-18
## Max. :2022-08-08
## NA's :10
## Shipping_Address_Street_2 Shipping_Address_City Shipping_Address_State
## Length:1016 Length:1016 Length:1016
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Shipping_Address_Zip Order_Status Carrier_Name_Tracking_Number
## Length:1016 Length:1016 Length:1016
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Item_Subtotal Item_Subtotal_Tax Item_Total Tax_Exemption_Applied
## Min. : 0.00 Min. : 0.000 Min. : 0.00 Mode :logical
## 1st Qu.: 11.92 1st Qu.: 0.755 1st Qu.: 12.84 FALSE:914
## Median : 19.58 Median : 1.240 Median : 21.10 TRUE :102
## Mean : 25.44 Mean : 1.722 Mean : 27.16
## 3rd Qu.: 26.97 3rd Qu.: 1.940 3rd Qu.: 28.87
## Max. :449.25 Max. :34.820 Max. :484.07
##
## Tax_Exemption_Type Exemption_OptOut Buyer_Name Currency
## Mode:logical Mode :logical Length:1016 Length:1016
## NA's:1016 FALSE:914 Class :character Class :character
## TRUE :101 Mode :character Mode :character
## NA's :1
##
##
##
## Group_Name
## Mode:logical
## NA's:1016
##
##
##
##
##
#SCRUB DATA: replace empty cell in column with 'NA'
library(dplyr)
class(amazonorders)
## [1] "spec_tbl_df" "tbl_df" "tbl" "data.frame"
edit_amazonorders <- edit_amazonorders %>% mutate_if(is.character, ~na_if(., ''))
print(edit_amazonorders)
## # A tibble: 1,016 × 35
## Order_Date Order_ID Title Category `ASIN/ISBN` Website Release_Date
## <date> <dbl> <chr> <chr> <chr> <chr> <dttm>
## 1 2021-01-01 1.13e16 "PawMed… ANIMAL_… B085W81GTG Amazon… NA
## 2 2021-01-01 1.13e16 "ULTRAI… SLIPPER B07RSPVR8H Amazon… NA
## 3 2021-01-01 1.13e16 "ULTRAI… SLIPPER B07RSPTGGW Amazon… NA
## 4 2021-01-01 1.13e16 "Leathe… DESK_PAD B084FFRPLX Amazon… NA
## 5 2021-01-01 1.13e16 "Car Cu… CUP_HOL… B082VB76QX Amazon… NA
## 6 2021-01-01 1.14e16 "Arctic… HAT B07HRGMV3C Amazon… NA
## 7 2021-01-01 1.14e16 "Arctic… HAT B071RDGY5Z Amazon… NA
## 8 2021-01-01 1.14e16 "Lifewi… STORAGE… B07PCNGJP8 Amazon… NA
## 9 2021-01-01 1.14e16 "1000 S… PROTECT… B08MVCRCPJ Amazon… NA
## 10 2021-01-08 1.14e16 "Henry’… COFFEE B071CZV4LN Amazon… NA
## # … with 1,006 more rows, and 28 more variables: Condition <chr>, Seller <chr>,
## # Seller_Credentials <chr>, List_Price_Per_Unit <dbl>,
## # Purchase_Price_Per_Unit <dbl>, Quantity <dbl>,
## # Payment_Instrument_Type <chr>, Purchase_Order_Number <lgl>,
## # PO_Line_Number <lgl>, Ordering_Customer_Email <chr>, Shipment_Date <date>,
## # Shipping_Address_Name <chr>, Shipping_Address_Street_1 <chr>,
## # Shipping_Address_Street_2 <chr>, Shipping_Address_City <chr>, …
#plot
str(edit_amazonorders)
## tibble [1,016 × 35] (S3: tbl_df/tbl/data.frame)
## $ Order_Date : Date[1:1016], format: "2021-01-01" "2021-01-01" ...
## $ Order_ID : num [1:1016] 1.13e+16 1.13e+16 1.13e+16 1.13e+16 1.13e+16 ...
## $ Title : chr [1:1016] "PawMedica Glucosamine for Dogs, Hip & Joint Care Chews for Dogs, Joint Support for Dog Joint Supplements with H"| __truncated__ "ULTRAIDEAS Women's Cozy Memory Foam Loafer Slippers with Warm Fleece Lining, Ladies Closed Back House Shoes wit"| __truncated__ "ULTRAIDEAS Women's Cozy Memory Foam Loafer Slippers with Warm Fleece Lining, Ladies Closed Back House Shoes wit"| __truncated__ "Leather Desk Pad Protector,Mouse Pad,Office Desk Mat, Non-Slip PU Leather Desk Blotter,Laptop Desk Pad,Waterpro"| __truncated__ ...
## $ Category : chr [1:1016] "ANIMAL_NUTRITIONAL_SUPPLEMENT" "SLIPPER" "SLIPPER" "DESK_PAD" ...
## $ ASIN/ISBN : chr [1:1016] "B085W81GTG" "B07RSPVR8H" "B07RSPTGGW" "B084FFRPLX" ...
## $ Website : chr [1:1016] "Amazon.com" "Amazon.com" "Amazon.com" "Amazon.com" ...
## $ Release_Date : POSIXct[1:1016], format: NA NA ...
## $ Condition : chr [1:1016] "new" "new" "new" "new" ...
## $ Seller : chr [1:1016] "Trustwell Brands" "Vital Store" "Vital Store" "Aothia Life" ...
## $ Seller_Credentials : chr [1:1016] NA NA NA "ISO 9001" ...
## $ List_Price_Per_Unit : num [1:1016] 50 30 30 17 0 ...
## $ Purchase_Price_Per_Unit : num [1:1016] 45 22 22 10.1 14 ...
## $ Quantity : num [1:1016] 1 1 1 1 1 1 1 3 1 1 ...
## $ Payment_Instrument_Type : chr [1:1016] "Visa - 7058" "Visa - 7058" "Visa - 7058" "Visa - 7058" ...
## $ Purchase_Order_Number : logi [1:1016] NA NA NA NA NA NA ...
## $ PO_Line_Number : logi [1:1016] NA NA NA NA NA NA ...
## $ Ordering_Customer_Email : chr [1:1016] "kaseyn617@yahoo.com" "kaseyn617@yahoo.com" "kaseyn617@yahoo.com" "kaseyn617@yahoo.com" ...
## $ Shipment_Date : Date[1:1016], format: "2021-01-07" "2021-01-07" ...
## $ Shipping_Address_Name : chr [1:1016] "Kasey Nguyen" "Kasey Nguyen" "Kasey Nguyen" "Kasey Nguyen" ...
## $ Shipping_Address_Street_1 : chr [1:1016] "11782 Carlisle Ct" "11782 Carlisle Ct" "11782 Carlisle Ct" "11782 Carlisle Ct" ...
## $ Shipping_Address_Street_2 : chr [1:1016] NA NA NA NA ...
## $ Shipping_Address_City : chr [1:1016] "Moreno Valley" "Moreno Valley" "Moreno Valley" "Moreno Valley" ...
## $ Shipping_Address_State : chr [1:1016] "Ca" "Ca" "Ca" "Ca" ...
## $ Shipping_Address_Zip : chr [1:1016] "92557" "92557" "92557" "92557" ...
## $ Order_Status : chr [1:1016] "Shipped" "Shipped" "Shipped" "Shipped" ...
## $ Carrier_Name_Tracking_Number: chr [1:1016] "AMZN_US(TBA291879438000)" "AMZN_US(TBA291879438000)" "AMZN_US(TBA291879438000)" "AMZN_US(TBA291879438000)" ...
## $ Item_Subtotal : num [1:1016] 45 22 22 10.1 14 ...
## $ Item_Subtotal_Tax : num [1:1016] 3.48 1.71 1.71 0.78 1.08 ...
## $ Item_Total : num [1:1016] 48.4 23.7 23.7 10.9 15.1 ...
## $ Tax_Exemption_Applied : logi [1:1016] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Tax_Exemption_Type : logi [1:1016] NA NA NA NA NA NA ...
## $ Exemption_OptOut : logi [1:1016] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Buyer_Name : chr [1:1016] "Kase" "Kase" "Kase" "Kase" ...
## $ Currency : chr [1:1016] "USD" "USD" "USD" "USD" ...
## $ Group_Name : logi [1:1016] NA NA NA NA NA NA ...
quant <- amazonorders$Quantity
hist(quant)
#factor columns
edit_amazonorders$Order_Date <-factor(edit_amazonorders$Order_Date)
edit_amazonorders$Order_ID <-factor(edit_amazonorders$Order_ID)
edit_amazonorders$Category <-factor(edit_amazonorders$Category)
edit_amazonorders$Quantity <- cut(edit_amazonorders$Quantity,breaks=4,labels=c("very few","few","many","very many"))
edit_amazonorders$List_Price_Per_Unit <-factor(edit_amazonorders$List_Price_Per_Unit)
edit_amazonorders$Purchase_Price_Per_Unit <-factor(edit_amazonorders$Purchase_Price_Per_Unit)
edit_amazonorders$Item_Subtotal <-factor(edit_amazonorders$Item_Subtotal)
edit_amazonorders$Item_Subtotal_Tax <-factor(edit_amazonorders$Item_Subtotal_Tax)
edit_amazonorders$Item_Total<-factor(edit_amazonorders$Item_Total)
edit_amazonorders$Payment_Instrument_Type<-factor(edit_amazonorders$Payment_Instrument_Type)
edit_amazonorders$Shipment_Date <-factor(edit_amazonorders$Shipment_Date)
edit_amazonorders$Shipping_Address_Name <-factor(edit_amazonorders$Shipping_Address_Name)
edit_amazonorders$Shipping_Address_Street_1 <-factor(edit_amazonorders$Shipping_Address_Street_1)
edit_amazonorders$Shipping_Address_City<-factor(edit_amazonorders$Shipping_Address_City)
edit_amazonorders$Shipping_Address_State <-factor(edit_amazonorders$Shipping_Address_State)
edit_amazonorders$Shipping_Address_Zip <-factor(edit_amazonorders$Shipping_Address_Zip)
edit_amazonorders$Order_Status <-factor(edit_amazonorders$Order_Status)
edit_amazonorders$Carrier_Name_Tracking_Number <-factor(edit_amazonorders$Carrier_Name_Tracking_Number)
edit_amazonorders$Tax_Exemption_Applied <-factor(edit_amazonorders$Tax_Exemption_Applied)
edit_amazonorders$Exemption_OptOut <-factor(edit_amazonorders$Exemption_OptOut)
edit_amazonorders$Currency <-factor(edit_amazonorders$Currency)
#convert to transactions
class(edit_amazonorders$Category)
## [1] "factor"
edit_amazonorders$Category <- as.factor(edit_amazonorders$Category)
Subcategories <- subset(edit_amazonorders$Category, edit_amazonorders$Category != 'NA')
Transactions <- subset(edit_amazonorders$Quantity, edit_amazonorders$Quantity != '0')
Transactions_by_Subcategory <- split(Subcategories, Transactions)
## Warning in split.default(Subcategories, Transactions): data length is not a
## multiple of split variable
class(Transactions_by_Subcategory)
## [1] "list"
# Arules
ars <- apriori(Transactions_by_Subcategory, parameter=list(support=0.7, confidence=0.7))
## Warning in asMethod(object): removing duplicated items in transactions
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.7 0.1 1 none FALSE TRUE 5 0.7 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 2
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[216 item(s), 4 transaction(s)] done [0.00s].
## sorting and recoding items ... [6 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [43 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
summary(ars)
## set of 43 rules
##
## rule length distribution (lhs + rhs):sizes
## 1 2 3 4
## 6 18 15 4
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 2.000 2.395 3.000 4.000
##
## summary of quality measures:
## support confidence coverage lift
## Min. :0.7500 Min. :0.7500 Min. :0.750 Min. :1.000
## 1st Qu.:0.7500 1st Qu.:1.0000 1st Qu.:0.750 1st Qu.:1.000
## Median :0.7500 Median :1.0000 Median :0.750 Median :1.333
## Mean :0.7558 Mean :0.9419 Mean :0.814 Mean :1.171
## 3rd Qu.:0.7500 3rd Qu.:1.0000 3rd Qu.:0.875 3rd Qu.:1.333
## Max. :1.0000 Max. :1.0000 Max. :1.000 Max. :1.333
## count
## Min. :3.000
## 1st Qu.:3.000
## Median :3.000
## Mean :3.023
## 3rd Qu.:3.000
## Max. :4.000
##
## mining info:
## data ntransactions support confidence
## Transactions_by_Subcategory 4 0.7 0.7
## call
## apriori(data = Transactions_by_Subcategory, parameter = list(support = 0.7, confidence = 0.7))
class(ars)
## [1] "rules"
## attr(,"package")
## [1] "arules"
inspect(head(ars,10, by="lift"))
## lhs rhs support
## [1] {SKIN_MOISTURIZER} => {PET_FOOD} 0.75
## [2] {PET_FOOD} => {SKIN_MOISTURIZER} 0.75
## [3] {SAUTE_FRY_PAN} => {HAT} 0.75
## [4] {HAT} => {SAUTE_FRY_PAN} 0.75
## [5] {SAUTE_FRY_PAN} => {PET_ACTIVITY_STRUCTURE} 0.75
## [6] {PET_ACTIVITY_STRUCTURE} => {SAUTE_FRY_PAN} 0.75
## [7] {HAT} => {PET_ACTIVITY_STRUCTURE} 0.75
## [8] {PET_ACTIVITY_STRUCTURE} => {HAT} 0.75
## [9] {PET_BED_MAT, SKIN_MOISTURIZER} => {PET_FOOD} 0.75
## [10] {PET_BED_MAT, PET_FOOD} => {SKIN_MOISTURIZER} 0.75
## confidence coverage lift count
## [1] 1 0.75 1.333333 3
## [2] 1 0.75 1.333333 3
## [3] 1 0.75 1.333333 3
## [4] 1 0.75 1.333333 3
## [5] 1 0.75 1.333333 3
## [6] 1 0.75 1.333333 3
## [7] 1 0.75 1.333333 3
## [8] 1 0.75 1.333333 3
## [9] 1 0.75 1.333333 3
## [10] 1 0.75 1.333333 3
library(arulesViz)
plot(ars, method = "graph",
measure = "confidence", shading = "lift")
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.