Load and inspect data
str(df)
## tibble [9,994 x 21] (S3: tbl_df/tbl/data.frame)
## $ Row ID : num [1:9994] 1 2 3 4 5 6 7 8 9 10 ...
## $ Order ID : chr [1:9994] "CA-2016-152156" "CA-2016-152156" "CA-2016-138688" "US-2015-108966" ...
## $ Order Date : POSIXct[1:9994], format: "2016-11-08" "2016-11-08" ...
## $ Ship Date : POSIXct[1:9994], format: "2016-11-11" "2016-11-11" ...
## $ Ship Mode : chr [1:9994] "Second Class" "Second Class" "Second Class" "Standard Class" ...
## $ Customer ID : chr [1:9994] "CG-12520" "CG-12520" "DV-13045" "SO-20335" ...
## $ Customer Name: chr [1:9994] "Claire Gute" "Claire Gute" "Darrin Van Huff" "Sean O'Donnell" ...
## $ Segment : chr [1:9994] "Consumer" "Consumer" "Corporate" "Consumer" ...
## $ Country : chr [1:9994] "United States" "United States" "United States" "United States" ...
## $ City : chr [1:9994] "Henderson" "Henderson" "Los Angeles" "Fort Lauderdale" ...
## $ State : chr [1:9994] "Kentucky" "Kentucky" "California" "Florida" ...
## $ Postal Code : num [1:9994] 42420 42420 90036 33311 33311 ...
## $ Region : chr [1:9994] "South" "South" "West" "South" ...
## $ Product ID : chr [1:9994] "FUR-BO-10001798" "FUR-CH-10000454" "OFF-LA-10000240" "FUR-TA-10000577" ...
## $ Category : chr [1:9994] "Furniture" "Furniture" "Office Supplies" "Furniture" ...
## $ Sub-Category : chr [1:9994] "Bookcases" "Chairs" "Labels" "Tables" ...
## $ Product Name : chr [1:9994] "Bush Somerset Collection Bookcase" "Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back" "Self-Adhesive Address Labels for Typewriters by Universal" "Bretford CR4500 Series Slim Rectangular Table" ...
## $ Sales : num [1:9994] 262 731.9 14.6 957.6 22.4 ...
## $ Quantity : num [1:9994] 2 3 2 5 2 7 4 6 3 5 ...
## $ Discount : num [1:9994] 0 0 0 0.45 0.2 0 0 0.2 0.2 0 ...
## $ Profit : num [1:9994] 41.91 219.58 6.87 -383.03 2.52 ...
head(df)
## # A tibble: 6 x 21
## `Row ID` `Order ID` `Order Date` `Ship Date` `Ship Mode`
## <dbl> <chr> <dttm> <dttm> <chr>
## 1 1 CA-2016-1~ 2016-11-08 00:00:00 2016-11-11 00:00:00 Second Cla~
## 2 2 CA-2016-1~ 2016-11-08 00:00:00 2016-11-11 00:00:00 Second Cla~
## 3 3 CA-2016-1~ 2016-06-12 00:00:00 2016-06-16 00:00:00 Second Cla~
## 4 4 US-2015-1~ 2015-10-11 00:00:00 2015-10-18 00:00:00 Standard C~
## 5 5 US-2015-1~ 2015-10-11 00:00:00 2015-10-18 00:00:00 Standard C~
## 6 6 CA-2014-1~ 2014-06-09 00:00:00 2014-06-14 00:00:00 Standard C~
## # ... with 16 more variables: `Customer ID` <chr>, `Customer Name` <chr>,
## # Segment <chr>, Country <chr>, City <chr>, State <chr>, `Postal Code` <dbl>,
## # Region <chr>, `Product ID` <chr>, Category <chr>, `Sub-Category` <chr>,
## # `Product Name` <chr>, Sales <dbl>, Quantity <dbl>, Discount <dbl>,
## # Profit <dbl>
Trim spaces in colnames to prevent confusion
names(df) <- gsub(" ", "_", names(df)) #trim spaces
names(df) <- gsub("-", "_", names(df)) #change - with _
names(df)
## [1] "Row_ID" "Order_ID" "Order_Date" "Ship_Date"
## [5] "Ship_Mode" "Customer_ID" "Customer_Name" "Segment"
## [9] "Country" "City" "State" "Postal_Code"
## [13] "Region" "Product_ID" "Category" "Sub_Category"
## [17] "Product_Name" "Sales" "Quantity" "Discount"
## [21] "Profit"
Testing for missing values
missing<-sapply(df, function(x) sum(is.na(x)))
missing
## Row_ID Order_ID Order_Date Ship_Date Ship_Mode
## 0 0 0 0 0
## Customer_ID Customer_Name Segment Country City
## 0 0 0 0 0
## State Postal_Code Region Product_ID Category
## 0 0 0 0 0
## Sub_Category Product_Name Sales Quantity Discount
## 0 0 0 0 0
## Profit
## 0
There is no null values.
Select specific columns only
profilinganalysis <- df %>%
select(Customer_ID,Sub_Category,Sales)
head(profilinganalysis)
## # A tibble: 6 x 3
## Customer_ID Sub_Category Sales
## <chr> <chr> <dbl>
## 1 CG-12520 Bookcases 262.
## 2 CG-12520 Chairs 732.
## 3 DV-13045 Labels 14.6
## 4 SO-20335 Tables 958.
## 5 SO-20335 Storage 22.4
## 6 BH-11710 Furnishings 48.9
Data transformation with dcast; transform subcategories from long format into wide format then sum by Sales Revenue
profilinganalysis$Sub_Category = as.factor(profilinganalysis$Sub_Category)
purchasedist<- dcast(profilinganalysis, Customer_ID ~ Sub_Category, value.var = "Sales", fun.aggregate = sum)
head(purchasedist)
## Customer_ID Accessories Appliances Art Binders Bookcases Chairs Copiers
## 1 AA-10315 41.72 415.92 0.000 685.108 0.000 0.000 0
## 2 AA-10375 649.95 0.00 5.248 74.004 0.000 0.000 0
## 3 AA-10480 479.97 0.00 8.640 0.000 0.000 0.000 0
## 4 AA-10645 0.00 0.00 5.280 113.940 1279.165 2347.888 0
## 5 AB-10015 0.00 0.00 17.940 0.000 341.960 48.712 0
## 6 AB-10060 195.74 160.32 0.000 4404.216 0.000 449.568 0
## Envelopes Fasteners Furnishings Labels Machines Paper Phones Storage
## 1 0.00 2.304 14.56 0 0 14.940 431.976 26.960
## 2 0.00 0.000 28.40 0 0 80.080 116.980 101.728
## 3 0.00 0.000 27.46 0 0 108.522 209.970 508.110
## 4 43.96 0.000 123.66 0 0 119.280 354.682 699.080
## 5 0.00 0.000 0.00 0 0 0.000 221.980 255.564
## 6 0.00 0.000 127.88 0 0 353.106 0.000 0.000
## Supplies Tables
## 1 3930.072 0.00
## 2 0.000 0.00
## 3 0.000 447.84
## 4 0.000 0.00
## 5 0.000 0.00
## 6 27.930 2036.86
Add some more variables
##Add TotalRevenue column
purchasedist$TotalSales <- as.numeric(apply(purchasedist[,2:18], 1, sum))
#count how many subcategories each customer has bought
purchasedist$TotalCat<- apply(purchasedist[2:18], 1, function(x) sum(x>0))
head(purchasedist)
## Customer_ID Accessories Appliances Art Binders Bookcases Chairs Copiers
## 1 AA-10315 41.72 415.92 0.000 685.108 0.000 0.000 0
## 2 AA-10375 649.95 0.00 5.248 74.004 0.000 0.000 0
## 3 AA-10480 479.97 0.00 8.640 0.000 0.000 0.000 0
## 4 AA-10645 0.00 0.00 5.280 113.940 1279.165 2347.888 0
## 5 AB-10015 0.00 0.00 17.940 0.000 341.960 48.712 0
## 6 AB-10060 195.74 160.32 0.000 4404.216 0.000 449.568 0
## Envelopes Fasteners Furnishings Labels Machines Paper Phones Storage
## 1 0.00 2.304 14.56 0 0 14.940 431.976 26.960
## 2 0.00 0.000 28.40 0 0 80.080 116.980 101.728
## 3 0.00 0.000 27.46 0 0 108.522 209.970 508.110
## 4 43.96 0.000 123.66 0 0 119.280 354.682 699.080
## 5 0.00 0.000 0.00 0 0 0.000 221.980 255.564
## 6 0.00 0.000 127.88 0 0 353.106 0.000 0.000
## Supplies Tables TotalSales TotalCat
## 1 3930.072 0.00 5563.560 9
## 2 0.000 0.00 1056.390 7
## 3 0.000 447.84 1790.512 7
## 4 0.000 0.00 5086.935 9
## 5 0.000 0.00 886.156 5
## 6 27.930 2036.86 7755.620 8
Add subcategory names of max and min values
##max subcategory
purchasedist$maxcol<-apply(purchasedist[2:18], 1, function(x) colnames(purchasedist[2:18])[which.max(x)])
##min subcategory
#this is more tricky since we want to eliminate 0 values and find first >0 value
min <- purchasedist[1:18] %>%
pivot_longer(cols =colnames(purchasedist[2:18]))%>%
filter(value > 0) %>%
group_by(Customer_ID) %>%
summarise(mincol = name[order(value)][1]
) %>%
select(Customer_ID,mincol)
## `summarise()` ungrouping output (override with `.groups` argument)
purchasedist<- left_join(purchasedist,min)
## Joining, by = "Customer_ID"
rm(min)
head(purchasedist)
## Customer_ID Accessories Appliances Art Binders Bookcases Chairs Copiers
## 1 AA-10315 41.72 415.92 0.000 685.108 0.000 0.000 0
## 2 AA-10375 649.95 0.00 5.248 74.004 0.000 0.000 0
## 3 AA-10480 479.97 0.00 8.640 0.000 0.000 0.000 0
## 4 AA-10645 0.00 0.00 5.280 113.940 1279.165 2347.888 0
## 5 AB-10015 0.00 0.00 17.940 0.000 341.960 48.712 0
## 6 AB-10060 195.74 160.32 0.000 4404.216 0.000 449.568 0
## Envelopes Fasteners Furnishings Labels Machines Paper Phones Storage
## 1 0.00 2.304 14.56 0 0 14.940 431.976 26.960
## 2 0.00 0.000 28.40 0 0 80.080 116.980 101.728
## 3 0.00 0.000 27.46 0 0 108.522 209.970 508.110
## 4 43.96 0.000 123.66 0 0 119.280 354.682 699.080
## 5 0.00 0.000 0.00 0 0 0.000 221.980 255.564
## 6 0.00 0.000 127.88 0 0 353.106 0.000 0.000
## Supplies Tables TotalSales TotalCat maxcol mincol
## 1 3930.072 0.00 5563.560 9 Supplies Fasteners
## 2 0.000 0.00 1056.390 7 Accessories Art
## 3 0.000 447.84 1790.512 7 Storage Art
## 4 0.000 0.00 5086.935 9 Chairs Art
## 5 0.000 0.00 886.156 5 Bookcases Art
## 6 27.930 2036.86 7755.620 8 Binders Supplies
add max-min ratio
for (i in 1:nrow(purchasedist)){
purchasedist$MaxRatio[i]<- (purchasedist[i,which(colnames(purchasedist[1:18]) == purchasedist$maxcol[i])])/(purchasedist$TotalSales[i])
purchasedist$MinRatio[i]<- (purchasedist[i,which(colnames(purchasedist[1:18]) == purchasedist$mincol[i])])/(purchasedist$TotalSales[i])
}
head(purchasedist)
## Customer_ID Accessories Appliances Art Binders Bookcases Chairs Copiers
## 1 AA-10315 41.72 415.92 0.000 685.108 0.000 0.000 0
## 2 AA-10375 649.95 0.00 5.248 74.004 0.000 0.000 0
## 3 AA-10480 479.97 0.00 8.640 0.000 0.000 0.000 0
## 4 AA-10645 0.00 0.00 5.280 113.940 1279.165 2347.888 0
## 5 AB-10015 0.00 0.00 17.940 0.000 341.960 48.712 0
## 6 AB-10060 195.74 160.32 0.000 4404.216 0.000 449.568 0
## Envelopes Fasteners Furnishings Labels Machines Paper Phones Storage
## 1 0.00 2.304 14.56 0 0 14.940 431.976 26.960
## 2 0.00 0.000 28.40 0 0 80.080 116.980 101.728
## 3 0.00 0.000 27.46 0 0 108.522 209.970 508.110
## 4 43.96 0.000 123.66 0 0 119.280 354.682 699.080
## 5 0.00 0.000 0.00 0 0 0.000 221.980 255.564
## 6 0.00 0.000 127.88 0 0 353.106 0.000 0.000
## Supplies Tables TotalSales TotalCat maxcol mincol MaxRatio
## 1 3930.072 0.00 5563.560 9 Supplies Fasteners 0.7063952
## 2 0.000 0.00 1056.390 7 Accessories Art 0.6152557
## 3 0.000 447.84 1790.512 7 Storage Art 0.2837792
## 4 0.000 0.00 5086.935 9 Chairs Art 0.4615526
## 5 0.000 0.00 886.156 5 Bookcases Art 0.3858914
## 6 27.930 2036.86 7755.620 8 Binders Supplies 0.5678741
## MinRatio
## 1 0.0004141233
## 2 0.0049678622
## 3 0.0048254354
## 4 0.0010379531
## 5 0.0202447425
## 6 0.0036012595
With this table we can target customers more precisely