library(datasetsICR)
data("customers")
head(customers, 10) Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicassen
1 2 3 12669 9656 7561 214 2674 1338
2 2 3 7057 9810 9568 1762 3293 1776
3 2 3 6353 8808 7684 2405 3516 7844
4 1 3 13265 1196 4221 6404 507 1788
5 2 3 22615 5410 7198 3915 1777 5185
6 2 3 9413 8259 5126 666 1795 1451
7 2 3 12126 3199 6975 480 3140 545
8 2 3 7579 4956 9426 1669 3321 2566
9 1 3 5963 3648 6192 425 1716 750
10 2 3 6006 11093 18881 1159 7425 2098
tail(customers, 10) Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicassen
431 1 3 3097 4230 16483 575 241 2080
432 1 3 8533 5506 5160 13486 1377 1498
433 1 3 21117 1162 4754 269 1328 395
434 1 3 1982 3218 1493 1541 356 1449
435 1 3 16731 3922 7994 688 2371 838
436 1 3 29703 12051 16027 13135 182 2204
437 1 3 39228 1431 764 4510 93 2346
438 2 3 14531 15488 30243 437 14841 1867
439 1 3 10290 1981 2232 1038 168 2125
440 1 3 2787 1698 2510 65 477 52
str(customers)'data.frame': 440 obs. of 8 variables:
$ Channel : int 2 2 2 1 2 2 2 2 1 2 ...
$ Region : int 3 3 3 3 3 3 3 3 3 3 ...
$ Fresh : int 12669 7057 6353 13265 22615 9413 12126 7579 5963 6006 ...
$ Milk : int 9656 9810 8808 1196 5410 8259 3199 4956 3648 11093 ...
$ Grocery : int 7561 9568 7684 4221 7198 5126 6975 9426 6192 18881 ...
$ Frozen : int 214 1762 2405 6404 3915 666 480 1669 425 1159 ...
$ Detergents_Paper: int 2674 3293 3516 507 1777 1795 3140 3321 1716 7425 ...
$ Delicassen : int 1338 1776 7844 1788 5185 1451 545 2566 750 2098 ...
# Summary Stats
summary(customers$Frozen) Min. 1st Qu. Median Mean 3rd Qu. Max.
25.0 742.2 1526.0 3071.9 3554.2 60869.0
summary(customers$Delicassen) Min. 1st Qu. Median Mean 3rd Qu. Max.
3.0 408.2 965.5 1524.9 1820.2 47943.0
# Converting categorical variables
customers$Channel <- as.factor(customers$Channel)
customers$Region <- as.factor(customers$Region)
# Table
table(customers$Region)
1 2 3
77 47 316
library(ggplot2)
# Simple Scatterplot
ggplot(data = customers,
mapping = aes(x = Frozen,
y = Delicassen,
color = Region)) + geom_point()# Scatterplot with trend lines
ggplot(data = customers,
mapping = aes(x = Frozen,
y = Delicassen,
color = Region)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "loess")`geom_smooth()` using formula = 'y ~ x'
# Final Scatterplot
ggplot(data = customers,
mapping = aes(x = Frozen,
y = Delicassen,
color = Region)) +
geom_point(alpha = 0.3) +
geom_smooth(method = "loess") +
scale_x_log10() +
labs(title = "Customer Spending for Frozen vs Delicassen products by Region",
subtitle = "For Region 1, higher spending on frozen items is associated with higher spending in the deli.",
caption = "Source: datasetsICR package",
x = "Frozen Spending (shown in log base 10)",
y = "Delicassen Spending")`geom_smooth()` using formula = 'y ~ x'