Lab

Author

Katlyn Collins

Loading the Dataset

library(datasetsICR)
data(customers)

First 10 rows

head(customers,10)
   Channel Region Fresh  Milk Grocery Frozen Detergents_Paper Delicassen
1        2      3 12669  9656    7561    214             2674       1338
2        2      3  7057  9810    9568   1762             3293       1776
3        2      3  6353  8808    7684   2405             3516       7844
4        1      3 13265  1196    4221   6404              507       1788
5        2      3 22615  5410    7198   3915             1777       5185
6        2      3  9413  8259    5126    666             1795       1451
7        2      3 12126  3199    6975    480             3140        545
8        2      3  7579  4956    9426   1669             3321       2566
9        1      3  5963  3648    6192    425             1716        750
10       2      3  6006 11093   18881   1159             7425       2098

Last 10 rows

tail(customers,10)
    Channel Region Fresh  Milk Grocery Frozen Detergents_Paper Delicassen
431       1      3  3097  4230   16483    575              241       2080
432       1      3  8533  5506    5160  13486             1377       1498
433       1      3 21117  1162    4754    269             1328        395
434       1      3  1982  3218    1493   1541              356       1449
435       1      3 16731  3922    7994    688             2371        838
436       1      3 29703 12051   16027  13135              182       2204
437       1      3 39228  1431     764   4510               93       2346
438       2      3 14531 15488   30243    437            14841       1867
439       1      3 10290  1981    2232   1038              168       2125
440       1      3  2787  1698    2510     65              477         52

Structure of Dataset

str(customers)
'data.frame':   440 obs. of  8 variables:
 $ Channel         : int  2 2 2 1 2 2 2 2 1 2 ...
 $ Region          : int  3 3 3 3 3 3 3 3 3 3 ...
 $ Fresh           : int  12669 7057 6353 13265 22615 9413 12126 7579 5963 6006 ...
 $ Milk            : int  9656 9810 8808 1196 5410 8259 3199 4956 3648 11093 ...
 $ Grocery         : int  7561 9568 7684 4221 7198 5126 6975 9426 6192 18881 ...
 $ Frozen          : int  214 1762 2405 6404 3915 666 480 1669 425 1159 ...
 $ Detergents_Paper: int  2674 3293 3516 507 1777 1795 3140 3321 1716 7425 ...
 $ Delicassen      : int  1338 1776 7844 1788 5185 1451 545 2566 750 2098 ...

Summary Statistics

summary(customers$Frozen)  # x variable 
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   25.0   742.2  1526.0  3071.9  3554.2 60869.0 
summary(customers$Delicassen)  # y variable
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    3.0   408.2   965.5  1524.9  1820.2 47943.0 

Table

table(customers$Region)

  1   2   3 
 77  47 316 
customers$region <- as.factor(customers$Region)
library(ggplot2)
ggplot(customers, aes(x = Frozen, y = Delicassen, color = region)) +
  geom_point() +
   scale_x_log10()

customers$Region <- as.factor(customers$Region)

ggplot(customers, aes(x = Frozen, y = Delicassen, color = Region)) +
  geom_point() +
  geom_smooth(method = "auto", se = TRUE)
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

customers$Region <- as.factor(customers$Region)
ggplot(customers, aes(x=Frozen, y=Delicassen, color=Region)) +
  geom_point() +
  scale_x_log10() +
  scale_y_log10()

library(ggplot2)


customers$Region <- as.factor(customers$Region)
ggplot(customers, aes(x = Frozen, y = Delicassen, color = Region)) +
  geom_point() +
  scale_x_log10() +
  scale_y_log10() +
  labs(
    x = "Frozen",
    y = "Deli",
    title = "Frozen vs Deli",
    subtitle = "Data points are Regions",
    caption = "source: customers."
  )

Findings

Based on my tables, you can see that people in region 3 spend more on Deli items when they spend more on Frozen items. Regions 1 and 2, don’t increase their spending on Deli items when they spend more on Frozen items, their Deli purchases remain constant.