LAB 3.1 - MAKE A PLOT

Author

Pamela Carta Jurado

Dataset Customers

library(datasetsICR)
library(ggplot2)
data(customers)

# First and last 10 rows of the dataset.
head(customers, 10)
   Channel Region Fresh  Milk Grocery Frozen Detergents_Paper Delicassen
1        2      3 12669  9656    7561    214             2674       1338
2        2      3  7057  9810    9568   1762             3293       1776
3        2      3  6353  8808    7684   2405             3516       7844
4        1      3 13265  1196    4221   6404              507       1788
5        2      3 22615  5410    7198   3915             1777       5185
6        2      3  9413  8259    5126    666             1795       1451
7        2      3 12126  3199    6975    480             3140        545
8        2      3  7579  4956    9426   1669             3321       2566
9        1      3  5963  3648    6192    425             1716        750
10       2      3  6006 11093   18881   1159             7425       2098
tail(customers, 10)
    Channel Region Fresh  Milk Grocery Frozen Detergents_Paper Delicassen
431       1      3  3097  4230   16483    575              241       2080
432       1      3  8533  5506    5160  13486             1377       1498
433       1      3 21117  1162    4754    269             1328        395
434       1      3  1982  3218    1493   1541              356       1449
435       1      3 16731  3922    7994    688             2371        838
436       1      3 29703 12051   16027  13135              182       2204
437       1      3 39228  1431     764   4510               93       2346
438       2      3 14531 15488   30243    437            14841       1867
439       1      3 10290  1981    2232   1038              168       2125
440       1      3  2787  1698    2510     65              477         52
# Dataset structure
str(customers)
'data.frame':   440 obs. of  8 variables:
 $ Channel         : int  2 2 2 1 2 2 2 2 1 2 ...
 $ Region          : int  3 3 3 3 3 3 3 3 3 3 ...
 $ Fresh           : int  12669 7057 6353 13265 22615 9413 12126 7579 5963 6006 ...
 $ Milk            : int  9656 9810 8808 1196 5410 8259 3199 4956 3648 11093 ...
 $ Grocery         : int  7561 9568 7684 4221 7198 5126 6975 9426 6192 18881 ...
 $ Frozen          : int  214 1762 2405 6404 3915 666 480 1669 425 1159 ...
 $ Detergents_Paper: int  2674 3293 3516 507 1777 1795 3140 3321 1716 7425 ...
 $ Delicassen      : int  1338 1776 7844 1788 5185 1451 545 2566 750 2098 ...
# Summary of variables 
summary(customers$Frozen)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   25.0   742.2  1526.0  3071.9  3554.2 60869.0 
summary(customers$Milk)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
     55    1533    3627    5796    7190   73498 
bins <- cut(customers$Frozen, breaks = 4, labels = c("Low", "Medium-Low", "Medium-High", "High"))
customers$Frozen <- factor(bins)

# Table
table_customers <- table(customers$Frozen)

# Scatterplot
ggplot(customers, aes(x = Frozen, y = Milk, color = Frozen)) +
  geom_point() +
  labs(x = "Frozen Variable", y = "Milk Variable", title = "Scatter Plot of Milk vs. Frozen Variables",
       subtitle = "Relationship between Milk and Frozen Variables with Categorical Levels") +
  theme(legend.position = "right") +
  labs(caption = "Data Source: customers dataset")

# Bar Plot
agg_data <- aggregate(Milk ~ Frozen, data = customers, FUN = mean)

# Plot
ggplot(agg_data, aes(x = Frozen, y = Milk, fill = Frozen)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(x = "Frozen Variable", y = "Mean Milk", title = "Bar Plot of Mean Milk vs. Frozen Variables",
       subtitle = "Relationship between Mean Milk and Frozen Variables with Categorical Levels") +
  theme(legend.position = "right") +
  labs(caption = "Data Source: customers dataset")

# Violin Plot
ggplot(customers, aes(x = Frozen, y = Milk, fill = Frozen)) +
  geom_violin() +
  labs(x = "Frozen Variable", y = "Milk Variable", title = "Violin Plot of Milk vs. Frozen Variables",
       subtitle = "Distribution of Milk Values across Frozen Variable Levels") +
  theme(legend.position = "right") +
  labs(caption = "Data Source: customers dataset")

# Line plot
ggplot(agg_data, aes(x = Frozen, y = Milk, group = 1)) +
  geom_line(color = "blue") +
  geom_point(color = "red") +
  labs(x = "Frozen Variable", y = "Mean Milk", title = "Line Plot of Mean Milk vs. Frozen Variables",
       subtitle = "Relationship between Mean Milk and Frozen Variables with Categorical Levels") +
  theme_minimal() +
  labs(caption = "Data Source: customers dataset")

The scatter plot visually represents the relationship between the “Milk” and “Frozen” variables, with each dot colored according to the levels of the categorical variable “Frozen.” The x-axis represents the “Frozen” variable, which has been categorized into four levels - “Low,” “Medium-Low,” “Medium-High,” and “High” based on numerical values.The y-axis represents the “Milk” variable, showing the values for each observation. The legend helps identify the color-coding for different levels, providing a clear distinction. The chart suggests whether there is any discernible pattern or trend between the two numeric variables for each category of the “Frozen” variable. The subtitle emphasizes that the primary insight revolves around understanding the relationship between “Milk” and “Frozen” variables across categorical levels.

END