LAB 8 - REFINE YOUR PLOTS

Author

Pamela Carta Jurado

Dataset Customers

Original Scatterlot Lab 3

library(datasetsICR)
library(ggplot2)
data(customers)

# First and last 10 rows of the dataset.
head(customers, 10)
   Channel Region Fresh  Milk Grocery Frozen Detergents_Paper Delicassen
1        2      3 12669  9656    7561    214             2674       1338
2        2      3  7057  9810    9568   1762             3293       1776
3        2      3  6353  8808    7684   2405             3516       7844
4        1      3 13265  1196    4221   6404              507       1788
5        2      3 22615  5410    7198   3915             1777       5185
6        2      3  9413  8259    5126    666             1795       1451
7        2      3 12126  3199    6975    480             3140        545
8        2      3  7579  4956    9426   1669             3321       2566
9        1      3  5963  3648    6192    425             1716        750
10       2      3  6006 11093   18881   1159             7425       2098
tail(customers, 10)
    Channel Region Fresh  Milk Grocery Frozen Detergents_Paper Delicassen
431       1      3  3097  4230   16483    575              241       2080
432       1      3  8533  5506    5160  13486             1377       1498
433       1      3 21117  1162    4754    269             1328        395
434       1      3  1982  3218    1493   1541              356       1449
435       1      3 16731  3922    7994    688             2371        838
436       1      3 29703 12051   16027  13135              182       2204
437       1      3 39228  1431     764   4510               93       2346
438       2      3 14531 15488   30243    437            14841       1867
439       1      3 10290  1981    2232   1038              168       2125
440       1      3  2787  1698    2510     65              477         52
# Dataset structure
str(customers)
'data.frame':   440 obs. of  8 variables:
 $ Channel         : int  2 2 2 1 2 2 2 2 1 2 ...
 $ Region          : int  3 3 3 3 3 3 3 3 3 3 ...
 $ Fresh           : int  12669 7057 6353 13265 22615 9413 12126 7579 5963 6006 ...
 $ Milk            : int  9656 9810 8808 1196 5410 8259 3199 4956 3648 11093 ...
 $ Grocery         : int  7561 9568 7684 4221 7198 5126 6975 9426 6192 18881 ...
 $ Frozen          : int  214 1762 2405 6404 3915 666 480 1669 425 1159 ...
 $ Detergents_Paper: int  2674 3293 3516 507 1777 1795 3140 3321 1716 7425 ...
 $ Delicassen      : int  1338 1776 7844 1788 5185 1451 545 2566 750 2098 ...
# Summary of variables 
summary(customers$Frozen)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   25.0   742.2  1526.0  3071.9  3554.2 60869.0 
summary(customers$Milk)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
     55    1533    3627    5796    7190   73498 
bins <- cut(customers$Frozen, breaks = 4, labels = c("Low", "Medium-Low", "Medium-High", "High"))
customers$Frozen <- factor(bins)

customers$Region <- factor(customers$Region)

# Scatterplot using "region" as color aesthetic
ggplot(customers, aes(x = Frozen, y = Milk, color = Region)) +
  geom_point() +
  labs(x = "Frozen Variable", y = "Milk Variable", 
       title = "Scatter Plot of Milk vs. Frozen Variables by Region",
       subtitle = "Relationship between Milk and Frozen Variables with Regional Distribution") +
  theme(legend.position = "right") +
  labs(caption = "Data Source: customers dataset")

8.1: Data-Ink Ratio and Chartjunk

Refinement 1: Removing Chartjunk

Intent: Remove unnecessary clutter to focus attention on the data. Rationale: Eliminating non-essential elements enhances the clarity of the plot and makes it easier for viewers to interpret the data.

ggplot(customers, aes(x = Frozen, y = Milk, color = Region)) +
  geom_point() +
  labs(x = "Frozen Variable", y = "Milk Variable", 
       title = "Scatter Plot of Milk vs. Frozen Variables by Region") +
  theme_minimal() +
  theme(legend.position = "right") +
  labs(caption = "Data Source: customers dataset")

Refinement 2: Increasing Data-Ink Ratio

Intent: Maximize the ratio of data to ink to improve the efficiency of conveying information. Rationale: By reducing the amount of non-data ink and emphasizing the data points, viewers can focus more on the relationships within the data.

ggplot(customers, aes(x = Frozen, y = Milk, color = Region)) +
  geom_point(alpha = 0.6) +  # Reduce point opacity
  labs(x = "Frozen Variable", y = "Milk Variable", 
       title = "Scatter Plot of Milk vs. Frozen Variables by Region",
       subtitle = "Relationship between Milk and Frozen Variables with Regional Distribution") +
  theme_classic() +
  theme(legend.position = "right") +
  labs(caption = "Data Source: customers dataset")

These refinements aim to enhance the clarity and interpretability of the scatter plot. By removing unnecessary elements and increasing the emphasis on data points, viewers can better discern the relationships between milk and frozen variables across different regions.

8.2: Data Density and Overplotting

Refinement 1: Adjusting Point Size

Intent: Control overplotting by adjusting the size of data points. Rationale: By scaling point size based on data density, we can mitigate overplotting issues and improve the visualization of dense regions.

ggplot(customers, aes(x = Frozen, y = Milk, color = Region)) +
  geom_point(alpha = 0.6, size = 1.5) +  # Increase point size
  labs(x = "Frozen Variable", y = "Milk Variable", 
       title = "Scatter Plot of Milk vs. Frozen Variables by Region") +
  theme_minimal() +
  theme(legend.position = "right") +
  labs(caption = "Data Source: customers dataset")

Refinement 2: Adding Jitter

Intent: Introduce random jitter to data points to reduce overplotting. Rationale: By adding a small amount of random variation to the data points, we can prevent them from completely overlapping, making it easier to identify patterns.

ggplot(customers, aes(x = Frozen, y = Milk, color = Region)) +
  geom_jitter(alpha = 0.6, width = 0.1, height = 0.1) +  # Add jitter
  labs(x = "Frozen Variable", y = "Milk Variable", 
       title = "Scatter Plot of Milk vs. Frozen Variables by Region") +
  theme_minimal() +
  theme(legend.position = "right") +
  labs(caption = "Data Source: customers dataset")

These refinements address issues related to data density and overplotting. By adjusting point size and adding jitter, the plots provide a clearer representation of the distribution of milk and frozen variables across different regions, helping viewers identify patterns and trends more effectively.

8.3 Refinement:

Intent: To enhance the visual appeal and clarity of the scatter plot by adjusting the point size and adding transparency to overlapping points. Rationale: By adjusting the point size and adding transparency, we can alleviate the issue of overplotting and provide a clearer visualization of individual data points.

# Scatterplot with adjusted point size and transparency
ggplot(customers, aes(x = Frozen, y = Milk, color = Region)) +
  geom_point(size = 2, alpha = 0.6) +  # Adjust point size and transparency
  labs(x = "Frozen Variable", y = "Milk Variable", 
       title = "Scatter Plot of Milk vs. Frozen Variables by Region",
       subtitle = "Relationship between Milk and Frozen Variables with Regional Distribution") +
  theme(legend.position = "right") +
  labs(caption = "Data Source: customers dataset")

This refined plot uses smaller points with added transparency, making it easier to distinguish between overlapping data points. It improves the clarity of the visualization, allowing viewers to identify patterns more effectively.

8.4 Refinement:

Intent: To enhance the plot’s readability, add a grid to aid in assessing the distribution of data points. Rationale: Adding a grid helps viewers assess the distribution of data points more accurately, improving the plot’s readability and facilitating data interpretation.

# Scatterplot with grid lines
ggplot(customers, aes(x = Frozen, y = Milk, color = Region)) +
  geom_point(size = 3) +
  geom_smooth(method = "lm", se = TRUE) +
  labs(x = "Frozen Variable", y = "Milk Variable", 
       title = "Scatter Plot of Milk vs. Frozen Variables by Region",
       subtitle = "Relationship between Milk and Frozen Variables with Regional Distribution") +
  theme(legend.position = "right", panel.grid.major = element_line(color = "gray", linetype = "dashed")) +  # Add grid lines
  labs(caption = "Data Source: customers dataset")

This refined plot incorporates grid lines, enhancing the readability of the visualization by providing reference points for assessing the distribution of data points. The grid lines aid viewers in understanding the relationship between Milk and Frozen variables within each region more effectively.