Setup

Step 1: Load and Export the Original Advertising Dataset

This section loads the original Advertising dataset from the instructor’s GitHub link and exports it as a CSV file. The exported file was used as the starting point for generating the randomized dataset.

advertising_original <- read.csv("https://raw.githubusercontent.com/utjimmyx/regression/master/advertising.csv")

write.csv(advertising_original, "advertising_export.csv", row.names = FALSE)

head(advertising_original)
##   X X1    TV radio newspaper sales
## 1 1  1 230.1  37.8      69.2  22.1
## 2 2  2  44.5  39.3      45.1  10.4
## 3 3  3  17.2  45.9      69.3   9.3
## 4 4  4 151.5  41.3      58.5  18.5
## 5 5  5 180.8  10.8      58.4  12.9
## 6 6  6   8.7  48.9      75.0   7.2
str(advertising_original)
## 'data.frame':    200 obs. of  6 variables:
##  $ X        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ X1       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ TV       : num  230.1 44.5 17.2 151.5 180.8 ...
##  $ radio    : num  37.8 39.3 45.9 41.3 10.8 48.9 32.8 19.6 2.1 2.6 ...
##  $ newspaper: num  69.2 45.1 69.3 58.5 58.4 75 23.5 11.6 1 21.2 ...
##  $ sales    : num  22.1 10.4 9.3 18.5 12.9 7.2 11.8 13.2 4.8 10.6 ...
summary(advertising_original)
##        X                X1               TV             radio       
##  Min.   :  1.00   Min.   :  1.00   Min.   :  0.70   Min.   : 0.000  
##  1st Qu.: 50.75   1st Qu.: 50.75   1st Qu.: 74.38   1st Qu.: 9.975  
##  Median :100.50   Median :100.50   Median :149.75   Median :22.900  
##  Mean   :100.50   Mean   :100.50   Mean   :147.04   Mean   :23.264  
##  3rd Qu.:150.25   3rd Qu.:150.25   3rd Qu.:218.82   3rd Qu.:36.525  
##  Max.   :200.00   Max.   :200.00   Max.   :296.40   Max.   :49.600  
##    newspaper          sales      
##  Min.   :  0.30   Min.   : 1.60  
##  1st Qu.: 12.75   1st Qu.:10.38  
##  Median : 25.75   Median :12.90  
##  Mean   : 30.55   Mean   :14.02  
##  3rd Qu.: 45.10   3rd Qu.:17.40  
##  Max.   :114.00   Max.   :27.00
names(advertising_original)
## [1] "X"         "X1"        "TV"        "radio"     "newspaper" "sales"

Step 2: Load the Randomized Dataset

This section loads the randomized dataset generated from the original Advertising data. The randomized dataset is used for the remaining visualizations and comparisons.

advertising_randomized <- read_excel("advertising_randomized.xlsx")

# Standardize column names to lowercase for consistent coding.
names(advertising_randomized) <- tolower(names(advertising_randomized))

head(advertising_randomized)
## # A tibble: 6 × 6
##       x    x1    tv radio newspaper sales
##   <dbl> <dbl> <dbl> <dbl>     <dbl> <dbl>
## 1    17    32 295.  17.6       15.5 19.5 
## 2   197   142 261.  27.6       84.8  9.93
## 3    44   132  10.8 52         66.4 22.3 
## 4    76    81 243.   8.43      21.8 12.8 
## 5    95   136 230.  14.9       49.7 12.7 
## 6    11   186 304.  25.4       60.1 20.3
str(advertising_randomized)
## tibble [200 × 6] (S3: tbl_df/tbl/data.frame)
##  $ x        : num [1:200] 17 197 44 76 95 11 145 95 2 49 ...
##  $ x1       : num [1:200] 32 142 132 81 136 186 176 90 88 204 ...
##  $ tv       : num [1:200] 294.6 260.8 10.8 243.2 230.4 ...
##  $ radio    : num [1:200] 17.64 27.56 52 8.43 14.86 ...
##  $ newspaper: num [1:200] 15.5 84.8 66.4 21.8 49.7 ...
##  $ sales    : num [1:200] 19.49 9.93 22.31 12.84 12.73 ...
summary(advertising_randomized)
##        x                x1               tv             radio      
##  Min.   :  1.00   Min.   :  1.00   Min.   :  0.92   Min.   : 0.71  
##  1st Qu.: 48.75   1st Qu.: 48.75   1st Qu.: 78.54   1st Qu.:11.17  
##  Median : 95.00   Median : 99.50   Median :142.47   Median :22.59  
##  Mean   : 97.08   Mean   :101.64   Mean   :157.50   Mean   :23.25  
##  3rd Qu.:138.25   3rd Qu.:142.25   3rd Qu.:225.94   3rd Qu.:33.12  
##  Max.   :259.00   Max.   :281.00   Max.   :453.17   Max.   :73.64  
##    newspaper         sales      
##  Min.   : 0.30   Min.   : 1.10  
##  1st Qu.:16.40   1st Qu.:10.83  
##  Median :28.34   Median :14.36  
##  Mean   :33.13   Mean   :14.75  
##  3rd Qu.:50.52   3rd Qu.:18.99  
##  Max.   :86.40   Max.   :30.11
names(advertising_randomized)
## [1] "x"         "x1"        "tv"        "radio"     "newspaper" "sales"

Scatterplot Function

The scatterplot code uses a custom function called galactic_scatter() to generate the three required charts in a consistent format. The function takes a dataset and an advertising channel as inputs, then creates a scatterplot comparing that channel to sales. It also adds a linear regression line, confidence band, regression equation, and R² value.

Most of the code focuses on formatting so the charts share the same visual style, including the colors, labels, background, point shape, and title structure. This lets me create the TV, radio, and newspaper charts by calling the same function three times instead of rewriting the full plotting code for each chart.

galactic_scatter <- function(data, x_var, y_var = "sales") {
  
  plot_data <- data %>%
    select(
      x = all_of(x_var),
      y = all_of(y_var)
    ) %>%
    drop_na()
  
  model <- lm(y ~ x, data = plot_data)
  
  intercept <- coef(model)[1]
  slope <- coef(model)[2]
  r_squared <- summary(model)$r.squared
  
  display_x <- case_when(
    x_var == "tv" ~ "TV",
    x_var == "radio" ~ "radio",
    x_var == "newspaper" ~ "newspaper",
    TRUE ~ x_var
  )
  
  equation_label <- paste0(
    "Regression Equation:\n",
    "sales = ", round(intercept, 2), " + ", round(slope, 2), "(", display_x, ")\n",
    "R² = ", round(r_squared, 3)
  )
  
  chart_title <- case_when(
    x_var == "tv" ~ "Relationship Between TV Advertising Budget and Sales",
    x_var == "radio" ~ "Relationship Between Radio Advertising Budget and Sales",
    x_var == "newspaper" ~ "Relationship Between Newspaper Advertising Budget and Sales",
    TRUE ~ paste("Relationship Between", x_var, "and Sales")
  )
  
  chart_subtitle <- case_when(
    x_var == "tv" ~ "Linear regression trend showing the association between TV ad spending and sales performance",
    x_var == "radio" ~ "Linear regression trend showing the association between radio ad spending and sales performance",
    x_var == "newspaper" ~ "Linear regression trend showing the association between newspaper ad spending and sales performance",
    TRUE ~ "Linear regression trend showing the association with sales performance"
  )
  
  x_label <- case_when(
    x_var == "tv" ~ "TV Advertising Budget",
    x_var == "radio" ~ "Radio Advertising Budget",
    x_var == "newspaper" ~ "Newspaper Advertising Budget",
    TRUE ~ x_var
  )
  
  ggplot(plot_data, aes(x = x, y = y)) +
    
    geom_smooth(
      method = "lm",
      se = TRUE,
      color = "#B7791F",
      fill = "#A78BFA",
      linewidth = 1.4,
      alpha = 0.30
    ) +
    
    geom_smooth(
      method = "lm",
      se = FALSE,
      color = "#FFB703",
      linewidth = 2.1
    ) +
    
    geom_point(
      color = "#1E3A8A",
      shape = 8,
      size = 1.8,
      alpha = 0.65
    ) +
    
    annotate(
      "label",
      x = Inf,
      y = -Inf,
      label = equation_label,
      hjust = 1.05,
      vjust = -0.35,
      size = 4,
      color = "#111827",
      fill = "#FFF7D6",
      label.size = 0.4
    ) +
    
    labs(
      title = chart_title,
      subtitle = chart_subtitle,
      x = x_label,
      y = "Sales"
    ) +
    
    theme_minimal(base_size = 14) +
    
    theme(
      plot.background = element_rect(fill = "#EAF2FF", color = NA),
      panel.background = element_rect(fill = "#F8FAFC", color = NA),
      panel.grid.major = element_line(color = "#CBD5E1", linewidth = 0.45),
      panel.grid.minor = element_line(color = "#E2E8F0", linewidth = 0.25),
      
      plot.title = element_text(
        color = "#1E293B",
        face = "bold",
        size = 17
      ),
      plot.subtitle = element_text(
        color = "#475569",
        size = 10
      ),
      axis.title = element_text(
        color = "#1E293B",
        face = "bold"
      ),
      axis.text = element_text(
        color = "#334155"
      )
    )
}

Create the Three Scatterplots

plot_TV <- galactic_scatter(advertising_randomized, "tv")
plot_radio <- galactic_scatter(advertising_randomized, "radio")
plot_newspaper <- galactic_scatter(advertising_randomized, "newspaper")

TV Advertising Budget and Sales

plot_TV

Radio Advertising Budget and Sales

plot_radio

Newspaper Advertising Budget and Sales

plot_newspaper

Save Chart Images

ggsave("TV_vs_sales_randomized.png", plot_TV, width = 8, height = 6, dpi = 300)
ggsave("radio_vs_sales_randomized.png", plot_radio, width = 8, height = 6, dpi = 300)
ggsave("newspaper_vs_sales_randomized.png", plot_newspaper, width = 8, height = 6, dpi = 300)

Normalized Faceted Comparison Visualization

The faceted chart below normalizes each advertising channel from 0 to 1. This allows TV, radio, and newspaper advertising to be compared on the same relative spending scale. Formatting applied to match earlier theme and give us some SW flair and style.

advertising_long_normalized <- advertising_randomized %>%
  pivot_longer(
    cols = c(tv, radio, newspaper),
    names_to = "channel",
    values_to = "budget"
  ) %>%
  group_by(channel) %>%
  mutate(
    budget_normalized = (budget - min(budget, na.rm = TRUE)) /
      (max(budget, na.rm = TRUE) - min(budget, na.rm = TRUE))
  ) %>%
  ungroup() %>%
  mutate(
    channel = case_when(
      channel == "tv" ~ "TV",
      channel == "radio" ~ "Radio",
      channel == "newspaper" ~ "Newspaper",
      TRUE ~ channel
    )
  )

ggplot(advertising_long_normalized, aes(x = budget_normalized, y = sales)) +
  
  geom_smooth(
    method = "lm",
    se = TRUE,
    color = "#B7791F",
    fill = "#A78BFA",
    linewidth = 1.2,
    alpha = 0.30
  ) +
  
  geom_smooth(
    method = "lm",
    se = FALSE,
    color = "#FFB703",
    linewidth = 1.8
  ) +
  
  geom_point(
    color = "#1E3A8A",
    shape = 8,
    size = 1.5,
    alpha = 0.60
  ) +
  
  facet_wrap(~ channel) +
  
  labs(
    title = "Normalized Advertising Budget and Sales by Channel",
    subtitle = "Each channel is scaled from 0 = lowest spend to 1 = highest spend using the randomized dataset",
    x = "Normalized Advertising Budget",
    y = "Sales"
  ) +
  
  theme_minimal(base_size = 14) +
  
  theme(
    plot.background = element_rect(fill = "#EAF2FF", color = NA),
    panel.background = element_rect(fill = "#F8FAFC", color = NA),
    panel.grid.major = element_line(color = "#CBD5E1", linewidth = 0.45),
    panel.grid.minor = element_line(color = "#E2E8F0", linewidth = 0.25),
    
    strip.text = element_text(
      color = "#1E293B",
      face = "bold",
      size = 13
    ),
    
    plot.title = element_text(
      color = "#1E293B",
      face = "bold",
      size = 17
    ),
    plot.subtitle = element_text(
      color = "#475569",
      size = 10
    ),
    axis.title = element_text(
      color = "#1E293B",
      face = "bold"
    ),
    axis.text = element_text(
      color = "#334155"
    )
  )

Observations

Use this section to write your own brief observations from the plots.