Exploratory Data Analysis

Karol Orozco, Jullian Schrup, Maribel Mendez

10/19/2021


The Data

Big Idea

We plan to earn customer loyalty and increase revenue through machine-readable marketing data, which allows us to create a user behavior profile and more accurately predict customer behavior to use in forecasting future revenue.

Exploratory Data Analysis

Import Data

marketing_campaign_original <-read_excel("Mark_Campaign_Clean.xlsx")

Convert data to tibble

marketing_campaign <- as_tibble(marketing_campaign_original)
marketing_campaign
## # A tibble: 2,233 x 33
##    Year_Birth   Age Age_Grp Education  Marital_Status Income Kidhome Teenhome
##         <dbl> <dbl> <chr>   <chr>      <chr>           <dbl>   <dbl>    <dbl>
##  1       1957    57 Boomer  Graduation Single          58138       0        0
##  2       1954    60 Boomer  Graduation Single          46344       1        1
##  3       1965    49 GenX    Graduation Together        71613       0        0
##  4       1984    30 GenY    Graduation Together        26646       1        0
##  5       1981    33 GenY    PhD        Married         58293       1        0
##  6       1967    47 GenX    Master     Together        62513       0        1
##  7       1971    43 GenX    Graduation Divorced        55635       0        1
##  8       1985    29 GenY    PhD        Married         33454       1        0
##  9       1974    40 GenX    PhD        Together        30351       1        0
## 10       1950    64 Boomer  PhD        Together         5648       1        1
## # ... with 2,223 more rows, and 25 more variables: LastSiteInteraction <dttm>,
## #   Recency <dbl>, Avg_Purchase_Per_Order <dbl>, Orders_Placed <dbl>,
## #   Num_Purchased_Items <dbl>, MntWines <dbl>, MntFruits <dbl>,
## #   MntMeatProducts <dbl>, MntFishProducts <dbl>, MntSweetProducts <dbl>,
## #   MntGoldProds <dbl>, NumDealsPurchases <dbl>, NumWebPurchases <dbl>,
## #   NumCatalogPurchases <dbl>, NumStorePurchases <dbl>,
## #   NumWebVisitsMonth <dbl>, AcceptedCmp3 <dbl>, AcceptedCmp4 <dbl>, ...

Create a 3-way contingency table: Marital Status, Education, Age Group

table(marketing_campaign$Marital_Status, marketing_campaign$Education, marketing_campaign$Age_Grp)
## , ,  = Boomer
## 
##           
##            2n Cycle Basic Graduation Master PhD
##   Divorced        4     0         29      8  12
##   Married        12     1         82     35  46
##   Single          5     1         46     13  25
##   Together        8     2         70     36  38
##   Widow           2     0         14      9  16
## 
## , ,  = GenX
## 
##           
##            2n Cycle Basic Graduation Master PhD
##   Divorced       14     1         70     24  38
##   Married        37     8        212     74 110
##   Single          9     1        113     40  48
##   Together       23     6        135     54  59
##   Widow           3     1         20      3   8
## 
## , ,  = GenY
## 
##           
##            2n Cycle Basic Graduation Master PhD
##   Divorced        4     0         20      3   2
##   Married        27     6        119     29  35
##   Single         18    11         66     18  22
##   Together       23     6         75     14  15
##   Widow           0     0          1      0   0
## 
## , ,  = Genz
## 
##           
##            2n Cycle Basic Graduation Master PhD
##   Divorced        0     0          0      2   0
##   Married         4     4         19      0   1
##   Single          4     5         27      4   3
##   Together        3     0          6      2   4
##   Widow           0     0          0      0   0
## 
## , ,  = Minor
## 
##           
##            2n Cycle Basic Graduation Master PhD
##   Divorced        0     0          0      0   0
##   Married         1     1          0      0   0
##   Single          0     0          0      0   0
##   Together        0     0          0      0   0
##   Widow           0     0          0      0   0
## 
## , ,  = Silent
## 
##           
##            2n Cycle Basic Graduation Master PhD
##   Divorced        1     0          0      0   0
##   Married         0     0          1      0   0
##   Single          1     0          0      0   0
##   Together        0     0          0      0   1
##   Widow           0     0          0      0   0

Dropping levels

tab <- table(marketing_campaign$Marital_Status, marketing_campaign$Education, marketing_campaign$Age_Grp)

## Remove Absurd, Alone and YOLO, Minor, Silent

marketing_campaign <- marketing_campaign %>%
  filter(Marital_Status != "Absurd", Marital_Status !=  "Alone", Marital_Status !=  "YOLO", Age_Grp != "Minor", Age_Grp != "Silent")%>%
  droplevels()

tab
## , ,  = Boomer
## 
##           
##            2n Cycle Basic Graduation Master PhD
##   Divorced        4     0         29      8  12
##   Married        12     1         82     35  46
##   Single          5     1         46     13  25
##   Together        8     2         70     36  38
##   Widow           2     0         14      9  16
## 
## , ,  = GenX
## 
##           
##            2n Cycle Basic Graduation Master PhD
##   Divorced       14     1         70     24  38
##   Married        37     8        212     74 110
##   Single          9     1        113     40  48
##   Together       23     6        135     54  59
##   Widow           3     1         20      3   8
## 
## , ,  = GenY
## 
##           
##            2n Cycle Basic Graduation Master PhD
##   Divorced        4     0         20      3   2
##   Married        27     6        119     29  35
##   Single         18    11         66     18  22
##   Together       23     6         75     14  15
##   Widow           0     0          1      0   0
## 
## , ,  = Genz
## 
##           
##            2n Cycle Basic Graduation Master PhD
##   Divorced        0     0          0      2   0
##   Married         4     4         19      0   1
##   Single          4     5         27      4   3
##   Together        3     0          6      2   4
##   Widow           0     0          0      0   0
## 
## , ,  = Minor
## 
##           
##            2n Cycle Basic Graduation Master PhD
##   Divorced        0     0          0      0   0
##   Married         1     1          0      0   0
##   Single          0     0          0      0   0
##   Together        0     0          0      0   0
##   Widow           0     0          0      0   0
## 
## , ,  = Silent
## 
##           
##            2n Cycle Basic Graduation Master PhD
##   Divorced        1     0          0      0   0
##   Married         0     0          1      0   0
##   Single          1     0          0      0   0
##   Together        0     0          0      0   1
##   Widow           0     0          0      0   0

Side-by-side barcharts (Marital Status by education)

ggplot(marketing_campaign, aes(x= Education, fill = Marital_Status))+
  geom_bar(position= "dodge")

ggplot(marketing_campaign, aes(x = Marital_Status, fill = Education)) +
 geom_bar(position = "dodge") 

ggplot(marketing_campaign, aes(x= Education, fill = Age_Grp))+
  geom_bar(position= "dodge")

ggplot(marketing_campaign, aes(x = Marital_Status, fill = Age_Grp)) +
 geom_bar(position = "dodge") +
 theme(axis.text.x = element_text(angle = 90))

There are more people married in this data set and people with Graduation. In general, there is an association between the level of education and the marital status.

Conditional proportions

tab <- table(marketing_campaign$Marital_Status, marketing_campaign$Education, marketing_campaign$Age_Grp)
options(scipen = 999, digits = 3)
prop.table(tab) # Joint proportions
## , ,  = Boomer
## 
##           
##            2n Cycle    Basic Graduation   Master      PhD
##   Divorced 0.001796 0.000000   0.013022 0.003592 0.005388
##   Married  0.005388 0.000449   0.036821 0.015716 0.020656
##   Single   0.002245 0.000449   0.020656 0.005837 0.011226
##   Together 0.003592 0.000898   0.031432 0.016165 0.017063
##   Widow    0.000898 0.000000   0.006286 0.004041 0.007185
## 
## , ,  = GenX
## 
##           
##            2n Cycle    Basic Graduation   Master      PhD
##   Divorced 0.006286 0.000449   0.031432 0.010777 0.017063
##   Married  0.016614 0.003592   0.095195 0.033229 0.049394
##   Single   0.004041 0.000449   0.050741 0.017961 0.021554
##   Together 0.010328 0.002694   0.060620 0.024248 0.026493
##   Widow    0.001347 0.000449   0.008981 0.001347 0.003592
## 
## , ,  = GenY
## 
##           
##            2n Cycle    Basic Graduation   Master      PhD
##   Divorced 0.001796 0.000000   0.008981 0.001347 0.000898
##   Married  0.012124 0.002694   0.053435 0.013022 0.015716
##   Single   0.008083 0.004939   0.029636 0.008083 0.009879
##   Together 0.010328 0.002694   0.033678 0.006286 0.006736
##   Widow    0.000000 0.000000   0.000449 0.000000 0.000000
## 
## , ,  = Genz
## 
##           
##            2n Cycle    Basic Graduation   Master      PhD
##   Divorced 0.000000 0.000000   0.000000 0.000898 0.000000
##   Married  0.001796 0.001796   0.008532 0.000000 0.000449
##   Single   0.001796 0.002245   0.012124 0.001796 0.001347
##   Together 0.001347 0.000000   0.002694 0.000898 0.001796
##   Widow    0.000000 0.000000   0.000000 0.000000 0.000000
prop.table(tab, 2)  # Conditional on columns
## , ,  = Boomer
## 
##           
##            2n Cycle   Basic Graduation  Master     PhD
##   Divorced  0.02000 0.00000    0.02580 0.02174 0.02490
##   Married   0.06000 0.01887    0.07295 0.09511 0.09544
##   Single    0.02500 0.01887    0.04093 0.03533 0.05187
##   Together  0.04000 0.03774    0.06228 0.09783 0.07884
##   Widow     0.01000 0.00000    0.01246 0.02446 0.03320
## 
## , ,  = GenX
## 
##           
##            2n Cycle   Basic Graduation  Master     PhD
##   Divorced  0.07000 0.01887    0.06228 0.06522 0.07884
##   Married   0.18500 0.15094    0.18861 0.20109 0.22822
##   Single    0.04500 0.01887    0.10053 0.10870 0.09959
##   Together  0.11500 0.11321    0.12011 0.14674 0.12241
##   Widow     0.01500 0.01887    0.01779 0.00815 0.01660
## 
## , ,  = GenY
## 
##           
##            2n Cycle   Basic Graduation  Master     PhD
##   Divorced  0.02000 0.00000    0.01779 0.00815 0.00415
##   Married   0.13500 0.11321    0.10587 0.07880 0.07261
##   Single    0.09000 0.20755    0.05872 0.04891 0.04564
##   Together  0.11500 0.11321    0.06673 0.03804 0.03112
##   Widow     0.00000 0.00000    0.00089 0.00000 0.00000
## 
## , ,  = Genz
## 
##           
##            2n Cycle   Basic Graduation  Master     PhD
##   Divorced  0.00000 0.00000    0.00000 0.00543 0.00000
##   Married   0.02000 0.07547    0.01690 0.00000 0.00207
##   Single    0.02000 0.09434    0.02402 0.01087 0.00622
##   Together  0.01500 0.00000    0.00534 0.00543 0.00830
##   Widow     0.00000 0.00000    0.00000 0.00000 0.00000

Plot Counts

ggplot(marketing_campaign, aes(x= Education, fill = Marital_Status))+
  geom_bar()

Plot Proportion

ggplot(marketing_campaign, aes(x= Education, fill = Marital_Status))+
  geom_bar(position = "fill")

Income

ggplot(marketing_campaign, aes(x = Income))+
  geom_histogram(bins = 30)

marketing_campaign %>%
  filter(Income <= 180000)%>%
ggplot(aes(x = Income))+
  geom_histogram(bins = 30)+
  scale_x_log10()

Income by Age Group

p9 <- marketing_campaign %>%
    filter(Income <= 90000)%>%

  ggplot(aes(Age, Income))+
      geom_point(aes(color = Income, size =  Age, text= Age))+
  
 geom_smooth(method = "lm", se = FALSE)+
  
  scale_colour_gradientn(colours = terrain.colors(10))+

  scale_size("Income", range= c(0.1, 4))+

  theme_classic()

ggplotly(p9)

What products do people buy the most by generation?

p10 <- marketing_campaign %>%
  
select(c(Age_Grp, "MntWines", "MntFruits", "MntMeatProducts", "MntFishProducts", "MntSweetProducts", "MntGoldProds")) %>%
  mutate(Wines = as.integer(MntWines),
         Fruits= as.integer(MntFruits),
         Meat = as.integer(MntMeatProducts),
         Fish = as.integer(MntFishProducts),
         Sweet = as.integer(MntSweetProducts),
         Gold = as.integer(MntGoldProds))%>%
  
  select(c(Age_Grp, Wines, Fruits, Meat, Fish, Sweet, Gold))%>%
  
  pivot_longer(-(Age_Grp), names_to= "Products", values_to= "Value")%>%
  
  ggplot(aes(x=Age_Grp, y=Value, fill=Age_Grp))+
  
  geom_boxplot()+
  
  facet_wrap(.~Products, scales = "free_y")+
  theme_bw()

p10

What channels do people use the most to buy items?

marketing_campaign %>%
  select(c(Age_Grp,"NumWebPurchases", "NumCatalogPurchases", "NumStorePurchases"))%>%
  mutate(Online = as.integer(NumWebPurchases),
         Catalog = as.integer(NumCatalogPurchases),
         Store = as.integer(NumStorePurchases))%>%
  select(c(Age_Grp, Online, Catalog, Store))%>%
  pivot_longer(-(Age_Grp), names_to = "Channels", values_to= "Value") %>%

  
ggplot(aes(x=Age_Grp, y=Value, fill = Age_Grp))+
  geom_boxplot()+
  facet_wrap(.~Channels, scales = "free_y")+
  theme_bw()

What is the relationship between Income and Average Purchase per order and Education?

p12 <- marketing_campaign %>%
  filter(Avg_Purchase_Per_Order < 100)%>%
  filter(Income < 150000)%>%
    mutate(Avg_Purchase_Per_Order=round(Avg_Purchase_Per_Order)) %>%
    arrange((Kidhome)) %>%

  ggplot(aes(Avg_Purchase_Per_Order, Income, fill= Education))+
  
  geom_point(alpha=0.5, shape=21,  
               position = "jitter") +
  geom_density_2d_filled(alpha = 0) +
  geom_density_2d(size = 0.25, colour = "black")+

  
  facet_grid(.~Education)+
  scale_fill_viridis_d(option = "plasma", direction = 1)+  
  
  theme_minimal()+
  theme(
          axis.text.y = element_text(size = 8), 
          axis.text.x = element_text(size= 8),
          
          plot.caption.position = "panel",
          plot.caption = element_text(hjust = 0, size = 7),
          
          strip.text.x = element_text(face = "bold", size= 10),
          
      
          panel.background = element_blank(),
          panel.border = element_blank(),
          panel.spacing = unit(1, "lines"),
          legend.title = element_blank(),
          legend.position = "none",
          panel.grid.major = element_blank(),
          panel.grid.minor.x = element_blank()
          
    
)

ggplotly(p12)

Do people who have children at home buy more wines?

ggplot(marketing_campaign, aes(Kidhome, MntWines, color= Kidhome))+
   geom_jitter(shape = "circle", size = 1.65) +
  scale_color_viridis_c(option = "viridis", direction = 1) +
  theme_classic() +
  theme(legend.position = "bottom")

What about people that have teenagers at home?

ggplot(marketing_campaign, aes(Teenhome, MntWines, color= Teenhome))+
  geom_jitter()