Section 1

You are working to select postal routes (an area unit on a sub-zip code level) for one of our direct-to-consumer shared mail campaigns.

We have have the following information from these datasets:

Loading the Data

##  Libraries
library(dplyr)
library(ggplot2)

##  Loading data
setwd("C:/Users/Glenda Ascencio/Desktop/data-analyst-take-home-master/data-analyst-take-home-master/data")
data_dir <- "."

### Reading the Dataset 1A 
Dataset_1A <- read.csv(file="C:/Users/Glenda Ascencio/Desktop/data-analyst-take-home-master/data-analyst-take-home-master/data/Dataset 1A.csv", header=TRUE, sep=",")

#Dataset 1A: Postal Route Data for Atlanta
summary(Dataset_1A)
##     ZipCode           CRRT          GeoCode    State       CityName  
##  Min.   :30030   C013   : 12   30030C001:  1   GA:291   ATLANTA:260  
##  1st Qu.:30306   C001   : 11   30030C002:  1            DECATUR: 31  
##  Median :30318   C006   : 11   30030C006:  1                         
##  Mean   :30291   C015   : 11   30030C007:  1                         
##  3rd Qu.:30338   C002   :  9   30030C008:  1                         
##  Max.   :30350   C004   :  9   30030C009:  1                         
##                  (Other):228   (Other)  :285                         
##  PostalCountyName TotalHouseholds  Median.Home.Value  MedianIncome   
##  COBB  : 20       Min.   : 128.0   Min.   : 149419   Min.   : 56410  
##  DEKALB: 74       1st Qu.: 456.0   1st Qu.: 328595   1st Qu.: 68147  
##  FULTON:197       Median : 560.0   Median : 416923   Median : 83006  
##                   Mean   : 615.7   Mean   : 445222   Mean   : 93849  
##                   3rd Qu.: 730.5   3rd Qu.: 548659   3rd Qu.:113204  
##                   Max.   :1747.0   Max.   :1000001   Max.   :357500  
##                                                                      
##    Median.Age    PercentOwnerOccupied PercentHouseholdWithChildren
##  Min.   :29.10   Min.   : 2.42        Min.   : 1.13               
##  1st Qu.:39.30   1st Qu.:41.53        1st Qu.:13.45               
##  Median :43.40   Median :59.25        Median :20.30               
##  Mean   :44.60   Mean   :58.91        Mean   :20.82               
##  3rd Qu.:50.15   3rd Qu.:75.80        3rd Qu.:28.27               
##  Max.   :61.90   Max.   :98.41        Max.   :49.72               
## 
### Reading the Dataset 1B 
Dataset_1B <- read.csv(file="C:/Users/Glenda Ascencio/Desktop/data-analyst-take-home-master/data-analyst-take-home-master/data/Dataset 1B.csv", header=TRUE, sep=",")

#Dataset 1B: Client Zip Ranking
summary(Dataset_1B)
##                               Market        Zip             Rank      
##  Atlanta-Sandy Springs-Roswell GA:20   Min.   :30022   Min.   : 1.00  
##                                        1st Qu.:30128   1st Qu.: 5.75  
##                                        Median :30308   Median :10.50  
##                                        Mean   :30240   Mean   :10.50  
##                                        3rd Qu.:30318   3rd Qu.:15.25  
##                                        Max.   :30342   Max.   :20.00
### Reading the Dataset 1C 
Dataset_1C <- read.csv(file="C:/Users/Glenda Ascencio/Desktop/data-analyst-take-home-master/data-analyst-take-home-master/data/Dataset 1C.csv", header=TRUE, sep=",")

# Dataset 1C: Previously Mailed Geocodes
summary(Dataset_1C)
##     ZipCode           CRRT          GeoCode    TotalHouseholds 
##  Min.   :30305   C001   : 13   30305C002:  2   Min.   :  77.0  
##  1st Qu.:30307   C004   : 13   30305C003:  2   1st Qu.: 406.2  
##  Median :30318   C018   : 12   30305C007:  2   Median : 507.0  
##  Mean   :30320   C021   : 12   30305C009:  2   Mean   : 558.7  
##  3rd Qu.:30328   C023   : 12   30305C014:  2   3rd Qu.: 623.8  
##  Max.   :30345   C002   : 11   30305C015:  2   Max.   :1518.0  
##                  (Other):285   (Other)  :346                   
##   SLM.Campaign     Mailing.Name
##  Atlanta:358   Mailed 3/2:179  
##                Mailed 5/1:179  
##                                
##                                
##                                
##                                
## 
### Reading the Dataset 2A 
Dataset_2A <- read.csv(file="C:/Users/Glenda Ascencio/Desktop/data-analyst-take-home-master/data-analyst-take-home-master/data/Dataset 2A.csv", header=TRUE, sep=",")

summary(Dataset_2A)
##                    Campaign     Test.Cell        OrderID     
##  Client August Campaign:207   Min.   :1.000   Min.   :101.0  
##                               1st Qu.:1.000   1st Qu.:152.5  
##                               Median :2.000   Median :204.0  
##                               Mean   :1.531   Mean   :204.0  
##                               3rd Qu.:2.000   3rd Qu.:255.5  
##                               Max.   :2.000   Max.   :307.0  
##                                                              
##      OrderDate    Order.Items     OrderRevenue  
##  8/10/2017: 18   Min.   :1.000   Min.   : 37.0  
##  8/2/2017 : 17   1st Qu.:2.000   1st Qu.: 74.0  
##  8/13/2017: 16   Median :3.000   Median :111.0  
##  8/9/2017 : 15   Mean   :3.121   Mean   :115.5  
##  8/11/2017: 14   3rd Qu.:4.000   3rd Qu.:148.0  
##  8/14/2017: 14   Max.   :5.000   Max.   :185.0  
##  (Other)  :113

Questions 1: How many households are in the postal routes that have a median household income greater than $80,000 and a median household age of 40-55? = There are 119 zip codes with this filter. In total we have 59,869 Households that have a median household income greater than $80k and a median household age of 40-55.

median_household_q1 <- Dataset_1A %>%
  group_by(TotalHouseholds, MedianIncome, Median.Age) %>%
  filter(MedianIncome > 80000 && Median.Age >= 40 && Median.Age <= 55) %>%
  dplyr::summarize(number_of_rows = n())

 total_household <- c(sum(median_household_q1$TotalHouseholds),sum(median_household_q1$number_of_rows))
 
 View(total_household)

Questions 2: Let’s say you want to target 100K households in Atlanta for our SLM shared mail program. Which Geocodes do you choose and why? (lower income families)

atlanta_geocode <- Dataset_1A %>%
  group_by(ZipCode, TotalHouseholds, GeoCode, CityName, MedianIncome, Median.Age) %>%
  filter(MedianIncome > 56650 && CityName == "ATLANTA" && Median.Age <= 43.50 && PercentHouseholdWithChildren <= 28 ) %>%
  dplyr::summarize(total_geocode = n()) %>%
  arrange(desc(MedianIncome))

## Showing Geocodes
head(atlanta_geocode)
## # A tibble: 6 x 7
## # Groups:   ZipCode, TotalHouseholds, GeoCode, CityName, MedianIncome [6]
##   ZipCode TotalHouseholds GeoCode CityName MedianIncome Median.Age
##     <int>           <int> <fct>   <fct>           <int>      <dbl>
## 1   30319             848 30319C~ ATLANTA        119886       42.2
## 2   30318             471 30318C~ ATLANTA        112838       40.3
## 3   30318             622 30318C~ ATLANTA        112010       40.4
## 4   30319             574 30319C~ ATLANTA        109043       43.1
## 5   30318             143 30318C~ ATLANTA        107692       39.9
## 6   30305             402 30305C~ ATLANTA        107500       34.8
## # ... with 1 more variable: total_geocode <int>
# 100.5k Households in Atlanta and 138 Geocodes 
total_geocodes <- c(sum(atlanta_geocode$TotalHouseholds),sum(atlanta_geocode$total_geocode))

View(total_geocodes)

Questions 3: Let’s say that a client gives you a zip ranking for Atlanta (Dataset 1B). How would you use this client info to improve targeting?

# Joining the Dataset_1A with the Dataset_1B
Dataset_1A$ZipCode <- as.numeric(Dataset_1A$ZipCode)
Dataset_1B$ZipCode <- as.numeric(Dataset_1B$Zip)
# Removing the Zip column
Dataset_1B$Zip <- NULL
# Doing a Right outer join
Dataset_1A_1B <- merge(x = Dataset_1A, y = Dataset_1B, by = "ZipCode", all.y = TRUE)

# How to improve targeting
atlanta_geocode_ranking <- Dataset_1A_1B %>%
  group_by(ZipCode, TotalHouseholds, GeoCode, CityName, MedianIncome, Rank) %>%
  filter(MedianIncome > 56650 && CityName == "ATLANTA" && Median.Age <= 55 && PercentHouseholdWithChildren <= 28 )%>%
  dplyr::summarize(total_geo_ranking = n()) %>%
  arrange(desc(MedianIncome))

# ~100.8k and 152 GeoCodes
total_geocodes_ranking <- c(sum(atlanta_geocode_ranking$TotalHouseholds),sum(atlanta_geocode_ranking$total_geo_ranking))

View(total_geocodes_ranking)

Questions 4: Dataset 1C is a list of Geocodes that have been previously mailed. Using your answer from question 2, what % of this market has been mailed before? What % has been mailed 2 times?

# Joining the atlanta_geocode with the Dataset_1C
# Doing a Right outer join: There are 358 Geocodes that have been mailed before. %?
atlanta_geocode_1C_rj <- merge(x = atlanta_geocode, y = Dataset_1C, by = "GeoCode", all.y = TRUE)


# What % has been mailed 2 times?    137 observation. % = 137/100k =
mail_twice <- atlanta_geocode_1C_rj %>%
  group_by(GeoCode) %>%
  dplyr::summarize(mail_count = n()) %>%
  arrange(desc(mail_count))
# 137 
mail_twice_2 <- filter(mail_twice, mail_count == 2)

Questions 5: What is the breakdown of household counts by zip code?

# zip codes count = 112
breakdown_hh_zc <- atlanta_geocode_1C_rj %>%
  group_by(ZipCode.x, TotalHouseholds.x) %>%
  dplyr::summarize(total_count = n()) %>%
  arrange(desc(total_count))

View(breakdown_hh_zc)


# # Number of zip codes = 17
# zipcodes_total <- atlanta_geocode_1C_rj %>%
#   group_by(ZipCode.x) %>%
#   dplyr::summarize(zipcode_count = n()) %>%
#   arrange(desc(zipcode_count))
# 
# # Household count
# # 107336 household
# household_cnt <- c(sum(atlanta_geocode_1C_rj$TotalHouseholds.x, na.rm=TRUE))

Section 2

You’re building a report for a client, showing their results from their latest campaign.

Client Mail Plan

### Reading the Dataset Client Mail Dataset 
client_mail_data <- read.csv(file="C:/Users/Glenda Ascencio/Desktop/data-analyst-take-home-master/data-analyst-take-home-master/data/Client_Mail_Plan.csv", header=TRUE, sep=",")

#The client's "Mail Plan" is below.
View(client_mail_data)

# R. Joining tables with the Dataset 2A
all_client_mail_data <- merge(x = Dataset_2A, y = client_mail_data, by = "Test.Cell", all.y = TRUE)

summary(all_client_mail_data)
##    Test.Cell                       Campaign      OrderID     
##  Min.   :1.000   Client August Campaign:207   Min.   :101.0  
##  1st Qu.:1.000                                1st Qu.:152.5  
##  Median :2.000                                Median :204.0  
##  Mean   :1.531                                Mean   :204.0  
##  3rd Qu.:2.000                                3rd Qu.:255.5  
##  Max.   :2.000                                Max.   :307.0  
##                                                              
##      OrderDate    Order.Items     OrderRevenue        Format   
##  8/10/2017: 18   Min.   :1.000   Min.   : 37.0   Foldout :110  
##  8/2/2017 : 17   1st Qu.:2.000   1st Qu.: 74.0   PostCard: 97  
##  8/13/2017: 16   Median :3.000   Median :111.0                 
##  8/9/2017 : 15   Mean   :3.121   Mean   :115.5                 
##  8/11/2017: 14   3rd Qu.:4.000   3rd Qu.:148.0                 
##  8/14/2017: 14   Max.   :5.000   Max.   :185.0                 
##  (Other)  :113                                                 
##  Quantity.Mailed   Total.Cost 
##   10,000 :207     5,000 : 97  
##                   7,800 :110  
##                               
##                               
##                               
##                               
##