You are working to select postal routes (an area unit on a sub-zip code level) for one of our direct-to-consumer shared mail campaigns.
## Libraries
library(dplyr)
library(ggplot2)
## Loading data
setwd("C:/Users/Glenda Ascencio/Desktop/data-analyst-take-home-master/data-analyst-take-home-master/data")
data_dir <- "."
### Reading the Dataset 1A
Dataset_1A <- read.csv(file="C:/Users/Glenda Ascencio/Desktop/data-analyst-take-home-master/data-analyst-take-home-master/data/Dataset 1A.csv", header=TRUE, sep=",")
#Dataset 1A: Postal Route Data for Atlanta
summary(Dataset_1A)
## ZipCode CRRT GeoCode State CityName
## Min. :30030 C013 : 12 30030C001: 1 GA:291 ATLANTA:260
## 1st Qu.:30306 C001 : 11 30030C002: 1 DECATUR: 31
## Median :30318 C006 : 11 30030C006: 1
## Mean :30291 C015 : 11 30030C007: 1
## 3rd Qu.:30338 C002 : 9 30030C008: 1
## Max. :30350 C004 : 9 30030C009: 1
## (Other):228 (Other) :285
## PostalCountyName TotalHouseholds Median.Home.Value MedianIncome
## COBB : 20 Min. : 128.0 Min. : 149419 Min. : 56410
## DEKALB: 74 1st Qu.: 456.0 1st Qu.: 328595 1st Qu.: 68147
## FULTON:197 Median : 560.0 Median : 416923 Median : 83006
## Mean : 615.7 Mean : 445222 Mean : 93849
## 3rd Qu.: 730.5 3rd Qu.: 548659 3rd Qu.:113204
## Max. :1747.0 Max. :1000001 Max. :357500
##
## Median.Age PercentOwnerOccupied PercentHouseholdWithChildren
## Min. :29.10 Min. : 2.42 Min. : 1.13
## 1st Qu.:39.30 1st Qu.:41.53 1st Qu.:13.45
## Median :43.40 Median :59.25 Median :20.30
## Mean :44.60 Mean :58.91 Mean :20.82
## 3rd Qu.:50.15 3rd Qu.:75.80 3rd Qu.:28.27
## Max. :61.90 Max. :98.41 Max. :49.72
##
### Reading the Dataset 1B
Dataset_1B <- read.csv(file="C:/Users/Glenda Ascencio/Desktop/data-analyst-take-home-master/data-analyst-take-home-master/data/Dataset 1B.csv", header=TRUE, sep=",")
#Dataset 1B: Client Zip Ranking
summary(Dataset_1B)
## Market Zip Rank
## Atlanta-Sandy Springs-Roswell GA:20 Min. :30022 Min. : 1.00
## 1st Qu.:30128 1st Qu.: 5.75
## Median :30308 Median :10.50
## Mean :30240 Mean :10.50
## 3rd Qu.:30318 3rd Qu.:15.25
## Max. :30342 Max. :20.00
### Reading the Dataset 1C
Dataset_1C <- read.csv(file="C:/Users/Glenda Ascencio/Desktop/data-analyst-take-home-master/data-analyst-take-home-master/data/Dataset 1C.csv", header=TRUE, sep=",")
# Dataset 1C: Previously Mailed Geocodes
summary(Dataset_1C)
## ZipCode CRRT GeoCode TotalHouseholds
## Min. :30305 C001 : 13 30305C002: 2 Min. : 77.0
## 1st Qu.:30307 C004 : 13 30305C003: 2 1st Qu.: 406.2
## Median :30318 C018 : 12 30305C007: 2 Median : 507.0
## Mean :30320 C021 : 12 30305C009: 2 Mean : 558.7
## 3rd Qu.:30328 C023 : 12 30305C014: 2 3rd Qu.: 623.8
## Max. :30345 C002 : 11 30305C015: 2 Max. :1518.0
## (Other):285 (Other) :346
## SLM.Campaign Mailing.Name
## Atlanta:358 Mailed 3/2:179
## Mailed 5/1:179
##
##
##
##
##
### Reading the Dataset 2A
Dataset_2A <- read.csv(file="C:/Users/Glenda Ascencio/Desktop/data-analyst-take-home-master/data-analyst-take-home-master/data/Dataset 2A.csv", header=TRUE, sep=",")
summary(Dataset_2A)
## Campaign Test.Cell OrderID
## Client August Campaign:207 Min. :1.000 Min. :101.0
## 1st Qu.:1.000 1st Qu.:152.5
## Median :2.000 Median :204.0
## Mean :1.531 Mean :204.0
## 3rd Qu.:2.000 3rd Qu.:255.5
## Max. :2.000 Max. :307.0
##
## OrderDate Order.Items OrderRevenue
## 8/10/2017: 18 Min. :1.000 Min. : 37.0
## 8/2/2017 : 17 1st Qu.:2.000 1st Qu.: 74.0
## 8/13/2017: 16 Median :3.000 Median :111.0
## 8/9/2017 : 15 Mean :3.121 Mean :115.5
## 8/11/2017: 14 3rd Qu.:4.000 3rd Qu.:148.0
## 8/14/2017: 14 Max. :5.000 Max. :185.0
## (Other) :113
median_household_q1 <- Dataset_1A %>%
group_by(TotalHouseholds, MedianIncome, Median.Age) %>%
filter(MedianIncome > 80000 && Median.Age >= 40 && Median.Age <= 55) %>%
dplyr::summarize(number_of_rows = n())
total_household <- c(sum(median_household_q1$TotalHouseholds),sum(median_household_q1$number_of_rows))
View(total_household)
# Joining the Dataset_1A with the Dataset_1B
Dataset_1A$ZipCode <- as.numeric(Dataset_1A$ZipCode)
Dataset_1B$ZipCode <- as.numeric(Dataset_1B$Zip)
# Removing the Zip column
Dataset_1B$Zip <- NULL
# Doing a Right outer join
Dataset_1A_1B <- merge(x = Dataset_1A, y = Dataset_1B, by = "ZipCode", all.y = TRUE)
# How to improve targeting
atlanta_geocode_ranking <- Dataset_1A_1B %>%
group_by(ZipCode, TotalHouseholds, GeoCode, CityName, MedianIncome, Rank) %>%
filter(MedianIncome > 56650 && CityName == "ATLANTA" && Median.Age <= 55 && PercentHouseholdWithChildren <= 28 )%>%
dplyr::summarize(total_geo_ranking = n()) %>%
arrange(desc(MedianIncome))
# ~100.8k and 152 GeoCodes
total_geocodes_ranking <- c(sum(atlanta_geocode_ranking$TotalHouseholds),sum(atlanta_geocode_ranking$total_geo_ranking))
View(total_geocodes_ranking)
# Joining the atlanta_geocode with the Dataset_1C
# Doing a Right outer join: There are 358 Geocodes that have been mailed before. %?
atlanta_geocode_1C_rj <- merge(x = atlanta_geocode, y = Dataset_1C, by = "GeoCode", all.y = TRUE)
# What % has been mailed 2 times? 137 observation. % = 137/100k =
mail_twice <- atlanta_geocode_1C_rj %>%
group_by(GeoCode) %>%
dplyr::summarize(mail_count = n()) %>%
arrange(desc(mail_count))
# 137
mail_twice_2 <- filter(mail_twice, mail_count == 2)
# zip codes count = 112
breakdown_hh_zc <- atlanta_geocode_1C_rj %>%
group_by(ZipCode.x, TotalHouseholds.x) %>%
dplyr::summarize(total_count = n()) %>%
arrange(desc(total_count))
View(breakdown_hh_zc)
# # Number of zip codes = 17
# zipcodes_total <- atlanta_geocode_1C_rj %>%
# group_by(ZipCode.x) %>%
# dplyr::summarize(zipcode_count = n()) %>%
# arrange(desc(zipcode_count))
#
# # Household count
# # 107336 household
# household_cnt <- c(sum(atlanta_geocode_1C_rj$TotalHouseholds.x, na.rm=TRUE))
You’re building a report for a client, showing their results from their latest campaign.
### Reading the Dataset Client Mail Dataset
client_mail_data <- read.csv(file="C:/Users/Glenda Ascencio/Desktop/data-analyst-take-home-master/data-analyst-take-home-master/data/Client_Mail_Plan.csv", header=TRUE, sep=",")
#The client's "Mail Plan" is below.
View(client_mail_data)
# R. Joining tables with the Dataset 2A
all_client_mail_data <- merge(x = Dataset_2A, y = client_mail_data, by = "Test.Cell", all.y = TRUE)
summary(all_client_mail_data)
## Test.Cell Campaign OrderID
## Min. :1.000 Client August Campaign:207 Min. :101.0
## 1st Qu.:1.000 1st Qu.:152.5
## Median :2.000 Median :204.0
## Mean :1.531 Mean :204.0
## 3rd Qu.:2.000 3rd Qu.:255.5
## Max. :2.000 Max. :307.0
##
## OrderDate Order.Items OrderRevenue Format
## 8/10/2017: 18 Min. :1.000 Min. : 37.0 Foldout :110
## 8/2/2017 : 17 1st Qu.:2.000 1st Qu.: 74.0 PostCard: 97
## 8/13/2017: 16 Median :3.000 Median :111.0
## 8/9/2017 : 15 Mean :3.121 Mean :115.5
## 8/11/2017: 14 3rd Qu.:4.000 3rd Qu.:148.0
## 8/14/2017: 14 Max. :5.000 Max. :185.0
## (Other) :113
## Quantity.Mailed Total.Cost
## 10,000 :207 5,000 : 97
## 7,800 :110
##
##
##
##
##