Starbucks is one of the best known brands in the world with thousands of locations. There are still zipcodes in the United States without a way to get your hyper-caffienated-four-dollar fix. Can we use data to make an informed decision about where Starbucks could expand next?
The train of thought is that Starbucks locations tend to be found in affluent and populous areas. By using publicly available data, we can overlay income data from zip codes and compare them by set: those having and lacking a Starbucks.
Note: this presentation was given to the University of Georgia in February 2017; it was not put into a an R Markdown presentation so that more code could be demonstrated.
library(readxl)
library(readr)
library(sqldf)
library(dplyr)
library(tidyr) #for reshaping data
library(zipcode)
library(plotly)
library(tidyr)
library(reshape2)
library(dplyr)
library(ggplot2)
library(plotly)
#clean up all variables before running from the top
rm(list=ls(all=TRUE))
#load up the zipcode data to lean the data
data(zipcode)
#read in the startbucks data
starbuckslocations <- read_csv("~/UGA/Thinking in SQL/TIS_UGA/data/starbuckslocations.csv")
starbuckslocations$Zip = clean.zipcodes(starbuckslocations$Zip)
#note: always, always, always assume that your data is dirty. Some of the zip codes from the raw file had their
#leading zero pulled off for zips in the northeast (3904-> 03904)
#create a 5 character zip column
starbuckslocations$zip_five <-substr(starbuckslocations$Zip,1,5)
#summarize the number of stores by zipcode
starbuckslocations<-sqldf("select count(*) as storecount,State,zip_five from starbuckslocations
group by State,zip_five")
knitr::kable(head(starbuckslocations,10))
storecount | State | zip_five |
---|---|---|
1 | AK | 99501 |
3 | AK | 99502 |
1 | AK | 99503 |
3 | AK | 99504 |
1 | AK | 99505 |
1 | AK | 99506 |
3 | AK | 99507 |
3 | AK | 99508 |
4 | AK | 99515 |
1 | AK | 99517 |
#View(sqldf("select * from starbuckslocations"))
#how many starbucks are in each state?
#View(sqldf("select count(*) as storecount,state from starbuckslocations group by state order by count(*) desc"))
#sum up the total income by zip code from the IRS Data
#https://www.irs.gov/uac/soi-tax-stats-individual-income-tax-statistics-2014-zip-code-data-soi
#and https://www.irs.gov/uac/soi-tax-stats-individual-income-tax-statistics-2013-zip-code-data-soi
#agi stub could be very useful because the bigger the number the more people have
#higher incomes. #lesson : READ THE DATA DICTIONARY.
columnlist<-c("STATE","N1", "zipcode","agi_stub","A02650","YEAR")
if(!exists("Income2013"))
{
Income2013 <- read_csv("~/UGA/Thinking in SQL/TIS_UGA/data/13zpallagi.csv")
Income2013$YEAR<-2013
Income2013<-select(Income2013, one_of(columnlist))
Income2013$zipcode<-clean.zipcodes(Income2013$zipcode)
}
if(!exists("Income2014"))
{
Income2014<-read_csv("~/UGA/Thinking in SQL/TIS_UGA/data/14zpallagi.csv")
Income2014$YEAR<-2014
Income2014<-select(Income2014, one_of(columnlist))
Income2014$zipcode<-clean.zipcodes(Income2014$zipcode)
}
IncomeData<-rbind(Income2013,Income2014)
IncomeData$IncomeState<-IncomeData$STATE
IncomeData <- subset(IncomeData, select = -c(STATE) )
IncomeData$TotalIncome <- IncomeData$A02650
IncomeData <- subset(IncomeData, select = -c(A02650) )
#get rid of any income data with zip code 99999; it means they had less than 100 returns
#this could also be single resident zip codes
#• ZIP codes with less than 100 returns and those identified as a single
#building or nonresidential ZIP code were categorized as “other” (99999).
IncomeData<-sqldf("select * from IncomeData where zipcode not in ('99999','00000')")
rm(Income2014)
rm(Income2013)
#join the starbucks dataset to the IRS income dataset
# I need one row per zip code for the starbucks locations.
#first recognize a good problem: there are many things you can learn from this data
#and its easy to go 100 different directions and get NO WHERE
#the data is at different levels, you will need to clean, aggregate, and average
#before you can analyze
IncomeData$zipcode<-as.character(IncomeData$zipcode)
#join the zip code data and the starbucks locaiton data by the cleaned up zipcode column.
all_data<-full_join(IncomeData,starbuckslocations,by=c("zipcode"="zip_five"),copy=TRUE)
#data quality check: count the numbe of total observations
total_rows<-nrow(IncomeData)
#how many starbucks locations do not have income associated
sb_zip_no_income<-nrow(dplyr::filter(all_data,is.na(`N1`)))
sb_zip_no_income_pct = (sb_zip_no_income/total_rows)*100
#hom many income zip codes do not have starbucks
income_zip_no_sb<-nrow(dplyr::filter(all_data,is.na(`storecount`)))
income_zip_no_sb_pct = (income_zip_no_sb/total_rows)*100
#View(sqldf("select * from all_data where storecount is null"))
#analysis and visualization ideas
#put the number of starbucks on the map by zip code and see how they correlate with a heat map.
#do a linear model predicting the number of starbucks in a zip code by income
set.seed(8265)
na.zero <- function (x) {
x[is.na(x)] <- 0
return(x)
}
#analysis and visualization ideas
#put the number of starbucks on the map by zip code and see how they correlate with a heat map.
#Idea: do a linear model predicting the number of starbucks in a zip code by income
all_data$storecount<-na.zero(all_data$storecount)
#filter these records out. income data is a critical factor
#View(sqldf("select * from all_data where IncomeState is null"))
all_data <- filter(all_data, !is.na(IncomeState))
income_data<-tidyr::spread(all_data,agi_stub,TotalIncome)
income_data<-income_data[,1:12]
#income_data<-filter(income_data,!is.na(IncomeState))
return_count<-tidyr::spread(all_data,agi_stub,N1)
return_count<-return_count[,1:12]
#return_count<-filter(return_count,!is.na(IncomeState))
income_data$storecount<-na.zero(income_data$storecount)
income_data$`1`<-na.zero(income_data$`1`)
income_data$`2`<-na.zero(income_data$`2`)
income_data$`3`<-na.zero(income_data$`3`)
income_data$`4`<-na.zero(income_data$`4`)
income_data$`5`<-na.zero(income_data$`5`)
income_data$`6`<-na.zero(income_data$`6`)
return_count$storecount<-na.zero(return_count$storecount)
return_count$`1`<-na.zero(return_count$`1`)
return_count$`2`<-na.zero(return_count$`2`)
return_count$`3`<-na.zero(return_count$`3`)
return_count$`4`<-na.zero(return_count$`4`)
return_count$`5`<-na.zero(return_count$`5`)
return_count$`6`<-na.zero(return_count$`6`)
names(income_data)[7]<-"agi_one_income"
names(income_data)[8]<-"agi_two_income"
names(income_data)[9]<-"agi_three_income"
names(income_data)[10]<-"agi_four_income"
names(income_data)[11]<-"agi_five_income"
names(income_data)[12]<-"agi_six_income"
names(return_count)[7]<-"agi_one_nt"
names(return_count)[8]<-"agi_two_nt"
names(return_count)[9]<-"agi_three_nt"
names(return_count)[10]<-"agi_four_nt"
names(return_count)[11]<-"agi_five_nt"
names(return_count)[12]<-"agi_six_nt"
library(xtable)
#datatable looking at income data and the total AGI for each zipcode
df1<-(sqldf("select zipcode,year,incomestate,
sum(agi_one_income)+sum(agi_two_income)+sum(agi_three_income)
+sum(agi_four_income)+sum(agi_five_income)+sum(agi_six_income) as total_income,
sum(N1) as total_returns_filed, sum(storecount) as storecount,
sum(agi_one_income) as agi_one,
sum(agi_two_income) as agi_two,
sum(agi_three_income) as agi_three,
sum(agi_four_income) as agi_four,
sum(agi_five_income) as agi_five,
sum(agi_six_income) as agi_six
from income_data
group by zipcode,year,incomestate"))
knitr::kable(head(df1,10))
zipcode | YEAR | IncomeState | total_income | total_returns_filed | storecount | agi_one | agi_two | agi_three | agi_four | agi_five | agi_six |
---|---|---|---|---|---|---|---|---|---|---|---|
01001 | 2013 | MA | 477601 | 8780 | 0 | 35047 | 82453 | 94284 | 79361 | 146901 | 39555 |
01001 | 2014 | MA | 494972 | 8880 | 0 | 34289 | 83204 | 96087 | 79716 | 152951 | 48725 |
01002 | 2013 | MA | 762298 | 9570 | 6 | 41057 | 62546 | 73550 | 74571 | 229774 | 280800 |
01002 | 2014 | MA | 802824 | 9640 | 6 | 40861 | 64687 | 70073 | 71996 | 241834 | 313373 |
01005 | 2013 | MA | 129645 | 2230 | 0 | 8456 | 19902 | 23521 | 24843 | 41184 | 11739 |
01005 | 2014 | MA | 133441 | 2280 | 0 | 8534 | 18901 | 25005 | 23646 | 44315 | 13040 |
01007 | 2013 | MA | 489020 | 7300 | 0 | 26288 | 54890 | 66259 | 76068 | 191131 | 74384 |
01007 | 2014 | MA | 513575 | 7390 | 0 | 26386 | 52448 | 65329 | 79559 | 199827 | 90026 |
01008 | 2013 | MA | 38867 | 540 | 0 | 2140 | 5713 | 6995 | 8760 | 15259 | 0 |
01008 | 2014 | MA | 40175 | 640 | 0 | 2024 | 4749 | 6703 | 7898 | 18801 | 0 |
#number of tax returns filed for each income bucket
df2<-(sqldf("select zipcode,year,incomestate,
sum(agi_one_nt)+sum(agi_two_nt)+sum(agi_three_nt)
+sum(agi_four_nt)+sum(agi_five_nt)+sum(agi_six_nt) as total_nt,
sum(storecount) as storecount,
sum(agi_one_nt) as nt_one,
sum(agi_two_nt) as nt_two,
sum(agi_three_nt) as nt_three,
sum(agi_four_nt) as nt_four,
sum(agi_five_nt) as nt_five,
sum(agi_six_nt) as nt_six
from return_count
group by zipcode,year,incomestate"))
knitr::kable(head(df2,10))
zipcode | YEAR | IncomeState | total_nt | storecount | nt_one | nt_two | nt_three | nt_four | nt_five | nt_six |
---|---|---|---|---|---|---|---|---|---|---|
01001 | 2013 | MA | 8780 | 0 | 2910 | 2190 | 1520 | 910 | 1120 | 130 |
01001 | 2014 | MA | 8880 | 0 | 2890 | 2210 | 1550 | 920 | 1160 | 150 |
01002 | 2013 | MA | 9570 | 6 | 3550 | 1700 | 1160 | 840 | 1630 | 690 |
01002 | 2014 | MA | 9640 | 6 | 3510 | 1760 | 1100 | 810 | 1710 | 750 |
01005 | 2013 | MA | 2230 | 0 | 690 | 540 | 370 | 280 | 310 | 40 |
01005 | 2014 | MA | 2280 | 0 | 720 | 500 | 400 | 270 | 340 | 50 |
01007 | 2013 | MA | 7300 | 0 | 2260 | 1470 | 1050 | 860 | 1420 | 240 |
01007 | 2014 | MA | 7390 | 0 | 2300 | 1400 | 1040 | 900 | 1470 | 280 |
01008 | 2013 | MA | 640 | 0 | 180 | 150 | 110 | 100 | 100 | 0 |
01008 | 2014 | MA | 640 | 0 | 190 | 130 | 110 | 90 | 120 | 0 |
#join the two datasets on income and store cout
#tag the rows where there are zip cods and storecount is 0
df3<-inner_join(df1,df2,by=c("zipcode","YEAR","IncomeState"))
#tag zipcodes where there is no starbucks store
df3$has_starbucks<-df3$storecount.x > 0
#print a sample of the data
knitr::kable(head(df3,10))
zipcode | YEAR | IncomeState | total_income | total_returns_filed | storecount.x | agi_one | agi_two | agi_three | agi_four | agi_five | agi_six | total_nt | storecount.y | nt_one | nt_two | nt_three | nt_four | nt_five | nt_six | has_starbucks |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
01001 | 2013 | MA | 477601 | 8780 | 0 | 35047 | 82453 | 94284 | 79361 | 146901 | 39555 | 8780 | 0 | 2910 | 2190 | 1520 | 910 | 1120 | 130 | FALSE |
01001 | 2014 | MA | 494972 | 8880 | 0 | 34289 | 83204 | 96087 | 79716 | 152951 | 48725 | 8880 | 0 | 2890 | 2210 | 1550 | 920 | 1160 | 150 | FALSE |
01002 | 2013 | MA | 762298 | 9570 | 6 | 41057 | 62546 | 73550 | 74571 | 229774 | 280800 | 9570 | 6 | 3550 | 1700 | 1160 | 840 | 1630 | 690 | TRUE |
01002 | 2014 | MA | 802824 | 9640 | 6 | 40861 | 64687 | 70073 | 71996 | 241834 | 313373 | 9640 | 6 | 3510 | 1760 | 1100 | 810 | 1710 | 750 | TRUE |
01005 | 2013 | MA | 129645 | 2230 | 0 | 8456 | 19902 | 23521 | 24843 | 41184 | 11739 | 2230 | 0 | 690 | 540 | 370 | 280 | 310 | 40 | FALSE |
01005 | 2014 | MA | 133441 | 2280 | 0 | 8534 | 18901 | 25005 | 23646 | 44315 | 13040 | 2280 | 0 | 720 | 500 | 400 | 270 | 340 | 50 | FALSE |
01007 | 2013 | MA | 489020 | 7300 | 0 | 26288 | 54890 | 66259 | 76068 | 191131 | 74384 | 7300 | 0 | 2260 | 1470 | 1050 | 860 | 1420 | 240 | FALSE |
01007 | 2014 | MA | 513575 | 7390 | 0 | 26386 | 52448 | 65329 | 79559 | 199827 | 90026 | 7390 | 0 | 2300 | 1400 | 1040 | 900 | 1470 | 280 | FALSE |
01008 | 2013 | MA | 38867 | 540 | 0 | 2140 | 5713 | 6995 | 8760 | 15259 | 0 | 640 | 0 | 180 | 150 | 110 | 100 | 100 | 0 | FALSE |
01008 | 2014 | MA | 40175 | 640 | 0 | 2024 | 4749 | 6703 | 7898 | 18801 | 0 | 640 | 0 | 190 | 130 | 110 | 90 | 120 | 0 | FALSE |
#this is just for formatting and creating a legend
# 1 = $1 under $25,000
# 2 = $25,000 under $50,000
# 3 = $50,000 under $75,000
# 4 = $75,000 under $100,000
# 5 = $100,000 under $200,000
# 6 = $200,000 or more
label = c("agi_one","agi_two","agi_three","agi_four","agi_five","agi_six")
bucket = c("1 under 25,000","25,000 under 50,000","50,000 under 75,000","75,000 under 100,000","100,000 under 200,000","200,000 or more")
position = c(1,2,3,4,5,6)
dflegend<-data.frame(label,bucket,position,stringsAsFactors = FALSE)
Starbucks tends to target more affluent areas, or households filing in the upper buckets of our data distribution. This analysis will look at agi buckets four through six for 2014.
#change the shape of the data before we plot and analyze
dfmeltbase<-melt(df3,measure.vars=7:12)%>%filter(YEAR==2014 & variable %in% c("agi_four", "agi_five","agi_six"))
head(dfmeltbase,10)
## zipcode YEAR IncomeState total_income total_returns_filed storecount.x
## 1 01001 2014 MA 494972 8880 0
## 2 01002 2014 MA 802824 9640 6
## 3 01005 2014 MA 133441 2280 0
## 4 01007 2014 MA 513575 7390 0
## 5 01008 2014 MA 40175 640 0
## 6 01010 2014 MA 128916 1840 0
## 7 01011 2014 MA 31224 610 0
## 8 01012 2014 MA 22684 310 0
## 9 01013 2014 MA 415570 10420 0
## 10 01020 2014 MA 697760 15020 6
## total_nt storecount.y nt_one nt_two nt_three nt_four nt_five nt_six
## 1 8880 0 2890 2210 1550 920 1160 150
## 2 9640 6 3510 1760 1100 810 1710 750
## 3 2280 0 720 500 400 270 340 50
## 4 7390 0 2300 1400 1040 900 1470 280
## 5 640 0 190 130 110 90 120 0
## 6 1840 0 570 350 320 180 360 60
## 7 610 0 210 140 110 70 80 0
## 8 370 0 120 90 60 40 60 0
## 9 10420 0 4460 3020 1510 770 610 50
## 10 15020 6 5560 4160 2410 1460 1330 100
## has_starbucks variable value
## 1 FALSE agi_four 79716
## 2 TRUE agi_four 71996
## 3 FALSE agi_four 23646
## 4 FALSE agi_four 79559
## 5 FALSE agi_four 7898
## 6 FALSE agi_four 15611
## 7 FALSE agi_four 5696
## 8 FALSE agi_four 3011
## 9 FALSE agi_four 65987
## 10 TRUE agi_four 127031
Look at the difference between the two tables below and it should be appearant what the melt function has done.
BEFORE melt
knitr::kable(head(df3,10))
zipcode | YEAR | IncomeState | total_income | total_returns_filed | storecount.x | agi_one | agi_two | agi_three | agi_four | agi_five | agi_six | total_nt | storecount.y | nt_one | nt_two | nt_three | nt_four | nt_five | nt_six | has_starbucks |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
01001 | 2013 | MA | 477601 | 8780 | 0 | 35047 | 82453 | 94284 | 79361 | 146901 | 39555 | 8780 | 0 | 2910 | 2190 | 1520 | 910 | 1120 | 130 | FALSE |
01001 | 2014 | MA | 494972 | 8880 | 0 | 34289 | 83204 | 96087 | 79716 | 152951 | 48725 | 8880 | 0 | 2890 | 2210 | 1550 | 920 | 1160 | 150 | FALSE |
01002 | 2013 | MA | 762298 | 9570 | 6 | 41057 | 62546 | 73550 | 74571 | 229774 | 280800 | 9570 | 6 | 3550 | 1700 | 1160 | 840 | 1630 | 690 | TRUE |
01002 | 2014 | MA | 802824 | 9640 | 6 | 40861 | 64687 | 70073 | 71996 | 241834 | 313373 | 9640 | 6 | 3510 | 1760 | 1100 | 810 | 1710 | 750 | TRUE |
01005 | 2013 | MA | 129645 | 2230 | 0 | 8456 | 19902 | 23521 | 24843 | 41184 | 11739 | 2230 | 0 | 690 | 540 | 370 | 280 | 310 | 40 | FALSE |
01005 | 2014 | MA | 133441 | 2280 | 0 | 8534 | 18901 | 25005 | 23646 | 44315 | 13040 | 2280 | 0 | 720 | 500 | 400 | 270 | 340 | 50 | FALSE |
01007 | 2013 | MA | 489020 | 7300 | 0 | 26288 | 54890 | 66259 | 76068 | 191131 | 74384 | 7300 | 0 | 2260 | 1470 | 1050 | 860 | 1420 | 240 | FALSE |
01007 | 2014 | MA | 513575 | 7390 | 0 | 26386 | 52448 | 65329 | 79559 | 199827 | 90026 | 7390 | 0 | 2300 | 1400 | 1040 | 900 | 1470 | 280 | FALSE |
01008 | 2013 | MA | 38867 | 540 | 0 | 2140 | 5713 | 6995 | 8760 | 15259 | 0 | 640 | 0 | 180 | 150 | 110 | 100 | 100 | 0 | FALSE |
01008 | 2014 | MA | 40175 | 640 | 0 | 2024 | 4749 | 6703 | 7898 | 18801 | 0 | 640 | 0 | 190 | 130 | 110 | 90 | 120 | 0 | FALSE |
AFTER melt: the agi column names have now become rows. This allows for pivotting the data into the right shape.
knitr::kable(head(dfmeltbase,10))
zipcode | YEAR | IncomeState | total_income | total_returns_filed | storecount.x | total_nt | storecount.y | nt_one | nt_two | nt_three | nt_four | nt_five | nt_six | has_starbucks | variable | value |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
01001 | 2014 | MA | 494972 | 8880 | 0 | 8880 | 0 | 2890 | 2210 | 1550 | 920 | 1160 | 150 | FALSE | agi_four | 79716 |
01002 | 2014 | MA | 802824 | 9640 | 6 | 9640 | 6 | 3510 | 1760 | 1100 | 810 | 1710 | 750 | TRUE | agi_four | 71996 |
01005 | 2014 | MA | 133441 | 2280 | 0 | 2280 | 0 | 720 | 500 | 400 | 270 | 340 | 50 | FALSE | agi_four | 23646 |
01007 | 2014 | MA | 513575 | 7390 | 0 | 7390 | 0 | 2300 | 1400 | 1040 | 900 | 1470 | 280 | FALSE | agi_four | 79559 |
01008 | 2014 | MA | 40175 | 640 | 0 | 640 | 0 | 190 | 130 | 110 | 90 | 120 | 0 | FALSE | agi_four | 7898 |
01010 | 2014 | MA | 128916 | 1840 | 0 | 1840 | 0 | 570 | 350 | 320 | 180 | 360 | 60 | FALSE | agi_four | 15611 |
01011 | 2014 | MA | 31224 | 610 | 0 | 610 | 0 | 210 | 140 | 110 | 70 | 80 | 0 | FALSE | agi_four | 5696 |
01012 | 2014 | MA | 22684 | 310 | 0 | 370 | 0 | 120 | 90 | 60 | 40 | 60 | 0 | FALSE | agi_four | 3011 |
01013 | 2014 | MA | 415570 | 10420 | 0 | 10420 | 0 | 4460 | 3020 | 1510 | 770 | 610 | 50 | FALSE | agi_four | 65987 |
01020 | 2014 | MA | 697760 | 15020 | 6 | 15020 | 6 | 5560 | 4160 | 2410 | 1460 | 1330 | 100 | TRUE | agi_four | 127031 |
Box plot and compare the two types of zip codes (those having vs. those lacking a Starbucks).
#join the melted data and legend, sort the data by the position of the labels in the legend.
dfmelt<- inner_join(dfmeltbase, dflegend,by=c("variable" = "label"))%>%
arrange(position)%>%
select(one_of(c("variable", "value","has_starbucks","bucket","zipcode","IncomeState")))
## Warning in inner_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## factor and character vector, coercing into character vector
#boxplot the data to see how the data is distributed among various dimensions
#this line is going to subdivide the data (income, number of returns etc into different levels that
#can be evaluated indepenently. This will become apparent boxplot.)
dfmelt$bucketf = factor(dfmelt$bucket,levels=c("75,000 under 100,000","100,000 under 200,000","200,000 or more"))
For easier following of this code chunk, look at the table immeidately above and follow where the column names fall into the function parameters
knitr::kable(head(dfmelt,10))
variable | value | has_starbucks | bucket | zipcode | IncomeState | bucketf |
---|---|---|---|---|---|---|
agi_four | 79716 | FALSE | 75,000 under 100,000 | 01001 | MA | 75,000 under 100,000 |
agi_four | 71996 | TRUE | 75,000 under 100,000 | 01002 | MA | 75,000 under 100,000 |
agi_four | 23646 | FALSE | 75,000 under 100,000 | 01005 | MA | 75,000 under 100,000 |
agi_four | 79559 | FALSE | 75,000 under 100,000 | 01007 | MA | 75,000 under 100,000 |
agi_four | 7898 | FALSE | 75,000 under 100,000 | 01008 | MA | 75,000 under 100,000 |
agi_four | 15611 | FALSE | 75,000 under 100,000 | 01010 | MA | 75,000 under 100,000 |
agi_four | 5696 | FALSE | 75,000 under 100,000 | 01011 | MA | 75,000 under 100,000 |
agi_four | 3011 | FALSE | 75,000 under 100,000 | 01012 | MA | 75,000 under 100,000 |
agi_four | 65987 | FALSE | 75,000 under 100,000 | 01013 | MA | 75,000 under 100,000 |
agi_four | 127031 | TRUE | 75,000 under 100,000 | 01020 | MA | 75,000 under 100,000 |
ggplot(dfmelt, aes(x=has_starbucks, y=value,fill=bucket))+
geom_boxplot()+
geom_point(aes(text=paste("zipcode:",zipcode)))+
#facet_grid(.~bucket)
facet_grid(.~bucketf)
## Warning: Ignoring unknown aesthetics: text
Now let’s take a percentile of zip codes without a starbucks and those that are above the 75th percentile in income could qualify for expansion.
dfntile<- inner_join(dfmeltbase, dflegend,by=c("variable" = "label"))
## Warning in inner_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## factor and character vector, coercing into character vector
#df %>% group_by(a) %>% summarise_each(funs(sum)) the line below was adapted from this line
##this line is very powerful, what it is doing is grouping by each bucket and then for each one
##finding the given percentile. YOU CANNOT DO THIS NATIVELY IN SQL OR ON LIGHTER SQL PLATFORMS BECAUSE
##FINDING THE N-TILE INVOLVES SORTING THE DATA. R does this inherently; quantile could be replaced
##with any other analytic function.
dfresult<-dfntile %>% group_by(bucket) %>% summarise(ntile_val=(quantile(value,.75)))
#join the dfmelt "table" with the dfresult table and get all zip codes without starbucks for each bucket
#that have a gross reported income of at least the ntile value or more (NOTE: this can be made dynamic)
dfnewzips<-inner_join (dfmelt,dfresult,by=c("bucket"))%>%
filter(value >= ntile_val & has_starbucks==FALSE)%>%select(one_of(c("variable","bucket", "value","ntile_val","zipcode","IncomeState")))%>%
arrange(desc(value))
dfNewLocationList<-melt(df3,measure.vars=7:12)%>%
filter(YEAR==2014 & variable %in% c("agi_four", "agi_five","agi_six"))%>%
filter(paste0(variable,zipcode) %in% paste0(dfnewzips$variable,dfnewzips$zipcode))
#data quality check: google results show over 2000 zipcodes in New York; 692 of them could be new starbucks locations
results<-sqldf("select sum(distinct total_income) as total_income,sum(distinct total_returns_filed) as total_returns_filed,
count(distinct zipcode) as num_zip_codes,IncomeState from dfNewLocationList
group by incomestate order by sum(distinct total_income) desc")
knitr::kable(results)
total_income | total_returns_filed | num_zip_codes | IncomeState |
---|---|---|---|
258703869 | 3982110 | 347 | NY |
205334738 | 3056670 | 279 | FL |
195071936 | 2953070 | 303 | TX |
172719823 | 2056340 | 255 | NJ |
153988449 | 2190490 | 247 | PA |
129695594 | 1772080 | 170 | CA |
126660729 | 1452680 | 195 | MA |
95924930 | 1565740 | 167 | MI |
95247661 | 1623630 | 159 | OH |
93602662 | 1543500 | 147 | IL |
79139573 | 1354490 | 125 | NC |
71626969 | 992880 | 110 | MD |
70166183 | 926600 | 100 | MN |
64974518 | 1238700 | 101 | GA |
61314763 | 671430 | 100 | CT |
60381301 | 753290 | 88 | VA |
56242570 | 863760 | 105 | WI |
47235402 | 817800 | 77 | TN |
44115904 | 750860 | 84 | IN |
42809718 | 691930 | 68 | MO |
40368965 | 721350 | 66 | SC |
39332842 | 658240 | 72 | LA |
34258296 | 555830 | 68 | AL |
33025281 | 571890 | 53 | KY |
32370315 | 478380 | 45 | UT |
31994741 | 486130 | 60 | OK |
25181185 | 338400 | 44 | CO |
24493015 | 383620 | 47 | IA |
23579667 | 413700 | 42 | MS |
23342503 | 283230 | 50 | NH |
22237821 | 270010 | 43 | KS |
19502833 | 342000 | 33 | AR |
17936207 | 270770 | 36 | NE |
16450029 | 215650 | 32 | WA |
15632760 | 270410 | 26 | RI |
15487883 | 244990 | 26 | AZ |
14888461 | 238450 | 23 | DE |
13390420 | 234370 | 21 | NM |
11044169 | 178110 | 18 | NV |
10598177 | 167570 | 22 | WV |
9828218 | 169780 | 17 | ID |
8756609 | 98420 | 13 | ND |
7828026 | 136340 | 17 | OR |
7571944 | 115890 | 15 | MT |
6639075 | 96220 | 15 | VT |
5906731 | 84740 | 14 | ME |
4751236 | 58280 | 10 | WY |
4337789 | 67890 | 7 | HI |
4037840 | 82850 | 4 | DC |
3887930 | 44450 | 7 | AK |
3242484 | 48010 | 7 | SD |
Visualizing the data can allow us to see how different states and markets relate to each other with respect to income and population. The first example is a scatter plot of states; the reference lines mark the 75th percentile of income and population.
#plot the income by population with a reference line to show the 75th percentile of income
ggplot
## function (data = NULL, mapping = aes(), ..., environment = parent.frame())
## {
## UseMethod("ggplot")
## }
## <environment: namespace:ggplot2>
ggplot(results,aes(x=total_income,y=total_returns_filed,label=IncomeState)) +
geom_point() + geom_text(aes(label=IncomeState),hjust=0,vjust=0)+geom_vline(xintercept = quantile(results$total_income,.75)) + geom_hline(yintercept = quantile(results$total_returns_filed,.75))
Another way to visualize the data is through mapping. State boundary data with lat-long coordiantes are built into the state R package. Joining lat-long data to the state names in the state result dataset enables plotting in ggplot to render a map of the United States with state boundaries filled in where expansion opportunities are the greatest. Compare this map with the scatter plot shown above.
data(state)
statename<-as.data.frame(cbind(state.abb,tolower(state.name)))
newresults<-inner_join(results,statename,by=c("IncomeState" = "state.abb"))
## Warning in inner_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## character vector and factor, coercing into character vector
newresults<-merge(newresults,map_data("state"), by.x='V2', by.y='region')
p <- ggplot()
p <- p + geom_polygon(data=newresults, aes(x=long, y=lat, group = group, fill=newresults$total_income),
colour="lightgreen"
) + scale_fill_continuous(low = "thistle2", high = "darkgreen", guide="colorbar")
P1 <- p + theme_bw() + labs(fill = "AGI"
,title = "AGI by State for Starbucks Expansion", x="", y="")
P1 + scale_y_continuous(breaks=c()) + scale_x_continuous(breaks=c()) + theme(panel.border = element_blank())