Intro

Starbucks is one of the best known brands in the world with thousands of locations. There are still zipcodes in the United States without a way to get your hyper-caffienated-four-dollar fix. Can we use data to make an informed decision about where Starbucks could expand next?

The train of thought is that Starbucks locations tend to be found in affluent and populous areas. By using publicly available data, we can overlay income data from zip codes and compare them by set: those having and lacking a Starbucks.

Note: this presentation was given to the University of Georgia in February 2017; it was not put into a an R Markdown presentation so that more code could be demonstrated.

The Data

  1. Starbucks Locaitons in the U.S. (https://opendata.socrata.com/Business/All-Starbucks-Locations-in-the-US-Map/ddym-zvjk)
  2. IRS Income Data Statistics by Zip Code for two years.
  3. Zipcodes Package (for cleaning up and providing Lat-Lon info)

The Data Loading Code

library(readxl)
library(readr)
library(sqldf)
library(dplyr)
library(tidyr) #for reshaping data
library(zipcode)
library(plotly)
library(tidyr)
library(reshape2)
library(dplyr)
library(ggplot2)
library(plotly)

#clean up all variables before running from the top
rm(list=ls(all=TRUE))
#load up the zipcode data to lean the data
data(zipcode)

#read in the startbucks data
starbuckslocations <- read_csv("~/UGA/Thinking in SQL/TIS_UGA/data/starbuckslocations.csv")
starbuckslocations$Zip = clean.zipcodes(starbuckslocations$Zip)

#note: always, always, always assume that your data is dirty.  Some of the zip codes from the raw file had their
#leading zero pulled off for zips in the northeast (3904-> 03904)

#create a 5 character zip column 
starbuckslocations$zip_five <-substr(starbuckslocations$Zip,1,5)

#summarize the number of stores by zipcode
starbuckslocations<-sqldf("select count(*) as storecount,State,zip_five from starbuckslocations 
                          group by State,zip_five")

knitr::kable(head(starbuckslocations,10))
storecount State zip_five
1 AK 99501
3 AK 99502
1 AK 99503
3 AK 99504
1 AK 99505
1 AK 99506
3 AK 99507
3 AK 99508
4 AK 99515
1 AK 99517
#View(sqldf("select * from starbuckslocations"))
#how many starbucks are in each state?
#View(sqldf("select count(*) as storecount,state from starbuckslocations group by state order by count(*) desc"))

#sum up the total income by zip code from the IRS Data
#https://www.irs.gov/uac/soi-tax-stats-individual-income-tax-statistics-2014-zip-code-data-soi
#and https://www.irs.gov/uac/soi-tax-stats-individual-income-tax-statistics-2013-zip-code-data-soi

#agi stub could be very useful because the bigger the number the more people have
#higher incomes.  #lesson : READ THE DATA DICTIONARY.
columnlist<-c("STATE","N1", "zipcode","agi_stub","A02650","YEAR")
if(!exists("Income2013"))
{
Income2013 <- read_csv("~/UGA/Thinking in SQL/TIS_UGA/data/13zpallagi.csv")
Income2013$YEAR<-2013
Income2013<-select(Income2013, one_of(columnlist))
Income2013$zipcode<-clean.zipcodes(Income2013$zipcode)
}

if(!exists("Income2014"))
{
Income2014<-read_csv("~/UGA/Thinking in SQL/TIS_UGA/data/14zpallagi.csv")
Income2014$YEAR<-2014
Income2014<-select(Income2014, one_of(columnlist))
Income2014$zipcode<-clean.zipcodes(Income2014$zipcode)
}

IncomeData<-rbind(Income2013,Income2014)
IncomeData$IncomeState<-IncomeData$STATE
IncomeData <- subset(IncomeData, select = -c(STATE) )
IncomeData$TotalIncome <- IncomeData$A02650
IncomeData <- subset(IncomeData, select = -c(A02650) )
#get rid of any income data with zip code 99999; it means they had less than 100 returns
#this could also be single resident zip codes
#•  ZIP codes with less than 100 returns and those identified as a single 
#building or nonresidential ZIP code were categorized as “other” (99999).
IncomeData<-sqldf("select * from IncomeData where zipcode not in ('99999','00000')")

rm(Income2014)
rm(Income2013)
#join the starbucks dataset to the IRS income dataset
# I need one row per zip code for the starbucks locations.

#first recognize a good problem: there are many things you can learn from this data
#and its easy to go 100 different directions and get NO WHERE
#the data is at different levels, you will need to clean, aggregate, and average
#before you can analyze

IncomeData$zipcode<-as.character(IncomeData$zipcode)


#join the zip code data and the starbucks locaiton data by the cleaned up zipcode column.
all_data<-full_join(IncomeData,starbuckslocations,by=c("zipcode"="zip_five"),copy=TRUE)

#data quality check: count the numbe of total observations
total_rows<-nrow(IncomeData)
#how many starbucks locations do not have income associated
sb_zip_no_income<-nrow(dplyr::filter(all_data,is.na(`N1`)))
sb_zip_no_income_pct = (sb_zip_no_income/total_rows)*100
#hom many income zip codes do not have starbucks
income_zip_no_sb<-nrow(dplyr::filter(all_data,is.na(`storecount`)))
income_zip_no_sb_pct = (income_zip_no_sb/total_rows)*100

#View(sqldf("select * from all_data where storecount is null"))


#analysis and visualization ideas
#put the number of starbucks on the map by zip code and see how they correlate with a heat map.
#do a linear model predicting the number of starbucks in a zip code by income
set.seed(8265)

na.zero <- function (x) {
  x[is.na(x)] <- 0
  return(x)
}
#analysis and visualization ideas
#put the number of starbucks on the map by zip code and see how they correlate with a heat map.
#Idea: do a linear model predicting the number of starbucks in a zip code by income

all_data$storecount<-na.zero(all_data$storecount)

#filter these records out. income data is a critical factor
#View(sqldf("select * from all_data where IncomeState is null"))
all_data <- filter(all_data, !is.na(IncomeState))


income_data<-tidyr::spread(all_data,agi_stub,TotalIncome)
income_data<-income_data[,1:12]
#income_data<-filter(income_data,!is.na(IncomeState))
return_count<-tidyr::spread(all_data,agi_stub,N1)
return_count<-return_count[,1:12]
#return_count<-filter(return_count,!is.na(IncomeState))
income_data$storecount<-na.zero(income_data$storecount)
income_data$`1`<-na.zero(income_data$`1`)
income_data$`2`<-na.zero(income_data$`2`)
income_data$`3`<-na.zero(income_data$`3`)
income_data$`4`<-na.zero(income_data$`4`)
income_data$`5`<-na.zero(income_data$`5`)
income_data$`6`<-na.zero(income_data$`6`)
return_count$storecount<-na.zero(return_count$storecount)
return_count$`1`<-na.zero(return_count$`1`)
return_count$`2`<-na.zero(return_count$`2`)
return_count$`3`<-na.zero(return_count$`3`)
return_count$`4`<-na.zero(return_count$`4`)
return_count$`5`<-na.zero(return_count$`5`)
return_count$`6`<-na.zero(return_count$`6`)


names(income_data)[7]<-"agi_one_income"
names(income_data)[8]<-"agi_two_income"
names(income_data)[9]<-"agi_three_income"
names(income_data)[10]<-"agi_four_income"
names(income_data)[11]<-"agi_five_income"
names(income_data)[12]<-"agi_six_income"

names(return_count)[7]<-"agi_one_nt"
names(return_count)[8]<-"agi_two_nt"
names(return_count)[9]<-"agi_three_nt"
names(return_count)[10]<-"agi_four_nt"
names(return_count)[11]<-"agi_five_nt"
names(return_count)[12]<-"agi_six_nt"

Data Slices

library(xtable)

#datatable looking at income data and the total AGI for each zipcode

df1<-(sqldf("select zipcode,year,incomestate, 
                 sum(agi_one_income)+sum(agi_two_income)+sum(agi_three_income)
                  +sum(agi_four_income)+sum(agi_five_income)+sum(agi_six_income) as total_income, 
                 sum(N1) as total_returns_filed,  sum(storecount) as storecount,
                 sum(agi_one_income) as agi_one,
                 sum(agi_two_income) as agi_two,
                 sum(agi_three_income) as agi_three,
                 sum(agi_four_income) as agi_four,
                 sum(agi_five_income) as agi_five,
                 sum(agi_six_income) as agi_six
                 from income_data 
                 group by zipcode,year,incomestate"))

knitr::kable(head(df1,10))
zipcode YEAR IncomeState total_income total_returns_filed storecount agi_one agi_two agi_three agi_four agi_five agi_six
01001 2013 MA 477601 8780 0 35047 82453 94284 79361 146901 39555
01001 2014 MA 494972 8880 0 34289 83204 96087 79716 152951 48725
01002 2013 MA 762298 9570 6 41057 62546 73550 74571 229774 280800
01002 2014 MA 802824 9640 6 40861 64687 70073 71996 241834 313373
01005 2013 MA 129645 2230 0 8456 19902 23521 24843 41184 11739
01005 2014 MA 133441 2280 0 8534 18901 25005 23646 44315 13040
01007 2013 MA 489020 7300 0 26288 54890 66259 76068 191131 74384
01007 2014 MA 513575 7390 0 26386 52448 65329 79559 199827 90026
01008 2013 MA 38867 540 0 2140 5713 6995 8760 15259 0
01008 2014 MA 40175 640 0 2024 4749 6703 7898 18801 0
#number of tax returns filed for each income bucket

df2<-(sqldf("select zipcode,year,incomestate, 
                 sum(agi_one_nt)+sum(agi_two_nt)+sum(agi_three_nt)
                  +sum(agi_four_nt)+sum(agi_five_nt)+sum(agi_six_nt) as total_nt, 
                   sum(storecount) as storecount,
                 sum(agi_one_nt) as nt_one,
                 sum(agi_two_nt) as nt_two,
                 sum(agi_three_nt) as nt_three,
                 sum(agi_four_nt) as nt_four,
                 sum(agi_five_nt) as nt_five,
                 sum(agi_six_nt) as nt_six
                 from return_count 
                 group by zipcode,year,incomestate"))
knitr::kable(head(df2,10))
zipcode YEAR IncomeState total_nt storecount nt_one nt_two nt_three nt_four nt_five nt_six
01001 2013 MA 8780 0 2910 2190 1520 910 1120 130
01001 2014 MA 8880 0 2890 2210 1550 920 1160 150
01002 2013 MA 9570 6 3550 1700 1160 840 1630 690
01002 2014 MA 9640 6 3510 1760 1100 810 1710 750
01005 2013 MA 2230 0 690 540 370 280 310 40
01005 2014 MA 2280 0 720 500 400 270 340 50
01007 2013 MA 7300 0 2260 1470 1050 860 1420 240
01007 2014 MA 7390 0 2300 1400 1040 900 1470 280
01008 2013 MA 640 0 180 150 110 100 100 0
01008 2014 MA 640 0 190 130 110 90 120 0
#join the two datasets on income and store cout
#tag the rows where there are zip cods and storecount is 0
df3<-inner_join(df1,df2,by=c("zipcode","YEAR","IncomeState"))
#tag zipcodes where there is no starbucks store
df3$has_starbucks<-df3$storecount.x > 0
#print a sample of the data
knitr::kable(head(df3,10))
zipcode YEAR IncomeState total_income total_returns_filed storecount.x agi_one agi_two agi_three agi_four agi_five agi_six total_nt storecount.y nt_one nt_two nt_three nt_four nt_five nt_six has_starbucks
01001 2013 MA 477601 8780 0 35047 82453 94284 79361 146901 39555 8780 0 2910 2190 1520 910 1120 130 FALSE
01001 2014 MA 494972 8880 0 34289 83204 96087 79716 152951 48725 8880 0 2890 2210 1550 920 1160 150 FALSE
01002 2013 MA 762298 9570 6 41057 62546 73550 74571 229774 280800 9570 6 3550 1700 1160 840 1630 690 TRUE
01002 2014 MA 802824 9640 6 40861 64687 70073 71996 241834 313373 9640 6 3510 1760 1100 810 1710 750 TRUE
01005 2013 MA 129645 2230 0 8456 19902 23521 24843 41184 11739 2230 0 690 540 370 280 310 40 FALSE
01005 2014 MA 133441 2280 0 8534 18901 25005 23646 44315 13040 2280 0 720 500 400 270 340 50 FALSE
01007 2013 MA 489020 7300 0 26288 54890 66259 76068 191131 74384 7300 0 2260 1470 1050 860 1420 240 FALSE
01007 2014 MA 513575 7390 0 26386 52448 65329 79559 199827 90026 7390 0 2300 1400 1040 900 1470 280 FALSE
01008 2013 MA 38867 540 0 2140 5713 6995 8760 15259 0 640 0 180 150 110 100 100 0 FALSE
01008 2014 MA 40175 640 0 2024 4749 6703 7898 18801 0 640 0 190 130 110 90 120 0 FALSE
#this is just for formatting and creating a legend
# 1 = $1 under $25,000
# 2 = $25,000 under $50,000
# 3 = $50,000 under $75,000
# 4 = $75,000 under $100,000
# 5 = $100,000 under $200,000
# 6 = $200,000 or more
label = c("agi_one","agi_two","agi_three","agi_four","agi_five","agi_six")
bucket = c("1 under 25,000","25,000 under 50,000","50,000 under 75,000","75,000 under 100,000","100,000 under 200,000","200,000 or more")
position = c(1,2,3,4,5,6)
dflegend<-data.frame(label,bucket,position,stringsAsFactors = FALSE)

The Analysis Code

Starbucks tends to target more affluent areas, or households filing in the upper buckets of our data distribution. This analysis will look at agi buckets four through six for 2014.

#change the shape of the data before we plot and analyze
dfmeltbase<-melt(df3,measure.vars=7:12)%>%filter(YEAR==2014 & variable %in% c("agi_four", "agi_five","agi_six"))

head(dfmeltbase,10)
##    zipcode YEAR IncomeState total_income total_returns_filed storecount.x
## 1    01001 2014          MA       494972                8880            0
## 2    01002 2014          MA       802824                9640            6
## 3    01005 2014          MA       133441                2280            0
## 4    01007 2014          MA       513575                7390            0
## 5    01008 2014          MA        40175                 640            0
## 6    01010 2014          MA       128916                1840            0
## 7    01011 2014          MA        31224                 610            0
## 8    01012 2014          MA        22684                 310            0
## 9    01013 2014          MA       415570               10420            0
## 10   01020 2014          MA       697760               15020            6
##    total_nt storecount.y nt_one nt_two nt_three nt_four nt_five nt_six
## 1      8880            0   2890   2210     1550     920    1160    150
## 2      9640            6   3510   1760     1100     810    1710    750
## 3      2280            0    720    500      400     270     340     50
## 4      7390            0   2300   1400     1040     900    1470    280
## 5       640            0    190    130      110      90     120      0
## 6      1840            0    570    350      320     180     360     60
## 7       610            0    210    140      110      70      80      0
## 8       370            0    120     90       60      40      60      0
## 9     10420            0   4460   3020     1510     770     610     50
## 10    15020            6   5560   4160     2410    1460    1330    100
##    has_starbucks variable  value
## 1          FALSE agi_four  79716
## 2           TRUE agi_four  71996
## 3          FALSE agi_four  23646
## 4          FALSE agi_four  79559
## 5          FALSE agi_four   7898
## 6          FALSE agi_four  15611
## 7          FALSE agi_four   5696
## 8          FALSE agi_four   3011
## 9          FALSE agi_four  65987
## 10          TRUE agi_four 127031

Look at the difference between the two tables below and it should be appearant what the melt function has done.

BEFORE melt

knitr::kable(head(df3,10))
zipcode YEAR IncomeState total_income total_returns_filed storecount.x agi_one agi_two agi_three agi_four agi_five agi_six total_nt storecount.y nt_one nt_two nt_three nt_four nt_five nt_six has_starbucks
01001 2013 MA 477601 8780 0 35047 82453 94284 79361 146901 39555 8780 0 2910 2190 1520 910 1120 130 FALSE
01001 2014 MA 494972 8880 0 34289 83204 96087 79716 152951 48725 8880 0 2890 2210 1550 920 1160 150 FALSE
01002 2013 MA 762298 9570 6 41057 62546 73550 74571 229774 280800 9570 6 3550 1700 1160 840 1630 690 TRUE
01002 2014 MA 802824 9640 6 40861 64687 70073 71996 241834 313373 9640 6 3510 1760 1100 810 1710 750 TRUE
01005 2013 MA 129645 2230 0 8456 19902 23521 24843 41184 11739 2230 0 690 540 370 280 310 40 FALSE
01005 2014 MA 133441 2280 0 8534 18901 25005 23646 44315 13040 2280 0 720 500 400 270 340 50 FALSE
01007 2013 MA 489020 7300 0 26288 54890 66259 76068 191131 74384 7300 0 2260 1470 1050 860 1420 240 FALSE
01007 2014 MA 513575 7390 0 26386 52448 65329 79559 199827 90026 7390 0 2300 1400 1040 900 1470 280 FALSE
01008 2013 MA 38867 540 0 2140 5713 6995 8760 15259 0 640 0 180 150 110 100 100 0 FALSE
01008 2014 MA 40175 640 0 2024 4749 6703 7898 18801 0 640 0 190 130 110 90 120 0 FALSE

AFTER melt: the agi column names have now become rows. This allows for pivotting the data into the right shape.

knitr::kable(head(dfmeltbase,10))
zipcode YEAR IncomeState total_income total_returns_filed storecount.x total_nt storecount.y nt_one nt_two nt_three nt_four nt_five nt_six has_starbucks variable value
01001 2014 MA 494972 8880 0 8880 0 2890 2210 1550 920 1160 150 FALSE agi_four 79716
01002 2014 MA 802824 9640 6 9640 6 3510 1760 1100 810 1710 750 TRUE agi_four 71996
01005 2014 MA 133441 2280 0 2280 0 720 500 400 270 340 50 FALSE agi_four 23646
01007 2014 MA 513575 7390 0 7390 0 2300 1400 1040 900 1470 280 FALSE agi_four 79559
01008 2014 MA 40175 640 0 640 0 190 130 110 90 120 0 FALSE agi_four 7898
01010 2014 MA 128916 1840 0 1840 0 570 350 320 180 360 60 FALSE agi_four 15611
01011 2014 MA 31224 610 0 610 0 210 140 110 70 80 0 FALSE agi_four 5696
01012 2014 MA 22684 310 0 370 0 120 90 60 40 60 0 FALSE agi_four 3011
01013 2014 MA 415570 10420 0 10420 0 4460 3020 1510 770 610 50 FALSE agi_four 65987
01020 2014 MA 697760 15020 6 15020 6 5560 4160 2410 1460 1330 100 TRUE agi_four 127031

Box plot and compare the two types of zip codes (those having vs. those lacking a Starbucks).

#join the melted data and legend, sort the data by the position of the labels in the legend.
dfmelt<-  inner_join(dfmeltbase, dflegend,by=c("variable" = "label"))%>%
arrange(position)%>%
select(one_of(c("variable", "value","has_starbucks","bucket","zipcode","IncomeState")))
## Warning in inner_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## factor and character vector, coercing into character vector
#boxplot the data to see how the data is distributed among various dimensions
#this line is going to subdivide the data (income, number of returns etc into different levels that 
#can be evaluated indepenently.  This will become apparent boxplot.)
dfmelt$bucketf = factor(dfmelt$bucket,levels=c("75,000 under 100,000","100,000 under 200,000","200,000 or more"))

For easier following of this code chunk, look at the table immeidately above and follow where the column names fall into the function parameters

knitr::kable(head(dfmelt,10))
variable value has_starbucks bucket zipcode IncomeState bucketf
agi_four 79716 FALSE 75,000 under 100,000 01001 MA 75,000 under 100,000
agi_four 71996 TRUE 75,000 under 100,000 01002 MA 75,000 under 100,000
agi_four 23646 FALSE 75,000 under 100,000 01005 MA 75,000 under 100,000
agi_four 79559 FALSE 75,000 under 100,000 01007 MA 75,000 under 100,000
agi_four 7898 FALSE 75,000 under 100,000 01008 MA 75,000 under 100,000
agi_four 15611 FALSE 75,000 under 100,000 01010 MA 75,000 under 100,000
agi_four 5696 FALSE 75,000 under 100,000 01011 MA 75,000 under 100,000
agi_four 3011 FALSE 75,000 under 100,000 01012 MA 75,000 under 100,000
agi_four 65987 FALSE 75,000 under 100,000 01013 MA 75,000 under 100,000
agi_four 127031 TRUE 75,000 under 100,000 01020 MA 75,000 under 100,000
ggplot(dfmelt,  aes(x=has_starbucks, y=value,fill=bucket))+
  geom_boxplot()+
  geom_point(aes(text=paste("zipcode:",zipcode)))+
  #facet_grid(.~bucket)
  facet_grid(.~bucketf)
## Warning: Ignoring unknown aesthetics: text

Now let’s take a percentile of zip codes without a starbucks and those that are above the 75th percentile in income could qualify for expansion.

dfntile<-  inner_join(dfmeltbase, dflegend,by=c("variable" = "label"))
## Warning in inner_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## factor and character vector, coercing into character vector
#df %>% group_by(a) %>% summarise_each(funs(sum)) the line below was adapted from this line 
##this line is very powerful, what it is doing is grouping by each bucket and then for each one
##finding the given percentile. YOU CANNOT DO THIS NATIVELY IN SQL OR ON LIGHTER SQL PLATFORMS BECAUSE
##FINDING THE N-TILE INVOLVES SORTING THE DATA. R does this inherently; quantile could be replaced
##with any other analytic function.
dfresult<-dfntile %>% group_by(bucket) %>% summarise(ntile_val=(quantile(value,.75)))


#join the dfmelt "table" with the dfresult table and get all zip codes without starbucks for each bucket
#that have a gross reported income of at least the ntile value or more (NOTE: this can be made dynamic)

dfnewzips<-inner_join (dfmelt,dfresult,by=c("bucket"))%>%
  filter(value >= ntile_val & has_starbucks==FALSE)%>%select(one_of(c("variable","bucket", "value","ntile_val","zipcode","IncomeState")))%>%
  arrange(desc(value))

dfNewLocationList<-melt(df3,measure.vars=7:12)%>%
  filter(YEAR==2014 & variable %in% c("agi_four", "agi_five","agi_six"))%>%
  filter(paste0(variable,zipcode) %in% paste0(dfnewzips$variable,dfnewzips$zipcode))

#data quality check: google results show over 2000 zipcodes in New York; 692 of them could be new starbucks locations
results<-sqldf("select sum(distinct total_income) as total_income,sum(distinct total_returns_filed) as total_returns_filed,
count(distinct zipcode) as num_zip_codes,IncomeState from dfNewLocationList
      group by incomestate order by sum(distinct total_income) desc")

knitr::kable(results)
total_income total_returns_filed num_zip_codes IncomeState
258703869 3982110 347 NY
205334738 3056670 279 FL
195071936 2953070 303 TX
172719823 2056340 255 NJ
153988449 2190490 247 PA
129695594 1772080 170 CA
126660729 1452680 195 MA
95924930 1565740 167 MI
95247661 1623630 159 OH
93602662 1543500 147 IL
79139573 1354490 125 NC
71626969 992880 110 MD
70166183 926600 100 MN
64974518 1238700 101 GA
61314763 671430 100 CT
60381301 753290 88 VA
56242570 863760 105 WI
47235402 817800 77 TN
44115904 750860 84 IN
42809718 691930 68 MO
40368965 721350 66 SC
39332842 658240 72 LA
34258296 555830 68 AL
33025281 571890 53 KY
32370315 478380 45 UT
31994741 486130 60 OK
25181185 338400 44 CO
24493015 383620 47 IA
23579667 413700 42 MS
23342503 283230 50 NH
22237821 270010 43 KS
19502833 342000 33 AR
17936207 270770 36 NE
16450029 215650 32 WA
15632760 270410 26 RI
15487883 244990 26 AZ
14888461 238450 23 DE
13390420 234370 21 NM
11044169 178110 18 NV
10598177 167570 22 WV
9828218 169780 17 ID
8756609 98420 13 ND
7828026 136340 17 OR
7571944 115890 15 MT
6639075 96220 15 VT
5906731 84740 14 ME
4751236 58280 10 WY
4337789 67890 7 HI
4037840 82850 4 DC
3887930 44450 7 AK
3242484 48010 7 SD

Data Visuzliation

Visualizing the data can allow us to see how different states and markets relate to each other with respect to income and population. The first example is a scatter plot of states; the reference lines mark the 75th percentile of income and population.

#plot the income by population with a reference line to show the 75th percentile of income
ggplot
## function (data = NULL, mapping = aes(), ..., environment = parent.frame()) 
## {
##     UseMethod("ggplot")
## }
## <environment: namespace:ggplot2>
ggplot(results,aes(x=total_income,y=total_returns_filed,label=IncomeState)) + 
  geom_point() + geom_text(aes(label=IncomeState),hjust=0,vjust=0)+geom_vline(xintercept = quantile(results$total_income,.75)) + geom_hline(yintercept = quantile(results$total_returns_filed,.75))

Another way to visualize the data is through mapping. State boundary data with lat-long coordiantes are built into the state R package. Joining lat-long data to the state names in the state result dataset enables plotting in ggplot to render a map of the United States with state boundaries filled in where expansion opportunities are the greatest. Compare this map with the scatter plot shown above.

data(state)
statename<-as.data.frame(cbind(state.abb,tolower(state.name)))
newresults<-inner_join(results,statename,by=c("IncomeState" = "state.abb"))
## Warning in inner_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## character vector and factor, coercing into character vector
newresults<-merge(newresults,map_data("state"), by.x='V2', by.y='region')



p <- ggplot()
p <- p + geom_polygon(data=newresults, aes(x=long, y=lat, group = group, fill=newresults$total_income),
                      colour="lightgreen"
) + scale_fill_continuous(low = "thistle2", high = "darkgreen", guide="colorbar")

P1 <- p + theme_bw()  + labs(fill = "AGI" 
                             ,title = "AGI by State for Starbucks Expansion", x="", y="")
P1 + scale_y_continuous(breaks=c()) + scale_x_continuous(breaks=c()) + theme(panel.border =  element_blank())