Intro

Starbucks is one of the best known brands in the world with thousands of locations. There are still zipcodes in the United States without a way to get your hyper-caffienated-four-dollar fix. Can we use data to make an informed decision about where Starbucks could expand next?

The train of thought is that Starbucks locations tend to be found in affluent and populous areas. By using publicly available data, we can overlay income data from zip codes and compare them by set: those having and lacking a Starbucks.

Note: this presentation was given to the University of Georgia in February 2017; it was not put into a an R Markdown presentation so that more code could be demonstrated.

The Data

Starbucks Locaitons in the U.S. (https://opendata.socrata.com/Business/All-Starbucks-Locations-in-the-US-Map/ddym-zvjk)
IRS Income Data Statistics by Zip Code for two years.
Zipcodes Package (for cleaning up and providing Lat-Lon info)

The Data Loading Code

library(readxl)
library(readr)
library(sqldf)
library(dplyr)
library(tidyr) #for reshaping data
library(zipcode)
library(plotly)
library(tidyr)
library(reshape2)
library(dplyr)
library(ggplot2)
library(plotly)

#clean up all variables before running from the top
rm(list=ls(all=TRUE))
#load up the zipcode data to lean the data
data(zipcode)

#read in the startbucks data
starbuckslocations <- read_csv("~/UGA/Thinking in SQL/TIS_UGA/data/starbuckslocations.csv")
starbuckslocations$Zip = clean.zipcodes(starbuckslocations$Zip)

#note: always, always, always assume that your data is dirty.  Some of the zip codes from the raw file had their
#leading zero pulled off for zips in the northeast (3904-> 03904)

#create a 5 character zip column 
starbuckslocations$zip_five <-substr(starbuckslocations$Zip,1,5)

#summarize the number of stores by zipcode
starbuckslocations<-sqldf("select count(*) as storecount,State,zip_five from starbuckslocations 
                          group by State,zip_five")

knitr::kable(head(starbuckslocations,10))

storecount	State	zip_five
1	AK	99501
3	AK	99502
1	AK	99503
3	AK	99504
1	AK	99505
1	AK	99506
3	AK	99507
3	AK	99508
4	AK	99515
1	AK	99517

#View(sqldf("select * from starbuckslocations"))
#how many starbucks are in each state?
#View(sqldf("select count(*) as storecount,state from starbuckslocations group by state order by count(*) desc"))

#sum up the total income by zip code from the IRS Data
#https://www.irs.gov/uac/soi-tax-stats-individual-income-tax-statistics-2014-zip-code-data-soi
#and https://www.irs.gov/uac/soi-tax-stats-individual-income-tax-statistics-2013-zip-code-data-soi

#agi stub could be very useful because the bigger the number the more people have
#higher incomes.  #lesson : READ THE DATA DICTIONARY.
columnlist<-c("STATE","N1", "zipcode","agi_stub","A02650","YEAR")
if(!exists("Income2013"))
{
Income2013 <- read_csv("~/UGA/Thinking in SQL/TIS_UGA/data/13zpallagi.csv")
Income2013$YEAR<-2013
Income2013<-select(Income2013, one_of(columnlist))
Income2013$zipcode<-clean.zipcodes(Income2013$zipcode)
}

if(!exists("Income2014"))
{
Income2014<-read_csv("~/UGA/Thinking in SQL/TIS_UGA/data/14zpallagi.csv")
Income2014$YEAR<-2014
Income2014<-select(Income2014, one_of(columnlist))
Income2014$zipcode<-clean.zipcodes(Income2014$zipcode)
}

IncomeData<-rbind(Income2013,Income2014)
IncomeData$IncomeState<-IncomeData$STATE
IncomeData <- subset(IncomeData, select = -c(STATE) )
IncomeData$TotalIncome <- IncomeData$A02650
IncomeData <- subset(IncomeData, select = -c(A02650) )
#get rid of any income data with zip code 99999; it means they had less than 100 returns
#this could also be single resident zip codes
#•  ZIP codes with less than 100 returns and those identified as a single 
#building or nonresidential ZIP code were categorized as “other” (99999).
IncomeData<-sqldf("select * from IncomeData where zipcode not in ('99999','00000')")

rm(Income2014)
rm(Income2013)
#join the starbucks dataset to the IRS income dataset
# I need one row per zip code for the starbucks locations.

#first recognize a good problem: there are many things you can learn from this data
#and its easy to go 100 different directions and get NO WHERE
#the data is at different levels, you will need to clean, aggregate, and average
#before you can analyze

IncomeData$zipcode<-as.character(IncomeData$zipcode)


#join the zip code data and the starbucks locaiton data by the cleaned up zipcode column.
all_data<-full_join(IncomeData,starbuckslocations,by=c("zipcode"="zip_five"),copy=TRUE)

#data quality check: count the numbe of total observations
total_rows<-nrow(IncomeData)
#how many starbucks locations do not have income associated
sb_zip_no_income<-nrow(dplyr::filter(all_data,is.na(`N1`)))
sb_zip_no_income_pct = (sb_zip_no_income/total_rows)*100
#hom many income zip codes do not have starbucks
income_zip_no_sb<-nrow(dplyr::filter(all_data,is.na(`storecount`)))
income_zip_no_sb_pct = (income_zip_no_sb/total_rows)*100

#View(sqldf("select * from all_data where storecount is null"))


#analysis and visualization ideas
#put the number of starbucks on the map by zip code and see how they correlate with a heat map.
#do a linear model predicting the number of starbucks in a zip code by income
set.seed(8265)

na.zero <- function (x) {
  x[is.na(x)] <- 0
  return(x)
}
#analysis and visualization ideas
#put the number of starbucks on the map by zip code and see how they correlate with a heat map.
#Idea: do a linear model predicting the number of starbucks in a zip code by income

all_data$storecount<-na.zero(all_data$storecount)

#filter these records out. income data is a critical factor
#View(sqldf("select * from all_data where IncomeState is null"))
all_data <- filter(all_data, !is.na(IncomeState))


income_data<-tidyr::spread(all_data,agi_stub,TotalIncome)
income_data<-income_data[,1:12]
#income_data<-filter(income_data,!is.na(IncomeState))
return_count<-tidyr::spread(all_data,agi_stub,N1)
return_count<-return_count[,1:12]
#return_count<-filter(return_count,!is.na(IncomeState))
income_data$storecount<-na.zero(income_data$storecount)
income_data$`1`<-na.zero(income_data$`1`)
income_data$`2`<-na.zero(income_data$`2`)
income_data$`3`<-na.zero(income_data$`3`)
income_data$`4`<-na.zero(income_data$`4`)
income_data$`5`<-na.zero(income_data$`5`)
income_data$`6`<-na.zero(income_data$`6`)
return_count$storecount<-na.zero(return_count$storecount)
return_count$`1`<-na.zero(return_count$`1`)
return_count$`2`<-na.zero(return_count$`2`)
return_count$`3`<-na.zero(return_count$`3`)
return_count$`4`<-na.zero(return_count$`4`)
return_count$`5`<-na.zero(return_count$`5`)
return_count$`6`<-na.zero(return_count$`6`)


names(income_data)[7]<-"agi_one_income"
names(income_data)[8]<-"agi_two_income"
names(income_data)[9]<-"agi_three_income"
names(income_data)[10]<-"agi_four_income"
names(income_data)[11]<-"agi_five_income"
names(income_data)[12]<-"agi_six_income"

names(return_count)[7]<-"agi_one_nt"
names(return_count)[8]<-"agi_two_nt"
names(return_count)[9]<-"agi_three_nt"
names(return_count)[10]<-"agi_four_nt"
names(return_count)[11]<-"agi_five_nt"
names(return_count)[12]<-"agi_six_nt"

Data Slices

library(xtable)

#datatable looking at income data and the total AGI for each zipcode

df1<-(sqldf("select zipcode,year,incomestate, 
                 sum(agi_one_income)+sum(agi_two_income)+sum(agi_three_income)
                  +sum(agi_four_income)+sum(agi_five_income)+sum(agi_six_income) as total_income, 
                 sum(N1) as total_returns_filed,  sum(storecount) as storecount,
                 sum(agi_one_income) as agi_one,
                 sum(agi_two_income) as agi_two,
                 sum(agi_three_income) as agi_three,
                 sum(agi_four_income) as agi_four,
                 sum(agi_five_income) as agi_five,
                 sum(agi_six_income) as agi_six
                 from income_data 
                 group by zipcode,year,incomestate"))

knitr::kable(head(df1,10))

zipcode	YEAR	IncomeState	total_income	total_returns_filed	storecount	agi_one	agi_two	agi_three	agi_four	agi_five	agi_six
01001	2013	MA	477601	8780	0	35047	82453	94284	79361	146901	39555
01001	2014	MA	494972	8880	0	34289	83204	96087	79716	152951	48725
01002	2013	MA	762298	9570	6	41057	62546	73550	74571	229774	280800
01002	2014	MA	802824	9640	6	40861	64687	70073	71996	241834	313373
01005	2013	MA	129645	2230	0	8456	19902	23521	24843	41184	11739
01005	2014	MA	133441	2280	0	8534	18901	25005	23646	44315	13040
01007	2013	MA	489020	7300	0	26288	54890	66259	76068	191131	74384
01007	2014	MA	513575	7390	0	26386	52448	65329	79559	199827	90026
01008	2013	MA	38867	540	0	2140	5713	6995	8760	15259	0
01008	2014	MA	40175	640	0	2024	4749	6703	7898	18801	0

#number of tax returns filed for each income bucket

df2<-(sqldf("select zipcode,year,incomestate, 
                 sum(agi_one_nt)+sum(agi_two_nt)+sum(agi_three_nt)
                  +sum(agi_four_nt)+sum(agi_five_nt)+sum(agi_six_nt) as total_nt, 
                   sum(storecount) as storecount,
                 sum(agi_one_nt) as nt_one,
                 sum(agi_two_nt) as nt_two,
                 sum(agi_three_nt) as nt_three,
                 sum(agi_four_nt) as nt_four,
                 sum(agi_five_nt) as nt_five,
                 sum(agi_six_nt) as nt_six
                 from return_count 
                 group by zipcode,year,incomestate"))
knitr::kable(head(df2,10))

zipcode	YEAR	IncomeState	total_nt	storecount	nt_one	nt_two	nt_three	nt_four	nt_five	nt_six
01001	2013	MA	8780	0	2910	2190	1520	910	1120	130
01001	2014	MA	8880	0	2890	2210	1550	920	1160	150
01002	2013	MA	9570	6	3550	1700	1160	840	1630	690
01002	2014	MA	9640	6	3510	1760	1100	810	1710	750
01005	2013	MA	2230	0	690	540	370	280	310	40
01005	2014	MA	2280	0	720	500	400	270	340	50
01007	2013	MA	7300	0	2260	1470	1050	860	1420	240
01007	2014	MA	7390	0	2300	1400	1040	900	1470	280
01008	2013	MA	640	0	180	150	110	100	100	0
01008	2014	MA	640	0	190	130	110	90	120	0

#join the two datasets on income and store cout
#tag the rows where there are zip cods and storecount is 0
df3<-inner_join(df1,df2,by=c("zipcode","YEAR","IncomeState"))
#tag zipcodes where there is no starbucks store
df3$has_starbucks<-df3$storecount.x > 0
#print a sample of the data
knitr::kable(head(df3,10))

zipcode	YEAR	IncomeState	total_income	total_returns_filed	storecount.x	agi_one	agi_two	agi_three	agi_four	agi_five	agi_six	total_nt	storecount.y	nt_one	nt_two	nt_three	nt_four	nt_five	nt_six	has_starbucks
01001	2013	MA	477601	8780	0	35047	82453	94284	79361	146901	39555	8780	0	2910	2190	1520	910	1120	130	FALSE
01001	2014	MA	494972	8880	0	34289	83204	96087	79716	152951	48725	8880	0	2890	2210	1550	920	1160	150	FALSE
01002	2013	MA	762298	9570	6	41057	62546	73550	74571	229774	280800	9570	6	3550	1700	1160	840	1630	690	TRUE
01002	2014	MA	802824	9640	6	40861	64687	70073	71996	241834	313373	9640	6	3510	1760	1100	810	1710	750	TRUE
01005	2013	MA	129645	2230	0	8456	19902	23521	24843	41184	11739	2230	0	690	540	370	280	310	40	FALSE
01005	2014	MA	133441	2280	0	8534	18901	25005	23646	44315	13040	2280	0	720	500	400	270	340	50	FALSE
01007	2013	MA	489020	7300	0	26288	54890	66259	76068	191131	74384	7300	0	2260	1470	1050	860	1420	240	FALSE
01007	2014	MA	513575	7390	0	26386	52448	65329	79559	199827	90026	7390	0	2300	1400	1040	900	1470	280	FALSE
01008	2013	MA	38867	540	0	2140	5713	6995	8760	15259	0	640	0	180	150	110	100	100	0	FALSE
01008	2014	MA	40175	640	0	2024	4749	6703	7898	18801	0	640	0	190	130	110	90	120	0	FALSE

#this is just for formatting and creating a legend
# 1 = $1 under $25,000
# 2 = $25,000 under $50,000
# 3 = $50,000 under $75,000
# 4 = $75,000 under $100,000
# 5 = $100,000 under $200,000
# 6 = $200,000 or more
label = c("agi_one","agi_two","agi_three","agi_four","agi_five","agi_six")
bucket = c("1 under 25,000","25,000 under 50,000","50,000 under 75,000","75,000 under 100,000","100,000 under 200,000","200,000 or more")
position = c(1,2,3,4,5,6)
dflegend<-data.frame(label,bucket,position,stringsAsFactors = FALSE)

The Analysis Code

Starbucks tends to target more affluent areas, or households filing in the upper buckets of our data distribution. This analysis will look at agi buckets four through six for 2014.

#change the shape of the data before we plot and analyze
dfmeltbase<-melt(df3,measure.vars=7:12)%>%filter(YEAR==2014 & variable %in% c("agi_four", "agi_five","agi_six"))

head(dfmeltbase,10)

##    zipcode YEAR IncomeState total_income total_returns_filed storecount.x
## 1    01001 2014          MA       494972                8880            0
## 2    01002 2014          MA       802824                9640            6
## 3    01005 2014          MA       133441                2280            0
## 4    01007 2014          MA       513575                7390            0
## 5    01008 2014          MA        40175                 640            0
## 6    01010 2014          MA       128916                1840            0
## 7    01011 2014          MA        31224                 610            0
## 8    01012 2014          MA        22684                 310            0
## 9    01013 2014          MA       415570               10420            0
## 10   01020 2014          MA       697760               15020            6
##    total_nt storecount.y nt_one nt_two nt_three nt_four nt_five nt_six
## 1      8880            0   2890   2210     1550     920    1160    150
## 2      9640            6   3510   1760     1100     810    1710    750
## 3      2280            0    720    500      400     270     340     50
## 4      7390            0   2300   1400     1040     900    1470    280
## 5       640            0    190    130      110      90     120      0
## 6      1840            0    570    350      320     180     360     60
## 7       610            0    210    140      110      70      80      0
## 8       370            0    120     90       60      40      60      0
## 9     10420            0   4460   3020     1510     770     610     50
## 10    15020            6   5560   4160     2410    1460    1330    100
##    has_starbucks variable  value
## 1          FALSE agi_four  79716
## 2           TRUE agi_four  71996
## 3          FALSE agi_four  23646
## 4          FALSE agi_four  79559
## 5          FALSE agi_four   7898
## 6          FALSE agi_four  15611
## 7          FALSE agi_four   5696
## 8          FALSE agi_four   3011
## 9          FALSE agi_four  65987
## 10          TRUE agi_four 127031

Look at the difference between the two tables below and it should be appearant what the melt function has done.

BEFORE melt

knitr::kable(head(df3,10))

zipcode	YEAR	IncomeState	total_income	total_returns_filed	storecount.x	agi_one	agi_two	agi_three	agi_four	agi_five	agi_six	total_nt	storecount.y	nt_one	nt_two	nt_three	nt_four	nt_five	nt_six	has_starbucks
01001	2013	MA	477601	8780	0	35047	82453	94284	79361	146901	39555	8780	0	2910	2190	1520	910	1120	130	FALSE
01001	2014	MA	494972	8880	0	34289	83204	96087	79716	152951	48725	8880	0	2890	2210	1550	920	1160	150	FALSE
01002	2013	MA	762298	9570	6	41057	62546	73550	74571	229774	280800	9570	6	3550	1700	1160	840	1630	690	TRUE
01002	2014	MA	802824	9640	6	40861	64687	70073	71996	241834	313373	9640	6	3510	1760	1100	810	1710	750	TRUE
01005	2013	MA	129645	2230	0	8456	19902	23521	24843	41184	11739	2230	0	690	540	370	280	310	40	FALSE
01005	2014	MA	133441	2280	0	8534	18901	25005	23646	44315	13040	2280	0	720	500	400	270	340	50	FALSE
01007	2013	MA	489020	7300	0	26288	54890	66259	76068	191131	74384	7300	0	2260	1470	1050	860	1420	240	FALSE
01007	2014	MA	513575	7390	0	26386	52448	65329	79559	199827	90026	7390	0	2300	1400	1040	900	1470	280	FALSE
01008	2013	MA	38867	540	0	2140	5713	6995	8760	15259	0	640	0	180	150	110	100	100	0	FALSE
01008	2014	MA	40175	640	0	2024	4749	6703	7898	18801	0	640	0	190	130	110	90	120	0	FALSE

AFTER melt: the agi column names have now become rows. This allows for pivotting the data into the right shape.

knitr::kable(head(dfmeltbase,10))

zipcode	YEAR	IncomeState	total_income	total_returns_filed	storecount.x	total_nt	storecount.y	nt_one	nt_two	nt_three	nt_four	nt_five	nt_six	has_starbucks	variable	value
01001	2014	MA	494972	8880	0	8880	0	2890	2210	1550	920	1160	150	FALSE	agi_four	79716
01002	2014	MA	802824	9640	6	9640	6	3510	1760	1100	810	1710	750	TRUE	agi_four	71996
01005	2014	MA	133441	2280	0	2280	0	720	500	400	270	340	50	FALSE	agi_four	23646
01007	2014	MA	513575	7390	0	7390	0	2300	1400	1040	900	1470	280	FALSE	agi_four	79559
01008	2014	MA	40175	640	0	640	0	190	130	110	90	120	0	FALSE	agi_four	7898
01010	2014	MA	128916	1840	0	1840	0	570	350	320	180	360	60	FALSE	agi_four	15611
01011	2014	MA	31224	610	0	610	0	210	140	110	70	80	0	FALSE	agi_four	5696
01012	2014	MA	22684	310	0	370	0	120	90	60	40	60	0	FALSE	agi_four	3011
01013	2014	MA	415570	10420	0	10420	0	4460	3020	1510	770	610	50	FALSE	agi_four	65987
01020	2014	MA	697760	15020	6	15020	6	5560	4160	2410	1460	1330	100	TRUE	agi_four	127031

Box plot and compare the two types of zip codes (those having vs. those lacking a Starbucks).

#join the melted data and legend, sort the data by the position of the labels in the legend.
dfmelt<-  inner_join(dfmeltbase, dflegend,by=c("variable" = "label"))%>%
arrange(position)%>%
select(one_of(c("variable", "value","has_starbucks","bucket","zipcode","IncomeState")))

## Warning in inner_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## factor and character vector, coercing into character vector

#boxplot the data to see how the data is distributed among various dimensions
#this line is going to subdivide the data (income, number of returns etc into different levels that 
#can be evaluated indepenently.  This will become apparent boxplot.)
dfmelt$bucketf = factor(dfmelt$bucket,levels=c("75,000 under 100,000","100,000 under 200,000","200,000 or more"))

For easier following of this code chunk, look at the table immeidately above and follow where the column names fall into the function parameters

knitr::kable(head(dfmelt,10))

variable	value	has_starbucks	bucket	zipcode	IncomeState	bucketf
agi_four	79716	FALSE	75,000 under 100,000	01001	MA	75,000 under 100,000
agi_four	71996	TRUE	75,000 under 100,000	01002	MA	75,000 under 100,000
agi_four	23646	FALSE	75,000 under 100,000	01005	MA	75,000 under 100,000
agi_four	79559	FALSE	75,000 under 100,000	01007	MA	75,000 under 100,000
agi_four	7898	FALSE	75,000 under 100,000	01008	MA	75,000 under 100,000
agi_four	15611	FALSE	75,000 under 100,000	01010	MA	75,000 under 100,000
agi_four	5696	FALSE	75,000 under 100,000	01011	MA	75,000 under 100,000
agi_four	3011	FALSE	75,000 under 100,000	01012	MA	75,000 under 100,000
agi_four	65987	FALSE	75,000 under 100,000	01013	MA	75,000 under 100,000
agi_four	127031	TRUE	75,000 under 100,000	01020	MA	75,000 under 100,000

ggplot(dfmelt,  aes(x=has_starbucks, y=value,fill=bucket))+
  geom_boxplot()+
  geom_point(aes(text=paste("zipcode:",zipcode)))+
  #facet_grid(.~bucket)
  facet_grid(.~bucketf)

## Warning: Ignoring unknown aesthetics: text

Now let’s take a percentile of zip codes without a starbucks and those that are above the 75th percentile in income could qualify for expansion.

dfntile<-  inner_join(dfmeltbase, dflegend,by=c("variable" = "label"))

## Warning in inner_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## factor and character vector, coercing into character vector

#df %>% group_by(a) %>% summarise_each(funs(sum)) the line below was adapted from this line 
##this line is very powerful, what it is doing is grouping by each bucket and then for each one
##finding the given percentile. YOU CANNOT DO THIS NATIVELY IN SQL OR ON LIGHTER SQL PLATFORMS BECAUSE
##FINDING THE N-TILE INVOLVES SORTING THE DATA. R does this inherently; quantile could be replaced
##with any other analytic function.
dfresult<-dfntile %>% group_by(bucket) %>% summarise(ntile_val=(quantile(value,.75)))


#join the dfmelt "table" with the dfresult table and get all zip codes without starbucks for each bucket
#that have a gross reported income of at least the ntile value or more (NOTE: this can be made dynamic)

dfnewzips<-inner_join (dfmelt,dfresult,by=c("bucket"))%>%
  filter(value >= ntile_val & has_starbucks==FALSE)%>%select(one_of(c("variable","bucket", "value","ntile_val","zipcode","IncomeState")))%>%
  arrange(desc(value))

dfNewLocationList<-melt(df3,measure.vars=7:12)%>%
  filter(YEAR==2014 & variable %in% c("agi_four", "agi_five","agi_six"))%>%
  filter(paste0(variable,zipcode) %in% paste0(dfnewzips$variable,dfnewzips$zipcode))

#data quality check: google results show over 2000 zipcodes in New York; 692 of them could be new starbucks locations
results<-sqldf("select sum(distinct total_income) as total_income,sum(distinct total_returns_filed) as total_returns_filed,
count(distinct zipcode) as num_zip_codes,IncomeState from dfNewLocationList
      group by incomestate order by sum(distinct total_income) desc")

knitr::kable(results)

total_income	total_returns_filed	num_zip_codes	IncomeState
258703869	3982110	347	NY
205334738	3056670	279	FL
195071936	2953070	303	TX
172719823	2056340	255	NJ
153988449	2190490	247	PA
129695594	1772080	170	CA
126660729	1452680	195	MA
95924930	1565740	167	MI
95247661	1623630	159	OH
93602662	1543500	147	IL
79139573	1354490	125	NC
71626969	992880	110	MD
70166183	926600	100	MN
64974518	1238700	101	GA
61314763	671430	100	CT
60381301	753290	88	VA
56242570	863760	105	WI
47235402	817800	77	TN
44115904	750860	84	IN
42809718	691930	68	MO
40368965	721350	66	SC
39332842	658240	72	LA
34258296	555830	68	AL
33025281	571890	53	KY
32370315	478380	45	UT
31994741	486130	60	OK
25181185	338400	44	CO
24493015	383620	47	IA
23579667	413700	42	MS
23342503	283230	50	NH
22237821	270010	43	KS
19502833	342000	33	AR
17936207	270770	36	NE
16450029	215650	32	WA
15632760	270410	26	RI
15487883	244990	26	AZ
14888461	238450	23	DE
13390420	234370	21	NM
11044169	178110	18	NV
10598177	167570	22	WV
9828218	169780	17	ID
8756609	98420	13	ND
7828026	136340	17	OR
7571944	115890	15	MT
6639075	96220	15	VT
5906731	84740	14	ME
4751236	58280	10	WY
4337789	67890	7	HI
4037840	82850	4	DC
3887930	44450	7	AK
3242484	48010	7	SD

Data Visuzliation

Visualizing the data can allow us to see how different states and markets relate to each other with respect to income and population. The first example is a scatter plot of states; the reference lines mark the 75th percentile of income and population.

#plot the income by population with a reference line to show the 75th percentile of income
ggplot

## function (data = NULL, mapping = aes(), ..., environment = parent.frame()) 
## {
##     UseMethod("ggplot")
## }
## <environment: namespace:ggplot2>

ggplot(results,aes(x=total_income,y=total_returns_filed,label=IncomeState)) + 
  geom_point() + geom_text(aes(label=IncomeState),hjust=0,vjust=0)+geom_vline(xintercept = quantile(results$total_income,.75)) + geom_hline(yintercept = quantile(results$total_returns_filed,.75))

Another way to visualize the data is through mapping. State boundary data with lat-long coordiantes are built into the state R package. Joining lat-long data to the state names in the state result dataset enables plotting in ggplot to render a map of the United States with state boundaries filled in where expansion opportunities are the greatest. Compare this map with the scatter plot shown above.

data(state)
statename<-as.data.frame(cbind(state.abb,tolower(state.name)))
newresults<-inner_join(results,statename,by=c("IncomeState" = "state.abb"))

## Warning in inner_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## character vector and factor, coercing into character vector