library(tidytransit)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.1
## ✔ readr   2.1.2     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(tmap)
library(ggplot2)
library(here)
## here() starts at C:/Users/CP8883/CP8883
library(units)
## udunits database from C:/Users/Mary Jane Leach/AppData/Local/R/win-library/4.2/units/share/udunits/udunits2.xml
library(sf)
## Linking to GEOS 3.9.1, GDAL 3.4.3, PROJ 7.2.1; sf_use_s2() is TRUE
library(leaflet)
library(tidycensus)
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(tidygraph)
## 
## Attaching package: 'tidygraph'
## 
## The following object is masked from 'package:stats':
## 
##     filter
library(leafsync)
library(here)
library(ggdark)
library(ggpubr)
coffee <- read.csv("C:/Users/CP8883/CP8883/Mini4/coffee.csv")
head(coffee)
##   X       GEOID         county hhincome    pct_pov review_count avg_rating
## 1 1 13063040202 Clayton County    33276 0.20134228     57.00000          2
## 2 2 13063040308 Clayton County    28422 0.21071800     13.00000          3
## 3 3 13063040407 Clayton County    49271 0.10825507     29.33333          2
## 4 4 13063040408 Clayton County    44551 0.18095661     20.00000          4
## 5 5 13063040410 Clayton County    49719 0.11468019     41.00000          1
## 6 6 13063040411 Clayton County    57924 0.09068942     18.00000          2
##   race.tot avg_price  pct_white hhincome_log review_count_log pct_pov_log
## 1     2850         1 0.07508772     10.41289         4.060443   -1.554276
## 2     4262         1 0.26067574     10.25527         1.975622   -1.510869
## 3     4046         1 0.20514088     10.80529         3.320837   -2.134911
## 4     8489         1 0.16868889     10.70461         3.044522   -1.655709
## 5     7166         1 0.19369244     10.81434         3.737670   -2.082003
## 6    13311         1 0.16512659     10.96706         2.944439   -2.295715
##   yelp_n
## 1      1
## 2      2
## 3      3
## 4      1
## 5      1
## 6      1

Generating plots from Yelp & Census track data in order to evaluate coffee shops as points of interest (POIs).

More specifically, this data analysis will evaluate coffee shop bussinesses located within the following counties in Metro Atlanta: Fulton, DeKalb, Clayton, Cobb, and Gwinnett County.

boxplot(coffee$hhincome ~ coffee$avg_rating,
  main ="Boxplot: Coffee Shop Ratings by Income",
  xlab ="Average Coffee Shop Rating", ylab ="Average Household Income",
  col = c("green","Grey", "orange", "pink"))

As shown in the above bloxplot, average ratings of 2 and 4 have extreme values which are skewing the data positively.

##This is the case to a lesser extent with 5 star ratings.

##These outliers are more pronounced as median household income goes above $100.000.

# ggplot(coffee) + 
#   geom_boxplot(mapping = aes(x = avg_rating, y = hhincome, color = "salmon")) +
#   facet_wrap(~county) #<<
bxplot <- ggplot(data = coffee) +
  geom_boxplot(aes(x=avg_rating, y=hhincome),
               color="salmon",fill="white")+
  facet_wrap(~county)


plotly::ggplotly(bxplot)
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

Fulton, Dekalb, and Cobb County have outliers with respect to incomes over $100.000.

These values are more extreme for 4 star average ratings and these outliers are skewing the data in that direction.

ggplot(data = coffee, aes(x = review_count_log, y = hhincome)) +
  geom_point(mapping = aes(color = pct_white)) +
  facet_wrap(~county)+
  scale_color_viridis_b(alpha = 1,
  begin = 0,
  end = 1,
  direction = 1,
  option = "D",
  values = NULL,
  space = "Lab",
  na.value = "grey50",
  guide = "coloursteps",
  aesthetics = "colour") +
  labs(x = "Review Count (log)", y = "Median Annual Household Income", color = "Proportion of residents who self-identify as white", title = "Scatterplot: Review Count Vs. Household Income") 

In Cobb, Dekalb, Fulton, and to a lesser degree in Gwinnett County, the proportion of residents who self-identify as white increases, as median household income levels equate to $100.000 or more.

mycols <- c('pct_pov_log'= "%poverty%", 
            'hhincome'= "Household Income", 
            'pct_white' = "% white", 
            'race.tot'= "Total Population")

library(ggpubr)
coffee %>%
  pivot_longer(cols = names(mycols), names_to = "variable", values_to = "value") %>%
  ggplot(data = ., mapping = aes(x=review_count_log, y = value)) +
  geom_point(aes(color = county)) + 
  geom_smooth(aes(color = county), method ="lm", se = F) +
  ggpubr::stat_cor(method = "pearson")+
  facet_wrap(~variable, scales= "free_y", labeller=as_labeller(mycols))+
  theme_light()
## `geom_smooth()` using formula 'y ~ x'

Multivariate Regression Conclusions: Figures 1-4 indicate that there is a positive association between household income and positive reviews (4-6 ratings) in Dekalb and Fulton.

There is a negative associateion between residents under poverty in Dekalb and Fulton. Poverty correlates negatively with 4-6 rating scores.

Further, there is a strong association between percent white residents and positive ratings in Dekalb and Fulton, and a slight positive correlation for Clayton, Gwinnett, and Cobb.

This association might situate coffee shops (POIs), as positive indicators for businesses that cater to self-identified white residents taste.