# Load packages:

library(tidycensus)
library(tidyr)
library(sf)
## Linking to GEOS 3.12.1, GDAL 3.8.4, PROJ 9.3.1; sf_use_s2() is TRUE
library(tmap)
## Breaking News: tmap 3.x is retiring. Please test v4, e.g. with
## remotes::install_github('r-tmap/tmap')
library(jsonlite)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ purrr     1.0.2
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.5.1     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()  masks stats::filter()
## ✖ purrr::flatten() masks jsonlite::flatten()
## ✖ dplyr::lag()     masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(httr)
library(jsonlite)
library(reshape2)
## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(here)
## here() starts at C:/Users/wpgeorgia/Documents/GT MSUA/CP 8883/Intro to UA R Projects
library(knitr)
library(skimr)
library(units)
## udunits database from C:/Users/wpgeorgia/AppData/Local/R/win-library/4.4/units/share/udunits/udunits2.xml
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(ggalt)
## Registered S3 methods overwritten by 'ggalt':
##   method                  from   
##   grid.draw.absoluteGrob  ggplot2
##   grobHeight.absoluteGrob ggplot2
##   grobWidth.absoluteGrob  ggplot2
##   grobX.absoluteGrob      ggplot2
##   grobY.absoluteGrob      ggplot2
library(ggplot2)
library(ggtext)
library(ggpubr)
library(readr)
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:httr':
## 
##     config
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
# Import data file:

atl_coffee <- read_csv("coffee.csv")
## New names:
## Rows: 363 Columns: 14
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): county dbl (13): ...1, GEOID, hhincome, pct_pov, review_count, avg_rating,
## pop, avg...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`



Plot 1

ggplot(data = atl_coffee, aes(x=avg_rating, y=hhincome, group=avg_rating)) +
  geom_boxplot(size=0.4, outlier.size=1)


Plot 1 displays a box-and-whisker graph of median household income by census tract compared to average ratings of coffee shops in those same census tracts. A box-and-whisker plot shows the middle 25 to 75 percentiles of the data (within each box) as well as data points beyond that mid-range (which in may be considered outliers). For Atlanta area coffees shops, this plot appears to suggest that as the median income of a tract increases, the quality of the shops (as measured by very non-scientific Yelp ratings) generally increases. Although the top 5-star rated shops do not appear to be concentrated in the higher income tracts.


Plot 2

ggplot(data = atl_coffee, aes(x=avg_rating, y=hhincome, group=avg_rating)) +
  geom_boxplot(size=0.4, outlier.size=1) +
  facet_wrap(~county)


Plot 2 is similar to Plot 1 with box-and-whisker plots of median household income compared to average ratings of coffee shops. Plot 2 adds another dimension to the visual by splitting the 5 counties in the study area into separate graphs. The additional layer of detail provides new insights. Clayton County’s coffee shops are all located in relatively lower income tracts compared to the other counties, and Clayton has 0 5-star shops. With bigger boxes in its plot, Fulton appears to have shops of all ratings spread across a range of different income areas. Dekalb County appears to have the most data points outside its boxes (and higher than its boxes) meaning general opinions of shops in its highest-income areas don’t align with areas closer to the median income.


Plot 3

ggplot(data = atl_coffee) +
  geom_point(mapping = aes(x=review_count_log, y=hhincome, color = pct_white), size=2.5, alpha=0.5) + 
  labs(x = "Review Count (log)",
       y = "Median Annual Household Income",
       title = "Scatterplot: Review Count vs. Household Income") +
  facet_wrap(~county) +
  scale_color_gradient(low = "blue", high = "red",
    name = "Proportion of residents
who self-identified as white") +
  theme_bw() +
  theme(legend.key.size = unit(0.5, 'cm'),legend.title = element_text(size=8))


Plot 3 displays scatterplots by county of median income by census tract to review counts of coffee shops (scaled using logarithimic calculations). The plot also adds another component - residents who identify as white (as shown by the color of each plot point). To start, across all of the plots, it’s immediately clear that income and race are correlated. Also, Fulton county - which has the most dots in its plot - has the most of reviews of any county (and therefore, likely the most shops). Clayton County with the fewest points has the fewest ratings (and likely the fewest shops). Meanwhile Dekalb has more purple and pink dots indicating more racial diversity along with a relatively high number of plot points.


Plot 4

# Make data set long combining four variables into one column:

atl_coffee_long <- pivot_longer(atl_coffee, cols=c("hhincome","pct_white","pop","pct_pov_log"))
ggplot(data=atl_coffee_long, mapping = aes(review_count_log, value)) +
      geom_point(mapping=aes(color=county)) + 
      geom_smooth(mapping = aes(color=county), method = lm, se=FALSE) +
      stat_cor(label.y.npc="top", size=3) +
      facet_wrap(~name, scales='free_y', 
                 labeller = as_labeller(c(hhincome = "Median Annual Household Income ($)", 
                                          pct_white = "Percent White Resident", 
                                          pct_pov_log = "Percent Residents Under Poverty", 
                                          pop = "Total Population"))) +
      theme_bw() +  
      labs(title = "Scatterplot between logged review count & neighborhood characteristics",
           subtitle = "Using Yelp data in five counties around Atlanta, GA",
           y = "Values",
           x = "Review Count Logged",
           color = "County") +
      theme(plot.title = element_text(size=10), 
            plot.subtitle = element_text(size=8),
            axis.title.x = element_text(size=8),
            axis.title.y = element_text(size=8),
            legend.title = element_text(size=8),
            legend.text = element_text(size=7)) 
## `geom_smooth()` using formula = 'y ~ x'


Plot 4 compares 4 different variables against coffee shop review counts. The plots’ points are colored by county, and regression lines for each county’s points are included in each plot. First, 2 of the 4 variables have p-values from their regressions below 0.05 indicating a high likelihood that the variables are good indicators. Those variables are Percent of Residents Under the Poverty Line and Percent of White Residents, and not surprisingly, the number of reviews (and also likely the number of shops) appears to be highly correlated with these variables. Also not surprisingly, total population (which doesn’t seem to be a descriptive enough metric) is the variable with the lowest correlation to review counts.