library(tidyverse)
library(sf)
library(tmap)
library(leaflet)
library(here)
library(tidycensus)
library(sf)
library(tmap)
library(jsonlite)
library(tidyverse)
library(httr)
library(jsonlite)
library(knitr)
library(tigris)
library(places)
library(grid)

Read downloaded data

df <- read.csv("coffee.csv")
head(df)
##   X       GEOID         county hhincome    pct_pov review_count avg_rating
## 1 1 13063040202 Clayton County    33276 0.20134228     57.00000          2
## 2 2 13063040308 Clayton County    28422 0.21071800     13.00000          3
## 3 3 13063040407 Clayton County    49271 0.10825507     29.33333          2
## 4 4 13063040408 Clayton County    44551 0.18095661     20.00000          4
## 5 5 13063040410 Clayton County    49719 0.11468019     41.00000          1
## 6 6 13063040411 Clayton County    57924 0.09068942     18.00000          2
##   race.tot avg_price  pct_white hhincome_log review_count_log pct_pov_log
## 1     2850         1 0.07508772     10.41289         4.060443   -1.554276
## 2     4262         1 0.26067574     10.25527         1.975622   -1.510869
## 3     4046         1 0.20514088     10.80529         3.320837   -2.134911
## 4     8489         1 0.16868889     10.70461         3.044522   -1.655709
## 5     7166         1 0.19369244     10.81434         3.737670   -2.082003
## 6    13311         1 0.16512659     10.96706         2.944439   -2.295715
##   yelp_n
## 1      1
## 2      2
## 3      3
## 4      1
## 5      1
## 6      1

02 Recreate the plot

Plot 1.avg_rating vs. hhincome

bxplot <- ggplot(data = df) +
  geom_boxplot(aes(x = factor(avg_rating), y = hhincome),
               color = "black", fill = "white")
bxplot

Plot2.avg_rating, hhincome, county

ggplot(data = df 
       ) +
  geom_boxplot(aes(x = factor(avg_rating), y=hhincome), 
               color = "black", fill = "white") +
  facet_wrap(~county) +
  labs(x="Average Yelp Rating",
       y="Median Annual Household Incomes($)"
    
  )

Plot2-Finding

The situation varies considerably in the six counties. For Clayton County there are no tracts where the rating of coffee shop is 5, while for Cobb Countym, there are no tracts with 1-rating coffee shop.For Dekalb County, there are more outliners.For Gwinnett County, tracts where average rating for coffeeshops are 3 exhibited the higher median annual household income,while in Clayton County tracts where average rating for coffeeshops are 3 exhibited the lowest median annual household income.Fulton county showcased considerable variability in household income for each rating level.

Plot3.Variables used - review_count_log, hhincome, county, pct_white

ggplot(data = df 
       ) +
  geom_point(mapping = aes(x=log(review_count), y=hhincome, color = pct_white),
             alpha=0.7,
             size=2) +
  facet_wrap(~county) +
  labs(title ="Scatterplot: Review Count vs. Household Income",
         x="Review Count(log)",
       y="Median Annual Household Incomes",
       color="Proportion of residents \n who self-identified as white")+
  scale_color_gradient(low="blue", high="red") + #<<
  theme_bw()+
  theme(
    text = element_text(size = 7), # Controls the base size of text, affecting axis labels, titles, etc.
    axis.text = element_text(size = 5),
    legend.title = element_text(size = 7),   
    legend.key.size = unit(0.4, "cm"),
    plot.margin = unit(c(0.3, 0, 0.3, 0), "cm")
  )

Plot3-Finding

We visualized the relationship between the log of review counts and median annual household incomes, color-coded by the proportion of residents who self-identified as white, across different counties.For Dekulb and Fulton County, tracts with higher percentage of white residents seem to have higher median average household income and more review counts. In Cob County and Gwinnett County,this trend is less significant. In Claton County this trend is not significant, the median average household income and the percentage of white people are both a low value despite the coffeeshop in tracts are given different level of rating from 0-4.

Plot4.Variables used - pct_pov_log, hhincome, pct_white, race.tot, review_count_log, county

#use pivot_longer() to combine the four variables

df_long<- df %>%
  pivot_longer(cols = c("hhincome", "pct_pov_log","pct_white","race.tot" ), 
               names_to = "variable", 
               values_to = "value")
#change lable
ylabeler <- as_labeller(c(hhincome = "Median Annual Household Income ($)",
                              pct_pov_log = "Percent Residents under Poverty",
                              pct_white = "Percentage White Residents",
                              race.tot = "Total Population"))
#draw the plot
ggplot(data = df_long
       ) +
  geom_point(mapping = aes(x=log(review_count), y=value, color = county),
             size=0.5) +
  facet_wrap(~variable,scales = "free_y", labeller = ylabeler) +
  ggpubr::stat_cor(mapping = aes(x = log(review_count), y = value), 
                   label.x.npc = "left", label.y.npc = "top",
                   color = "black",
                   size = 2,
                   label.sep = ", ", 
                   method = "pearson") +
  geom_smooth(mapping = aes(x=log(review_count), y=value, color = county, group=county), 
              method="lm", se=FALSE,linewidth=0.7) + 
  labs(title ="Scatterplot: Review Count vs. Household Income",
       subtitle = "Using Yelp data in Five Counties Around Atlanta, GA",
         x="Review Count(log)",
       y="Values",
       color="County")+ #<<
  theme_bw()+
  theme(
    text = element_text(size = 7), # Controls the base size of text, affecting axis labels, titles, etc.
    axis.text = element_text(size = 5),
    legend.title = element_text(size = 7),   
    legend.key.size = unit(0.4, "cm"),
    plot.margin = unit(c(0, 0, 0, 0), "cm"),
    panel.spacing = unit(0.3, "lines") 
  )
## `geom_smooth()` using formula = 'y ~ x'

Plot4-Finding

By faceting multiple plots based on the four variables (“hhincome”, “pct_pov_log”,“pct_white”,“race.tot”), we can compare how each variable correlates with the review count across different counties.The geom_smooth(method=“lm”) layers help visualize how a linear model fits the data.For % white there are larger variations between the 5 counties while for other variables, the plots show similar trends among the 5 counties. With the inclusion of Pearson correlation coefficients and p-values on the plots, we can understand that there is a statistically significant linear relationship between the review count and percentage of white residents, and the percentage of residents under poverty due to the small p value(<0.005).Positive R value for % white give an indication of the direction of the positive relationship between the review count and % white, for each county.But for the percentage of residents under poverty, there there is a negative relationship between it and the review count due to the negative R value.