library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
library(tmap)
##
## Attaching package: 'tmap'
##
## The following object is masked from 'package:datasets':
##
## rivers
library(leaflet)
library(ggplot2)
library(ggpubr)
mini4data <- read.csv("/Users/ryannation/Downloads/coffee.csv")
head(mini4data)
## X GEOID county hhincome pct_pov review_count avg_rating
## 1 1 13063040202 Clayton County 33276 0.20134228 57.00000 2
## 2 2 13063040308 Clayton County 28422 0.21071800 13.00000 3
## 3 3 13063040407 Clayton County 49271 0.10825507 29.33333 2
## 4 4 13063040408 Clayton County 44551 0.18095661 20.00000 4
## 5 5 13063040410 Clayton County 49719 0.11468019 41.00000 1
## 6 6 13063040411 Clayton County 57924 0.09068942 18.00000 2
## pop avg_price pct_white hhincome_log review_count_log pct_pov_log yelp_n
## 1 2850 1 0.07508772 10.41289 4.060443 -1.554276 1
## 2 4262 1 0.26067574 10.25527 1.975622 -1.510869 2
## 3 4046 1 0.20514088 10.80529 3.320837 -2.134911 3
## 4 8489 1 0.16868889 10.70461 3.044522 -1.655709 1
## 5 7166 1 0.19369244 10.81434 3.737670 -2.082003 1
## 6 13311 1 0.16512659 10.96706 2.944439 -2.295715 1
plot1 <- ggplot(data = mini4data) +geom_boxplot(aes(x=avg_rating, y=hhincome, group = cut(avg_rating, breaks = 5)), color="black", fill="white")
plotly::ggplotly(plot1)
###This data appears to follow a normal curve. Coffee shop POIs with an average rating of 3 have the highest median income, whereas POIs with 1 or 5 star ratings are lower. Additionally, stores with a rating of 4 had higher outliers, which might reveal further findings with more analysis.
plot2 <- ggplot(data = mini4data) + geom_boxplot(aes(x=avg_rating, y=hhincome, group = cut(avg_rating, breaks = 5)), color = "black", fill = "white") + facet_wrap(~county) + labs(x = "Average Yelp Rating", y = "Median Annual Household Income ($)")
plotly::ggplotly(plot2)
###Clayton County appears to have much lower median incomes than the other 4 counties. Additionally, it has no POIs with a 5 star average rating. This may indicate a relationship between income and attractiveness of POIs.
plot3 <- ggplot(data = mini4data) + geom_point(mapping = aes(x=review_count_log, y=hhincome, color = pct_white)) + facet_wrap(~county) + labs(x = "Review Count (log)", y = "Median Annual Household Income", color = "Percentage of residents\nwho self-identified as white", title = "Scatterplot: Review Count vs. Household Income") + scale_color_gradient(low="darkblue", high = "red") + theme(guides(color = guide_legend(override.aes = list(size = 1))))
plot3
###Fulton and DeKalb counties have a much higher count of reviews, with
a larger spread than the others. Clayton county has the fewest reviews
as well as lowest median income and highest proportion of residents that
do not identify as white. This indicates that there might be a
relationship between race and income, compared to the amount of POIs in
a given neighborhood.
mini4data_long <- mini4data %>% pivot_longer(cols = c("hhincome", "pct_pov_log", "pct_white", "pop"), names_to = "category", values_to = "value") %>% mutate(value = case_when(
category %in% c("hhincome", "pop") ~ as.integer(value),
category %in% c("pct_pov_log", "pct_white") ~as.numeric(value),
TRUE ~ value
))
###Pivoting data frame for easier plotting
plot4 <- ggplot(data = mini4data_long) + geom_point(mapping = aes(x = review_count_log, y = value, color = county)) + facet_wrap(~category, scales = "free_y") + labs(x = "Review Count Logged", y = "Values", color = "County", title = "Scatterplot between logged review count & neighborhood characteristics\nUsing Yelp data in Five Counties Around Atlanta, GA") + geom_smooth(mapping = aes(x = review_count_log, y = value, color = county), method = lm, se = FALSE) + ggpubr::stat_cor(aes(x = review_count_log, y = value), method = "pearson")
plot4
## `geom_smooth()` using formula = 'y ~ x'
###Clearly, the relationship between reviews logged and income, poverty,
and race are significant. This supports the claim that there is a
relationship between neighborhood characteristics and POIs. Depending on
how you would classify “advantaged” could determine how you intepret
this data. I would argue that the relationship between income and POIs
indicates that more “advantaged” neighborhoods tend to have a larger
number of, and higher quality of POIs than less advantaged
neighborhoods. Additionally, race seems to be a good indicator of POI
count.