This document contains recreations of plots that visualize combined and tidied Yelp data and census data about coffee shops, as closely as ppossible to how they were observed in the source HTML webpage.
You can view the original plots and access the tidied data set here.
The following is my re-creation of the plots.
Not going to show results for the libraries being loaded here for the graphs.
# install.packages('patchwork')
# install.packages('ggcorrplot')
# install.packages(ggpubr)
## load libraries
library(ggplot2)
library(patchwork)
library(ggcorrplot)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse) # for pivot_longer
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ✔ readr 2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggpubr) # for stat_cor
Original graph note: Variables used - avg_rating, hhincome
The following code chunk uses the ggplot2 library.
# Box plot distributed hhincome y spacing intuitively
ggplot(coffee_df, aes(x = factor(avg_rating), y = hhincome)) +
geom_boxplot() +
labs(x = "avg_rating", y = "hhincome", title = "")
Original graph note: Variables used - avg_rating, hhincome, county
At first, I tried declaring all the county names and pointing to each item in the array to iterate the plot for each. I found that this wasn’t necessary.
# # List of counties was not needed
# counties <- c("Clayton County", "Cobb County", "Dekalb County", "Fulton County", "Gwinnett County")
###### couldn't get this version to work
# # Function to create a box plot for a specific county
# create_county_box_plot <- function(data, county_name) {
# ggplot(data, aes(x = factor(avg_rating), y = hhincome)) +
# geom_boxplot() +
# facet_wrap(. ~ county, scales = "free_x", ncol = 5) #+
# # labs(title = paste("Box Plots for", county_name))
# }
#
# # Create and arrange box plots for each county
# plots <- lapply(counties, function(county) {
# county_data <- subset(coffee_df, county == county)
# create_county_box_plot(county_data, county)
# })
#
# # Arrange plots in two rows
# arranged_plots <- wrap_plots(plots, ncol = 3)
#
# # Set common labels
# arranged_plots +
# labs(x = "Average Yelp Rating", y = "Median Annual Household Income ($)")
The data set only includes the 5 counties shown in the patchwork, and
they are automatically sorted in alphabetical order. Since I hadn’t
changed the labs declarations but this time I’ve set
scales per county, I didn’t need to do anything else.
# Box plot distributed hhincome y spacing intuitively
ggplot(coffee_df, aes(x = factor(avg_rating), y = hhincome)) +
geom_boxplot() +
labs(x = "Average Yelp Rating", y = "Median Annual Household Income ($)", title = "") +
facet_wrap(. ~ county, scales = "fixed", ncol = 3) # change heading of each to the county name from data set
Original graph note: Variables used - review_count_log, hhincome, county, pct_white
Using the ggcorrplot library to color, cluster, and
order the correlation plot.
# Box plot distributed hhincome y spacing intuitively
ggplot(coffee_df, aes(x = review_count_log, y = hhincome, color = pct_white)) +
geom_point(alpha = 0.75) + ## alpha is opacity
labs(x = "Review Count (log)", y = "Median Annual Household Income", title = "Scatterplot: Review Count vs. Household Income") +
facet_wrap(. ~ county) + # could additonally set to 'fixed', so all X scales are matching instead of variant based on the data
# facet_grid(. ~ county, cols = 3) +
scale_color_gradient(low = "#1802FE", high = "#FE0108", name = "Proportion of residents \n who self-identified as white") +
theme_bw() +
theme(
legend.title = element_text(size = 6),
legend.text = element_text(size = 6),
axis.text = element_text(size = 8),
plot.title = element_text(size = 8),
plot.subtitle = element_text(size = 8),
axis.title = element_text(size = 8),
text = element_text(size = 8),
legend.key.width = unit(0.015, "npc"),
legend.key.height = unit(0.03, "npc")
)
#####
#coffee_df %>% pivot_wider() ## make wider later
Original graph note: Variables used - pct_pov_log, hhincome, pct_white, race.tot, review_count_log, county
## Hint: I used pivot_longer() to create Plot 4.
## See Scatter Plots section here: https://ujhwang.github.io/UrbanAnalytics2023/Lab/module_3/1_GTFS_v2.html
### also see Merge frequency by stop information with ACS data section
plot4_labels <- c(
'hhincome' = "Median Annual Household Income ($)",
'pct_pov_log' = "Percent Residents Under Poverty",
'pct_white' = "Percent White Resident",
'race.tot' = "Total Population"
)
# Heading labeller function for the plot4labels
set_labeller <- as_labeller(plot4_labels)
coffee_df %>%
pivot_longer(cols = c('hhincome', 'pct_pov_log', 'pct_white', "race.tot"), names_to = "variable", values_to = "value") %>%
ggplot(aes(x = review_count_log, y = value, color = county)) +
facet_wrap(~variable, scales = "free_y", labeller = set_labeller) +
geom_point(alpha = 1, stroke = 0, size = 1.25) +
geom_smooth(method = "lm", se = FALSE, fullrange = TRUE, size = 0.65) +
stat_cor(
mapping = aes(inherit.aes = TRUE),
method = "pearson",
label.x.npc = "left",
label.y.npc = "top",
color = "black",
size = 2,
digits = 2,
show.legend = FALSE ) +
#stat_cor(p.accuracy = 0.001, r.accuracy = 0.001) +
#stat_cor(method = "pearson", label.x = -5, label.y = 30) +
#stat_cor(method = "pearson", label.x = -5, label.y = 30) +
#cor.test()
labs(x = "Review Count Logged", y = "Values", title = "Scatterplot between logged review count & neighborhood characteristics", subtitle = "Using Yelp data in Five Counties Around Atlanta, GA", color = "County") +
theme_bw() +
theme(
legend.text = element_text(size = 6),
axis.text = element_text(size = 8),
plot.title = element_text(size = 10),
plot.subtitle = element_text(size = 8),
axis.title = element_text(size = 8),
legend.title = element_text(size = 8),
text = element_text(size = 8) # gray box headings for plots
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning in stat_cor(mapping = aes(inherit.aes = TRUE), method = "pearson", :
## Ignoring unknown aesthetics: inherit.aes
## `geom_smooth()` using formula = 'y ~ x'
# ############ Original plan to create 4 different plots and join them using facet_wrap had many complications and was unsuccessful
# # Median annual hhincome Q4
# hhincome_plot <- ggplot(coffee_df, aes(x=review_count_log, y=hhincome, color=county)) +
# geom_point() +
# #stat_cor(method = "pearson", label.x = -5, label.y = 30) +
# geom_smooth(method=lm, se=FALSE, fullrange=TRUE) +
# theme_bw() +
# theme(axis.title.x=element_blank(), # remove x title
# #axis.title.y=element_blank(),
# axis.text.x=element_blank(), # remove labels from x
# axis.ticks.x=element_blank()) # remove tick marks
# # facet_wrap(plot4labels.ai, labeller=set_labeller)
#
# # Print plot
# hhincome_plot # + theme_bw()