Visualizations

The following is my re-creation of the plots.

Setup

Not going to show results for the libraries being loaded here for the graphs.

# install.packages('patchwork')
# install.packages('ggcorrplot')
# install.packages(ggpubr)

## load libraries
library(ggplot2)
library(patchwork)
library(ggcorrplot)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyverse) # for pivot_longer

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ✔ readr     2.1.4

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggpubr) # for stat_cor

Plot 1.

Original graph note: Variables used - avg_rating, hhincome

The following code chunk uses the ggplot2 library.

# Box plot distributed hhincome y spacing intuitively
ggplot(coffee_df, aes(x = factor(avg_rating), y = hhincome)) +
  geom_boxplot() +
  labs(x = "avg_rating", y = "hhincome", title = "")

Plot 2.

Original graph note: Variables used - avg_rating, hhincome, county

At first, I tried declaring all the county names and pointing to each item in the array to iterate the plot for each. I found that this wasn’t necessary.

# # List of counties was not needed
# counties <- c("Clayton County", "Cobb County", "Dekalb County", "Fulton County", "Gwinnett County")


###### couldn't get this version to work

# # Function to create a box plot for a specific county
# create_county_box_plot <- function(data, county_name) {
#   ggplot(data, aes(x = factor(avg_rating), y = hhincome)) +
#     geom_boxplot() +
#     facet_wrap(. ~ county, scales = "free_x", ncol = 5) #+
#    # labs(title = paste("Box Plots for", county_name))
# }
# 
# # Create and arrange box plots for each county
# plots <- lapply(counties, function(county) {
#   county_data <- subset(coffee_df, county == county)
#   create_county_box_plot(county_data, county)
# })
# 
# # Arrange plots in two rows
# arranged_plots <- wrap_plots(plots, ncol = 3)
# 
# # Set common labels
# arranged_plots +
#   labs(x = "Average Yelp Rating", y = "Median Annual Household Income ($)")

The data set only includes the 5 counties shown in the patchwork, and they are automatically sorted in alphabetical order. Since I hadn’t changed the labs declarations but this time I’ve set scales per county, I didn’t need to do anything else.

# Box plot distributed hhincome y spacing intuitively
ggplot(coffee_df, aes(x = factor(avg_rating), y = hhincome)) +
  geom_boxplot() +
  labs(x = "Average Yelp Rating", y = "Median Annual Household Income ($)", title = "") +
  facet_wrap(. ~ county, scales = "fixed", ncol = 3)  # change heading of each to the county name from data set

Plot 3.

Original graph note: Variables used - review_count_log, hhincome, county, pct_white

Using the ggcorrplot library to color, cluster, and order the correlation plot.

# Box plot distributed hhincome y spacing intuitively
ggplot(coffee_df, aes(x = review_count_log, y = hhincome, color = pct_white)) +
  geom_point(alpha = 0.75) + ## alpha is opacity
  labs(x = "Review Count (log)", y = "Median Annual Household Income", title = "Scatterplot: Review Count vs. Household Income") +
  facet_wrap(. ~ county) +  # could additonally set to 'fixed', so all X scales are matching instead of variant based on the data
 # facet_grid(. ~ county, cols = 3) +
  scale_color_gradient(low = "#1802FE", high = "#FE0108", name = "Proportion of residents \n who self-identified as white") + 
  theme_bw() + 
  theme(
    legend.title = element_text(size = 6),
    legend.text = element_text(size = 6),
    axis.text = element_text(size = 8),   
    plot.title = element_text(size = 8), 
    plot.subtitle = element_text(size = 8),
    axis.title = element_text(size = 8),  
    text = element_text(size = 8),
    legend.key.width = unit(0.015, "npc"),
    legend.key.height = unit(0.03, "npc")
  )

#####
#coffee_df %>% pivot_wider() ## make wider later

Plot 4.

Original graph note: Variables used - pct_pov_log, hhincome, pct_white, race.tot, review_count_log, county

## Hint: I used pivot_longer() to create Plot 4.
## See Scatter Plots section here: https://ujhwang.github.io/UrbanAnalytics2023/Lab/module_3/1_GTFS_v2.html
  ### also see Merge frequency by stop information with ACS data section

plot4_labels <- c(
  'hhincome' = "Median Annual Household Income ($)",
  'pct_pov_log' = "Percent Residents Under Poverty",
  'pct_white' = "Percent White Resident",
  'race.tot' = "Total Population"
)

# Heading labeller function for the plot4labels
set_labeller <- as_labeller(plot4_labels)


coffee_df %>%
  pivot_longer(cols = c('hhincome', 'pct_pov_log', 'pct_white', "race.tot"), names_to = "variable", values_to = "value") %>% 
  ggplot(aes(x = review_count_log, y = value, color = county)) +
  facet_wrap(~variable, scales = "free_y", labeller = set_labeller) +
  geom_point(alpha = 1, stroke = 0, size = 1.25) +
  geom_smooth(method = "lm", se = FALSE, fullrange = TRUE, size = 0.65) +
  stat_cor(
      mapping = aes(inherit.aes = TRUE),
      method = "pearson",
      label.x.npc = "left",
      label.y.npc = "top",
      color = "black",
      size = 2,
      digits = 2,
      show.legend = FALSE ) +
  #stat_cor(p.accuracy = 0.001, r.accuracy = 0.001) +
  #stat_cor(method = "pearson", label.x = -5, label.y = 30) +
  #stat_cor(method = "pearson", label.x = -5, label.y = 30) +
  #cor.test()
  labs(x = "Review Count Logged", y = "Values", title = "Scatterplot between logged review count & neighborhood characteristics", subtitle = "Using Yelp data in Five Counties Around Atlanta, GA", color = "County") +
  theme_bw() + 
  theme(
    legend.text = element_text(size = 6),
    axis.text = element_text(size = 8),   
    plot.title = element_text(size = 10), 
    plot.subtitle = element_text(size = 8),
    axis.title = element_text(size = 8),  
    legend.title = element_text(size = 8),  
    text = element_text(size = 8)         # gray box headings for plots
  )

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning in stat_cor(mapping = aes(inherit.aes = TRUE), method = "pearson", :
## Ignoring unknown aesthetics: inherit.aes

## `geom_smooth()` using formula = 'y ~ x'

# ############ Original plan to create 4 different plots and join them using facet_wrap had many complications and was unsuccessful

# # Median annual hhincome Q4
# hhincome_plot <- ggplot(coffee_df, aes(x=review_count_log, y=hhincome, color=county)) +
#   geom_point() +
#   #stat_cor(method = "pearson", label.x = -5, label.y = 30) +
#   geom_smooth(method=lm, se=FALSE, fullrange=TRUE) +
#   theme_bw() +
#   theme(axis.title.x=element_blank(), # remove x title
#         #axis.title.y=element_blank(),
#         axis.text.x=element_blank(), # remove labels from x
#         axis.ticks.x=element_blank()) # remove tick marks
# #  facet_wrap(plot4labels.ai, labeller=set_labeller)
# 
# # Print plot
# hhincome_plot # + theme_bw()

Mini-Assignment-4

Jalisa Smith

2023-10-10

About the tidied data set