Data Visualization

Description

Some ways of visualizing and plotting datasets in R

library(sf)
library(tmap)
library(tidyverse)
library(dplyr)
library(ggpubr)
library(ggplot2)
library(tidycensus)
library(tigris)
library(leaflet)
library(here)

# Loading input data file
data <- read.csv("coffee.csv")
summary(data)

Plot 1

# Boxplot
bxplot <- ggplot(data = data) +
  geom_boxplot(aes(x=avg_rating, y=hhincome),
               color="black",fill="white")

plotly::ggplotly(bxplot)

This box plot shows the relation between hh incomes and average ratings of businesses. Higher median hh income areas have places with better ratings.

Plot 2

bxplot2 <- ggplot(data = data) + geom_boxplot(aes(x = avg_rating, y=hhincome), color = "black") +
  facet_wrap(~county) +
  scale_fill_brewer(palette = "Blues") +
  labs(x = "Average Yelp Rating", 
       y = "Median Annual Household Income ($)",
       color = "Proportion of residents\nwho self-identified as white",
       title = "Scatterplot: Review Count vs. Household Income")

plotly::ggplotly(bxplot2)

This box plot describes the relation between review counts and houshold incomes for 5 different counties in Georgia. Cobb, DeKalb, and Fulton have more areas with high median annual hh incomes and also has more businesses with better ratings. Clayton county has areas with low median annual hh incomes and businesses with lower ratings between 1 and 3.

Plot 3

ggplot(data, aes(x=review_count_log, y=hhincome, colour=pct_white)) + facet_wrap(~county) + geom_point(size = 2, alpha = 0.7) +
  theme_minimal() +
  scale_color_gradient(low = "#0091ff", high = "#f0650e") + 
  labs(x = "Review Count (log)",
       y = "Median Annual Houshold Income",
       color = "Proportion of residents\nwho self-identified as white",
       title = "Scatterplot: Review Count vs. Household Income")

The scatter plot shows the relation between median annual hh income and review counts for different counties and also shows the racial composition of the area where the coffee shop is located. It shows that areas with a higher proportion of white residents have higher median annual hh income and also have more businesses with better ratings.

Plot 4

# Preparing Data
data1 <- data %>% select(pct_pov_log, hhincome, pct_white, race.tot, review_count_log, county)
data2 <- pivot_longer(data1, cols = pct_pov_log:race.tot,  names_to = "variable", values_to = "value")

plot_names <- c('hhincome' = "Median Annual Household Income ($)",
                'pct_pov_log' = "Percent Residents Under Poverty",
                'pct_white' = "Percent White Resident",
                'race.tot' = "Total Population")

ggplot(data2, aes(x = review_count_log, y = value)) +
  geom_point(aes(color = county), size=0.7) +
  facet_wrap(~variable, scales="free", labeller = as_labeller(plot_names))+
  geom_smooth(aes(color = county), method=lm, se=FALSE) +
  stat_cor(method = "pearson") +
  labs(x = "Review Count Logged", 
       y = "Values",
       color = "County",
       title = "Scatterplot between logged review count & neighborhood characteristics",
       subtitle = "Using Yelp data in Five Counties Around Atlanta, GA")

## `geom_smooth()` using formula = 'y ~ x'

This scatter plot shows the correlation between values of median hh income, percentage residents under poverty, percentage white residents and total population with review counts for different counties.