The data used in this workshop are adapted from the following:
City of Cambridge Public Safety. “Police Department Crash Data - Updated.” Cambridge Open Data, September 13, 2022. https://data.cambridgema.gov/Public-Safety/Police-Department-Crash-Data-Updated/gb5w-yva3.
National Centers for Environmental Information, National Oceanic and Atmospheric Administration. “Past Weather.” Past Weather | CAMBRIDGE MA | US1MAMD0011, October 3, 2022. https://www.ncei.noaa.gov/access/past-weather/42.55502054693938,-71.76079845408827,42.040922629398835,-70.48652687279844.
library(tidyverse)
library(lubridate)
ggplot(iris, aes(x = Petal.Length, y = Petal.Width)) +
geom_point()
Start with just the data and coordinates
ggplot(iris, aes(x = Petal.Length, y = Petal.Width))
Add in points to show data, along with color and a linear model
ggplot(iris, aes(x = Petal.Length, y = Petal.Width)) +
geom_point()
ggplot(iris, aes(x = Petal.Length, y = Petal.Width, color = Species)) +
geom_point()
ggplot(iris, aes(x = Petal.Length, y = Petal.Width, color = Species)) +
geom_point() +
geom_smooth(method = "lm")
ggplot(iris, aes(x = Petal.Length, y = Petal.Width)) +
geom_point(aes(color = Species)) +
geom_smooth(method = "lm")
Read in our data
crashes <- read_csv("./data/processed/crashes.csv")
weather <- read_csv("./data/processed/weather.csv")
crashes_weather <- crashes %>%
left_join(weather)
Let’s interrogate whether those match up sensibly
ggplot(crashes_weather, aes(x = weather_condition, y = precip)) +
geom_boxplot()
Good enough for practice! We’d want to be more careful if this were an actual analysis.
ggplot(crashes_weather, aes(x = precip)) +
geom_histogram()
ggplot(crashes_weather, aes(x = precip)) +
geom_histogram(bins = 40)
ggplot(crashes_weather, aes(x = precip)) +
geom_histogram(bins = 10)
ggplot(crashes_weather, aes(x = precip)) +
geom_histogram(bins = 20)
Color is different from fill
ggplot(crashes_weather, aes(x = precip)) +
geom_histogram(bins = 20, color = "black", fill = "steelblue")
How does this compare to the overall histogram for our weather data set?
ggplot(weather, aes(x = precip)) +
geom_histogram(bins = 20, color = "black", fill = "steelblue")
That actually matches pretty closely - there are just more days with less precipitation. Let’s try looking at number of records based on reported weather condition.
ggplot(crashes_weather, aes(x = weather_condition)) +
geom_bar()
ggplot(crashes_weather, aes(x = weather_condition)) +
geom_bar() +
theme(
axis.text.x = element_text(angle = 45)
)
Let’s try something a little advanced. We’ll make a data set to see whether there was precipitation on a given day, and then compare crashes per day for days with precipitation and without precipitation.
crashes_per_day <- crashes_weather %>%
group_by(date, precip) %>%
summarize(crashes = n()) %>%
mutate(
was_precip = precip > 0,
year = year(date)
)
knitr::kable(head(crashes_per_day))
ggplot(crashes_per_day, aes(x = was_precip, y = crashes)) +
geom_boxplot()
ggplot(crashes_per_day, aes(x = was_precip, y = crashes)) +
geom_violin()
ggplot(crashes_per_day, aes(x = was_precip, y = crashes)) +
geom_violin() +
geom_jitter(alpha = 0.2)
ggplot(crashes_per_day, aes(x = precip, y = crashes)) +
geom_point()
ggplot(crashes_per_day, aes(x = precip, y = crashes)) +
geom_point() +
geom_smooth()
ggplot(crashes_per_day, aes(x = precip, y = crashes)) +
geom_smooth() +
geom_point()
ggplot(crashes_per_day, aes(x = was_precip, y = crashes)) +
geom_violin() +
geom_jitter(alpha = 0.2)
This applies the color to everything - global ggplot call carries to all geom_*
ggplot(crashes_per_day, aes(x = was_precip, y = crashes, color = was_precip)) +
geom_violin() +
geom_jitter(alpha = 0.2)
If we want to color the violins based on was_precip:
ggplot(crashes_per_day, aes(x = was_precip, y = crashes)) +
geom_violin(color = was_precip) +
geom_jitter(alpha = 0.2)
ggplot(crashes_per_day, aes(x = was_precip, y = crashes)) +
geom_violin(aes(color = was_precip)) +
geom_jitter(alpha = 0.2)
This changes to a color not defined by the data (also see alpha in the geom_jitter)
ggplot(crashes_per_day, aes(x = was_precip, y = crashes)) +
geom_violin(color = "orange") +
geom_jitter(alpha = 0.2)
This creates a color based on one value, “orange.” It doesn’t involve the color orange at all.
ggplot(crashes_per_day, aes(x = was_precip, y = crashes)) +
geom_violin(aes(color = "orange")) +
geom_jitter(alpha = 0.2)
Colors can also be continuous rather than categorical
ggplot(crashes_per_day, aes(x = was_precip, y = crashes, color = crashes)) +
geom_violin() +
geom_jitter(alpha = 0.2)
ggplot(crashes_per_day, aes(x = was_precip, y = crashes, color = precip)) +
geom_violin() +
geom_jitter(alpha = 0.5) +
labs(
x = "Was there Precipitation?",
y = "Number of Crashes",
title = "Vehicle Crashes and Precipitation",
subtitle = "In Cambridge, MA",
caption = "Data from NOAA and City of Cambridge",
color = "Amount of Precipitation"
)
ggplot(crashes_per_day, aes(x = was_precip, y = crashes, color = precip)) +
geom_violin() +
geom_jitter(alpha = 0.5) +
labs(
x = "Was there Precipitation?",
y = "Number of Crashes",
title = "Vehicle Crashes and Precipitation",
subtitle = "In Cambridge, MA",
caption = "Data from NOAA and City of Cambridge",
color = "Amount of Precipitation"
) +
scale_color_continuous(
low = "yellow",
high = "blue"
)
ggplot(crashes_per_day, aes(x = was_precip, y = crashes, color = precip)) +
geom_violin() +
geom_jitter(alpha = 0.5) +
labs(
x = "Was there Precipitation?",
y = "Number of Crashes",
title = "Vehicle Crashes and Precipitation",
subtitle = "In Cambridge, MA",
caption = "Data from NOAA and City of Cambridge",
color = "Amount of Precipitation"
) +
scale_color_continuous(
low = "yellow",
high = "blue"
) +
theme_dark()
ggplot(crashes_per_day, aes(x = was_precip, y = crashes, color = precip)) +
geom_violin() +
geom_jitter(alpha = 0.5) +
labs(
x = "Was there Precipitation?",
y = "Number of Crashes",
title = "Vehicle Crashes and Precipitation",
subtitle = "In Cambridge, MA",
caption = "Data from NOAA and City of Cambridge",
color = "Amount of Precipitation"
) +
scale_color_continuous(
low = "yellow",
high = "blue"
) +
theme_bw()
ggplot(crashes_per_day, aes(x = was_precip, y = crashes, color = precip)) +
geom_violin() +
geom_jitter(alpha = 0.5) +
labs(
x = "Was there Precipitation?",
y = "Number of Crashes",
title = "Vehicle Crashes and Precipitation",
subtitle = "In Cambridge, MA",
caption = "Data from NOAA and City of Cambridge",
color = "Amount of Precipitation"
) +
scale_color_continuous(
low = "blue",
high = "red"
) +
theme_bw() +
facet_wrap( ~ year)