import data
Gas_Prices <-read_csv("../00_data/Gas_Prices.csv")
## Rows: 22360 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): fuel, grade, formulation
## dbl (1): price
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Gas_Prices
## # A tibble: 22,360 × 5
## date fuel grade formulation price
## <date> <chr> <chr> <chr> <dbl>
## 1 1990-08-20 gasoline regular all 1.19
## 2 1990-08-20 gasoline regular conventional 1.19
## 3 1990-08-27 gasoline regular all 1.25
## 4 1990-08-27 gasoline regular conventional 1.25
## 5 1990-09-03 gasoline regular all 1.24
## 6 1990-09-03 gasoline regular conventional 1.24
## 7 1990-09-10 gasoline regular all 1.25
## 8 1990-09-10 gasoline regular conventional 1.25
## 9 1990-09-17 gasoline regular all 1.27
## 10 1990-09-17 gasoline regular conventional 1.27
## # ℹ 22,350 more rows
Introduction
Questions
Variation
Visualizing distributions
Gas_Prices %>%
filter(fuel == "gasoline") %>%
ggplot(aes(x =grade,y=price ))+
geom_bar(stat = "identity")

Gas_Prices %>% count(price)
## # A tibble: 3,807 × 2
## price n
## <dbl> <int>
## 1 0.885 1
## 2 0.891 1
## 3 0.899 1
## 4 0.9 1
## 5 0.907 2
## 6 0.908 1
## 7 0.913 3
## 8 0.914 1
## 9 0.919 1
## 10 0.92 1
## # ℹ 3,797 more rows
Gas_Prices %>%
filter(fuel == "gasoline") %>%
ggplot(aes(x =grade,y=price ))+
geom_histogram(stat = "identity", binwidth = .27)
## Warning in geom_histogram(stat = "identity", binwidth = 0.27): Ignoring unknown
## parameters: `binwidth` and `bins`

ggplot(data=Gas_Prices) +
geom_histogram( mapping = aes(x = price), bindwidth = 1)
## Warning in geom_histogram(mapping = aes(x = price), bindwidth = 1): Ignoring
## unknown parameters: `bindwidth`
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

Gas_Prices%>%
ggplot(data=Gas_Prices, mapping = aes(x = price, colour = grade)) +
geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

Typical values
ggplot(data = Gas_Prices, mapping =aes(x = price)) +
geom_histogram(binwidth = 0.8)

Unusual values
Gas_Prices %>%
ggplot(aes(price)) +
geom_histogram( )+
coord_cartesian(ylim =c(0,50))
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

Missing Values
Gas_Prices %>%
#filter y<5 | y>200
mutate(y = ifelse(price <5 | price >200, NA, price )) %>%
#plot
ggplot(aes(x=grade, y=price)) +
geom_point()

Covariation
A categorical and continuous variable
Gas_Prices %>%
ggplot(aes(x=grade, y=price)) +
geom_boxplot()

Two categorical variables
Gas_Prices %>%
count(fuel, grade) %>%
ggplot(aes(x=fuel, y=grade, fill = n)) +
geom_tile()

Two continous variables
library(hexbin)
Gas_Prices_fuel <- Gas_Prices %>%
group_by(date, grade, fuel) %>%
summarise(price = mean(price, na.rm = TRUE), .groups = "drop") %>%
pivot_wider(names_from = fuel, values_from = price)
Gas_Prices_fuel %>%
ggplot(aes(x = diesel, y = gasoline)) +
geom_hex()
## Warning: Removed 6111 rows containing non-finite outside the scale range
## (`stat_binhex()`).

Gas_Prices_fuel %>%
filter(gasoline<200) %>%
ggplot(aes(x = diesel, y = gasoline)) +
geom_boxplot(aes(group=cut_width(gasoline,0.1)))
## Warning: Orientation is not uniquely specified when both the x and y aesthetics are
## continuous. Picking default orientation 'x'.
## Warning: Removed 5055 rows containing missing values or values outside the scale range
## (`stat_boxplot()`).

Patterns and models
library(modelr)
Gas_Price_Clean<- Gas_Prices_fuel %>%
filter(gasoline>0,
diesel>0)
mod <- lm(log(diesel) ~log(gasoline),data=Gas_Price_Clean)
Gas_price_resid<- Gas_Price_Clean %>%
modelr::add_residuals(mod) %>%
mutate(resid=exp(resid))
Gas_price_resid %>%
ggplot(aes(gasoline,resid)) +
geom_point()

Gas_price_resid %>%
ggplot(aes(grade, resid)) +
geom_boxplot()
