import data

Gas_Prices <-read_csv("../00_data/Gas_Prices.csv")
## Rows: 22360 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): fuel, grade, formulation
## dbl  (1): price
## date (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Gas_Prices
## # A tibble: 22,360 × 5
##    date       fuel     grade   formulation  price
##    <date>     <chr>    <chr>   <chr>        <dbl>
##  1 1990-08-20 gasoline regular all           1.19
##  2 1990-08-20 gasoline regular conventional  1.19
##  3 1990-08-27 gasoline regular all           1.25
##  4 1990-08-27 gasoline regular conventional  1.25
##  5 1990-09-03 gasoline regular all           1.24
##  6 1990-09-03 gasoline regular conventional  1.24
##  7 1990-09-10 gasoline regular all           1.25
##  8 1990-09-10 gasoline regular conventional  1.25
##  9 1990-09-17 gasoline regular all           1.27
## 10 1990-09-17 gasoline regular conventional  1.27
## # ℹ 22,350 more rows

Introduction

Questions

Variation

Visualizing distributions

Gas_Prices %>%
    filter(fuel == "gasoline") %>%
    ggplot(aes(x =grade,y=price ))+
    geom_bar(stat = "identity")

Gas_Prices %>% count(price)
## # A tibble: 3,807 × 2
##    price     n
##    <dbl> <int>
##  1 0.885     1
##  2 0.891     1
##  3 0.899     1
##  4 0.9       1
##  5 0.907     2
##  6 0.908     1
##  7 0.913     3
##  8 0.914     1
##  9 0.919     1
## 10 0.92      1
## # ℹ 3,797 more rows
Gas_Prices %>%
    filter(fuel == "gasoline") %>%
    ggplot(aes(x =grade,y=price ))+
    geom_histogram(stat = "identity", binwidth = .27)
## Warning in geom_histogram(stat = "identity", binwidth = 0.27): Ignoring unknown
## parameters: `binwidth` and `bins`

ggplot(data=Gas_Prices) +
    geom_histogram( mapping = aes(x = price), bindwidth = 1)
## Warning in geom_histogram(mapping = aes(x = price), bindwidth = 1): Ignoring
## unknown parameters: `bindwidth`
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

Gas_Prices%>%
ggplot(data=Gas_Prices, mapping = aes(x = price, colour = grade)) +
    
    geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

Typical values

ggplot(data = Gas_Prices, mapping =aes(x = price)) +
    geom_histogram(binwidth = 0.8)

Unusual values

Gas_Prices %>%
    ggplot(aes(price)) +
    geom_histogram( )+
    coord_cartesian(ylim =c(0,50))
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

Missing Values

Gas_Prices %>%
    #filter y<5 | y>200 
    mutate(y = ifelse(price <5 | price >200, NA, price )) %>%
    #plot 
    ggplot(aes(x=grade, y=price)) +
geom_point()

Covariation

A categorical and continuous variable

Gas_Prices %>%
    ggplot(aes(x=grade, y=price)) +
    geom_boxplot()

Two categorical variables

Gas_Prices %>%
    count(fuel, grade) %>%
    ggplot(aes(x=fuel, y=grade, fill = n)) +
    geom_tile()

Two continous variables

library(hexbin) 

Gas_Prices_fuel <- Gas_Prices %>%
  group_by(date, grade, fuel) %>%
  summarise(price = mean(price, na.rm = TRUE), .groups = "drop") %>%
  pivot_wider(names_from = fuel, values_from = price)

Gas_Prices_fuel %>%
  ggplot(aes(x = diesel, y = gasoline)) +
  geom_hex()
## Warning: Removed 6111 rows containing non-finite outside the scale range
## (`stat_binhex()`).

Gas_Prices_fuel %>%
    filter(gasoline<200) %>%
    ggplot(aes(x = diesel, y = gasoline)) +
geom_boxplot(aes(group=cut_width(gasoline,0.1)))
## Warning: Orientation is not uniquely specified when both the x and y aesthetics are
## continuous. Picking default orientation 'x'.
## Warning: Removed 5055 rows containing missing values or values outside the scale range
## (`stat_boxplot()`).

Patterns and models

library(modelr)
Gas_Price_Clean<- Gas_Prices_fuel %>%
    filter(gasoline>0,
           diesel>0)
mod <- lm(log(diesel) ~log(gasoline),data=Gas_Price_Clean)

Gas_price_resid<- Gas_Price_Clean %>%
    modelr::add_residuals(mod) %>%
    mutate(resid=exp(resid))

Gas_price_resid %>%
    ggplot(aes(gasoline,resid)) +
    geom_point()

Gas_price_resid %>%
    ggplot(aes(grade, resid)) +
geom_boxplot()