Electric_Vehicle_Population

Author

Zijin Wang

Introduction

This dataset shows the Battery Electric Vehicles (BEVs) and Plug-in Hybrid Electric Vehicles (PHEVs) that are currently registered through Washington State Department of Licensing(DOL)

The data was sourced from the Washington State DOL and includes various attributes of the vehicles such as make, model, and electric range.

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.3     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
library(dplyr)
library(ggplot2)
library(tidyr)
library(ggalluvial)
library(plotly)

Attaching package: 'plotly'

The following object is masked from 'package:ggplot2':

    last_plot

The following object is masked from 'package:stats':

    filter

The following object is masked from 'package:graphics':

    layout

Data Loading and Cleaning

electric_vehicle_data <- read_csv("/Users/zwang30/Downloads/Electric_Vehicle_Population_Data.csv")
Rows: 159467 Columns: 17
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (11): VIN (1-10), County, City, State, Make, Model, Electric Vehicle Typ...
dbl  (6): Postal Code, Model Year, Electric Range, Base MSRP, Legislative Di...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Filter and summarize the data
filtered_data <- electric_vehicle_data %>%
  dplyr::filter(Make %in% c('TESLA', 'NISSAN')) %>%
  group_by(`Model Year`, `Electric Vehicle Type`) %>%
  summarise(Average_Count = n(), .groups = 'drop') # Using n() to count the rows

# Convert to a wide format for the heatmap
heatmap_data <- tidyr::spread(filtered_data, `Electric Vehicle Type`, Average_Count)

# Convert back to long format for ggplot
long_heatmap_data <- tidyr::pivot_longer(heatmap_data, 
                                         cols = -`Model Year`, 
                                         names_to = "Electric Vehicle Type", 
                                         values_to = "Average_Count")

# Replace NA values with 0 in Average_Count
long_heatmap_data[is.na(long_heatmap_data$Average_Count), "Average_Count"] <- 0

# Create the heatmap with a specified viridis scale to ensure at least three unique colors
heatmap_plot <- ggplot(long_heatmap_data, aes(x = `Electric Vehicle Type`, y = `Model Year`, fill = Average_Count)) +
  geom_tile() + 
  scale_fill_viridis_c(option = "D", begin = 0.3, end = 0.8) +
  labs(title = 'Average Count of Electric Vehicles by Type and Model Year', 
       x = 'Electric Vehicle Type', y = 'Model Year', 
       fill = "Average Count",
       caption = 'Source: the Washington State DOL') +
  theme_minimal()

# Convert ggplot to an interactive plotly plot
interactive_heatmap_plot <- ggplotly(heatmap_plot)

Let’s check the structure of the data first

str(electric_vehicle_data)
spc_tbl_ [159,467 × 17] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ VIN (1-10)                                       : chr [1:159467] "2C4RC1N71H" "2C4RC1N7XL" "KNDC3DLCXN" "5YJ3E1EA0J" ...
 $ County                                           : chr [1:159467] "Kitsap" "Stevens" "Yakima" "Kitsap" ...
 $ City                                             : chr [1:159467] "Bremerton" "Colville" "Yakima" "Bainbridge Island" ...
 $ State                                            : chr [1:159467] "WA" "WA" "WA" "WA" ...
 $ Postal Code                                      : num [1:159467] 98311 99114 98908 98110 98501 ...
 $ Model Year                                       : num [1:159467] 2017 2020 2022 2018 2018 ...
 $ Make                                             : chr [1:159467] "CHRYSLER" "CHRYSLER" "KIA" "TESLA" ...
 $ Model                                            : chr [1:159467] "PACIFICA" "PACIFICA" "EV6" "MODEL 3" ...
 $ Electric Vehicle Type                            : chr [1:159467] "Plug-in Hybrid Electric Vehicle (PHEV)" "Plug-in Hybrid Electric Vehicle (PHEV)" "Battery Electric Vehicle (BEV)" "Battery Electric Vehicle (BEV)" ...
 $ Clean Alternative Fuel Vehicle (CAFV) Eligibility: chr [1:159467] "Clean Alternative Fuel Vehicle Eligible" "Clean Alternative Fuel Vehicle Eligible" "Eligibility unknown as battery range has not been researched" "Clean Alternative Fuel Vehicle Eligible" ...
 $ Electric Range                                   : num [1:159467] 33 32 0 215 151 239 12 0 6 0 ...
 $ Base MSRP                                        : num [1:159467] 0 0 0 0 0 0 36900 0 0 0 ...
 $ Legislative District                             : num [1:159467] 23 7 14 23 35 26 14 15 26 23 ...
 $ DOL Vehicle ID                                   : num [1:159467] 3.49e+08 1.55e+08 2.20e+08 4.77e+08 2.01e+08 ...
 $ Vehicle Location                                 : chr [1:159467] "POINT (-122.6466274 47.6341188)" "POINT (-117.90431 48.547075)" "POINT (-120.6027202 46.5965625)" "POINT (-122.5235781 47.6293323)" ...
 $ Electric Utility                                 : chr [1:159467] "PUGET SOUND ENERGY INC" "AVISTA CORP" "PACIFICORP" "PUGET SOUND ENERGY INC" ...
 $ 2020 Census Tract                                : chr [1:159467] "53035091800" "53065950500" "53077000904" "53035091001" ...
 - attr(*, "spec")=
  .. cols(
  ..   `VIN (1-10)` = col_character(),
  ..   County = col_character(),
  ..   City = col_character(),
  ..   State = col_character(),
  ..   `Postal Code` = col_double(),
  ..   `Model Year` = col_double(),
  ..   Make = col_character(),
  ..   Model = col_character(),
  ..   `Electric Vehicle Type` = col_character(),
  ..   `Clean Alternative Fuel Vehicle (CAFV) Eligibility` = col_character(),
  ..   `Electric Range` = col_double(),
  ..   `Base MSRP` = col_double(),
  ..   `Legislative District` = col_double(),
  ..   `DOL Vehicle ID` = col_double(),
  ..   `Vehicle Location` = col_character(),
  ..   `Electric Utility` = col_character(),
  ..   `2020 Census Tract` = col_character()
  .. )
 - attr(*, "problems")=<externalptr> 

Now I’ll convert necessary columns to the appropriate Now we’ll convert necessary columns to the appropriate data types

electric_vehicle_data <- electric_vehicle_data %>%
  mutate(
    `Model Year` = as.numeric(`Model Year`),
    `Electric Range` = as.numeric(`Electric Range`)
  )

Filters for relevant years and makes with sufficient data

recent_evs <- electric_vehicle_data %>%
  filter(`Model Year` >= 2015) %>%
  group_by(Make) %>%
  filter(n() > 200) %>%
  ungroup() 

Statistical Analysis

Here I will perform my statistical analysis. For example, a simple linear regression to predict “Electric Range” based on “Model year” and “Base MSRP”.

model <- lm(`Electric Range` ~ `Model Year` + `Base MSRP`, data = recent_evs)
summary(model)

Call:
lm(formula = `Electric Range` ~ `Model Year` + `Base MSRP`, data = recent_evs)

Residuals:
    Min      1Q  Median      3Q     Max 
-192.04  -33.97  -10.53   21.47 1191.70 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)   4.743e+04  1.751e+02   270.8   <2e-16 ***
`Model Year` -2.344e+01  8.667e-02  -270.4   <2e-16 ***
`Base MSRP`  -1.630e-03  3.672e-05   -44.4   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 78.99 on 148379 degrees of freedom
Multiple R-squared:  0.3306,    Adjusted R-squared:  0.3306 
F-statistic: 3.665e+04 on 2 and 148379 DF,  p-value: < 2.2e-16

Data Visualization

I will now create a faceted scatter plot to visualize the electric range by make and model year.

Here is another type of visualization, a boxplot, which can show the spread of the electric range for different makes.

ggplot(recent_evs, aes(x = Make, y = `Electric Range`, fill = Make)) +
  geom_boxplot() +
  scale_fill_viridis_d() +
  theme_minimal() +
  labs(title = "Distribution of Electric Range Among Different Vehicle Makes",
       x = "Make",
       y = "Electric Range (miles)",
       fill = "Make") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggsave("EV_Range_Plot.png", width = 20, height = 15, units = "cm")