Adding required packages

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.4     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.0.2     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Load the input file to a Dataframe

ecars <- read_csv("EVDatabase.csv")
## Rows: 180 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): Name, Subtitle, Acceleration, TopSpeed, Range, Efficiency, FastCha...
## dbl  (1): NumberofSeats
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

check quality of the data

head(ecars)
## # A tibble: 6 × 11
##   Name    Subtitle  Acceleration TopSpeed Range Efficiency FastChargeSpeed Drive
##   <chr>   <chr>     <chr>        <chr>    <chr> <chr>      <chr>           <chr>
## 1 Opel A… Battery … 7.3 sec      150 km/h 335 … 173 Wh/km  210 km/h        Fron…
## 2 Renaul… Battery … 22.4 sec     130 km/h 160 … 194 Wh/km  -               Fron…
## 3 Nissan… Battery … 7.9 sec      144 km/h 220 … 164 Wh/km  230 km/h        Fron…
## 4 Audi e… Battery … 5.7 sec      200 km/h 375 … 231 Wh/km  600 km/h        All …
## 5 Porsch… Battery … 2.8 sec      260 km/h 390 … 215 Wh/km  860 km/h        All …
## 6 Nissan… Battery … 14.0 sec     123 km/h 165 … 218 Wh/km  170 km/h        Fron…
## # … with 3 more variables: NumberofSeats <dbl>, PriceinGermany <chr>,
## #   PriceinUK <chr>

This file requies a clean up. Most of the numeric fields are given as Char

First step of the cleanup, Manufacturer name is in the Name field. Lets create a new field for the same

ecars <- ecars %>% 
  separate(Name, into = "Manufacturer", sep = ' ', extra = 'drop', remove = FALSE)

A new column Manufacturer is added. Lets check the structure of the dataframe

glimpse(ecars)
## Rows: 180
## Columns: 12
## $ Name            <chr> "Opel Ampera-e", "Renault Kangoo Maxi ZE 33", "Nissan …
## $ Manufacturer    <chr> "Opel", "Renault", "Nissan", "Audi", "Porsche", "Nissa…
## $ Subtitle        <chr> "Battery Electric Vehicle |       58 kWh", "Battery El…
## $ Acceleration    <chr> "7.3 sec", "22.4 sec", "7.9 sec", "5.7 sec", "2.8 sec"…
## $ TopSpeed        <chr> "150 km/h", "130 km/h", "144 km/h", "200 km/h", "260 k…
## $ Range           <chr> "335 km", "160 km", "220 km", "375 km", "390 km", "165…
## $ Efficiency      <chr> "173 Wh/km", "194 Wh/km", "164 Wh/km", "231 Wh/km", "2…
## $ FastChargeSpeed <chr> "210 km/h", "-", "230 km/h", "600 km/h", "860 km/h", "…
## $ Drive           <chr> "Front Wheel Drive", "Front Wheel Drive", "Front Wheel…
## $ NumberofSeats   <dbl> 5, 5, 5, 5, 4, 7, 5, 5, 5, 4, 5, 5, 4, 5, 4, 5, 5, 5, …
## $ PriceinGermany  <chr> "€42,990", "N/A", "€29,990", "N/A", "€186,336", "€43,4…
## $ PriceinUK       <chr> "N/A", "£31,680", "£25,995", "£79,900", "£138,830", "£…

Second step in cleanup, convert all the Fields attributes which is supposed to be a numeric

ecars <- ecars %>% 
    separate(Acceleration, c("Acceleration"), sep = ' ', extra = 'drop') %>% 
    mutate(Acceleration = as.double(Acceleration)) %>%
    separate(TopSpeed, c("TopSpeed"), sep = ' ', extra = 'drop') %>%
    mutate(TopSpeed = as.double(TopSpeed)) %>%
    separate(Range, c("Range"), sep = ' ', extra = 'drop') %>%
    mutate(Range = as.double(Range)) %>%
    separate(Efficiency, c("Efficiency"), sep = ' ', extra = 'drop') %>%
    mutate(Efficiency = as.double(Efficiency)) %>%
    separate(FastChargeSpeed, c("FastChargeSpeed"), sep = ' ', extra = 'drop') %>%
    mutate(FastChargeSpeed = as.double(FastChargeSpeed))
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion

Lets check the dataframe structure

glimpse(ecars)
## Rows: 180
## Columns: 12
## $ Name            <chr> "Opel Ampera-e", "Renault Kangoo Maxi ZE 33", "Nissan …
## $ Manufacturer    <chr> "Opel", "Renault", "Nissan", "Audi", "Porsche", "Nissa…
## $ Subtitle        <chr> "Battery Electric Vehicle |       58 kWh", "Battery El…
## $ Acceleration    <dbl> 7.3, 22.4, 7.9, 5.7, 2.8, 14.0, 8.9, 6.8, 7.3, 7.3, 8.…
## $ TopSpeed        <dbl> 150, 130, 144, 200, 260, 123, 160, 180, 157, 150, 160,…
## $ Range           <dbl> 335, 160, 220, 375, 390, 165, 275, 385, 325, 235, 355,…
## $ Efficiency      <dbl> 173, 194, 164, 231, 215, 218, 164, 192, 172, 161, 187,…
## $ FastChargeSpeed <dbl> 210, NA, 230, 600, 860, 170, 260, 520, 390, 270, 420, …
## $ Drive           <chr> "Front Wheel Drive", "Front Wheel Drive", "Front Wheel…
## $ NumberofSeats   <dbl> 5, 5, 5, 5, 4, 7, 5, 5, 5, 4, 5, 5, 4, 5, 4, 5, 5, 5, …
## $ PriceinGermany  <chr> "€42,990", "N/A", "€29,990", "N/A", "€186,336", "€43,4…
## $ PriceinUK       <chr> "N/A", "£31,680", "£25,995", "£79,900", "£138,830", "£…

Price column PriceinGermany is having currency symbol, convert that to numeric

ecars <- ecars %>% 
    mutate(PriceinGermany = ifelse(PriceinGermany == "N/A", "€0", PriceinGermany))

ecars$PriceinGermany <- gsub("€", "", as.character(ecars$PriceinGermany))
ecars$PriceinGermany <- gsub(",", "", as.character(ecars$PriceinGermany))

ecars <- ecars %>%
    mutate(PriceinGermany = as.numeric(PriceinGermany))

Similarly, change the formating to PriceinUK column as well

ecars <- ecars %>% 
    mutate(PriceinUK = ifelse(PriceinUK == "N/A", "£0", PriceinUK))

ecars$PriceinUK <- gsub("£", "", as.character(ecars$PriceinUK))
ecars$PriceinUK <- gsub(",", "", as.character(ecars$PriceinUK))

ecars <- ecars %>%
    mutate(PriceinUK = as.numeric(PriceinUK))

Lets check the structure

glimpse(ecars)
## Rows: 180
## Columns: 12
## $ Name            <chr> "Opel Ampera-e", "Renault Kangoo Maxi ZE 33", "Nissan …
## $ Manufacturer    <chr> "Opel", "Renault", "Nissan", "Audi", "Porsche", "Nissa…
## $ Subtitle        <chr> "Battery Electric Vehicle |       58 kWh", "Battery El…
## $ Acceleration    <dbl> 7.3, 22.4, 7.9, 5.7, 2.8, 14.0, 8.9, 6.8, 7.3, 7.3, 8.…
## $ TopSpeed        <dbl> 150, 130, 144, 200, 260, 123, 160, 180, 157, 150, 160,…
## $ Range           <dbl> 335, 160, 220, 375, 390, 165, 275, 385, 325, 235, 355,…
## $ Efficiency      <dbl> 173, 194, 164, 231, 215, 218, 164, 192, 172, 161, 187,…
## $ FastChargeSpeed <dbl> 210, NA, 230, 600, 860, 170, 260, 520, 390, 270, 420, …
## $ Drive           <chr> "Front Wheel Drive", "Front Wheel Drive", "Front Wheel…
## $ NumberofSeats   <dbl> 5, 5, 5, 5, 4, 7, 5, 5, 5, 4, 5, 5, 4, 5, 4, 5, 5, 5, …
## $ PriceinGermany  <dbl> 42990, 0, 29990, 0, 186336, 43433, 31960, 66300, 38350…
## $ PriceinUK       <dbl> 0, 31680, 25995, 79900, 138830, 30255, 27120, 58850, 3…

There are few columns which has NA, convert that to ZERO to make the calculations easier

ecars[is.na(ecars)] <- 0

The field Subtitle is suffixed with with “Battery Electric Vehicle |”, lets remove that

ecars$Subtitle <- gsub("Battery Electric Vehicle |       ", "", as.character(ecars$Subtitle))

Still this column needs some more cleanup

# To remove the hardcoded value kWh
ecars$Subtitle <- gsub("kWh", "", as.character(ecars$Subtitle))
# To remove the first hardcoded value in the column.
ecars$Subtitle <- gsub("^.", "", as.character(ecars$Subtitle))

# Convert the column Subtitle to numeric and rename it to KWH
ecars <- ecars %>%
    mutate(Subtitle = as.numeric(Subtitle)) %>%
    rename(KWH = Subtitle)

All the cleanup activity is complete for the data set, lets look at the structure

glimpse(ecars)
## Rows: 180
## Columns: 12
## $ Name            <chr> "Opel Ampera-e", "Renault Kangoo Maxi ZE 33", "Nissan …
## $ Manufacturer    <chr> "Opel", "Renault", "Nissan", "Audi", "Porsche", "Nissa…
## $ KWH             <dbl> 58.0, 31.0, 36.0, 86.5, 83.7, 36.0, 45.0, 74.0, 56.0, …
## $ Acceleration    <dbl> 7.3, 22.4, 7.9, 5.7, 2.8, 14.0, 8.9, 6.8, 7.3, 7.3, 8.…
## $ TopSpeed        <dbl> 150, 130, 144, 200, 260, 123, 160, 180, 157, 150, 160,…
## $ Range           <dbl> 335, 160, 220, 375, 390, 165, 275, 385, 325, 235, 355,…
## $ Efficiency      <dbl> 173, 194, 164, 231, 215, 218, 164, 192, 172, 161, 187,…
## $ FastChargeSpeed <dbl> 210, 0, 230, 600, 860, 170, 260, 520, 390, 270, 420, 3…
## $ Drive           <chr> "Front Wheel Drive", "Front Wheel Drive", "Front Wheel…
## $ NumberofSeats   <dbl> 5, 5, 5, 5, 4, 7, 5, 5, 5, 4, 5, 5, 4, 5, 4, 5, 5, 5, …
## $ PriceinGermany  <dbl> 42990, 0, 29990, 0, 186336, 43433, 31960, 66300, 38350…
## $ PriceinUK       <dbl> 0, 31680, 25995, 79900, 138830, 30255, 27120, 58850, 3…

Now we will start plotting the data and understand it better.

First we will create correlation table

ecars_only_numeric_fields <- ecars %>%
    select(-c(Name, Manufacturer, Drive))

ecars_only_numeric_fields.cor = cor(ecars_only_numeric_fields)
ecars_only_numeric_fields.cor
##                        KWH Acceleration    TopSpeed       Range  Efficiency
## KWH              1.0000000   -0.6229854  0.69141548  0.89009245  0.35642612
## Acceleration    -0.6229854    1.0000000 -0.82104106 -0.71207706  0.14098402
## TopSpeed         0.6914155   -0.8210411  1.00000000  0.76031899 -0.04738347
## Range            0.8900925   -0.7120771  0.76031899  1.00000000 -0.07227251
## Efficiency       0.3564261    0.1409840 -0.04738347 -0.07227251  1.00000000
## FastChargeSpeed  0.6597431   -0.7416703  0.77318976  0.76757435 -0.05991384
## NumberofSeats    0.1344473    0.3183015 -0.24256727 -0.06885113  0.60246852
## PriceinGermany   0.5980835   -0.5015186  0.67886530  0.56903699  0.23922128
## PriceinUK        0.6212530   -0.6742728  0.79266892  0.62075705  0.08452729
##                 FastChargeSpeed NumberofSeats PriceinGermany   PriceinUK
## KWH                  0.65974314    0.13444725     0.59808354  0.62125296
## Acceleration        -0.74167034    0.31830154    -0.50151859 -0.67427279
## TopSpeed             0.77318976   -0.24256727     0.67886530  0.79266892
## Range                0.76757435   -0.06885113     0.56903699  0.62075705
## Efficiency          -0.05991384    0.60246852     0.23922128  0.08452729
## FastChargeSpeed      1.00000000   -0.11964428     0.59560913  0.65085116
## NumberofSeats       -0.11964428    1.00000000    -0.01245501 -0.21198513
## PriceinGermany       0.59560913   -0.01245501     1.00000000  0.69875323
## PriceinUK            0.65085116   -0.21198513     0.69875323  1.00000000

Top Speed has a high correlation to price in Germany and price in the UK which makes sense. A car that has a higher top speed will have a higher price.

We can understand the data better with the Heatmap

palette = colorRampPalette(c("green", "white", "red")) (20)
heatmap(x = ecars_only_numeric_fields.cor, col = palette, symm = TRUE)

#### Let us check each type of drive

ecars %>%
    ggplot(aes(x = Drive)) +
    geom_bar() +
    theme_classic()

The plot can be verified, by checking the data

ecars %>%
    group_by(Drive) %>%
    summarize(Total_count = n()) %>%
    arrange(-Total_count)
## # A tibble: 3 × 2
##   Drive             Total_count
##   <chr>                   <int>
## 1 Front Wheel Drive          71
## 2 All Wheel Drive            64
## 3 Rear Wheel Drive           45

Now lets see the count of vehicles with Number of Seats

ecars %>%
    ggplot(aes(x = NumberofSeats)) +
    geom_bar() +
    labs(x = "Number of Seats") +
    theme_classic()

Check the Car Distribution in comparison with Number of Seats and Drive

barplot(table(ecars$Drive, ecars$NumberofSeats), 
        main="Car Distribution considering Number of Seats and Drive type",
        xlab="NumberofSeats", col=c("darkblue","red"),
        legend = rownames(table(ecars$Drive, ecars$NumberofSeats)), beside = TRUE)

Lets find who is the top manufacturer of Electric vehicle

ecars %>%
    ggplot(aes(x = Manufacturer)) +
    geom_bar() +
    labs(x = "Manufacturer") +
    coord_flip() +
    theme_classic()

We could see that Audi and Tesla has more number of electric vehicles.

Now let us look at the relationship between Acceleration and KWH

ecars %>%
    ggplot(aes(x = KWH, y = Acceleration, color = Drive)) +
    geom_point() +
    theme_classic()

Lets compare with KWH and Acceleration with Number of Seats

ecars %>%
    ggplot(aes(x = KWH, y = Acceleration, size = NumberofSeats)) +
    geom_point() +
    theme_classic()

Lets compare the Range and Top Speed

ecars %>%
    ggplot(aes(x = TopSpeed, y = Range, color = Drive)) +
    geom_point() +
    theme_classic()