R Markdown

This is an R Markdown document. is an analysis on cars base on their model,type and fuel consumption on CO2 emmision, the data set was obtained from kaggle https://www.kaggle.com/datasets/rinichristy/2022-fuel-consumption-ratings?select=MY2022+Fuel+Consumption+Ratings.csv.

# Load the necessary libraries
library(readr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.1     ✔ dplyr   1.1.0
## ✔ tibble  3.1.8     ✔ stringr 1.4.0
## ✔ tidyr   1.2.0     ✔ forcats 0.5.1
## ✔ purrr   0.3.4
## Warning: package 'ggplot2' was built under R version 4.2.2
## Warning: package 'dplyr' was built under R version 4.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(dplyr)
library(ggplot2)
library(tidyr)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.2.2
## corrplot 0.92 loaded
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.2.2
## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(plotly)
## Warning: package 'plotly' was built under R version 4.2.2
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
# Load the dataset
cars <- read_csv("C:/Users/Adnan/Downloads/cars.csv")
## Rows: 946 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): make, model, class, transmission, fuel type
## dbl (10): model year, engine size, cylinders, Fuel Consumption (City (L/100 ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#show colum names and data types
str(cars)
## spc_tbl_ [946 × 15] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ model year                       : num [1:946] 2022 2022 2022 2022 2022 ...
##  $ make                             : chr [1:946] "Acura" "Acura" "Acura" "Acura" ...
##  $ model                            : chr [1:946] "ILX" "MDX SH-AWD" "RDX SH-AWD" "RDX SH-AWD A-SPEC" ...
##  $ class                            : chr [1:946] "Compact" "SUV: Small" "SUV: Small" "SUV: Small" ...
##  $ engine size                      : num [1:946] 2.4 3.5 2 2 2 2 3 3 2 2 ...
##  $ cylinders                        : num [1:946] 4 6 4 4 4 4 6 6 4 4 ...
##  $ transmission                     : chr [1:946] "AM8" "AS10" "AS10" "AS10" ...
##  $ fuel type                        : chr [1:946] "Z" "Z" "Z" "Z" ...
##  $ Fuel Consumption (City (L/100 km): num [1:946] 9.9 12.6 11 11.3 11.2 11.3 12.3 12.3 10 10.5 ...
##  $ Fuel Consumption(Hwy (L/100 km)) : num [1:946] 7 9.4 8.6 9.1 8 8.1 9.4 9.8 7.2 7.7 ...
##  $ Fuel Consumption(Comb (L/100 km)): num [1:946] 8.6 11.2 9.9 10.3 9.8 9.8 11 11.2 8.7 9.2 ...
##  $ Fuel Consumption(Comb (mpg))     : num [1:946] 33 25 29 27 29 29 26 25 32 31 ...
##  $ CO2 emissions(g/km)              : num [1:946] 200 263 232 242 230 231 256 261 205 217 ...
##  $ CO2 rating                       : num [1:946] 6 4 5 5 5 5 5 4 6 5 ...
##  $ smog rating                      : num [1:946] 3 5 6 6 7 7 5 5 3 3 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   `model year` = col_double(),
##   ..   make = col_character(),
##   ..   model = col_character(),
##   ..   class = col_character(),
##   ..   `engine size` = col_double(),
##   ..   cylinders = col_double(),
##   ..   transmission = col_character(),
##   ..   `fuel type` = col_character(),
##   ..   `Fuel Consumption (City (L/100 km)` = col_double(),
##   ..   `Fuel Consumption(Hwy (L/100 km))` = col_double(),
##   ..   `Fuel Consumption(Comb (L/100 km))` = col_double(),
##   ..   `Fuel Consumption(Comb (mpg))` = col_double(),
##   ..   `CO2 emissions(g/km)` = col_double(),
##   ..   `CO2 rating` = col_double(),
##   ..   `smog rating` = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
#converting the columns to their respective data formats

  cars <- mutate_at(cars, vars('model','make','transmission'), as.character)
  cars <- mutate_at(cars, vars('model year', 'CO2 rating','cylinders','Fuel Consumption (City (L/100 km)','Fuel Consumption(Hwy (L/100 km))'), as.numeric)
  
  
#Check for missing values in each column
  colSums(is.na(cars))
##                        model year                              make 
##                                 0                                 0 
##                             model                             class 
##                                 0                                 0 
##                       engine size                         cylinders 
##                                 0                                 0 
##                      transmission                         fuel type 
##                                 0                                 0 
## Fuel Consumption (City (L/100 km)  Fuel Consumption(Hwy (L/100 km)) 
##                                 0                                 0 
## Fuel Consumption(Comb (L/100 km))      Fuel Consumption(Comb (mpg)) 
##                                 0                                 0 
##               CO2 emissions(g/km)                        CO2 rating 
##                                 0                                 0 
##                       smog rating 
##                                 0
  #calculating the average fuel consumption of each car by taking the average of City and High way consumption
  
  avg_fuel_consumption <- rowMeans(cars[, c("Fuel Consumption (City (L/100 km)", "Fuel Consumption(Hwy (L/100 km))")])

  
  
  # Calculate the correlation matrix
  correlation_matrix <- cor(cars %>% select(cylinders, `CO2 emissions(g/km)`), use = "complete.obs")  # Print the correlation matrix
  print(correlation_matrix)
##                     cylinders CO2 emissions(g/km)
## cylinders           1.0000000           0.8332406
## CO2 emissions(g/km) 0.8332406           1.0000000
  # Create the correlation plot
  corrplot(correlation_matrix, type = "upper", method = "color", 
           tl.col = "black", tl.srt = 0, tl.offset = 0.3, 
           addCoef.col = "white", addgrid.col = "gray", 
           cl.lim = c(-1, 1), number.cex = 0.7)
## Warning in text.default(pos.xlabel[, 1], pos.xlabel[, 2], newcolnames, srt =
## tl.srt, : "cl.lim" is not a graphical parameter
## Warning in text.default(pos.ylabel[, 1], pos.ylabel[, 2], newrownames, col =
## tl.col, : "cl.lim" is not a graphical parameter
## Warning in title(title, ...): "cl.lim" is not a graphical parameter
  # The correlation coefficient between the number of cylinders and CO2 emissions is 0.8332406,   which suggests a positive correlation between the two variables. In other words, as the number of cylinders increases, so does the CO2 emissions.

  #The correlation coefficient between CO2 emissions and itself is 1, which is expected since any variable is perfectly correlated with itself.
  
  # Add a title to the plot
  title("Correlation Between Number of Cylinders and CO2 Emissions")

  # Create scatter plot between fuel consumption and CO2 emissions
  ggplot(cars, aes(x = `avg_fuel_consumption`, y = `CO2 emissions(g/km)` , shape =`fuel type`, color = make)) +
    geom_point() +
    labs(title = "Fuel Consumption vs. CO2 Emissions", color = "Make") +
    scale_color_discrete()

  # Create box plot of fuel consumption by engine type
  ggplot(cars, aes(x = factor(`engine size`), y = `avg_fuel_consumption`, fill = `fuel type`)) +
    geom_boxplot() +
    labs(title = "Fuel Consumption by Engine Type")+
    theme(axis.text.x = element_text(angle = 85, vjust = 0.5))

  # Create bar plot of average fuel consumption by car make
  cars %>%
    group_by(`make`) %>%
    ggplot(aes(x = `class`, y = avg_fuel_consumption, fill = `fuel type`)) +
    geom_col() +
    labs(title = "Average Fuel Consumption by Car Make") +
    theme(axis.text.x = element_text(angle = 85, vjust = 0.5))

  # Reshape data
  df <- dcast(cars, make ~ class, value.var = "CO2 emissions(g/km)")
## Aggregation function missing: defaulting to length
  # Create heatmap in terms of fuel type
  ggplot(cars, aes(x = `fuel type`, y = make, fill = `CO2 emissions(g/km)`)) +
    geom_tile() +
    scale_fill_gradient2(low = "green", mid = "blue", high = "red", midpoint = 200) +
    labs(title = "CO2 Emissions by Make and Class")+
    theme(axis.text.x = element_text(angle = 85, vjust = 0.5))

  # Create heatmap in terms of the transmission type
  ggplot(cars, aes(x = `transmission`, y = make, fill = `CO2 emissions(g/km)`)) +
    geom_tile() +
    scale_fill_gradient2(low = "green", mid = "blue", high = "red", midpoint = 200) +
    labs(title = "CO2 Emissions by Make and Class") +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5))