This is an R Markdown document. is an analysis on cars base on their model,type and fuel consumption on CO2 emmision, the data set was obtained from kaggle https://www.kaggle.com/datasets/rinichristy/2022-fuel-consumption-ratings?select=MY2022+Fuel+Consumption+Ratings.csv.
# Load the necessary libraries
library(readr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.1 ✔ dplyr 1.1.0
## ✔ tibble 3.1.8 ✔ stringr 1.4.0
## ✔ tidyr 1.2.0 ✔ forcats 0.5.1
## ✔ purrr 0.3.4
## Warning: package 'ggplot2' was built under R version 4.2.2
## Warning: package 'dplyr' was built under R version 4.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(dplyr)
library(ggplot2)
library(tidyr)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.2.2
## corrplot 0.92 loaded
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.2.2
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
library(plotly)
## Warning: package 'plotly' was built under R version 4.2.2
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
# Load the dataset
cars <- read_csv("C:/Users/Adnan/Downloads/cars.csv")
## Rows: 946 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): make, model, class, transmission, fuel type
## dbl (10): model year, engine size, cylinders, Fuel Consumption (City (L/100 ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#show colum names and data types
str(cars)
## spc_tbl_ [946 × 15] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ model year : num [1:946] 2022 2022 2022 2022 2022 ...
## $ make : chr [1:946] "Acura" "Acura" "Acura" "Acura" ...
## $ model : chr [1:946] "ILX" "MDX SH-AWD" "RDX SH-AWD" "RDX SH-AWD A-SPEC" ...
## $ class : chr [1:946] "Compact" "SUV: Small" "SUV: Small" "SUV: Small" ...
## $ engine size : num [1:946] 2.4 3.5 2 2 2 2 3 3 2 2 ...
## $ cylinders : num [1:946] 4 6 4 4 4 4 6 6 4 4 ...
## $ transmission : chr [1:946] "AM8" "AS10" "AS10" "AS10" ...
## $ fuel type : chr [1:946] "Z" "Z" "Z" "Z" ...
## $ Fuel Consumption (City (L/100 km): num [1:946] 9.9 12.6 11 11.3 11.2 11.3 12.3 12.3 10 10.5 ...
## $ Fuel Consumption(Hwy (L/100 km)) : num [1:946] 7 9.4 8.6 9.1 8 8.1 9.4 9.8 7.2 7.7 ...
## $ Fuel Consumption(Comb (L/100 km)): num [1:946] 8.6 11.2 9.9 10.3 9.8 9.8 11 11.2 8.7 9.2 ...
## $ Fuel Consumption(Comb (mpg)) : num [1:946] 33 25 29 27 29 29 26 25 32 31 ...
## $ CO2 emissions(g/km) : num [1:946] 200 263 232 242 230 231 256 261 205 217 ...
## $ CO2 rating : num [1:946] 6 4 5 5 5 5 5 4 6 5 ...
## $ smog rating : num [1:946] 3 5 6 6 7 7 5 5 3 3 ...
## - attr(*, "spec")=
## .. cols(
## .. `model year` = col_double(),
## .. make = col_character(),
## .. model = col_character(),
## .. class = col_character(),
## .. `engine size` = col_double(),
## .. cylinders = col_double(),
## .. transmission = col_character(),
## .. `fuel type` = col_character(),
## .. `Fuel Consumption (City (L/100 km)` = col_double(),
## .. `Fuel Consumption(Hwy (L/100 km))` = col_double(),
## .. `Fuel Consumption(Comb (L/100 km))` = col_double(),
## .. `Fuel Consumption(Comb (mpg))` = col_double(),
## .. `CO2 emissions(g/km)` = col_double(),
## .. `CO2 rating` = col_double(),
## .. `smog rating` = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
#converting the columns to their respective data formats
cars <- mutate_at(cars, vars('model','make','transmission'), as.character)
cars <- mutate_at(cars, vars('model year', 'CO2 rating','cylinders','Fuel Consumption (City (L/100 km)','Fuel Consumption(Hwy (L/100 km))'), as.numeric)
#Check for missing values in each column
colSums(is.na(cars))
## model year make
## 0 0
## model class
## 0 0
## engine size cylinders
## 0 0
## transmission fuel type
## 0 0
## Fuel Consumption (City (L/100 km) Fuel Consumption(Hwy (L/100 km))
## 0 0
## Fuel Consumption(Comb (L/100 km)) Fuel Consumption(Comb (mpg))
## 0 0
## CO2 emissions(g/km) CO2 rating
## 0 0
## smog rating
## 0
#calculating the average fuel consumption of each car by taking the average of City and High way consumption
avg_fuel_consumption <- rowMeans(cars[, c("Fuel Consumption (City (L/100 km)", "Fuel Consumption(Hwy (L/100 km))")])
# Calculate the correlation matrix
correlation_matrix <- cor(cars %>% select(cylinders, `CO2 emissions(g/km)`), use = "complete.obs") # Print the correlation matrix
print(correlation_matrix)
## cylinders CO2 emissions(g/km)
## cylinders 1.0000000 0.8332406
## CO2 emissions(g/km) 0.8332406 1.0000000
# Create the correlation plot
corrplot(correlation_matrix, type = "upper", method = "color",
tl.col = "black", tl.srt = 0, tl.offset = 0.3,
addCoef.col = "white", addgrid.col = "gray",
cl.lim = c(-1, 1), number.cex = 0.7)
## Warning in text.default(pos.xlabel[, 1], pos.xlabel[, 2], newcolnames, srt =
## tl.srt, : "cl.lim" is not a graphical parameter
## Warning in text.default(pos.ylabel[, 1], pos.ylabel[, 2], newrownames, col =
## tl.col, : "cl.lim" is not a graphical parameter
## Warning in title(title, ...): "cl.lim" is not a graphical parameter
# The correlation coefficient between the number of cylinders and CO2 emissions is 0.8332406, which suggests a positive correlation between the two variables. In other words, as the number of cylinders increases, so does the CO2 emissions.
#The correlation coefficient between CO2 emissions and itself is 1, which is expected since any variable is perfectly correlated with itself.
# Add a title to the plot
title("Correlation Between Number of Cylinders and CO2 Emissions")
# Create scatter plot between fuel consumption and CO2 emissions
ggplot(cars, aes(x = `avg_fuel_consumption`, y = `CO2 emissions(g/km)` , shape =`fuel type`, color = make)) +
geom_point() +
labs(title = "Fuel Consumption vs. CO2 Emissions", color = "Make") +
scale_color_discrete()
# Create box plot of fuel consumption by engine type
ggplot(cars, aes(x = factor(`engine size`), y = `avg_fuel_consumption`, fill = `fuel type`)) +
geom_boxplot() +
labs(title = "Fuel Consumption by Engine Type")+
theme(axis.text.x = element_text(angle = 85, vjust = 0.5))
# Create bar plot of average fuel consumption by car make
cars %>%
group_by(`make`) %>%
ggplot(aes(x = `class`, y = avg_fuel_consumption, fill = `fuel type`)) +
geom_col() +
labs(title = "Average Fuel Consumption by Car Make") +
theme(axis.text.x = element_text(angle = 85, vjust = 0.5))
# Reshape data
df <- dcast(cars, make ~ class, value.var = "CO2 emissions(g/km)")
## Aggregation function missing: defaulting to length
# Create heatmap in terms of fuel type
ggplot(cars, aes(x = `fuel type`, y = make, fill = `CO2 emissions(g/km)`)) +
geom_tile() +
scale_fill_gradient2(low = "green", mid = "blue", high = "red", midpoint = 200) +
labs(title = "CO2 Emissions by Make and Class")+
theme(axis.text.x = element_text(angle = 85, vjust = 0.5))
# Create heatmap in terms of the transmission type
ggplot(cars, aes(x = `transmission`, y = make, fill = `CO2 emissions(g/km)`)) +
geom_tile() +
scale_fill_gradient2(low = "green", mid = "blue", high = "red", midpoint = 200) +
labs(title = "CO2 Emissions by Make and Class") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5))