##1. Understanding the Data ##2. Data Exploration ##3. Simple-linear Regression Model ##4. Multi-linear Regression Model
df <- read.csv("/Users/apple/Desktop/FuelConsumption.csv")
library(ggplot2)
library(plyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble 3.1.7 ✔ purrr 0.3.4
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::arrange() masks plyr::arrange()
## ✖ purrr::compact() masks plyr::compact()
## ✖ dplyr::count() masks plyr::count()
## ✖ dplyr::failwith() masks plyr::failwith()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::id() masks plyr::id()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::mutate() masks plyr::mutate()
## ✖ dplyr::rename() masks plyr::rename()
## ✖ dplyr::summarise() masks plyr::summarise()
## ✖ dplyr::summarize() masks plyr::summarize()
library(lubridate)
##
## Attaching package: 'lubridate'
##
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(caTools)
library(ggthemes)
library(reshape2)
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
library(data.table)
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:reshape2':
##
## dcast, melt
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following object is masked from 'package:purrr':
##
## transpose
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(tidyr)
library(corrgram)
##
## Attaching package: 'corrgram'
##
## The following object is masked from 'package:plyr':
##
## baseball
library(corrplot)
## corrplot 0.92 loaded
library(formattable)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'lattice'
##
## The following object is masked from 'package:corrgram':
##
## panel.fill
##
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
str(df)
## 'data.frame': 1067 obs. of 13 variables:
## $ MODELYEAR : int 2014 2014 2014 2014 2014 2014 2014 2014 2014 2014 ...
## $ MAKE : chr "ACURA" "ACURA" "ACURA" "ACURA" ...
## $ MODEL : chr "ILX" "ILX" "ILX HYBRID" "MDX 4WD" ...
## $ VEHICLECLASS : chr "COMPACT" "COMPACT" "COMPACT" "SUV - SMALL" ...
## $ ENGINESIZE : num 2 2.4 1.5 3.5 3.5 3.5 3.5 3.7 3.7 2.4 ...
## $ CYLINDERS : int 4 4 4 6 6 6 6 6 6 4 ...
## $ TRANSMISSION : chr "AS5" "M6" "AV7" "AS6" ...
## $ FUELTYPE : chr "Z" "Z" "Z" "Z" ...
## $ FUELCONSUMPTION_CITY : num 9.9 11.2 6 12.7 12.1 11.9 11.8 12.8 13.4 10.6 ...
## $ FUELCONSUMPTION_HWY : num 6.7 7.7 5.8 9.1 8.7 7.7 8.1 9 9.5 7.5 ...
## $ FUELCONSUMPTION_COMB : num 8.5 9.6 5.9 11.1 10.6 10 10.1 11.1 11.6 9.2 ...
## $ FUELCONSUMPTION_COMB_MPG: int 33 29 48 25 27 28 28 25 24 31 ...
## $ CO2EMISSIONS : int 196 221 136 255 244 230 232 255 267 212 ...
summary(df)
## MODELYEAR MAKE MODEL VEHICLECLASS
## Min. :2014 Length:1067 Length:1067 Length:1067
## 1st Qu.:2014 Class :character Class :character Class :character
## Median :2014 Mode :character Mode :character Mode :character
## Mean :2014
## 3rd Qu.:2014
## Max. :2014
## ENGINESIZE CYLINDERS TRANSMISSION FUELTYPE
## Min. :1.000 Min. : 3.000 Length:1067 Length:1067
## 1st Qu.:2.000 1st Qu.: 4.000 Class :character Class :character
## Median :3.400 Median : 6.000 Mode :character Mode :character
## Mean :3.346 Mean : 5.795
## 3rd Qu.:4.300 3rd Qu.: 8.000
## Max. :8.400 Max. :12.000
## FUELCONSUMPTION_CITY FUELCONSUMPTION_HWY FUELCONSUMPTION_COMB
## Min. : 4.60 Min. : 4.900 Min. : 4.70
## 1st Qu.:10.25 1st Qu.: 7.500 1st Qu.: 9.00
## Median :12.60 Median : 8.800 Median :10.90
## Mean :13.30 Mean : 9.475 Mean :11.58
## 3rd Qu.:15.55 3rd Qu.:10.850 3rd Qu.:13.35
## Max. :30.20 Max. :20.500 Max. :25.80
## FUELCONSUMPTION_COMB_MPG CO2EMISSIONS
## Min. :11.00 Min. :108.0
## 1st Qu.:21.00 1st Qu.:207.0
## Median :26.00 Median :251.0
## Mean :26.44 Mean :256.2
## 3rd Qu.:31.00 3rd Qu.:294.0
## Max. :60.00 Max. :488.0
head(df,10)
Now we have a general idea about the data. There are 13 variables including one dependent variable (CO2 EMISSIONS) and 12 independent variables.
colnames(df) <- c("MODELYEAR", "MAKE", "MODEL", "VEHICLECLASS", "ENGINESIZE", "CYLINDERS", "TRANSMISSION", "FUELTYPE","FUELCONSUMPTION_CITY", "FUELCONSUMPTION_HWY", "FUELCONSUMPTION_COMB", "FUELCONSUMPTION_COMB_MPG", "CO2EMISSIONS")
Num.cols <- sapply(df, is.numeric)
Cor.data <- cor(df[, Num.cols])
## Warning in cor(df[, Num.cols]): the standard deviation is zero
Cor.data
## MODELYEAR ENGINESIZE CYLINDERS FUELCONSUMPTION_CITY
## MODELYEAR 1 NA NA NA
## ENGINESIZE NA 1.0000000 0.9340105 0.8322250
## CYLINDERS NA 0.9340105 1.0000000 0.7964727
## FUELCONSUMPTION_CITY NA 0.8322250 0.7964727 1.0000000
## FUELCONSUMPTION_HWY NA 0.7787458 0.7245936 0.9657182
## FUELCONSUMPTION_COMB NA 0.8194821 0.7767878 0.9955424
## FUELCONSUMPTION_COMB_MPG NA -0.8085545 -0.7704297 -0.9356126
## CO2EMISSIONS NA 0.8741544 0.8496846 0.8980385
## FUELCONSUMPTION_HWY FUELCONSUMPTION_COMB
## MODELYEAR NA NA
## ENGINESIZE 0.7787458 0.8194821
## CYLINDERS 0.7245936 0.7767878
## FUELCONSUMPTION_CITY 0.9657182 0.9955424
## FUELCONSUMPTION_HWY 1.0000000 0.9858038
## FUELCONSUMPTION_COMB 0.9858038 1.0000000
## FUELCONSUMPTION_COMB_MPG -0.8938086 -0.9279651
## CO2EMISSIONS 0.8617479 0.8921286
## FUELCONSUMPTION_COMB_MPG CO2EMISSIONS
## MODELYEAR NA NA
## ENGINESIZE -0.8085545 0.8741544
## CYLINDERS -0.7704297 0.8496846
## FUELCONSUMPTION_CITY -0.9356126 0.8980385
## FUELCONSUMPTION_HWY -0.8938086 0.8617479
## FUELCONSUMPTION_COMB -0.9279651 0.8921286
## FUELCONSUMPTION_COMB_MPG 1.0000000 -0.9063942
## CO2EMISSIONS -0.9063942 1.0000000
Based on the correlation we choose ENGINESIZE, CYLINDERS, FUELCONSUMPTION_CITY, FUELCONSUMPTION_HWY and FUELCONSUMPTION_COMB as features to explore more and to build the final models.
ggplot(df, aes(ENGINESIZE, CO2EMISSIONS))+
geom_point()+
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Here we can see that ENGINESIZE has a linear relationship with the CO2Emissions.
ggplot(df, aes(CYLINDERS, CO2EMISSIONS))+
geom_point()
The cars with more cylinders tend to produce more CO2.
ggplot(df, aes(x = FUELCONSUMPTION_COMB, y = CO2EMISSIONS))+
geom_point()+
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Here we can see this feature is very useful for the final model.
ggplot(df, aes(x = FUELCONSUMPTION_HWY, y = CO2EMISSIONS))+
geom_point()+
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
This graph is similar to the graph of Fuelconsumption_COMB and CO2 Emissions.
ggplot(df, aes(x = FUELCONSUMPTION_CITY, y = CO2EMISSIONS))+
geom_point()+
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
FUELCONSUMPTION_CITY is also an important feature for building the final model.
train_data <- floor(0.75 * nrow(df))
set.seed(123)
training <- sample(seq_len(nrow(df)), size = train_data)
train <- df[training,]
test <- df[-training,]
According to the calculated correlations, the relation between FUELCONSUMPTION_CITY and CO2EMISSIONS is the strongest. So, we will use this feature to fit a simple linear regression model.
model1 <- lm(formula = CO2EMISSIONS ~ FUELCONSUMPTION_CITY, data = train)
summary(model1)
##
## Call:
## lm(formula = CO2EMISSIONS ~ FUELCONSUMPTION_CITY, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -82.474 -10.157 1.991 14.578 85.074
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 71.5196 3.3786 21.17 <2e-16 ***
## FUELCONSUMPTION_CITY 13.8664 0.2429 57.09 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 28.18 on 798 degrees of freedom
## Multiple R-squared: 0.8033, Adjusted R-squared: 0.8031
## F-statistic: 3259 on 1 and 798 DF, p-value: < 2.2e-16
plot(model1)
In Linear Regression, the Null Hypothesis is that the coefficients associated with the variables is equal to zero. The alternate hypothesis is that the coefficients are not equal to zero.
P value has 3 stars which means x is of very high statistical significance. P value is extremely small. Generaaly below 0.05 is considered good.
R-Squared tells us is the proportion of variation in the dependent variable that has been explained by this model. R square is 0.8033 which shows good variation between dependent variable and independent variable.
ggplot() +
geom_point(aes(x = train$FUELCONSUMPTION_CITY, y = train$CO2EMISSIONS),
colour = 'red') +
geom_line(aes(x = train$FUELCONSUMPTION_CITY, y = predict(model1, newdata = train)),
colour = 'blue')
Model1_Pred = predict(model1, newdata = test)
## Visualizing the test set results
ggplot()+
geom_point(aes(x = test$FUELCONSUMPTION_CITY, y = test$CO2EMISSIONS),
colour = 'red') +
geom_line(aes(x = test$FUELCONSUMPTION_CITY, y = predict(model1, newdata = test)),
colour = 'blue')
compare1 <- cbind (actual=test$FUELCONSUMPTION_CITY, Model1_Pred)
mean (apply(compare1, 1, min)/apply(compare1, 1, max))
## [1] 0.05104433
## Check for residual mean and distribution
plot(train$CO2EMISSIONS, resid(model1),
ylab="CO2Emissions", xlab="Fuelconsumption_City",
main="Residual plot")
mean(model1$residuals)
## [1] -3.783719e-16
The plot shows that the model was a fair fit.
Model2 <- lm(formula = CO2EMISSIONS ~ ENGINESIZE + CYLINDERS + FUELCONSUMPTION_CITY + FUELCONSUMPTION_COMB + FUELCONSUMPTION_HWY, data = train)
summary(Model2)
##
## Call:
## lm(formula = CO2EMISSIONS ~ ENGINESIZE + CYLINDERS + FUELCONSUMPTION_CITY +
## FUELCONSUMPTION_COMB + FUELCONSUMPTION_HWY, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -74.541 -9.018 0.248 12.503 72.323
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 65.934 3.882 16.985 < 2e-16 ***
## ENGINESIZE 10.835 1.800 6.018 2.70e-09 ***
## CYLINDERS 7.158 1.368 5.231 2.15e-07 ***
## FUELCONSUMPTION_CITY -11.962 16.351 -0.732 0.465
## FUELCONSUMPTION_COMB 33.870 29.694 1.141 0.254
## FUELCONSUMPTION_HWY -12.790 13.424 -0.953 0.341
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23.6 on 794 degrees of freedom
## Multiple R-squared: 0.8627, Adjusted R-squared: 0.8619
## F-statistic: 997.9 on 5 and 794 DF, p-value: < 2.2e-16
plot(Model2)
Here we can see the R-squared is 0.8627, which is an improvement of Model1. We can say this model is a better one.
Model2_Pred <- predict(Model2, newdata = test)
plot(Model2_Pred)
As a result, I will accept Model2 to predict the CO2 emission.