Table of contents

##1. Understanding the Data ##2. Data Exploration ##3. Simple-linear Regression Model ##4. Multi-linear Regression Model

Importing data and packages

df <- read.csv("/Users/apple/Desktop/FuelConsumption.csv")
library(ggplot2)
library(plyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble  3.1.7     ✔ purrr   0.3.4
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::arrange()   masks plyr::arrange()
## ✖ purrr::compact()   masks plyr::compact()
## ✖ dplyr::count()     masks plyr::count()
## ✖ dplyr::failwith()  masks plyr::failwith()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::id()        masks plyr::id()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ dplyr::mutate()    masks plyr::mutate()
## ✖ dplyr::rename()    masks plyr::rename()
## ✖ dplyr::summarise() masks plyr::summarise()
## ✖ dplyr::summarize() masks plyr::summarize()
library(lubridate)
## 
## Attaching package: 'lubridate'
## 
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(caTools)
library(ggthemes)
library(reshape2)
## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(data.table)
## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:reshape2':
## 
##     dcast, melt
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(tidyr)
library(corrgram)       
## 
## Attaching package: 'corrgram'
## 
## The following object is masked from 'package:plyr':
## 
##     baseball
library(corrplot)
## corrplot 0.92 loaded
library(formattable)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'lattice'
## 
## The following object is masked from 'package:corrgram':
## 
##     panel.fill
## 
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

Understanding the data

str(df)
## 'data.frame':    1067 obs. of  13 variables:
##  $ MODELYEAR               : int  2014 2014 2014 2014 2014 2014 2014 2014 2014 2014 ...
##  $ MAKE                    : chr  "ACURA" "ACURA" "ACURA" "ACURA" ...
##  $ MODEL                   : chr  "ILX" "ILX" "ILX HYBRID" "MDX 4WD" ...
##  $ VEHICLECLASS            : chr  "COMPACT" "COMPACT" "COMPACT" "SUV - SMALL" ...
##  $ ENGINESIZE              : num  2 2.4 1.5 3.5 3.5 3.5 3.5 3.7 3.7 2.4 ...
##  $ CYLINDERS               : int  4 4 4 6 6 6 6 6 6 4 ...
##  $ TRANSMISSION            : chr  "AS5" "M6" "AV7" "AS6" ...
##  $ FUELTYPE                : chr  "Z" "Z" "Z" "Z" ...
##  $ FUELCONSUMPTION_CITY    : num  9.9 11.2 6 12.7 12.1 11.9 11.8 12.8 13.4 10.6 ...
##  $ FUELCONSUMPTION_HWY     : num  6.7 7.7 5.8 9.1 8.7 7.7 8.1 9 9.5 7.5 ...
##  $ FUELCONSUMPTION_COMB    : num  8.5 9.6 5.9 11.1 10.6 10 10.1 11.1 11.6 9.2 ...
##  $ FUELCONSUMPTION_COMB_MPG: int  33 29 48 25 27 28 28 25 24 31 ...
##  $ CO2EMISSIONS            : int  196 221 136 255 244 230 232 255 267 212 ...
summary(df)
##    MODELYEAR        MAKE              MODEL           VEHICLECLASS      
##  Min.   :2014   Length:1067        Length:1067        Length:1067       
##  1st Qu.:2014   Class :character   Class :character   Class :character  
##  Median :2014   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :2014                                                           
##  3rd Qu.:2014                                                           
##  Max.   :2014                                                           
##    ENGINESIZE      CYLINDERS      TRANSMISSION         FUELTYPE        
##  Min.   :1.000   Min.   : 3.000   Length:1067        Length:1067       
##  1st Qu.:2.000   1st Qu.: 4.000   Class :character   Class :character  
##  Median :3.400   Median : 6.000   Mode  :character   Mode  :character  
##  Mean   :3.346   Mean   : 5.795                                        
##  3rd Qu.:4.300   3rd Qu.: 8.000                                        
##  Max.   :8.400   Max.   :12.000                                        
##  FUELCONSUMPTION_CITY FUELCONSUMPTION_HWY FUELCONSUMPTION_COMB
##  Min.   : 4.60        Min.   : 4.900      Min.   : 4.70       
##  1st Qu.:10.25        1st Qu.: 7.500      1st Qu.: 9.00       
##  Median :12.60        Median : 8.800      Median :10.90       
##  Mean   :13.30        Mean   : 9.475      Mean   :11.58       
##  3rd Qu.:15.55        3rd Qu.:10.850      3rd Qu.:13.35       
##  Max.   :30.20        Max.   :20.500      Max.   :25.80       
##  FUELCONSUMPTION_COMB_MPG  CO2EMISSIONS  
##  Min.   :11.00            Min.   :108.0  
##  1st Qu.:21.00            1st Qu.:207.0  
##  Median :26.00            Median :251.0  
##  Mean   :26.44            Mean   :256.2  
##  3rd Qu.:31.00            3rd Qu.:294.0  
##  Max.   :60.00            Max.   :488.0
head(df,10)

Now we have a general idea about the data. There are 13 variables including one dependent variable (CO2 EMISSIONS) and 12 independent variables.

Changing the name of columns

colnames(df) <- c("MODELYEAR", "MAKE", "MODEL", "VEHICLECLASS", "ENGINESIZE", "CYLINDERS", "TRANSMISSION", "FUELTYPE","FUELCONSUMPTION_CITY", "FUELCONSUMPTION_HWY", "FUELCONSUMPTION_COMB", "FUELCONSUMPTION_COMB_MPG", "CO2EMISSIONS")

Data Exploration

The relationships between variables

Finding the correlation between numerical columns

Num.cols <- sapply(df, is.numeric)
Cor.data <- cor(df[, Num.cols])
## Warning in cor(df[, Num.cols]): the standard deviation is zero
Cor.data
##                          MODELYEAR ENGINESIZE  CYLINDERS FUELCONSUMPTION_CITY
## MODELYEAR                        1         NA         NA                   NA
## ENGINESIZE                      NA  1.0000000  0.9340105            0.8322250
## CYLINDERS                       NA  0.9340105  1.0000000            0.7964727
## FUELCONSUMPTION_CITY            NA  0.8322250  0.7964727            1.0000000
## FUELCONSUMPTION_HWY             NA  0.7787458  0.7245936            0.9657182
## FUELCONSUMPTION_COMB            NA  0.8194821  0.7767878            0.9955424
## FUELCONSUMPTION_COMB_MPG        NA -0.8085545 -0.7704297           -0.9356126
## CO2EMISSIONS                    NA  0.8741544  0.8496846            0.8980385
##                          FUELCONSUMPTION_HWY FUELCONSUMPTION_COMB
## MODELYEAR                                 NA                   NA
## ENGINESIZE                         0.7787458            0.8194821
## CYLINDERS                          0.7245936            0.7767878
## FUELCONSUMPTION_CITY               0.9657182            0.9955424
## FUELCONSUMPTION_HWY                1.0000000            0.9858038
## FUELCONSUMPTION_COMB               0.9858038            1.0000000
## FUELCONSUMPTION_COMB_MPG          -0.8938086           -0.9279651
## CO2EMISSIONS                       0.8617479            0.8921286
##                          FUELCONSUMPTION_COMB_MPG CO2EMISSIONS
## MODELYEAR                                      NA           NA
## ENGINESIZE                             -0.8085545    0.8741544
## CYLINDERS                              -0.7704297    0.8496846
## FUELCONSUMPTION_CITY                   -0.9356126    0.8980385
## FUELCONSUMPTION_HWY                    -0.8938086    0.8617479
## FUELCONSUMPTION_COMB                   -0.9279651    0.8921286
## FUELCONSUMPTION_COMB_MPG                1.0000000   -0.9063942
## CO2EMISSIONS                           -0.9063942    1.0000000

Based on the correlation we choose ENGINESIZE, CYLINDERS, FUELCONSUMPTION_CITY, FUELCONSUMPTION_HWY and FUELCONSUMPTION_COMB as features to explore more and to build the final models.

1. ENGINESIZE

ggplot(df, aes(ENGINESIZE, CO2EMISSIONS))+
  geom_point()+
  geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Here we can see that ENGINESIZE has a linear relationship with the CO2Emissions.

2. CYLINDERS

ggplot(df, aes(CYLINDERS, CO2EMISSIONS))+
  geom_point()

The cars with more cylinders tend to produce more CO2.

3. FUELCONSUMPTION_COMB

ggplot(df, aes(x = FUELCONSUMPTION_COMB, y = CO2EMISSIONS))+
  geom_point()+
  geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Here we can see this feature is very useful for the final model.

4. FUELCONSUMPTION_HWY

ggplot(df, aes(x = FUELCONSUMPTION_HWY, y = CO2EMISSIONS))+
  geom_point()+
  geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

This graph is similar to the graph of Fuelconsumption_COMB and CO2 Emissions.

5. FUELCONSUMPTION_CITY

ggplot(df, aes(x = FUELCONSUMPTION_CITY, y = CO2EMISSIONS))+
  geom_point()+
  geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

FUELCONSUMPTION_CITY is also an important feature for building the final model.

Simple-linear Regression Model

Split the data

train_data <- floor(0.75 * nrow(df))

set.seed(123)
training <- sample(seq_len(nrow(df)), size = train_data)

train <- df[training,]
test <- df[-training,]

According to the calculated correlations, the relation between FUELCONSUMPTION_CITY and CO2EMISSIONS is the strongest. So, we will use this feature to fit a simple linear regression model.

model1 <- lm(formula = CO2EMISSIONS ~ FUELCONSUMPTION_CITY, data = train)

summary(model1)
## 
## Call:
## lm(formula = CO2EMISSIONS ~ FUELCONSUMPTION_CITY, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -82.474 -10.157   1.991  14.578  85.074 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           71.5196     3.3786   21.17   <2e-16 ***
## FUELCONSUMPTION_CITY  13.8664     0.2429   57.09   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28.18 on 798 degrees of freedom
## Multiple R-squared:  0.8033, Adjusted R-squared:  0.8031 
## F-statistic:  3259 on 1 and 798 DF,  p-value: < 2.2e-16
plot(model1)

In Linear Regression, the Null Hypothesis is that the coefficients associated with the variables is equal to zero. The alternate hypothesis is that the coefficients are not equal to zero.

P value has 3 stars which means x is of very high statistical significance. P value is extremely small. Generaaly below 0.05 is considered good.

R-Squared tells us is the proportion of variation in the dependent variable that has been explained by this model. R square is 0.8033 which shows good variation between dependent variable and independent variable.

Visualizing the training set results

ggplot() +
  geom_point(aes(x = train$FUELCONSUMPTION_CITY, y = train$CO2EMISSIONS),
             colour = 'red') +
  geom_line(aes(x = train$FUELCONSUMPTION_CITY, y = predict(model1, newdata = train)),
            colour = 'blue') 

Predicting the test results

Model1_Pred = predict(model1, newdata = test)

## Visualizing the test set results
ggplot()+
  geom_point(aes(x = test$FUELCONSUMPTION_CITY, y = test$CO2EMISSIONS),
             colour = 'red') +
  geom_line(aes(x = test$FUELCONSUMPTION_CITY, y = predict(model1, newdata = test)),
            colour = 'blue')

Finding accuracy

compare1 <- cbind (actual=test$FUELCONSUMPTION_CITY, Model1_Pred)  
mean (apply(compare1, 1, min)/apply(compare1, 1, max))
## [1] 0.05104433
## Check for residual mean and distribution
plot(train$CO2EMISSIONS, resid(model1), 
     ylab="CO2Emissions", xlab="Fuelconsumption_City", 
     main="Residual plot") 

mean(model1$residuals)
## [1] -3.783719e-16

The plot shows that the model was a fair fit.

Multi-linear Regression Model

Model2 <- lm(formula = CO2EMISSIONS ~ ENGINESIZE + CYLINDERS + FUELCONSUMPTION_CITY + FUELCONSUMPTION_COMB + FUELCONSUMPTION_HWY, data = train)

summary(Model2)
## 
## Call:
## lm(formula = CO2EMISSIONS ~ ENGINESIZE + CYLINDERS + FUELCONSUMPTION_CITY + 
##     FUELCONSUMPTION_COMB + FUELCONSUMPTION_HWY, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -74.541  -9.018   0.248  12.503  72.323 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            65.934      3.882  16.985  < 2e-16 ***
## ENGINESIZE             10.835      1.800   6.018 2.70e-09 ***
## CYLINDERS               7.158      1.368   5.231 2.15e-07 ***
## FUELCONSUMPTION_CITY  -11.962     16.351  -0.732    0.465    
## FUELCONSUMPTION_COMB   33.870     29.694   1.141    0.254    
## FUELCONSUMPTION_HWY   -12.790     13.424  -0.953    0.341    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23.6 on 794 degrees of freedom
## Multiple R-squared:  0.8627, Adjusted R-squared:  0.8619 
## F-statistic: 997.9 on 5 and 794 DF,  p-value: < 2.2e-16
plot(Model2)

Here we can see the R-squared is 0.8627, which is an improvement of Model1. We can say this model is a better one.

Predicting the test results

Model2_Pred <- predict(Model2, newdata = test)

plot(Model2_Pred)

As a result, I will accept Model2 to predict the CO2 emission.