##1. Understanding the Data ##2. Data Exploration ##3. Simple-linear Regression Model ##4. Multi-linear Regression Model

Importing data and packages

df <- read.csv("/Users/apple/Desktop/FuelConsumption.csv")
library(ggplot2)
library(plyr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyverse)

## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──

## ✔ tibble  3.1.7     ✔ purrr   0.3.4
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::arrange()   masks plyr::arrange()
## ✖ purrr::compact()   masks plyr::compact()
## ✖ dplyr::count()     masks plyr::count()
## ✖ dplyr::failwith()  masks plyr::failwith()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::id()        masks plyr::id()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ dplyr::mutate()    masks plyr::mutate()
## ✖ dplyr::rename()    masks plyr::rename()
## ✖ dplyr::summarise() masks plyr::summarise()
## ✖ dplyr::summarize() masks plyr::summarize()

library(lubridate)

## 
## Attaching package: 'lubridate'
## 
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(caTools)
library(ggthemes)
library(reshape2)

## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths

library(data.table)

## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:reshape2':
## 
##     dcast, melt
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

library(tidyr)
library(corrgram)

## 
## Attaching package: 'corrgram'
## 
## The following object is masked from 'package:plyr':
## 
##     baseball

library(corrplot)

## corrplot 0.92 loaded

library(formattable)
library(caret)

## Loading required package: lattice
## 
## Attaching package: 'lattice'
## 
## The following object is masked from 'package:corrgram':
## 
##     panel.fill
## 
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

Understanding the data

str(df)

## 'data.frame':    1067 obs. of  13 variables:
##  $ MODELYEAR               : int  2014 2014 2014 2014 2014 2014 2014 2014 2014 2014 ...
##  $ MAKE                    : chr  "ACURA" "ACURA" "ACURA" "ACURA" ...
##  $ MODEL                   : chr  "ILX" "ILX" "ILX HYBRID" "MDX 4WD" ...
##  $ VEHICLECLASS            : chr  "COMPACT" "COMPACT" "COMPACT" "SUV - SMALL" ...
##  $ ENGINESIZE              : num  2 2.4 1.5 3.5 3.5 3.5 3.5 3.7 3.7 2.4 ...
##  $ CYLINDERS               : int  4 4 4 6 6 6 6 6 6 4 ...
##  $ TRANSMISSION            : chr  "AS5" "M6" "AV7" "AS6" ...
##  $ FUELTYPE                : chr  "Z" "Z" "Z" "Z" ...
##  $ FUELCONSUMPTION_CITY    : num  9.9 11.2 6 12.7 12.1 11.9 11.8 12.8 13.4 10.6 ...
##  $ FUELCONSUMPTION_HWY     : num  6.7 7.7 5.8 9.1 8.7 7.7 8.1 9 9.5 7.5 ...
##  $ FUELCONSUMPTION_COMB    : num  8.5 9.6 5.9 11.1 10.6 10 10.1 11.1 11.6 9.2 ...
##  $ FUELCONSUMPTION_COMB_MPG: int  33 29 48 25 27 28 28 25 24 31 ...
##  $ CO2EMISSIONS            : int  196 221 136 255 244 230 232 255 267 212 ...

summary(df)

##    MODELYEAR        MAKE              MODEL           VEHICLECLASS      
##  Min.   :2014   Length:1067        Length:1067        Length:1067       
##  1st Qu.:2014   Class :character   Class :character   Class :character  
##  Median :2014   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :2014                                                           
##  3rd Qu.:2014                                                           
##  Max.   :2014                                                           
##    ENGINESIZE      CYLINDERS      TRANSMISSION         FUELTYPE        
##  Min.   :1.000   Min.   : 3.000   Length:1067        Length:1067       
##  1st Qu.:2.000   1st Qu.: 4.000   Class :character   Class :character  
##  Median :3.400   Median : 6.000   Mode  :character   Mode  :character  
##  Mean   :3.346   Mean   : 5.795                                        
##  3rd Qu.:4.300   3rd Qu.: 8.000                                        
##  Max.   :8.400   Max.   :12.000                                        
##  FUELCONSUMPTION_CITY FUELCONSUMPTION_HWY FUELCONSUMPTION_COMB
##  Min.   : 4.60        Min.   : 4.900      Min.   : 4.70       
##  1st Qu.:10.25        1st Qu.: 7.500      1st Qu.: 9.00       
##  Median :12.60        Median : 8.800      Median :10.90       
##  Mean   :13.30        Mean   : 9.475      Mean   :11.58       
##  3rd Qu.:15.55        3rd Qu.:10.850      3rd Qu.:13.35       
##  Max.   :30.20        Max.   :20.500      Max.   :25.80       
##  FUELCONSUMPTION_COMB_MPG  CO2EMISSIONS  
##  Min.   :11.00            Min.   :108.0  
##  1st Qu.:21.00            1st Qu.:207.0  
##  Median :26.00            Median :251.0  
##  Mean   :26.44            Mean   :256.2  
##  3rd Qu.:31.00            3rd Qu.:294.0  
##  Max.   :60.00            Max.   :488.0

head(df,10)

Now we have a general idea about the data. There are 13 variables including one dependent variable (CO2 EMISSIONS) and 12 independent variables.

Changing the name of columns

colnames(df) <- c("MODELYEAR", "MAKE", "MODEL", "VEHICLECLASS", "ENGINESIZE", "CYLINDERS", "TRANSMISSION", "FUELTYPE","FUELCONSUMPTION_CITY", "FUELCONSUMPTION_HWY", "FUELCONSUMPTION_COMB", "FUELCONSUMPTION_COMB_MPG", "CO2EMISSIONS")

Data Exploration

The relationships between variables

Finding the correlation between numerical columns

Num.cols <- sapply(df, is.numeric)
Cor.data <- cor(df[, Num.cols])

## Warning in cor(df[, Num.cols]): the standard deviation is zero

Cor.data

##                          MODELYEAR ENGINESIZE  CYLINDERS FUELCONSUMPTION_CITY
## MODELYEAR                        1         NA         NA                   NA
## ENGINESIZE                      NA  1.0000000  0.9340105            0.8322250
## CYLINDERS                       NA  0.9340105  1.0000000            0.7964727
## FUELCONSUMPTION_CITY            NA  0.8322250  0.7964727            1.0000000
## FUELCONSUMPTION_HWY             NA  0.7787458  0.7245936            0.9657182
## FUELCONSUMPTION_COMB            NA  0.8194821  0.7767878            0.9955424
## FUELCONSUMPTION_COMB_MPG        NA -0.8085545 -0.7704297           -0.9356126
## CO2EMISSIONS                    NA  0.8741544  0.8496846            0.8980385
##                          FUELCONSUMPTION_HWY FUELCONSUMPTION_COMB
## MODELYEAR                                 NA                   NA
## ENGINESIZE                         0.7787458            0.8194821
## CYLINDERS                          0.7245936            0.7767878
## FUELCONSUMPTION_CITY               0.9657182            0.9955424
## FUELCONSUMPTION_HWY                1.0000000            0.9858038
## FUELCONSUMPTION_COMB               0.9858038            1.0000000
## FUELCONSUMPTION_COMB_MPG          -0.8938086           -0.9279651
## CO2EMISSIONS                       0.8617479            0.8921286
##                          FUELCONSUMPTION_COMB_MPG CO2EMISSIONS
## MODELYEAR                                      NA           NA
## ENGINESIZE                             -0.8085545    0.8741544
## CYLINDERS                              -0.7704297    0.8496846
## FUELCONSUMPTION_CITY                   -0.9356126    0.8980385
## FUELCONSUMPTION_HWY                    -0.8938086    0.8617479
## FUELCONSUMPTION_COMB                   -0.9279651    0.8921286
## FUELCONSUMPTION_COMB_MPG                1.0000000   -0.9063942
## CO2EMISSIONS                           -0.9063942    1.0000000

Based on the correlation we choose ENGINESIZE, CYLINDERS, FUELCONSUMPTION_CITY, FUELCONSUMPTION_HWY and FUELCONSUMPTION_COMB as features to explore more and to build the final models.

1. ENGINESIZE

ggplot(df, aes(ENGINESIZE, CO2EMISSIONS))+
  geom_point()+
  geom_smooth()

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Here we can see that ENGINESIZE has a linear relationship with the CO2Emissions.

2. CYLINDERS

ggplot(df, aes(CYLINDERS, CO2EMISSIONS))+
  geom_point()

The cars with more cylinders tend to produce more CO2.

3. FUELCONSUMPTION_COMB

ggplot(df, aes(x = FUELCONSUMPTION_COMB, y = CO2EMISSIONS))+
  geom_point()+
  geom_smooth()

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Here we can see this feature is very useful for the final model.

4. FUELCONSUMPTION_HWY

ggplot(df, aes(x = FUELCONSUMPTION_HWY, y = CO2EMISSIONS))+
  geom_point()+
  geom_smooth()

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

This graph is similar to the graph of Fuelconsumption_COMB and CO2 Emissions.

5. FUELCONSUMPTION_CITY

ggplot(df, aes(x = FUELCONSUMPTION_CITY, y = CO2EMISSIONS))+
  geom_point()+
  geom_smooth()

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

FUELCONSUMPTION_CITY is also an important feature for building the final model.

Simple-linear Regression Model

Split the data

train_data <- floor(0.75 * nrow(df))

set.seed(123)
training <- sample(seq_len(nrow(df)), size = train_data)

train <- df[training,]
test <- df[-training,]

According to the calculated correlations, the relation between FUELCONSUMPTION_CITY and CO2EMISSIONS is the strongest. So, we will use this feature to fit a simple linear regression model.

model1 <- lm(formula = CO2EMISSIONS ~ FUELCONSUMPTION_CITY, data = train)

summary(model1)

## 
## Call:
## lm(formula = CO2EMISSIONS ~ FUELCONSUMPTION_CITY, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -82.474 -10.157   1.991  14.578  85.074 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           71.5196     3.3786   21.17   <2e-16 ***
## FUELCONSUMPTION_CITY  13.8664     0.2429   57.09   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28.18 on 798 degrees of freedom
## Multiple R-squared:  0.8033, Adjusted R-squared:  0.8031 
## F-statistic:  3259 on 1 and 798 DF,  p-value: < 2.2e-16

plot(model1)

In Linear Regression, the Null Hypothesis is that the coefficients associated with the variables is equal to zero. The alternate hypothesis is that the coefficients are not equal to zero.

P value has 3 stars which means x is of very high statistical significance. P value is extremely small. Generaaly below 0.05 is considered good.

R-Squared tells us is the proportion of variation in the dependent variable that has been explained by this model. R square is 0.8033 which shows good variation between dependent variable and independent variable.

Visualizing the training set results

ggplot() +
  geom_point(aes(x = train$FUELCONSUMPTION_CITY, y = train$CO2EMISSIONS),
             colour = 'red') +
  geom_line(aes(x = train$FUELCONSUMPTION_CITY, y = predict(model1, newdata = train)),
            colour = 'blue')

Predicting the test results

Model1_Pred = predict(model1, newdata = test)

## Visualizing the test set results
ggplot()+
  geom_point(aes(x = test$FUELCONSUMPTION_CITY, y = test$CO2EMISSIONS),
             colour = 'red') +
  geom_line(aes(x = test$FUELCONSUMPTION_CITY, y = predict(model1, newdata = test)),
            colour = 'blue')

Finding accuracy

compare1 <- cbind (actual=test$FUELCONSUMPTION_CITY, Model1_Pred)  
mean (apply(compare1, 1, min)/apply(compare1, 1, max))

## [1] 0.05104433

## Check for residual mean and distribution
plot(train$CO2EMISSIONS, resid(model1), 
     ylab="CO2Emissions", xlab="Fuelconsumption_City", 
     main="Residual plot")

mean(model1$residuals)

## [1] -3.783719e-16

The plot shows that the model was a fair fit.

Multi-linear Regression Model

Model2 <- lm(formula = CO2EMISSIONS ~ ENGINESIZE + CYLINDERS + FUELCONSUMPTION_CITY + FUELCONSUMPTION_COMB + FUELCONSUMPTION_HWY, data = train)

summary(Model2)

## 
## Call:
## lm(formula = CO2EMISSIONS ~ ENGINESIZE + CYLINDERS + FUELCONSUMPTION_CITY + 
##     FUELCONSUMPTION_COMB + FUELCONSUMPTION_HWY, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -74.541  -9.018   0.248  12.503  72.323 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            65.934      3.882  16.985  < 2e-16 ***
## ENGINESIZE             10.835      1.800   6.018 2.70e-09 ***
## CYLINDERS               7.158      1.368   5.231 2.15e-07 ***
## FUELCONSUMPTION_CITY  -11.962     16.351  -0.732    0.465    
## FUELCONSUMPTION_COMB   33.870     29.694   1.141    0.254    
## FUELCONSUMPTION_HWY   -12.790     13.424  -0.953    0.341    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23.6 on 794 degrees of freedom
## Multiple R-squared:  0.8627, Adjusted R-squared:  0.8619 
## F-statistic: 997.9 on 5 and 794 DF,  p-value: < 2.2e-16

plot(Model2)

Here we can see the R-squared is 0.8627, which is an improvement of Model1. We can say this model is a better one.

Predicting the test results

Model2_Pred <- predict(Model2, newdata = test)

plot(Model2_Pred)

As a result, I will accept Model2 to predict the CO2 emission.

Final Project_MBA6636

Sikun_Ma

2022-08-06

Table of contents

Importing data and packages

Understanding the data

Changing the name of columns

Data Exploration

The relationships between variables

Finding the correlation between numerical columns

1. ENGINESIZE

2. CYLINDERS

3. FUELCONSUMPTION_COMB

4. FUELCONSUMPTION_HWY

5. FUELCONSUMPTION_CITY

Simple-linear Regression Model

Split the data

Visualizing the training set results

Predicting the test results

Finding accuracy

Multi-linear Regression Model

Predicting the test results