Gasoline Project

Install Packages

install.packages("caret")
install.packages("RSADBE")
install.packages("rlang")

Load Packages

library(RSADBE)
library(caret)
library(rlang)

Importing Gasoline Dataset From RSADBE Package

data("Gasoline")
gasolineData <- Gasoline

Exploring Dataset

dim(gasolineData)

## [1] 25 12

head(gasolineData,1)

##           y  x1  x2  x3     x4     x5 x6 x7    x8   x9  x10 x11
## Apollo 18.9 350 165 260 0.8889 0.7191  4  3 200.3 69.9 3910   A

summary(gasolineData)

##        y               x1              x2              x3       
##  Min.   :11.20   Min.   : 85.3   Min.   : 70.0   Min.   : 81.0  
##  1st Qu.:16.50   1st Qu.:225.0   1st Qu.:105.0   1st Qu.:164.0  
##  Median :19.70   Median :318.0   Median :140.0   Median :231.5  
##  Mean   :20.62   Mean   :287.6   Mean   :137.7   Mean   :215.0  
##  3rd Qu.:22.12   3rd Qu.:351.0   3rd Qu.:165.0   3rd Qu.:256.2  
##  Max.   :36.50   Max.   :500.0   Max.   :223.0   Max.   :366.0  
##                                                  NA's   :1      
##        x4               x5               x6            x7      
##  Min.   :0.8837   Min.   :0.7101   Min.   :1.0   Min.   :3.00  
##  1st Qu.:0.8889   1st Qu.:0.7319   1st Qu.:2.0   1st Qu.:3.00  
##  Median :0.8913   Median :0.7500   Median :2.0   Median :3.00  
##  Mean   :0.8920   Mean   :0.7523   Mean   :2.6   Mean   :3.36  
##  3rd Qu.:0.8947   3rd Qu.:0.7630   3rd Qu.:4.0   3rd Qu.:3.00  
##  Max.   :0.9000   Max.   :0.8113   Max.   :4.0   Max.   :5.00  
##                                                                
##        x8              x9             x10       x11   
##  Min.   :155.7   Min.   :61.80   Min.   :1905   A:18  
##  1st Qu.:171.5   1st Qu.:65.40   1st Qu.:3020   M: 7  
##  Median :195.4   Median :72.20   Median :3850         
##  Mean   :191.8   Mean   :71.24   Mean   :3612         
##  3rd Qu.:200.3   3rd Qu.:76.30   3rd Qu.:4215         
##  Max.   :231.0   Max.   :79.80   Max.   :5430         
##

names(gasolineData)

##  [1] "y"   "x1"  "x2"  "x3"  "x4"  "x5"  "x6"  "x7"  "x8"  "x9"  "x10"
## [12] "x11"

So as you can see there is 25 Rows and 12 Columns in ourdataset and we don’t have proper name of the columns. Let’s replace column names with proper names

Rename Dataframe Column Names

names(gasolineData) <-  c("Miles Per Gallon","Displacement","Horsepower","Torque","Compression Ratio",
                          "Rear Axle Ratio","Carburetor","Number Of Transmission Speeds","Overall Length",
                          "Width Inches","Weight Pounds","TransmissionType")

names(gasolineData)

##  [1] "Miles Per Gallon"              "Displacement"                 
##  [3] "Horsepower"                    "Torque"                       
##  [5] "Compression Ratio"             "Rear Axle Ratio"              
##  [7] "Carburetor"                    "Number Of Transmission Speeds"
##  [9] "Overall Length"                "Width Inches"                 
## [11] "Weight Pounds"                 "TransmissionType"

Now our dataset looking more informative

Split Dataset

splitData <- createDataPartition(gasolineData$TransmissionType, p= 0.70,list = FALSE)

Training Dataset

trainingData <- gasolineData[splitData,]

Testing Dataset

testingData <- gasolineData[-splitData,]

Building Logistics Regression Model

logisticsModel <- glm(trainingData$TransmissionType~.,family="binomial",data = trainingData)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(logisticsModel)

## 
## Call:
## glm(formula = trainingData$TransmissionType ~ ., family = "binomial", 
##     data = trainingData)
## 
## Deviance Residuals: 
##         Nova       Monarch  Jenson Conv.       Skyhawk      Scirocco  
##   -2.565e-06    -2.431e-06    -5.158e-06    -3.055e-06     4.616e-06  
## Corolla SR-5        Camaro      Capri II       Granada      Eldorado  
##    2.110e-08    -5.799e-06     7.136e-06    -7.939e-06    -2.753e-06  
##     Starfire       Cordoba   Corolla E-S       Mark IV     Celica GT  
##   -5.431e-06    -5.084e-06     6.221e-06    -2.990e-06     2.110e-08  
##       Cougar      Corvette  
##   -2.110e-08    -5.256e-07  
## 
## Coefficients:
##                                   Estimate Std. Error z value Pr(>|z|)
## (Intercept)                      9.268e+02  7.001e+07       0        1
## `Miles Per Gallon`               3.659e-01  5.313e+04       0        1
## Displacement                    -3.405e-01  2.270e+04       0        1
## Horsepower                      -3.212e-01  2.936e+04       0        1
## Torque                           5.329e-01  4.652e+04       0        1
## `Compression Ratio`             -9.742e+02  7.961e+07       0        1
## `Rear Axle Ratio`               -3.144e+02  1.423e+07       0        1
## Carburetor                       1.917e+00  1.582e+05       0        1
## `Number Of Transmission Speeds`  4.164e+01  8.513e+05       0        1
## `Overall Length`                -2.192e+00  5.352e+04       0        1
## `Width Inches`                   5.391e+00  8.802e+04       0        1
## `Weight Pounds`                  2.103e-02  1.581e+03       0        1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2.0597e+01  on 16  degrees of freedom
## Residual deviance: 3.2815e-10  on  5  degrees of freedom
##   (1 observation deleted due to missingness)
## AIC: 24
## 
## Number of Fisher Scoring iterations: 24

Predict On Testing Dataset

pred <- predict(logisticsModel,newdata = testingData,type = "response")

Compute Accuracy

Now we will see that how much our model is accurate

y_pred_num <- ifelse(pred > 0.5,"M","A")
y_pred <- factor(y_pred_num,levels = c('A','M'))

y_act <- testingData$TransmissionType
mean(y_pred == y_act)

## [1] 0.7142857

Conclusion

SO as you can see our model is 71.4% Accurate