Sample code to examime data for Project 2

This is a partial data analysis of the vehicles.csv data. This came from EPA vehicle data over the period 1984 to 2017, including vehicle design information (cylinders, displacement, fuel type, etc.) and performance (mpg, CO2 emitted).

Sudip has processed the data into another csv file - margedData3

Load the libraries

#install.packages("party")
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
#Get source data from a file
setwd("~/GWU Intro to Data Science/Project 2/")
autodata <- read.csv("mergedData3.csv")

use a subset of the data columns, and look at the header data

clean_autodata <- autodata[c("co2TailpipeGpm",  "comb08", "discontinuedNextYear",
                             "cylinders",   "displ", "drive","fuelType","make", "model",
                             "fuelType1",   "VClass",   "year", "DiffFromMean")]

head(clean_autodata)
##   co2TailpipeGpm comb08 discontinuedNextYear cylinders displ
## 1       423.1905     21                 TRUE         4   2.0
## 2       807.9091     11                FALSE        12   4.9
## 3       329.1481     27                FALSE         4   2.2
## 4       807.9091     11                FALSE         8   5.2
## 5       467.7368     19                FALSE         4   2.2
## 6       403.9545     22                 TRUE         4   1.8
##                        drive fuelType       make               model
## 1           Rear-Wheel Drive  Regular Alfa Romeo  Spider Veloce 2000
## 2           Rear-Wheel Drive  Regular    Ferrari          Testarossa
## 3          Front-Wheel Drive  Regular      Dodge             Charger
## 4           Rear-Wheel Drive  Regular      Dodge B150/B250 Wagon 2WD
## 5 4-Wheel or All-Wheel Drive  Premium     Subaru    Legacy AWD Turbo
## 6          Front-Wheel Drive  Regular     Subaru              Loyale
##          fuelType1          VClass year DiffFromMean
## 1 Regular Gasoline     Two Seaters 1985     2.767296
## 2 Regular Gasoline     Two Seaters 1985   -51.320755
## 3 Regular Gasoline Subcompact Cars 1985    12.510653
## 4 Regular Gasoline            Vans 1985   -28.665786
## 5 Premium Gasoline    Compact Cars 1993   -12.627638
## 6 Regular Gasoline    Compact Cars 1993     7.930565

Now, start getting the data ready for analysis.

Start subsetting a series of years to do some data analysis over time periods

# start looking at a few years of data, one year at a time
# first start with 1984 (only 1984, 1987, 1988 don't have NA's)
year_1984 <- subset(clean_autodata, year== 1984)
year_1986 <- subset(clean_autodata, year== 1986)
year_1988 <- subset(clean_autodata, year== 1988)
year_1996 <- subset(clean_autodata, year== 1996)
year_2006 <- subset(clean_autodata, year== 2006)
year_2016 <- subset(clean_autodata, year== 2016)

#look at how the mean changes over time
# look at how the mean evolves over time for all data 
mean(year_1984$comb08)
## [1] 19.88187
mean(year_1986$comb08)
## [1] 19.55041
mean(year_1996$comb08)
## [1] 19.58473
mean(year_2006$comb08)
## [1] 18.95924
mean(year_2016$comb08)
## [1] 25.09984
# and then create a few subsets over periods of time
The80s <- subset(clean_autodata, year < 1990)
Prior2017 <-subset(clean_autodata, year < 2017)

Now do some density plots to show combined mpg over time. Note when EV’s starting showing up, the plot starts showing much higher mpg data points.

plot(density(year_1984$comb08))

plot(density(year_1986$comb08))

plot(density(year_1996$comb08))

plot(density(year_2006$comb08))

plot(density(year_2016$comb08))

Now run a conditional inference tree.

# clean up the blanks and create a new dataframe
Good_data <- subset(clean_autodata, DiffFromMean != "")

# Look at a conditional inference tree 
output_tree <- ctree(
  discontinuedNextYear ~ co2TailpipeGpm +   comb08 + make, 
  data = The80s)

# Plot the tree.
plot(output_tree, type = "simple")

Since the model names are long, it is difficult to understand.
Look at the data output.

# print the data from the analysis of conditional inference tree
output_tree
## 
##   Conditional inference tree with 16 terminal nodes
## 
## Response:  discontinuedNextYear 
## Inputs:  co2TailpipeGpm, comb08, make 
## Number of observations:  8405 
## 
## 1) make == {Acura, Alfa Romeo, ASC Incorporated, Aurora Cars Ltd, Autokraft Limited, Avanti Motor Corporation, Bertone, Bill Dovell Motor Car Company, Bitter Gmbh and Co. Kg, BMW, Buick, Cadillac, CCC Engineering, Chevrolet, Chrysler, Dacia, Daihatsu, Dodge, E. P. Dutton, Inc., Eagle, Environmental Rsch and Devp Corp, Evans Automobiles, Excalibur Autos, Ferrari, Ford, General Motors, Geo, GMC, Grumman Allied Industries, Grumman Olson, Honda, Hyundai, Import Foreign Auto Sales Inc, Isuzu, Jaguar, JBA Motorcars, Inc., Laforza Automobile Inc, Lambda Control Systems, Lamborghini, Land Rover, Lincoln, London Coach Co Inc, Lotus, Maserati, Mazda, Mcevoy Motors, Mercury, Merkur, Mitsubishi, Nissan, Oldsmobile, Pininfarina, Plymouth, Pontiac, Red Shift Ltd., Rolls-Royce, Ruf Automobile Gmbh, S and S Coach Company  E.p. Dutton, Saab, Saleen, Sterling, Subaru, Superior Coaches Div E.p. Dutton, Toyota, Vixen Motor Company, Volga Associated Automobile, Volkswagen}; criterion = 1, statistic = 1220.247
##   2) make == {Alfa Romeo, ASC Incorporated, Aurora Cars Ltd, Autokraft Limited, Avanti Motor Corporation, Bill Dovell Motor Car Company, Bitter Gmbh and Co. Kg, BMW, Cadillac, CCC Engineering, Chevrolet, Chrysler, Dacia, Daihatsu, E. P. Dutton, Inc., Eagle, Environmental Rsch and Devp Corp, Evans Automobiles, Excalibur Autos, Ferrari, General Motors, Geo, GMC, Grumman Allied Industries, Grumman Olson, Hyundai, Import Foreign Auto Sales Inc, Jaguar, JBA Motorcars, Inc., Lambda Control Systems, Lamborghini, London Coach Co Inc, Lotus, Maserati, Mazda, Mcevoy Motors, Merkur, Nissan, Oldsmobile, Pininfarina, Pontiac, Red Shift Ltd., Rolls-Royce, Ruf Automobile Gmbh, S and S Coach Company  E.p. Dutton, Sterling, Subaru, Superior Coaches Div E.p. Dutton, Vixen Motor Company, Volga Associated Automobile, Volkswagen}; criterion = 1, statistic = 560.315
##     3) make == {ASC Incorporated, Aurora Cars Ltd, Avanti Motor Corporation, Bitter Gmbh and Co. Kg, Dacia, E. P. Dutton, Inc., Eagle, Environmental Rsch and Devp Corp, Evans Automobiles, Excalibur Autos, General Motors, Grumman Allied Industries, Import Foreign Auto Sales Inc, JBA Motorcars, Inc., Lambda Control Systems, London Coach Co Inc, Lotus, Mazda, Mcevoy Motors, S and S Coach Company  E.p. Dutton, Superior Coaches Div E.p. Dutton, Vixen Motor Company, Volga Associated Automobile}; criterion = 1, statistic = 144.054
##       4) co2TailpipeGpm <= 522.7647; criterion = 0.985, statistic = 39.779
##         5)*  weights = 162 
##       4) co2TailpipeGpm > 522.7647
##         6)*  weights = 23 
##     3) make == {Alfa Romeo, Autokraft Limited, Bill Dovell Motor Car Company, BMW, Cadillac, CCC Engineering, Chevrolet, Chrysler, Daihatsu, Ferrari, Geo, GMC, Grumman Olson, Hyundai, Jaguar, Lamborghini, Maserati, Merkur, Nissan, Oldsmobile, Pininfarina, Pontiac, Red Shift Ltd., Rolls-Royce, Ruf Automobile Gmbh, Sterling, Subaru, Volkswagen}
##       7) co2TailpipeGpm <= 592.4667; criterion = 1, statistic = 46.932
##         8) make == {Alfa Romeo, Autokraft Limited, Bill Dovell Motor Car Company, Cadillac, CCC Engineering, Chrysler, Geo, Grumman Olson, Maserati, Merkur, Oldsmobile, Pininfarina, Red Shift Ltd., Sterling, Volkswagen}; criterion = 0.999, statistic = 52.188
##           9)*  weights = 754 
##         8) make == {BMW, Chevrolet, Daihatsu, GMC, Hyundai, Jaguar, Nissan, Pontiac, Subaru}
##           10) co2TailpipeGpm <= 509; criterion = 0.999, statistic = 13.753
##             11) make == {BMW, Daihatsu, Hyundai, Nissan, Pontiac, Subaru}; criterion = 0.999, statistic = 28.143
##               12)*  weights = 869 
##             11) make == {Chevrolet, GMC, Jaguar}
##               13) comb08 <= 25; criterion = 0.984, statistic = 7.782
##                 14)*  weights = 643 
##               13) comb08 > 25
##                 15)*  weights = 83 
##           10) co2TailpipeGpm > 509
##             16)*  weights = 773 
##       7) co2TailpipeGpm > 592.4667
##         17) comb08 <= 9; criterion = 0.978, statistic = 11.575
##           18)*  weights = 46 
##         17) comb08 > 9
##           19)*  weights = 622 
##   2) make == {Acura, Bertone, Buick, Dodge, Ford, Honda, Isuzu, Laforza Automobile Inc, Land Rover, Lincoln, Mercury, Mitsubishi, Plymouth, Saab, Saleen, Toyota}
##     20) make == {Acura, Dodge, Ford, Honda, Isuzu, Laforza Automobile Inc, Land Rover, Lincoln, Saab, Saleen}; criterion = 1, statistic = 63.085
##       21) co2TailpipeGpm <= 522.7647; criterion = 0.99, statistic = 21.446
##         22) make == {Acura, Ford, Honda, Isuzu, Lincoln, Saab}; criterion = 1, statistic = 38.115
##           23) make == {Ford, Isuzu}; criterion = 0.952, statistic = 13.915
##             24)*  weights = 579 
##           23) make == {Acura, Honda, Lincoln, Saab}
##             25)*  weights = 235 
##         22) make == {Dodge}
##           26)*  weights = 487 
##       21) co2TailpipeGpm > 522.7647
##         27)*  weights = 871 
##     20) make == {Bertone, Buick, Mercury, Mitsubishi, Plymouth, Toyota}
##       28)*  weights = 1388 
## 1) make == {AM General, American Motors Corporation, Aston Martin, Audi, CX Automotive, Jeep, Kenyon Corporation Of America, Mercedes-Benz, Panther Car Company Limited, Peugeot, Porsche, Renault, Suzuki, Texas Coach Company, TVR Engineering Ltd, Volvo, Yugo}
##   29) make == {AM General, American Motors Corporation, CX Automotive, Kenyon Corporation Of America, Panther Car Company Limited, Texas Coach Company, TVR Engineering Ltd, Yugo}; criterion = 1, statistic = 63.686
##     30)*  weights = 63 
##   29) make == {Aston Martin, Audi, Jeep, Mercedes-Benz, Peugeot, Porsche, Renault, Suzuki, Volvo}
##     31)*  weights = 807
# do a stripchart to visualize the how the data spreads across
# the two options - whether a car was discontinued or not
stripchart(DiffFromMean ~ discontinuedNextYear,
           data=Good_data,
           main="Stripchart comparing Make/Model Discontinuity",
           xlab="Auto Discontinued?",
           ylab="MPG delta from Class Average",
           col=c("orange","red"),
           method = "jitter",
           vertical=TRUE,
           pch=16
)

Now run random forest and look at output.

# do a random forests function
#install.packages("randomForest")
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
set.seed(415)

fit_discNextYear <- randomForest(as.factor(discontinuedNextYear) ~ co2TailpipeGpm + comb08 + 
                      cylinders + displ + drive + fuelType +  
                      fuelType1 + VClass + year,
                    data=Prior2017, 
                    na.action = na.omit,
                    importance=TRUE, 
                    ntree=100)

varImpPlot(fit_discNextYear)