This is a partial data analysis of the vehicles.csv data. This came from EPA vehicle data over the period 1984 to 2017, including vehicle design information (cylinders, displacement, fuel type, etc.) and performance (mpg, CO2 emitted).
Sudip has processed the data into another csv file - margedData3
Load the libraries
#install.packages("party")
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
#Get source data from a file
setwd("~/GWU Intro to Data Science/Project 2/")
autodata <- read.csv("mergedData3.csv")
use a subset of the data columns, and look at the header data
clean_autodata <- autodata[c("co2TailpipeGpm", "comb08", "discontinuedNextYear",
"cylinders", "displ", "drive","fuelType","make", "model",
"fuelType1", "VClass", "year", "DiffFromMean")]
head(clean_autodata)
## co2TailpipeGpm comb08 discontinuedNextYear cylinders displ
## 1 423.1905 21 TRUE 4 2.0
## 2 807.9091 11 FALSE 12 4.9
## 3 329.1481 27 FALSE 4 2.2
## 4 807.9091 11 FALSE 8 5.2
## 5 467.7368 19 FALSE 4 2.2
## 6 403.9545 22 TRUE 4 1.8
## drive fuelType make model
## 1 Rear-Wheel Drive Regular Alfa Romeo Spider Veloce 2000
## 2 Rear-Wheel Drive Regular Ferrari Testarossa
## 3 Front-Wheel Drive Regular Dodge Charger
## 4 Rear-Wheel Drive Regular Dodge B150/B250 Wagon 2WD
## 5 4-Wheel or All-Wheel Drive Premium Subaru Legacy AWD Turbo
## 6 Front-Wheel Drive Regular Subaru Loyale
## fuelType1 VClass year DiffFromMean
## 1 Regular Gasoline Two Seaters 1985 2.767296
## 2 Regular Gasoline Two Seaters 1985 -51.320755
## 3 Regular Gasoline Subcompact Cars 1985 12.510653
## 4 Regular Gasoline Vans 1985 -28.665786
## 5 Premium Gasoline Compact Cars 1993 -12.627638
## 6 Regular Gasoline Compact Cars 1993 7.930565
Now, start getting the data ready for analysis.
# start looking at a few years of data, one year at a time
# first start with 1984 (only 1984, 1987, 1988 don't have NA's)
year_1984 <- subset(clean_autodata, year== 1984)
year_1986 <- subset(clean_autodata, year== 1986)
year_1988 <- subset(clean_autodata, year== 1988)
year_1996 <- subset(clean_autodata, year== 1996)
year_2006 <- subset(clean_autodata, year== 2006)
year_2016 <- subset(clean_autodata, year== 2016)
#look at how the mean changes over time
# look at how the mean evolves over time for all data
mean(year_1984$comb08)
## [1] 19.88187
mean(year_1986$comb08)
## [1] 19.55041
mean(year_1996$comb08)
## [1] 19.58473
mean(year_2006$comb08)
## [1] 18.95924
mean(year_2016$comb08)
## [1] 25.09984
# and then create a few subsets over periods of time
The80s <- subset(clean_autodata, year < 1990)
Prior2017 <-subset(clean_autodata, year < 2017)
Now do some density plots to show combined mpg over time. Note when EV’s starting showing up, the plot starts showing much higher mpg data points.
plot(density(year_1984$comb08))
plot(density(year_1986$comb08))
plot(density(year_1996$comb08))
plot(density(year_2006$comb08))
plot(density(year_2016$comb08))
Now run a conditional inference tree.
# clean up the blanks and create a new dataframe
Good_data <- subset(clean_autodata, DiffFromMean != "")
# Look at a conditional inference tree
output_tree <- ctree(
discontinuedNextYear ~ co2TailpipeGpm + comb08 + make,
data = The80s)
# Plot the tree.
plot(output_tree, type = "simple")
Since the model names are long, it is difficult to understand.
Look at the data output.
# print the data from the analysis of conditional inference tree
output_tree
##
## Conditional inference tree with 16 terminal nodes
##
## Response: discontinuedNextYear
## Inputs: co2TailpipeGpm, comb08, make
## Number of observations: 8405
##
## 1) make == {Acura, Alfa Romeo, ASC Incorporated, Aurora Cars Ltd, Autokraft Limited, Avanti Motor Corporation, Bertone, Bill Dovell Motor Car Company, Bitter Gmbh and Co. Kg, BMW, Buick, Cadillac, CCC Engineering, Chevrolet, Chrysler, Dacia, Daihatsu, Dodge, E. P. Dutton, Inc., Eagle, Environmental Rsch and Devp Corp, Evans Automobiles, Excalibur Autos, Ferrari, Ford, General Motors, Geo, GMC, Grumman Allied Industries, Grumman Olson, Honda, Hyundai, Import Foreign Auto Sales Inc, Isuzu, Jaguar, JBA Motorcars, Inc., Laforza Automobile Inc, Lambda Control Systems, Lamborghini, Land Rover, Lincoln, London Coach Co Inc, Lotus, Maserati, Mazda, Mcevoy Motors, Mercury, Merkur, Mitsubishi, Nissan, Oldsmobile, Pininfarina, Plymouth, Pontiac, Red Shift Ltd., Rolls-Royce, Ruf Automobile Gmbh, S and S Coach Company E.p. Dutton, Saab, Saleen, Sterling, Subaru, Superior Coaches Div E.p. Dutton, Toyota, Vixen Motor Company, Volga Associated Automobile, Volkswagen}; criterion = 1, statistic = 1220.247
## 2) make == {Alfa Romeo, ASC Incorporated, Aurora Cars Ltd, Autokraft Limited, Avanti Motor Corporation, Bill Dovell Motor Car Company, Bitter Gmbh and Co. Kg, BMW, Cadillac, CCC Engineering, Chevrolet, Chrysler, Dacia, Daihatsu, E. P. Dutton, Inc., Eagle, Environmental Rsch and Devp Corp, Evans Automobiles, Excalibur Autos, Ferrari, General Motors, Geo, GMC, Grumman Allied Industries, Grumman Olson, Hyundai, Import Foreign Auto Sales Inc, Jaguar, JBA Motorcars, Inc., Lambda Control Systems, Lamborghini, London Coach Co Inc, Lotus, Maserati, Mazda, Mcevoy Motors, Merkur, Nissan, Oldsmobile, Pininfarina, Pontiac, Red Shift Ltd., Rolls-Royce, Ruf Automobile Gmbh, S and S Coach Company E.p. Dutton, Sterling, Subaru, Superior Coaches Div E.p. Dutton, Vixen Motor Company, Volga Associated Automobile, Volkswagen}; criterion = 1, statistic = 560.315
## 3) make == {ASC Incorporated, Aurora Cars Ltd, Avanti Motor Corporation, Bitter Gmbh and Co. Kg, Dacia, E. P. Dutton, Inc., Eagle, Environmental Rsch and Devp Corp, Evans Automobiles, Excalibur Autos, General Motors, Grumman Allied Industries, Import Foreign Auto Sales Inc, JBA Motorcars, Inc., Lambda Control Systems, London Coach Co Inc, Lotus, Mazda, Mcevoy Motors, S and S Coach Company E.p. Dutton, Superior Coaches Div E.p. Dutton, Vixen Motor Company, Volga Associated Automobile}; criterion = 1, statistic = 144.054
## 4) co2TailpipeGpm <= 522.7647; criterion = 0.985, statistic = 39.779
## 5)* weights = 162
## 4) co2TailpipeGpm > 522.7647
## 6)* weights = 23
## 3) make == {Alfa Romeo, Autokraft Limited, Bill Dovell Motor Car Company, BMW, Cadillac, CCC Engineering, Chevrolet, Chrysler, Daihatsu, Ferrari, Geo, GMC, Grumman Olson, Hyundai, Jaguar, Lamborghini, Maserati, Merkur, Nissan, Oldsmobile, Pininfarina, Pontiac, Red Shift Ltd., Rolls-Royce, Ruf Automobile Gmbh, Sterling, Subaru, Volkswagen}
## 7) co2TailpipeGpm <= 592.4667; criterion = 1, statistic = 46.932
## 8) make == {Alfa Romeo, Autokraft Limited, Bill Dovell Motor Car Company, Cadillac, CCC Engineering, Chrysler, Geo, Grumman Olson, Maserati, Merkur, Oldsmobile, Pininfarina, Red Shift Ltd., Sterling, Volkswagen}; criterion = 0.999, statistic = 52.188
## 9)* weights = 754
## 8) make == {BMW, Chevrolet, Daihatsu, GMC, Hyundai, Jaguar, Nissan, Pontiac, Subaru}
## 10) co2TailpipeGpm <= 509; criterion = 0.999, statistic = 13.753
## 11) make == {BMW, Daihatsu, Hyundai, Nissan, Pontiac, Subaru}; criterion = 0.999, statistic = 28.143
## 12)* weights = 869
## 11) make == {Chevrolet, GMC, Jaguar}
## 13) comb08 <= 25; criterion = 0.984, statistic = 7.782
## 14)* weights = 643
## 13) comb08 > 25
## 15)* weights = 83
## 10) co2TailpipeGpm > 509
## 16)* weights = 773
## 7) co2TailpipeGpm > 592.4667
## 17) comb08 <= 9; criterion = 0.978, statistic = 11.575
## 18)* weights = 46
## 17) comb08 > 9
## 19)* weights = 622
## 2) make == {Acura, Bertone, Buick, Dodge, Ford, Honda, Isuzu, Laforza Automobile Inc, Land Rover, Lincoln, Mercury, Mitsubishi, Plymouth, Saab, Saleen, Toyota}
## 20) make == {Acura, Dodge, Ford, Honda, Isuzu, Laforza Automobile Inc, Land Rover, Lincoln, Saab, Saleen}; criterion = 1, statistic = 63.085
## 21) co2TailpipeGpm <= 522.7647; criterion = 0.99, statistic = 21.446
## 22) make == {Acura, Ford, Honda, Isuzu, Lincoln, Saab}; criterion = 1, statistic = 38.115
## 23) make == {Ford, Isuzu}; criterion = 0.952, statistic = 13.915
## 24)* weights = 579
## 23) make == {Acura, Honda, Lincoln, Saab}
## 25)* weights = 235
## 22) make == {Dodge}
## 26)* weights = 487
## 21) co2TailpipeGpm > 522.7647
## 27)* weights = 871
## 20) make == {Bertone, Buick, Mercury, Mitsubishi, Plymouth, Toyota}
## 28)* weights = 1388
## 1) make == {AM General, American Motors Corporation, Aston Martin, Audi, CX Automotive, Jeep, Kenyon Corporation Of America, Mercedes-Benz, Panther Car Company Limited, Peugeot, Porsche, Renault, Suzuki, Texas Coach Company, TVR Engineering Ltd, Volvo, Yugo}
## 29) make == {AM General, American Motors Corporation, CX Automotive, Kenyon Corporation Of America, Panther Car Company Limited, Texas Coach Company, TVR Engineering Ltd, Yugo}; criterion = 1, statistic = 63.686
## 30)* weights = 63
## 29) make == {Aston Martin, Audi, Jeep, Mercedes-Benz, Peugeot, Porsche, Renault, Suzuki, Volvo}
## 31)* weights = 807
# do a stripchart to visualize the how the data spreads across
# the two options - whether a car was discontinued or not
stripchart(DiffFromMean ~ discontinuedNextYear,
data=Good_data,
main="Stripchart comparing Make/Model Discontinuity",
xlab="Auto Discontinued?",
ylab="MPG delta from Class Average",
col=c("orange","red"),
method = "jitter",
vertical=TRUE,
pch=16
)
# do a random forests function
#install.packages("randomForest")
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
set.seed(415)
fit_discNextYear <- randomForest(as.factor(discontinuedNextYear) ~ co2TailpipeGpm + comb08 +
cylinders + displ + drive + fuelType +
fuelType1 + VClass + year,
data=Prior2017,
na.action = na.omit,
importance=TRUE,
ntree=100)
varImpPlot(fit_discNextYear)