Importing Automobile Fuel Efficiency Data into R
vehicles <- read.csv("../data/vehicles.csv", stringsAsFactors = F)
head(vehicles)
## barrels08 barrelsA08 charge120 charge240 city08 city08U cityA08 cityA08U
## 1 15.68944 0 0 0 19 0 0 0
## 2 29.95056 0 0 0 9 0 0 0
## 3 12.19557 0 0 0 23 0 0 0
## 4 29.95056 0 0 0 10 0 0 0
## 5 17.33749 0 0 0 17 0 0 0
## 6 14.96429 0 0 0 21 0 0 0
## cityCD cityE cityUF co2 co2A co2TailpipeAGpm co2TailpipeGpm comb08
## 1 0 0 0 -1 -1 0 423.1905 21
## 2 0 0 0 -1 -1 0 807.9091 11
## 3 0 0 0 -1 -1 0 329.1481 27
## 4 0 0 0 -1 -1 0 807.9091 11
## 5 0 0 0 -1 -1 0 467.7368 19
## 6 0 0 0 -1 -1 0 403.9545 22
## comb08U combA08 combA08U combE combinedCD combinedUF cylinders displ
## 1 0 0 0 0 0 0 4 2.0
## 2 0 0 0 0 0 0 12 4.9
## 3 0 0 0 0 0 0 4 2.2
## 4 0 0 0 0 0 0 8 5.2
## 5 0 0 0 0 0 0 4 2.2
## 6 0 0 0 0 0 0 4 1.8
## drive engId eng_dscr feScore fuelCost08
## 1 Rear-Wheel Drive 9011 (FFS) -1 2350
## 2 Rear-Wheel Drive 22020 (GUZZLER) -1 4450
## 3 Front-Wheel Drive 2100 (FFS) -1 1800
## 4 Rear-Wheel Drive 2850 -1 4450
## 5 4-Wheel or All-Wheel Drive 66031 (FFS,TRBO) -1 2850
## 6 Front-Wheel Drive 66020 (FFS) -1 2250
## fuelCostA08 fuelType fuelType1 ghgScore ghgScoreA highway08
## 1 0 Regular Regular Gasoline -1 -1 25
## 2 0 Regular Regular Gasoline -1 -1 14
## 3 0 Regular Regular Gasoline -1 -1 33
## 4 0 Regular Regular Gasoline -1 -1 12
## 5 0 Premium Premium Gasoline -1 -1 23
## 6 0 Regular Regular Gasoline -1 -1 24
## highway08U highwayA08 highwayA08U highwayCD highwayE highwayUF hlv hpv
## 1 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 19 77
## 4 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0
## id lv2 lv4 make model mpgData phevBlended pv2 pv4
## 1 1 0 0 Alfa Romeo Spider Veloce 2000 Y false 0 0
## 2 10 0 0 Ferrari Testarossa N false 0 0
## 3 100 0 0 Dodge Charger Y false 0 0
## 4 1000 0 0 Dodge B150/B250 Wagon 2WD N false 0 0
## 5 10000 0 14 Subaru Legacy AWD Turbo N false 0 90
## 6 10001 0 15 Subaru Loyale N false 0 88
## range rangeCity rangeCityA rangeHwy rangeHwyA trany UCity
## 1 0 0 0 0 0 Manual 5-spd 23.3333
## 2 0 0 0 0 0 Manual 5-spd 11.0000
## 3 0 0 0 0 0 Manual 5-spd 29.0000
## 4 0 0 0 0 0 Automatic 3-spd 12.2222
## 5 0 0 0 0 0 Manual 5-spd 21.0000
## 6 0 0 0 0 0 Automatic 3-spd 27.0000
## UCityA UHighway UHighwayA VClass year youSaveSpend guzzler
## 1 0 35.0000 0 Two Seaters 1985 -1000
## 2 0 19.0000 0 Two Seaters 1985 -11500 T
## 3 0 47.0000 0 Subcompact Cars 1985 1750
## 4 0 16.6667 0 Vans 1985 -11500
## 5 0 32.0000 0 Compact Cars 1993 -3500
## 6 0 33.0000 0 Compact Cars 1993 -500
## trans_dscr tCharger sCharger atvType fuelType2 rangeA evMotor mfrCode
## 1 NA
## 2 NA
## 3 SIL NA
## 4 NA
## 5 TRUE
## 6 NA
labels <- do.call(rbind, strsplit(readLines("../data/varlabels.txt"), " - "))
head(labels)
## [,1]
## [1,] "atvtype"
## [2,] "barrels08"
## [3,] "barrelsA08"
## [4,] "charge120"
## [5,] "charge240"
## [6,] "city08"
## [,2]
## [1,] "type of alternative fuel or advanced technology vehicle"
## [2,] "annual petroleum consumption in barrels for fuelType1 (1)"
## [3,] "annual petroleum consumption in barrels for fuelType2 (1)"
## [4,] "time to charge an electric vehicle in hours at 120 V"
## [5,] "time to charge an electric vehicle in hours at 240 V"
## [6,] "city MPG for fuelType1 (2)"
Exploring and Describing the Fuel Efficiency Data
first_year <- min(vehicles[, "year"])
last_year <- max(vehicles[, "year"])
length(unique(vehicles$year))
## [1] 31
table(vehicles$fuelType1)
##
## Diesel Electricity Midgrade Gasoline Natural Gas
## 1025 56 41 57
## Premium Gasoline Regular Gasoline
## 8521 24587
vehicles$trany[vehicles$trany == ""] <- NA
vehicles$trany2 <- ifelse(substr(vehicles$trany, 1, 4) == "Auto", "Auto", "Manual")
vehicles$trany <- as.factor(vehicles$trany)
table(vehicles$trany2)
##
## Auto Manual
## 22451 11825
#Analyzing Automobile Fuel Efficiency Over Time
mpgByYr <- ddply(vehicles, ~year, summarise, avgMPG = mean(comb08), avgHghy = mean(highway08), avgCity = mean(city08))
ggplot(mpgByYr, aes(year, avgMPG)) + geom_point() + geom_smooth() + xlab("Year") + ylab("Average MPG") + ggtitle("All cars")
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

table(vehicles$fuelType1)
##
## Diesel Electricity Midgrade Gasoline Natural Gas
## 1025 56 41 57
## Premium Gasoline Regular Gasoline
## 8521 24587
gasCars <- subset(vehicles, fuelType1 %in% c("Regular Gasoline", "Premium Gasoline", "Midgrade Gasoline") & fuelType2 == "" & atvType != "Hybrid")
mpgByYr_Gas <- ddply(gasCars, ~year, summarise, avgMPG = mean(comb08))
ggplot(mpgByYr_Gas, aes(year, avgMPG)) + geom_point() + geom_smooth() + xlab("Year") + ylab("Average MPG") + ggtitle("Gasoline cars")
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

typeof(gasCars$displ)
## [1] "character"
gasCars$displ <- as.numeric(gasCars$displ)
ggplot(gasCars, aes(displ, comb08)) + geom_point() + geom_smooth()
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
## Warning: Removed 2 rows containing missing values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).

avgCarSize <- ddply(gasCars, ~year, summarise, avgDispl = mean(displ))
# This scatter plot of the data offers the convincing evidence that there is a negative, or even inverse correlation, between engine displacement and fuel efficiency; thus, smaller cars tend to be more fuel-efficient.
ggplot(avgCarSize, aes(year, avgDispl)) + geom_point() + geom_smooth() + xlab("Year") + ylab("Average engine displacement (l)")
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## Warning: Removed 1 rows containing missing values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

byYear <- ddply(gasCars, ~year, summarise, avgMPG = mean(comb08), avgDispl = mean(displ))
head(byYear)
## year avgMPG avgDispl
## 1 1984 19.12162 3.068449
## 2 1985 19.39469 NA
## 3 1986 19.32046 3.126514
## 4 1987 19.16457 3.096474
## 5 1988 19.36761 3.113558
## 6 1989 19.14196 3.133393
byYear2 = melt(byYear, id = "year")
levels(byYear2$variable) <- c("Average MPG", "Avg engine displacement")
head(byYear2)
## year variable value
## 1 1984 Average MPG 19.12162
## 2 1985 Average MPG 19.39469
## 3 1986 Average MPG 19.32046
## 4 1987 Average MPG 19.16457
## 5 1988 Average MPG 19.36761
## 6 1989 Average MPG 19.14196
ggplot(byYear2, aes(year, value)) + geom_point() + geom_smooth() + facet_wrap(~variable, ncol = 1, scales = "free_y") + xlab("Year") + ylab("")
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## Warning: Removed 1 rows containing missing values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

# whether automatic or manual transmissions are more efficient for four cylinder engines, and how the efficiencies have changed over time
gasCars4 <- subset(gasCars, cylinders == "4")
ggplot(gasCars4, aes(factor(year), comb08)) + geom_boxplot() + facet_wrap(~trany2, ncol = 1) + theme(axis.text.x = element_text(angle = 45)) + labs(x = "Year", y = "MPG")

# the change in proportion of manual cars available each year
ggplot(gasCars4, aes(factor(year), fill = factor(trany2))) + geom_bar(position = "fill") + labs(x = "Year", y = "Proportion of cars", fill = "Transmission") + theme(axis.text.x = element_text(angle = 45)) + geom_hline(yintercept = 0.5, linetype = 2)

Investigating the Makes and Models of Automobiles
carsMake <- ddply(gasCars4, ~year, summarise, numberOfMakes = length(unique(make)))
ggplot(carsMake, aes(year, numberOfMakes)) + geom_point() + labs(x = "Year", y = "Number of available makes") + ggtitle("Four cylinder cars")

# 12 manufactures that made four-cylinder cars every year during this period
uniqMakes <- dlply(gasCars4, ~year, function(x) unique(x$make))
commonMakes <- Reduce(intersect, uniqMakes)
commonMakes
## [1] "Ford" "Honda" "Toyota" "Volkswagen" "Chevrolet"
## [6] "Chrysler" "Nissan" "Dodge" "Mazda" "Mitsubishi"
## [11] "Subaru" "Jeep"
# How have these manufacturers done over time with respect to fuel efficiency?
carsCommonMakes4 <- subset(gasCars4, make %in% commonMakes)
avgMPG_commonMakes <- ddply(carsCommonMakes4, ~year + make, summarise, avgMPG = mean(comb08))
ggplot(avgMPG_commonMakes, aes(year, avgMPG)) + geom_line() + facet_wrap(~make, nrow = 3)
