#setwd("d:/pdata/pdsc/Chapter02")
vehicles<-read.csv("http://20727931.d.yyupload.com/down/20727931/opendata/vehicles.csv",stringsAsFactors = F)
head(vehicles)
## barrels08 barrelsA08 charge120 charge240 city08 city08U cityA08 cityA08U
## 1 15.68944 0 0 0 19 0 0 0
## 2 29.95056 0 0 0 9 0 0 0
## 3 12.19557 0 0 0 23 0 0 0
## 4 29.95056 0 0 0 10 0 0 0
## 5 17.33749 0 0 0 17 0 0 0
## 6 14.96429 0 0 0 21 0 0 0
## cityCD cityE cityUF co2 co2A co2TailpipeAGpm co2TailpipeGpm comb08 comb08U
## 1 0 0 0 -1 -1 0 423.1905 21 0
## 2 0 0 0 -1 -1 0 807.9091 11 0
## 3 0 0 0 -1 -1 0 329.1481 27 0
## 4 0 0 0 -1 -1 0 807.9091 11 0
## 5 0 0 0 -1 -1 0 467.7368 19 0
## 6 0 0 0 -1 -1 0 403.9545 22 0
## combA08 combA08U combE combinedCD combinedUF cylinders displ
## 1 0 0 0 0 0 4 2.0
## 2 0 0 0 0 0 12 4.9
## 3 0 0 0 0 0 4 2.2
## 4 0 0 0 0 0 8 5.2
## 5 0 0 0 0 0 4 2.2
## 6 0 0 0 0 0 4 1.8
## drive engId eng_dscr feScore fuelCost08 fuelCostA08
## 1 Rear-Wheel Drive 9011 (FFS) -1 2350 0
## 2 Rear-Wheel Drive 22020 (GUZZLER) -1 4450 0
## 3 Front-Wheel Drive 2100 (FFS) -1 1800 0
## 4 Rear-Wheel Drive 2850 -1 4450 0
## 5 4-Wheel or All-Wheel Drive 66031 (FFS,TRBO) -1 2850 0
## 6 Front-Wheel Drive 66020 (FFS) -1 2250 0
## fuelType fuelType1 ghgScore ghgScoreA highway08 highway08U highwayA08
## 1 Regular Regular Gasoline -1 -1 25 0 0
## 2 Regular Regular Gasoline -1 -1 14 0 0
## 3 Regular Regular Gasoline -1 -1 33 0 0
## 4 Regular Regular Gasoline -1 -1 12 0 0
## 5 Premium Premium Gasoline -1 -1 23 0 0
## 6 Regular Regular Gasoline -1 -1 24 0 0
## highwayA08U highwayCD highwayE highwayUF hlv hpv id lv2 lv4 make
## 1 0 0 0 0 0 0 1 0 0 Alfa Romeo
## 2 0 0 0 0 0 0 10 0 0 Ferrari
## 3 0 0 0 0 19 77 100 0 0 Dodge
## 4 0 0 0 0 0 0 1000 0 0 Dodge
## 5 0 0 0 0 0 0 10000 0 14 Subaru
## 6 0 0 0 0 0 0 10001 0 15 Subaru
## model mpgData phevBlended pv2 pv4 range rangeCity rangeCityA
## 1 Spider Veloce 2000 Y false 0 0 0 0 0
## 2 Testarossa N false 0 0 0 0 0
## 3 Charger Y false 0 0 0 0 0
## 4 B150/B250 Wagon 2WD N false 0 0 0 0 0
## 5 Legacy AWD Turbo N false 0 90 0 0 0
## 6 Loyale N false 0 88 0 0 0
## rangeHwy rangeHwyA trany UCity UCityA UHighway UHighwayA
## 1 0 0 Manual 5-spd 23.3333 0 35.0000 0
## 2 0 0 Manual 5-spd 11.0000 0 19.0000 0
## 3 0 0 Manual 5-spd 29.0000 0 47.0000 0
## 4 0 0 Automatic 3-spd 12.2222 0 16.6667 0
## 5 0 0 Manual 5-spd 21.0000 0 32.0000 0
## 6 0 0 Automatic 3-spd 27.0000 0 33.0000 0
## VClass year youSaveSpend guzzler trans_dscr tCharger sCharger
## 1 Two Seaters 1985 -1000 NA
## 2 Two Seaters 1985 -11500 T NA
## 3 Subcompact Cars 1985 1750 SIL NA
## 4 Vans 1985 -11500 NA
## 5 Compact Cars 1993 -3500 TRUE
## 6 Compact Cars 1993 -500 NA
## atvType fuelType2 rangeA evMotor mfrCode
## 1
## 2
## 3
## 4
## 5
## 6
labels<-do.call(rbind,strsplit(readLines("http://20727931.d.yyupload.com/down/20727931/opendata/varlabels.txt"),"-"))
## Warning in (function (..., deparse.level = 1) : number of columns of result is
## not a multiple of vector length (arg 1)
head(labels)
## [,1] [,2]
## [1,] "atvtype " " type of alternative fuel or advanced technology vehicle"
## [2,] "barrels08 " " annual petroleum consumption in barrels for fuelType1 (1)"
## [3,] "barrelsA08 " " annual petroleum consumption in barrels for fuelType2 (1)"
## [4,] "charge120 " " time to charge an electric vehicle in hours at 120 V"
## [5,] "charge240 " " time to charge an electric vehicle in hours at 240 V"
## [6,] "city08 " " city MPG for fuelType1 (2)"
## [,3]
## [1,] "atvtype "
## [2,] "barrels08 "
## [3,] "barrelsA08 "
## [4,] "charge120 "
## [5,] "charge240 "
## [6,] "city08 "
nrow(vehicles)
## [1] 34287
ncol(vehicles)
## [1] 74
names(vehicles)
## [1] "barrels08" "barrelsA08" "charge120" "charge240"
## [5] "city08" "city08U" "cityA08" "cityA08U"
## [9] "cityCD" "cityE" "cityUF" "co2"
## [13] "co2A" "co2TailpipeAGpm" "co2TailpipeGpm" "comb08"
## [17] "comb08U" "combA08" "combA08U" "combE"
## [21] "combinedCD" "combinedUF" "cylinders" "displ"
## [25] "drive" "engId" "eng_dscr" "feScore"
## [29] "fuelCost08" "fuelCostA08" "fuelType" "fuelType1"
## [33] "ghgScore" "ghgScoreA" "highway08" "highway08U"
## [37] "highwayA08" "highwayA08U" "highwayCD" "highwayE"
## [41] "highwayUF" "hlv" "hpv" "id"
## [45] "lv2" "lv4" "make" "model"
## [49] "mpgData" "phevBlended" "pv2" "pv4"
## [53] "range" "rangeCity" "rangeCityA" "rangeHwy"
## [57] "rangeHwyA" "trany" "UCity" "UCityA"
## [61] "UHighway" "UHighwayA" "VClass" "year"
## [65] "youSaveSpend" "guzzler" "trans_dscr" "tCharger"
## [69] "sCharger" "atvType" "fuelType2" "rangeA"
## [73] "evMotor" "mfrCode"
length(unique(vehicles[,'year']))
## [1] 31
first_year<-min(vehicles[,'year'])
last_year<-max(vehicles[,'year'])
length(unique(vehicles$year))
## [1] 31
table(vehicles$fuelType1)
##
## Diesel Electricity Midgrade Gasoline Natural Gas
## 1025 56 41 57
## Premium Gasoline Regular Gasoline
## 8521 24587
table(vehicles$trany)
##
## Auto (AV-S6)
## 11 1
## Auto (AV-S8) Auto (AV)
## 1 2
## Auto(A1) Auto(A8)
## 1 1
## Auto(AM-S6) Auto(AM-S7)
## 38 84
## Auto(AM5) Auto(AM6)
## 10 68
## Auto(AM7) Auto(AV-S6)
## 58 80
## Auto(AV-S7) Auto(AV-S8)
## 19 11
## Auto(L3) Auto(L4)
## 2 2
## Automatic (A1) Automatic (A6)
## 41 4
## Automatic (AM-S6) Automatic (AM-S7)
## 1 2
## Automatic (AM5) Automatic (AM6)
## 2 1
## Automatic (AV-S6) Automatic (AV)
## 9 4
## Automatic (S4) Automatic (S5)
## 231 813
## Automatic (S6) Automatic (S7)
## 1877 182
## Automatic (S8) Automatic (variable gear ratios)
## 366 534
## Automatic 3-spd Automatic 4-spd
## 3151 11029
## Automatic 5-spd Automatic 6-spd
## 2149 1093
## Automatic 6spd Automatic 7-spd
## 1 493
## Automatic 8-spd Automatic 8spd
## 78 4
## Automatic 9-spd Manual 3-spd
## 8 77
## Manual 4-spd Manual 4-spd Doubled
## 1483 17
## Manual 5-spd Manual 5 spd
## 8213 1
## Manual 6-spd Manual 7-spd
## 2009 24
## Manual(M7)
## 1
vehicles$trany[vehicles$trany==""]<-NA
vehicles$trany2<-ifelse(substr(vehicles$trany,1,4)=="Auto","Auto","Manual")
vehicles$trany2<-as.factor(vehicles$trany2)
table(vehicles$trany2)
##
## Auto Manual
## 22451 11825
with(vehicles,table(sCharger,year))
## year
## sCharger 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997
## 1964 1701 1210 1247 1130 1149 1074 1130 1116 1088 979 962 767 757
## S 0 0 0 0 0 4 4 2 5 5 3 5 6 5
## year
## sCharger 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011
## 800 840 826 891 949 1015 1089 1136 1067 1098 1152 1166 1091 1077
## S 12 12 14 20 26 29 33 30 37 28 35 19 18 20
## year
## sCharger 2012 2013 2014
## 1125 1141 1051
## S 28 42 57
table(vehicles$year,vehicles$sCharger)
##
## S
## 1984 1964 0
## 1985 1701 0
## 1986 1210 0
## 1987 1247 0
## 1988 1130 0
## 1989 1149 4
## 1990 1074 4
## 1991 1130 2
## 1992 1116 5
## 1993 1088 5
## 1994 979 3
## 1995 962 5
## 1996 767 6
## 1997 757 5
## 1998 800 12
## 1999 840 12
## 2000 826 14
## 2001 891 20
## 2002 949 26
## 2003 1015 29
## 2004 1089 33
## 2005 1136 30
## 2006 1067 37
## 2007 1098 28
## 2008 1152 35
## 2009 1166 19
## 2010 1091 18
## 2011 1077 20
## 2012 1125 28
## 2013 1141 42
## 2014 1051 57
class(vehicles$sCharger)
## [1] "character"
unique(vehicles$sCharger)
## [1] "" "S"
class(vehicles$tCharger)
## [1] "logical"
unique(vehicles$tCharger)
## [1] NA TRUE
mpgByYr<-ddply(vehicles,~year,summarise,avgMPG=mean(comb08),avgHghy=mean(highway08),avgCity=mean(city08))
head(mpgByYr)
## year avgMPG avgHghy avgCity
## 1 1984 19.88187 23.07536 17.98269
## 2 1985 19.80835 23.04233 17.87831
## 3 1986 19.55041 22.69917 17.66529
## 4 1987 19.22855 22.44507 17.31034
## 5 1988 19.32832 22.70265 17.33363
## 6 1989 19.12576 22.46574 17.14397
ggplot(mpgByYr,aes(year,avgMPG))+geom_point()+
geom_smooth()+xlab("Year")+ylab("Average MPG")+
ggtitle("all cars")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

table(vehicles$fuelType1)
##
## Diesel Electricity Midgrade Gasoline Natural Gas
## 1025 56 41 57
## Premium Gasoline Regular Gasoline
## 8521 24587
gasCars<-subset(vehicles,fuelType1 %in% c("Regular Gasoline","Premium Gasoline","Midgrade Gasoline") & fuelType2 == "" & atvType != "Hybrid")
mpgByYr_Gas<-ddply(gasCars,~year,summarise,avgMPG=mean(comb08))
ggplot(mpgByYr_Gas,aes(year,avgMPG))+geom_point()+
geom_smooth()+xlab("Year")+ylab("Average MPG")+
ggtitle("Gasoline cars")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

typeof(gasCars$displ)
## [1] "character"
gasCars$displ<-as.numeric(gasCars$displ)
ggplot(gasCars, aes(displ,comb08))+geom_point()+
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).

avgCarSize<-ddply(gasCars,~year,summarise,avgDispl=mean(displ))
ggplot(avgCarSize,aes(year,avgDispl))+geom_point()+
geom_smooth()+xlab("year")+ylab("Average engine displacement(1)")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

byYear<-ddply(gasCars,~year,summarise,avgMPG = mean(comb08),avgDispl=mean(displ))
head(byYear)
## year avgMPG avgDispl
## 1 1984 19.12162 3.068449
## 2 1985 19.39469 NA
## 3 1986 19.32046 3.126514
## 4 1987 19.16457 3.096474
## 5 1988 19.36761 3.113558
## 6 1989 19.14196 3.133393
byYear2=melt(byYear,id="year")
levels(byYear2$variable)<-c("Average MPG","Avg engine displacement")
ggplot(byYear2,aes(year,value))+geom_point()+
geom_smooth()+facet_wrap(~variable,ncol=1,scales="free_y")+xlab("Year")+ylab("")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

gasCars4<-subset(gasCars,cylinders=="4")
ggplot(gasCars4,aes(factor(year),comb08))+geom_boxplot()+
facet_wrap(~trany2,ncol=1)+
theme(axis.text.x=element_text(angle=45))+labs(x="Year",y="MPG")

ggplot(gasCars4,aes(factor(year),fill=factor(trany2)))+
geom_bar(position ="fill")+labs(x="Year",y="Proportion of cars",fill="Transmission")+
theme(axis.text.x=element_text(angle=45))+geom_hline(yintercept=0.5,linetype=2)

carsMake<-ddply(gasCars4,~year,summarise,numberOfMakes = length(unique(make)))
ggplot(carsMake,aes(year,numberOfMakes))+geom_point()+
labs(x="Year",y="Number of available makes")+ggtitle("Four cylinder cars")

uniqMakes<-dlply(gasCars4,~year,function(x)
unique(x$make))
commonMakes<-Reduce(intersect,uniqMakes)
carsCommonMakes4<-subset(gasCars4,make %in% commonMakes)
avgMPG_commonMakes<-ddply(carsCommonMakes4,~year+make,summarise,avgMPG=mean(comb08))
ggplot(avgMPG_commonMakes,aes(year,avgMPG))+geom_line()+
facet_wrap(~make,nrow=3)
