# Clear the console
cat("\014")
# Check if the package is installed. If not, install the package
if(!require('fueleconomy')) {
install.packages('fueleconomy')
library(fueleconomy)
}
if(!require('psych')) {
install.packages('psych')
library(psych)
}
if(!require('ggplot2')) {
install.packages('ggplot2')
library(ggplot2)
}
vehiclesDF <- as.data.frame(fueleconomy::vehicles)
head(vehiclesDF)
You should phrase your research question in a way that matches up with the scope of inference your dataset allows for.
The following research questions will be analyzed
What are the cases, and how many are there?
Each case represents a vehicle type and its specification. There 33442 observations in the given data set.
Describe the method of data collection.
Data is retrieved from [Hadley Wickham’s R Studio Blog] (https://blog.rstudio.com/2014/07/23/new-data-packages/) which is one of the approved data sources to get the data from. This data is available in the R package “fueleconomy”.
What type of study is this (observational/experiment)?
This is an observational study.
If you collected the data, state self-collected. If not, provide a citation/link.
Data is collected by Hadley Wickham and is available online here: https://blog.rstudio.com/2014/07/23/new-data-packages/ For this project, data was extracted using the fueleconomy R package.
What is the response variable, and what type is it (numerical/categorical)?
The response variable is degree of fuel efficiency impact among different vehicle types and is numerical.
What is the explanatory variable, and what type is it (numerical/categorival)?
The explanatory variable is fuel efficiency and is numerical.
Provide summary statistics relevant to your research question. For example, if you’re comparing means across groups provide means, SDs, sample sizes of each group. This step requires the use of R, hence a code chunk is provided below. Insert more code chunks as needed.
describe(vehiclesDF$hwy)
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 33442 23.55 6.21 23 23.18 5.93 9 109 100 2.15 19.61
## se
## X1 0.03
describe(vehiclesDF$cty)
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 33442 17.49 5.58 17 17 2.97 6 138 132 6.18 96
## se
## X1 0.03
table(vehiclesDF$class, useNA='ifany')
##
## Compact Cars Large Cars
## 4739 1533
## Midsize-Large Station Wagons Midsize Cars
## 627 3621
## Midsize Station Wagons Minicompact Cars
## 415 1080
## Minivan - 2WD Minivan - 4WD
## 308 44
## Small Pickup Trucks Small Pickup Trucks 2WD
## 538 392
## Small Pickup Trucks 4WD Small Sport Utility Vehicle 2WD
## 181 169
## Small Sport Utility Vehicle 4WD Small Station Wagons
## 213 1295
## Special Purpose Vehicle Special Purpose Vehicle 2WD
## 1 553
## Special Purpose Vehicle 4WD Special Purpose Vehicles
## 289 1453
## Special Purpose Vehicles/2wd Special Purpose Vehicles/4wd
## 2 2
## Sport Utility Vehicle - 2WD Sport Utility Vehicle - 4WD
## 1626 2091
## Standard Pickup Trucks Standard Pickup Trucks 2WD
## 2354 1106
## Standard Pickup Trucks 4WD Standard Pickup Trucks/2wd
## 910 4
## Standard Sport Utility Vehicle 2WD Standard Sport Utility Vehicle 4WD
## 76 171
## Subcompact Cars Two Seaters
## 4185 1602
## Vans Vans Passenger
## 1141 2
## Vans, Cargo Type Vans, Passenger Type
## 434 285
describeBy(vehiclesDF$hwy, group = vehiclesDF$class, mat=TRUE)
## item group1 vars n mean sd
## X11 1 Compact Cars 1 4739 27.80629 5.3742223
## X12 2 Large Cars 1 1533 23.79713 5.8587545
## X13 3 Midsize-Large Station Wagons 1 627 24.06539 2.8007071
## X14 4 Midsize Cars 1 3621 25.87793 5.7437943
## X15 5 Midsize Station Wagons 1 415 25.23373 3.2445896
## X16 6 Minicompact Cars 1 1080 25.53611 6.6785446
## X17 7 Minivan - 2WD 1 308 23.26948 2.1160417
## X18 8 Minivan - 4WD 1 44 21.40909 0.6220066
## X19 9 Small Pickup Trucks 1 538 23.25836 2.9207686
## X110 10 Small Pickup Trucks 2WD 1 392 23.33163 3.3784664
## X111 11 Small Pickup Trucks 4WD 1 181 20.49724 2.0128043
## X112 12 Small Sport Utility Vehicle 2WD 1 169 28.63905 7.0613731
## X113 13 Small Sport Utility Vehicle 4WD 1 213 25.70892 3.2577578
## X114 14 Small Station Wagons 1 1295 28.03320 5.0950450
## X115 15 Special Purpose Vehicle 1 1 24.00000 NA
## X116 16 Special Purpose Vehicle 2WD 1 553 20.35443 4.7747319
## X117 17 Special Purpose Vehicle 4WD 1 289 19.01038 3.6633360
## X118 18 Special Purpose Vehicles 1 1453 19.19270 3.6629749
## X119 19 Special Purpose Vehicles/2wd 1 2 21.50000 2.1213203
## X120 20 Special Purpose Vehicles/4wd 1 2 19.00000 0.0000000
## X121 21 Sport Utility Vehicle - 2WD 1 1626 22.43050 4.6189341
## X122 22 Sport Utility Vehicle - 4WD 1 2091 20.34099 3.2685157
## X123 23 Standard Pickup Trucks 1 2354 17.67077 2.9237257
## X124 24 Standard Pickup Trucks 2WD 1 1106 19.49819 3.5966923
## X125 25 Standard Pickup Trucks 4WD 1 910 17.80879 2.3033743
## X126 26 Standard Pickup Trucks/2wd 1 4 15.50000 1.7320508
## X127 27 Standard Sport Utility Vehicle 2WD 1 76 21.92105 2.9383132
## X128 28 Standard Sport Utility Vehicle 4WD 1 171 21.29240 3.3862873
## X129 29 Subcompact Cars 1 4185 27.04946 6.1803821
## X130 30 Two Seaters 1 1602 24.11548 7.7441778
## X131 31 Vans 1 1141 17.52585 3.0771494
## X132 32 Vans Passenger 1 2 17.00000 4.2426407
## X133 33 Vans, Cargo Type 1 434 16.98157 1.9425065
## X134 34 Vans, Passenger Type 1 285 16.44211 1.8176509
## median trimmed mad min max range skew kurtosis
## X11 28.0 27.54759 4.4478 9 99 90 1.864748658 19.70937325
## X12 24.0 23.63325 2.9652 9 97 88 6.487652731 76.60116407
## X13 24.0 23.94632 2.9652 16 42 26 1.042486462 4.45450840
## X14 25.0 25.56127 2.9652 9 102 93 3.125495438 34.58746436
## X15 25.0 25.09910 1.4826 16 45 29 1.252870742 6.45447188
## X16 24.5 24.94213 3.7065 13 108 95 5.472344616 60.55085653
## X17 23.0 23.21774 1.4826 17 33 16 0.400123259 3.39041458
## X18 21.0 21.30556 0.0000 21 23 2 1.187151768 0.26336372
## X19 23.0 23.01620 1.4826 16 35 19 0.958653423 1.91140748
## X110 23.0 22.95860 2.9652 18 58 40 3.274720396 27.43845398
## X111 20.0 20.43448 1.4826 15 28 13 0.536534001 1.00757965
## X112 28.0 27.68613 2.9652 20 74 54 4.694520135 25.56120449
## X113 25.0 25.68421 2.9652 18 34 16 0.149096235 -0.49055044
## X114 28.0 27.74156 4.4478 18 105 87 5.619948715 79.16111739
## X115 24.0 24.00000 0.0000 24 24 0 NA NA
## X116 21.0 20.39052 2.9652 10 62 52 2.083910344 19.35313512
## X117 19.0 19.16309 4.4478 10 27 17 -0.273043771 -0.61598240
## X118 20.0 19.21152 4.4478 10 32 22 -0.003126172 -0.38650407
## X119 21.5 21.50000 2.2239 20 23 3 0.000000000 -2.75000000
## X120 19.0 19.00000 0.0000 19 19 0 NaN NaN
## X121 22.0 22.06298 4.4478 14 74 60 3.634903445 32.40787061
## X122 20.0 20.21040 2.9652 13 30 17 0.335301371 -0.53209471
## X123 17.0 17.62951 2.9652 10 29 19 0.184168338 -0.15364200
## X124 19.0 19.39278 2.9652 11 54 43 3.280254686 29.54041933
## X125 18.0 17.82555 1.4826 10 27 17 -0.078159454 0.89907800
## X126 15.5 15.50000 2.2239 14 17 3 0.000000000 -2.43750000
## X127 22.0 21.82258 2.9652 16 30 14 0.253837131 -0.15761576
## X128 21.0 21.06569 2.9652 14 31 17 0.413515982 -0.03288392
## X129 26.0 26.70976 4.4478 9 109 100 2.490866017 25.20129102
## X130 23.0 23.39704 4.4478 10 93 83 2.967720115 19.66164007
## X131 17.0 17.49507 2.9652 10 26 16 0.195127720 -0.24248962
## X132 17.0 17.00000 4.4478 14 20 6 0.000000000 -2.75000000
## X133 17.0 16.95690 1.4826 10 25 15 0.076814340 1.77799441
## X134 16.0 16.45852 1.4826 10 22 12 -0.152558046 2.46865747
## se
## X11 0.07806786
## X12 0.14963536
## X13 0.11184947
## X14 0.09545191
## X15 0.15927064
## X16 0.20322164
## X17 0.12057266
## X18 0.09377102
## X19 0.12592327
## X110 0.17063832
## X111 0.14961057
## X112 0.54318254
## X113 0.22321789
## X114 0.14158366
## X115 NA
## X116 0.20304225
## X117 0.21549035
## X118 0.09609507
## X119 1.50000000
## X120 0.00000000
## X121 0.11454641
## X122 0.07147819
## X123 0.06026059
## X124 0.10814980
## X125 0.07635611
## X126 0.86602540
## X127 0.33704763
## X128 0.25895586
## X129 0.09553612
## X130 0.19348356
## X131 0.09109735
## X132 3.00000000
## X133 0.09324330
## X134 0.10766835
ggplot(vehiclesDF, aes(x=vehiclesDF$hwy)) + geom_histogram()
ggplot(vehiclesDF, aes(x=vehiclesDF$cty)) + geom_histogram()
Does the number of cylinders in the vehicle have direct impact to the mileage that the car gives?
I will use more ggplot2 charting techniques to visualize how one variable affects another. I am plotting to see how number of cyclinders affect MPG by doing a scatter plot overlaid with a linear best-fit line.
ggplot(data = vehiclesDF, aes(x = cyl, y = hwy)) +
geom_point() +
geom_smooth(method='lm') +
xlab('Cylinders') +
ylab('MPG') +
ggtitle('Highway MPG vs. Cylinders: Entire Sample')
## Warning: Removed 58 rows containing non-finite values (stat_smooth).
## Warning: Removed 58 rows containing missing values (geom_point).
As we can see, the number of cylinders have inverse effect on the high way mileage, i.e. as the number of cylinders go up in a vehicle, the mileage is coming down.
Same way I am plotting the graph for city mileage as well.
ggplot(data = vehiclesDF, aes(x = cyl, y = cty)) +
geom_point() +
geom_smooth(method='lm') +
xlab('Cylinders') +
ylab('MPG') +
ggtitle('City MPG vs. Cylinders: Entire Sample')
## Warning: Removed 58 rows containing non-finite values (stat_smooth).
## Warning: Removed 58 rows containing missing values (geom_point).
City Mileage have the same negative impact to the cylinders. Mileage per gallon drops as the number of cylinders go up
For the same transmission and cylinders that the vehicles have, does the mileage vary because of the Cylinders?
table(vehiclesDF$cyl)
##
## 2 3 4 5 6 8 10 12 16
## 45 182 12381 718 11885 7550 138 478 7
vehiclesDFNew = vehiclesDF[vehiclesDF$trans %in% c('Automatic (S6)', 'Manual 5-spd', 'Automatic 5-spd', 'Automatic 4-spd'),]
# vehiclesDFNew = vehiclesDFNew[vehiclesDFNew$cyl %in% c('6', '8', '4'),]
table(vehiclesDFNew$cyl)
##
## 2 3 4 5 6 8 10 12 16
## 35 104 8463 514 8263 5016 34 299 2
# vehiclesDFNew = vehiclesDFNew[vehiclesDFNew$cyl %in% c(6),]
head(vehiclesDFNew)
## id make model year class trans
## 7 3347 ASC Incorporated GNX 1987 Midsize Cars Automatic 4-spd
## 8 13309 Acura 2.2CL/3.0CL 1997 Subcompact Cars Automatic 4-spd
## 9 13310 Acura 2.2CL/3.0CL 1997 Subcompact Cars Manual 5-spd
## 10 13311 Acura 2.2CL/3.0CL 1997 Subcompact Cars Automatic 4-spd
## 11 14038 Acura 2.3CL/3.0CL 1998 Subcompact Cars Automatic 4-spd
## 12 14039 Acura 2.3CL/3.0CL 1998 Subcompact Cars Manual 5-spd
## drive cyl displ fuel hwy cty
## 7 Rear-Wheel Drive 6 3.8 Premium 21 14
## 8 Front-Wheel Drive 4 2.2 Regular 26 20
## 9 Front-Wheel Drive 4 2.2 Regular 28 22
## 10 Front-Wheel Drive 6 3.0 Regular 26 18
## 11 Front-Wheel Drive 4 2.3 Regular 27 19
## 12 Front-Wheel Drive 4 2.3 Regular 29 21
nrow(vehiclesDFNew)
## [1] 22733
fit = lm(cyl ~ cty, data=vehiclesDFNew)
summary(fit)
##
## Call:
## lm(formula = cyl ~ cty, data = vehiclesDFNew)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.3749 -0.6711 -0.0787 0.5872 7.5516
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.818062 0.032114 336.9 <2e-16 ***
## cty -0.296210 0.001819 -162.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.157 on 22728 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.5385, Adjusted R-squared: 0.5385
## F-statistic: 2.652e+04 on 1 and 22728 DF, p-value: < 2.2e-16
# For the same transmission and cylinders that the vehicles have, does the mileage vary because of the Cylinders?
ggplot(data = vehiclesDFNew, aes(x = cyl, y = cty)) +
geom_point() +
geom_smooth(method='lm') +
xlab('Transmission Type') +
ylab('MPG') +
ggtitle('City MPG vs. Cylinders: Restricted Transmission Types')
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
## Warning: Removed 3 rows containing missing values (geom_point).
fit = lm(cyl ~ hwy, data=vehiclesDFNew)
summary(fit)
##
## Call:
## lm(formula = cyl ~ hwy, data = vehiclesDFNew)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.4269 -0.8416 -0.0122 0.8171 8.3291
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.573598 0.036612 288.8 <2e-16 ***
## hwy -0.207334 0.001529 -135.6 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.266 on 22728 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.4474, Adjusted R-squared: 0.4473
## F-statistic: 1.84e+04 on 1 and 22728 DF, p-value: < 2.2e-16
# For the same transmission and cylinders that the vehicles have, does the mileage vary because of the Cylinders?
ggplot(data = vehiclesDFNew, aes(x = cyl, y = hwy)) +
geom_point() +
geom_smooth(method='lm') +
xlab('Transmission Type') +
ylab('MPG') +
ggtitle('City MPG vs. Cylinders: Restricted Transmission Types')
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
## Warning: Removed 3 rows containing missing values (geom_point).
For the same number of Cylinders, different transmission types make an impact to the mileage. It appears that Manual transmission and with lesser cylinders, the mileage gets better
Do we see the mileage variance between the manufacturers for the same number of cylinders, displacement and drive type?
# Filter the dataset for 6 Cylinders, 4.3 Displacement and Rear-Wheel Drive vehicles
vehiclesDFMfgr = vehiclesDF[vehiclesDF$cyl %in% c('6'),]
vehiclesDFMfgr = vehiclesDFMfgr[vehiclesDFMfgr$displ %in% c('4.3'),]
vehiclesDFMfgr = vehiclesDFMfgr[vehiclesDFMfgr$drive %in% c('Rear-Wheel Drive'),]
table(vehiclesDFMfgr$make)
##
## Buick Cadillac Chevrolet GMC Isuzu Oldsmobile
## 4 1 385 361 8 1
## Pontiac
## 12
head(vehiclesDFMfgr)
## id make model year class
## 3011 1332 Buick Regal 1985 Midsize Cars
## 3014 426 Buick Regal 1985 Midsize Cars
## 3015 427 Buick Regal 1985 Midsize Cars
## 3016 428 Buick Regal 1985 Midsize Cars
## 3567 28106 Cadillac Fleetwood/DeVille (FWD) 1984 Large Cars
## 3707 1568 Chevrolet Astro 2WD (cargo) 1985 Vans
## trans drive cyl displ fuel hwy cty
## 3011 Automatic 4-spd Rear-Wheel Drive 6 4.3 Regular 24 16
## 3014 Automatic 3-spd Rear-Wheel Drive 6 4.3 Regular 21 16
## 3015 Automatic 4-spd Rear-Wheel Drive 6 4.3 Regular 24 16
## 3016 Automatic 3-spd Rear-Wheel Drive 6 4.3 Diesel 29 20
## 3567 Automatic 4-spd Rear-Wheel Drive 6 4.3 Diesel 31 21
## 3707 Automatic 4-spd Rear-Wheel Drive 6 4.3 Regular 20 15
# Before we perform refgression analysis, assign integer values for the Manufactures
# 1 - Buick
# 2 - Cadillac
# 3 - Chevrolet
# 4 - GMC
# 5 - Isuzu
# 6 - Oldsmobile
# 7 - Pontiac
vehiclesDFMfgr$makeKey <- vehiclesDFMfgr$make
vehiclesDFMfgr$makeKey <- replace(vehiclesDFMfgr$makeKey, vehiclesDFMfgr$make=='Buick', '1')
vehiclesDFMfgr$makeKey <- replace(vehiclesDFMfgr$makeKey, vehiclesDFMfgr$make=='Cadillac', '2')
vehiclesDFMfgr$makeKey <- replace(vehiclesDFMfgr$makeKey, vehiclesDFMfgr$make=='Chevrolet', '3')
vehiclesDFMfgr$makeKey <- replace(vehiclesDFMfgr$makeKey, vehiclesDFMfgr$make=='GMC', '4')
vehiclesDFMfgr$makeKey <- replace(vehiclesDFMfgr$makeKey, vehiclesDFMfgr$make=='Isuzu', '5')
vehiclesDFMfgr$makeKey <- replace(vehiclesDFMfgr$makeKey, vehiclesDFMfgr$make=='Oldsmobile', '6')
vehiclesDFMfgr$makeKey <- replace(vehiclesDFMfgr$makeKey, vehiclesDFMfgr$make=='Pontiac', '7')
fit = lm(makeKey ~ cty, data=vehiclesDFMfgr)
summary(fit)
##
## Call:
## lm(formula = makeKey ~ cty, data = vehiclesDFMfgr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7035 -0.5470 -0.4844 0.4530 3.4530
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.07759 0.41555 7.406 3.42e-13 ***
## cty 0.03129 0.02790 1.121 0.262
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7052 on 770 degrees of freedom
## Multiple R-squared: 0.001631, Adjusted R-squared: 0.0003341
## F-statistic: 1.258 on 1 and 770 DF, p-value: 0.2624
ggplot(data = vehiclesDFMfgr, aes(x = makeKey, y = cty)) +
geom_point() +
geom_smooth(method='lm') +
xlab('Manufacturer') +
ylab('MPG') +
ggtitle('City MPG vs. Make: Restricted Cylinders/Displacement/Drive Types')
It appears that even though the cylinders, displacement and drive type are same for a given manufacturer, there are still variance in mileage because of the other factors such as Vehicle class, transmission and model years