# load data from FuelEconomy.gov
if(!require(devtools)) install.packages("devtools")
## Loading required package: devtools
## Loading required package: usethis
devtools::install_github("hadley/fueleconomy")
## Skipping install of 'fueleconomy' from a github remote, the SHA1 (d590bcf6) has not changed since last install.
## Use `force = TRUE` to force installation
library(fueleconomy)
#typeof(fueleconomy::vehicles)
cat("List row number is",dim(fueleconomy::vehicles)[1],", column number is",dim(fueleconomy::vehicles)[2], "\n")
## List row number is 33442 , column number is 12
cat("Columns are:", colnames(fueleconomy::vehicles), "\n")
## Columns are: id make model year class trans drive cyl displ fuel hwy cty
# "id" "make" "model" "year" "class" "trans" "drive" "cyl" "displ" "fuel" "hwy" "cty"
cat("Convert to data frame... \n")
## Convert to data frame...
df_full = data.frame("vehicle_id" = as.numeric(fueleconomy::vehicles$id), "make" = fueleconomy::vehicles$make,
"model" = fueleconomy::vehicles$model, "year" = as.numeric(fueleconomy::vehicles$year), "class" = fueleconomy::vehicles$class, "trans" = fueleconomy::vehicles$trans, "drive"=fueleconomy::vehicles$drive, "cyl"=as.numeric(fueleconomy::vehicles$cyl),
"displ"= as.character(fueleconomy::vehicles$displ), "fuel"=fueleconomy::vehicles$fuel,"hwy"=as.numeric(fueleconomy::vehicles$hwy), "cty"=as.numeric(fueleconomy::vehicles$cty))
cat("Data frame row number is",nrow(df_full),", column number is",ncol(df_full), "\n")
## Data frame row number is 33442 , column number is 12
# we interesting only in 2-Wheel Drive, Vans, Passenger Type, Regular fuel
#summary(df_full)
#df <- subset(df_full, class == "Midsize Cars" && fuel == "Regular" && drive == "2-Wheel Drive")
df <- subset(df_full, drive == "Front-Wheel Drive" & fuel == "Regular" & class == "Large Cars" & cyl == 6 & displ == "3.5")
# df <- subset(df, fuel == "Regular" )
#df <- subset(df, class = "Vans, Passenger Type")
#head(df)
cat("New Data frame row number is",nrow(df),", column number is",ncol(df), "\n")
## New Data frame row number is 81 , column number is 12
summary(df)
## vehicle_id make model year
## Min. :10113 Chrysler :21 Intrepid :15 Min. :1993
## 1st Qu.:13627 Dodge :15 Taurus FWD:10 1st Qu.:1997
## Median :19839 Ford :10 Avalon : 8 Median :2004
## Mean :20309 Chevrolet: 8 Vision : 7 Mean :2003
## 3rd Qu.:26004 Toyota : 8 Concorde : 6 3rd Qu.:2009
## Max. :33682 Eagle : 7 300 M : 5 Max. :2014
## (Other) :12 (Other) :30
## class trans
## Large Cars :81 Automatic 4-spd:45
## Compact Cars : 0 Automatic (S6) :11
## Midsize-Large Station Wagons: 0 Automatic 5-spd: 8
## Midsize Cars : 0 Automatic 6-spd: 8
## Midsize Station Wagons : 0 Automatic (S4) : 6
## Minicompact Cars : 0 Automatic (S5) : 3
## (Other) : 0 (Other) : 0
## drive cyl displ
## 2-Wheel Drive : 0 Min. :6 3.5 :81
## 4-Wheel Drive : 0 1st Qu.:6 0 : 0
## 4-Wheel or All-Wheel Drive: 0 Median :6 1 : 0
## All-Wheel Drive : 0 Mean :6 1.1 : 0
## Front-Wheel Drive :81 3rd Qu.:6 1.2 : 0
## Part-time 4-Wheel Drive : 0 Max. :6 1.3 : 0
## Rear-Wheel Drive : 0 (Other): 0
## fuel hwy cty
## Regular :81 Min. :23.00 Min. :15.00
## CNG : 0 1st Qu.:24.00 1st Qu.:16.00
## Diesel : 0 Median :25.00 Median :16.00
## Electricity : 0 Mean :25.83 Mean :17.11
## Gasoline or E85 : 0 3rd Qu.:28.00 3rd Qu.:18.00
## Gasoline or natural gas: 0 Max. :30.00 Max. :20.00
## (Other) : 0
What are the cases, and how many are there?
cat("There are",nrow(df),"cases, each case is particular vehicle, 2-Wheel Drive, Large Cars, Regular fuel. \n")
## There are 81 cases, each case is particular vehicle, 2-Wheel Drive, Large Cars, Regular fuel.
Describe the method of data collection. Fuel economy data from the EPA, 1985-2015.
What type of study is this (observational/experiment)?
This is an observational study.
Fuel economy data from the EPA, 1985-2015, conveniently packaged for consumption by R users. URL https://github.com/hadley/fueleconomy Load data with this command: devtools::install_github(“hadley/fueleconomy”)
What is the response variable, and what type is it (numerical/categorical)?
The response variable is Model year and is numerical.
What is the explanatory variable, and what type is it (numerical/categorical)?
The explanatory variable is highway fuel economy, in mpg and is numerical.
Provide summary statistics relevant to your research question. For example, if you’re comparing means across groups provide means, SDs, sample sizes of each group. This step requires the use of R, hence a code chunk is provided below. Insert more code chunks as needed.
library(psych)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
describe(df$year)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 81 2003.31 6.16 2004 2003.38 7.41 1993 2014 21 -0.12 -1.32 0.68
describe(df$hwy)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 81 25.83 2.15 25 25.68 1.48 23 30 7 0.42 -1.53 0.24
library(ggplot2)
ggplot(df, aes(x=df$year,y=df$hwy))+ geom_line()