DATA 606 Data Project Proposal

Data Preparation

# load data from FuelEconomy.gov
if(!require(devtools)) install.packages("devtools")

## Loading required package: devtools

## Loading required package: usethis

devtools::install_github("hadley/fueleconomy")

## Skipping install of 'fueleconomy' from a github remote, the SHA1 (d590bcf6) has not changed since last install.
##   Use `force = TRUE` to force installation

library(fueleconomy)

#typeof(fueleconomy::vehicles)
cat("List row number is",dim(fueleconomy::vehicles)[1],", column number is",dim(fueleconomy::vehicles)[2], "\n")

## List row number is 33442 , column number is 12

cat("Columns are:", colnames(fueleconomy::vehicles), "\n")

## Columns are: id make model year class trans drive cyl displ fuel hwy cty

# "id"    "make"  "model" "year"  "class" "trans" "drive" "cyl"   "displ" "fuel"  "hwy"   "cty"
cat("Convert to data frame... \n")

## Convert to data frame...

df_full = data.frame("vehicle_id" = as.numeric(fueleconomy::vehicles$id), "make" = fueleconomy::vehicles$make, 
"model" = fueleconomy::vehicles$model, "year" = as.numeric(fueleconomy::vehicles$year), "class" = fueleconomy::vehicles$class, "trans" = fueleconomy::vehicles$trans, "drive"=fueleconomy::vehicles$drive, "cyl"=as.numeric(fueleconomy::vehicles$cyl), 
"displ"= as.character(fueleconomy::vehicles$displ), "fuel"=fueleconomy::vehicles$fuel,"hwy"=as.numeric(fueleconomy::vehicles$hwy), "cty"=as.numeric(fueleconomy::vehicles$cty))
cat("Data frame row number is",nrow(df_full),", column number is",ncol(df_full), "\n")

## Data frame row number is 33442 , column number is 12

# we interesting only in 2-Wheel Drive, Vans, Passenger Type, Regular fuel 
#summary(df_full)
#df <- subset(df_full, class == "Midsize Cars" && fuel == "Regular" && drive == "2-Wheel Drive")
df <- subset(df_full,  drive == "Front-Wheel Drive" & fuel == "Regular" & class == "Large Cars" & cyl == 6 & displ == "3.5")
# df <- subset(df, fuel == "Regular" )
#df <- subset(df, class = "Vans, Passenger Type")
#head(df)
cat("New Data frame row number is",nrow(df),", column number is",ncol(df), "\n")

## New Data frame row number is 81 , column number is 12

summary(df)

##    vehicle_id           make           model         year     
##  Min.   :10113   Chrysler :21   Intrepid  :15   Min.   :1993  
##  1st Qu.:13627   Dodge    :15   Taurus FWD:10   1st Qu.:1997  
##  Median :19839   Ford     :10   Avalon    : 8   Median :2004  
##  Mean   :20309   Chevrolet: 8   Vision    : 7   Mean   :2003  
##  3rd Qu.:26004   Toyota   : 8   Concorde  : 6   3rd Qu.:2009  
##  Max.   :33682   Eagle    : 7   300 M     : 5   Max.   :2014  
##                  (Other)  :12   (Other)   :30                 
##                           class                trans   
##  Large Cars                  :81   Automatic 4-spd:45  
##  Compact Cars                : 0   Automatic (S6) :11  
##  Midsize-Large Station Wagons: 0   Automatic 5-spd: 8  
##  Midsize Cars                : 0   Automatic 6-spd: 8  
##  Midsize Station Wagons      : 0   Automatic (S4) : 6  
##  Minicompact Cars            : 0   Automatic (S5) : 3  
##  (Other)                     : 0   (Other)        : 0  
##                         drive         cyl        displ   
##  2-Wheel Drive             : 0   Min.   :6   3.5    :81  
##  4-Wheel Drive             : 0   1st Qu.:6   0      : 0  
##  4-Wheel or All-Wheel Drive: 0   Median :6   1      : 0  
##  All-Wheel Drive           : 0   Mean   :6   1.1    : 0  
##  Front-Wheel Drive         :81   3rd Qu.:6   1.2    : 0  
##  Part-time 4-Wheel Drive   : 0   Max.   :6   1.3    : 0  
##  Rear-Wheel Drive          : 0               (Other): 0  
##                       fuel         hwy             cty       
##  Regular                :81   Min.   :23.00   Min.   :15.00  
##  CNG                    : 0   1st Qu.:24.00   1st Qu.:16.00  
##  Diesel                 : 0   Median :25.00   Median :16.00  
##  Electricity            : 0   Mean   :25.83   Mean   :17.11  
##  Gasoline or E85        : 0   3rd Qu.:28.00   3rd Qu.:18.00  
##  Gasoline or natural gas: 0   Max.   :30.00   Max.   :20.00  
##  (Other)                : 0

Research question

Cases

What are the cases, and how many are there?

cat("There are",nrow(df),"cases, each case is particular vehicle, 2-Wheel Drive, Large Cars, Regular fuel. \n")

## There are 81 cases, each case is particular vehicle, 2-Wheel Drive, Large Cars, Regular fuel.

Data collection

Describe the method of data collection. Fuel economy data from the EPA, 1985-2015.

Type of study

What type of study is this (observational/experiment)?

This is an observational study.

Data Source

Fuel economy data from the EPA, 1985-2015, conveniently packaged for consumption by R users. URL https://github.com/hadley/fueleconomy Load data with this command: devtools::install_github(“hadley/fueleconomy”)

Response

What is the response variable, and what type is it (numerical/categorical)?

The response variable is Model year and is numerical.

Explanatory

What is the explanatory variable, and what type is it (numerical/categorical)?

The explanatory variable is highway fuel economy, in mpg and is numerical.

Relevant summary statistics

Provide summary statistics relevant to your research question. For example, if you’re comparing means across groups provide means, SDs, sample sizes of each group. This step requires the use of R, hence a code chunk is provided below. Insert more code chunks as needed.

library(psych)
library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following objects are masked from 'package:psych':
## 
##     %+%, alpha

describe(df$year)

##    vars  n    mean   sd median trimmed  mad  min  max range  skew kurtosis   se
## X1    1 81 2003.31 6.16   2004 2003.38 7.41 1993 2014    21 -0.12    -1.32 0.68

describe(df$hwy)

##    vars  n  mean   sd median trimmed  mad min max range skew kurtosis   se
## X1    1 81 25.83 2.15     25   25.68 1.48  23  30     7 0.42    -1.53 0.24

library(ggplot2)
ggplot(df, aes(x=df$year,y=df$hwy))+ geom_line()