Load the Car Price Data Set Into R
#load tidyverse library
#use read_csv to load data
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.4.3
## -- Attaching packages ---------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1 v purrr 0.2.4
## v tibble 1.4.1 v dplyr 0.7.4
## v tidyr 0.7.2 v stringr 1.2.0
## v readr 1.1.1 v forcats 0.2.0
## Warning: package 'tibble' was built under R version 3.4.3
## Warning: package 'tidyr' was built under R version 3.4.3
## Warning: package 'readr' was built under R version 3.4.2
## Warning: package 'purrr' was built under R version 3.4.3
## Warning: package 'dplyr' was built under R version 3.4.2
## Warning: package 'forcats' was built under R version 3.4.3
## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
carprice <- read_csv("C:\\Users\\bkl2001\\Documents\\Personal\\CUNY\\Classes\\Winter Bridge\\carprice.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_integer(),
## Type = col_character(),
## Min.Price = col_double(),
## Price = col_double(),
## Max.Price = col_double(),
## Range.Price = col_double(),
## RoughRange = col_double(),
## gpm100 = col_double(),
## MPG.city = col_integer(),
## MPG.highway = col_integer()
## )
Take a look at the data
glimpse(carprice)
## Observations: 48
## Variables: 10
## $ X1 <int> 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19...
## $ Type <chr> "Midsize", "Large", "Large", "Midsize", "Large", "...
## $ Min.Price <dbl> 14.2, 19.9, 22.6, 26.3, 33.0, 37.5, 8.5, 11.4, 13....
## $ Price <dbl> 15.7, 20.8, 23.7, 26.3, 34.7, 40.1, 13.4, 11.4, 15...
## $ Max.Price <dbl> 17.3, 21.7, 24.9, 26.3, 36.3, 42.7, 18.3, 11.4, 16...
## $ Range.Price <dbl> 3.1, 1.8, 2.3, 0.0, 3.3, 5.2, 9.8, 0.0, 3.4, 5.0, ...
## $ RoughRange <dbl> 3.09, 1.79, 2.31, -0.01, 3.30, 5.18, 9.80, -0.01, ...
## $ gpm100 <dbl> 3.8, 4.2, 4.9, 4.3, 4.9, 4.9, 3.3, 3.4, 4.2, 4.0, ...
## $ MPG.city <int> 22, 19, 16, 19, 16, 16, 25, 25, 19, 21, 18, 15, 17...
## $ MPG.highway <int> 31, 28, 25, 27, 25, 25, 36, 34, 28, 29, 23, 20, 26...
#view the names of the variables
names(carprice)
## [1] "X1" "Type" "Min.Price" "Price" "Max.Price"
## [6] "Range.Price" "RoughRange" "gpm100" "MPG.city" "MPG.highway"
Based of the data provided by the glimpse feature the X1 data element does not provide anything substantial. The next step is to create a subset of this tibble and rename values.
#create a tibble called carNew
carNew <- tibble('type' = carprice$Type,
'minPrice' = carprice$Min.Price,
'price' = carprice$Price,
'maxPrice' = carprice$Max.Price,
'range' = carprice$Range.Price,
'rangePlus' = carprice$RoughRange,
'galReq100' = carprice$gpm100,
'mpgCity' = carprice$MPG.city,
'mpgHwy' = carprice$MPG.highway)
#take a glimpse of subset
glimpse(carNew)
## Observations: 48
## Variables: 9
## $ type <chr> "Midsize", "Large", "Large", "Midsize", "Large", "Mi...
## $ minPrice <dbl> 14.2, 19.9, 22.6, 26.3, 33.0, 37.5, 8.5, 11.4, 13.4,...
## $ price <dbl> 15.7, 20.8, 23.7, 26.3, 34.7, 40.1, 13.4, 11.4, 15.1...
## $ maxPrice <dbl> 17.3, 21.7, 24.9, 26.3, 36.3, 42.7, 18.3, 11.4, 16.8...
## $ range <dbl> 3.1, 1.8, 2.3, 0.0, 3.3, 5.2, 9.8, 0.0, 3.4, 5.0, 3....
## $ rangePlus <dbl> 3.09, 1.79, 2.31, -0.01, 3.30, 5.18, 9.80, -0.01, 3....
## $ galReq100 <dbl> 3.8, 4.2, 4.9, 4.3, 4.9, 4.9, 3.3, 3.4, 4.2, 4.0, 4....
## $ mpgCity <int> 22, 19, 16, 19, 16, 16, 25, 25, 19, 21, 18, 15, 17, ...
## $ mpgHwy <int> 31, 28, 25, 27, 25, 25, 36, 34, 28, 29, 23, 20, 26, ...
We now have an updated tibble which has 9 variables and 48 observations. Next we will run the summary function which will tell us what direction we should go in analytically.
summary(carNew)
## type minPrice price maxPrice
## Length:48 Min. : 6.90 Min. : 7.40 Min. : 7.90
## Class :character 1st Qu.:11.40 1st Qu.:13.47 1st Qu.:14.97
## Mode :character Median :14.50 Median :16.30 Median :18.40
## Mean :16.54 Mean :18.57 Mean :20.63
## 3rd Qu.:19.43 3rd Qu.:20.73 3rd Qu.:24.50
## Max. :37.50 Max. :40.10 Max. :42.70
## range rangePlus galReq100 mpgCity
## Min. : 0.000 Min. :-0.020 Min. :2.800 Min. :15.00
## 1st Qu.: 1.700 1st Qu.: 1.705 1st Qu.:3.800 1st Qu.:18.00
## Median : 3.300 Median : 3.305 Median :4.200 Median :20.00
## Mean : 4.092 Mean : 4.089 Mean :4.167 Mean :20.96
## 3rd Qu.: 5.850 3rd Qu.: 5.853 3rd Qu.:4.550 3rd Qu.:23.00
## Max. :14.600 Max. :14.600 Max. :5.700 Max. :31.00
## mpgHwy
## Min. :20.00
## 1st Qu.:26.00
## Median :28.00
## Mean :28.15
## 3rd Qu.:30.00
## Max. :41.00
Based on the findings from the summary feature we can see that the median and mean are very close for the mpgCity and with many drivers concerned with increasing gas prices we will explore the differences for each type of car.
#boxplot of mpgCity
ggplot(data = carNew, mapping = aes(x = type, y = mpgCity)) + geom_boxplot()
Also,With 8 out of 9 variables being numeric, the next step will be to see how many unique values we have in the type data element.
#unique returns a vector, data frame or array like x but with duplicate elements/rows removed.
unique(carNew$type)
## [1] "Midsize" "Large" "Compact" "Sporty" "Van" "Small"
The next step was to accurately count how many cars were in each group.
table(carNew$type)
##
## Compact Large Midsize Small Sporty Van
## 7 11 10 7 8 5
Before delving into specific groups take a look at the mpgCity element
hist(carNew$mpgCity, main = "Histogram of Miles Per Gallon (City)", xlab = "miles per gallon", col = "beige", breaks = 10)
Subset data based on types
carCompact <-subset(carNew, carNew$type=="Compact", select = c(type,galReq100, mpgCity, mpgHwy, price))
carLarge <-subset(carNew, carNew$type=="Large", select = c(type,galReq100, mpgCity, mpgHwy, price))
carMidsize <-subset(carNew, carNew$type=="Midsize", select = c(type,galReq100, mpgCity, mpgHwy, price))
carSmall <-subset(carNew, carNew$type=="Small", select = c(type,galReq100, mpgCity, mpgHwy, price))
carSporty <-subset(carNew, carNew$type=="Sporty", select = c(type,galReq100, mpgCity, mpgHwy, price))
carVan <-subset(carNew, carNew$type=="Van", select = c(type,galReq100, mpgCity, mpgHwy, price))
Run summary statistics on all subsets
#compact
summary(carCompact)
## type galReq100 mpgCity mpgHwy
## Length:7 Min. :3.300 Min. :22.00 Min. :27.00
## Class :character 1st Qu.:3.500 1st Qu.:22.50 1st Qu.:27.50
## Mode :character Median :3.700 Median :23.00 Median :31.00
## Mean :3.729 Mean :23.43 Mean :30.57
## 3rd Qu.:4.000 3rd Qu.:24.50 3rd Qu.:32.50
## Max. :4.100 Max. :25.00 Max. :36.00
## price
## Min. :11.10
## 1st Qu.:11.35
## Median :13.30
## Mean :12.83
## 3rd Qu.:13.45
## Max. :15.80
summary(carLarge)
## type galReq100 mpgCity mpgHwy
## Length:11 Min. :4.200 Min. :16.00 Min. :25.00
## Class :character 1st Qu.:4.200 1st Qu.:17.50 1st Qu.:26.00
## Mode :character Median :4.300 Median :19.00 Median :26.00
## Mean :4.436 Mean :18.36 Mean :26.73
## 3rd Qu.:4.600 3rd Qu.:19.50 3rd Qu.:28.00
## Max. :4.900 Max. :20.00 Max. :28.00
## price
## Min. :18.40
## 1st Qu.:20.00
## Median :20.90
## Mean :24.30
## 3rd Qu.:26.95
## Max. :36.10
summary(carMidsize)
## type galReq100 mpgCity mpgHwy
## Length:10 Min. :3.700 Min. :16.0 Min. :25.00
## Class :character 1st Qu.:3.925 1st Qu.:19.0 1st Qu.:26.25
## Mode :character Median :4.250 Median :20.0 Median :27.00
## Mean :4.220 Mean :19.8 Mean :27.90
## 3rd Qu.:4.375 3rd Qu.:21.0 3rd Qu.:29.75
## Max. :4.900 Max. :23.0 Max. :31.00
## price
## Min. :14.90
## 1st Qu.:15.75
## Median :17.40
## Mean :21.78
## 3rd Qu.:24.77
## Max. :40.10
summary(carSmall)
## type galReq100 mpgCity mpgHwy
## Length:7 Min. :2.800 Min. :23.00 Min. :29.00
## Class :character 1st Qu.:3.050 1st Qu.:25.50 1st Qu.:31.50
## Mode :character Median :3.200 Median :29.00 Median :33.00
## Mean :3.271 Mean :27.71 Mean :33.86
## 3rd Qu.:3.500 3rd Qu.:30.00 3rd Qu.:35.50
## Max. :3.800 Max. :31.00 Max. :41.00
## price
## Min. : 7.40
## 1st Qu.: 9.10
## Median :10.10
## Mean :10.04
## 3rd Qu.:11.20
## Max. :12.20
summary(carSporty)
## type galReq100 mpgCity mpgHwy
## Length:8 Min. :3.700 Min. :17.00 Min. :24.00
## Class :character 1st Qu.:3.875 1st Qu.:18.75 1st Qu.:25.75
## Mode :character Median :4.150 Median :20.50 Median :28.00
## Mean :4.188 Mean :20.62 Mean :27.50
## 3rd Qu.:4.350 3rd Qu.:23.00 3rd Qu.:29.25
## Max. :4.800 Max. :24.00 Max. :30.00
## price
## Min. :14.00
## 1st Qu.:14.32
## Median :15.50
## Mean :19.38
## 3rd Qu.:19.73
## Max. :38.00
summary(carVan)
## type galReq100 mpgCity mpgHwy
## Length:5 Min. :4.9 Min. :15.0 Min. :20.0
## Class :character 1st Qu.:4.9 1st Qu.:15.0 1st Qu.:20.0
## Mode :character Median :5.3 Median :17.0 Median :21.0
## Mean :5.3 Mean :16.6 Mean :21.4
## 3rd Qu.:5.7 3rd Qu.:18.0 3rd Qu.:23.0
## Max. :5.7 Max. :18.0 Max. :23.0
## price
## Min. :16.30
## 1st Qu.:16.60
## Median :19.00
## Mean :18.26
## 3rd Qu.:19.50
## Max. :19.90
Based on the findings from the summary statistic we can see that a vehicle will burn more gas to travel 100 miles based on their size. For example, a van takes 5.3 gallons in order to travel 100 miles and the maximum miles per gallon in the city is 18 and 19 on the highway.
We will compare the compact car, midsize, and small car with boxplots
boxplot(carCompact$mpgCity, carMidsize$mpgCity, carSmall$mpgCity,
horizontal = FALSE,
col = "yellow",
stats = TRUE,
names = c("Compact", "Midsize", "Small")
)