Suppose that your boss gave you a task to report an exploratory data analysis including visualization and summary statistics of wheel_base
(mileage per gallon) in the cars
data set.
library(ggplot2)
library(dplyr)
# Load data
cars <- read.csv("/resources/rstudio/business statistics/data/cars.csv")
str(cars)
## 'data.frame': 428 obs. of 19 variables:
## $ name : Factor w/ 425 levels "Acura 3.5 RL 4dr",..: 66 67 68 69 70 114 115 133 129 130 ...
## $ sports_car : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ suv : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ wagon : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ minivan : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ pickup : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ all_wheel : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ rear_wheel : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ msrp : int 11690 12585 14610 14810 16385 13670 15040 13270 13730 15460 ...
## $ dealer_cost: int 10965 11802 13697 13884 15357 12849 14086 12482 12906 14496 ...
## $ eng_size : num 1.6 1.6 2.2 2.2 2.2 2 2 2 2 2 ...
## $ ncyl : int 4 4 4 4 4 4 4 4 4 4 ...
## $ horsepwr : int 103 103 140 140 140 132 132 130 110 130 ...
## $ city_mpg : int 28 28 26 26 26 29 29 26 27 26 ...
## $ hwy_mpg : int 34 34 37 37 37 36 36 33 36 33 ...
## $ weight : int 2370 2348 2617 2676 2617 2581 2626 2612 2606 2606 ...
## $ wheel_base : int 98 98 104 104 104 105 105 103 103 103 ...
## $ length : int 167 153 183 183 183 174 174 168 168 168 ...
## $ width : int 66 66 69 68 69 67 67 67 67 67 ...
summary(cars)
## name sports_car suv
## Infiniti G35 4dr : 2 Mode :logical Mode :logical
## Mercedes-Benz C240 4dr : 2 FALSE:379 FALSE:368
## Mercedes-Benz C320 4dr : 2 TRUE :49 TRUE :60
## Acura 3.5 RL 4dr : 1 NA's :0 NA's :0
## Acura 3.5 RL w/Navigation 4dr: 1
## Acura MDX : 1
## (Other) :419
## wagon minivan pickup all_wheel
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:398 FALSE:408 FALSE:404 FALSE:336
## TRUE :30 TRUE :20 TRUE :24 TRUE :92
## NA's :0 NA's :0 NA's :0 NA's :0
##
##
##
## rear_wheel msrp dealer_cost eng_size
## Mode :logical Min. : 10280 Min. : 9875 Min. :1.300
## FALSE:318 1st Qu.: 20334 1st Qu.: 18866 1st Qu.:2.375
## TRUE :110 Median : 27635 Median : 25294 Median :3.000
## NA's :0 Mean : 32775 Mean : 30015 Mean :3.197
## 3rd Qu.: 39205 3rd Qu.: 35710 3rd Qu.:3.900
## Max. :192465 Max. :173560 Max. :8.300
##
## ncyl horsepwr city_mpg hwy_mpg
## Min. :-1.000 Min. : 73.0 Min. :10.00 Min. :12.00
## 1st Qu.: 4.000 1st Qu.:165.0 1st Qu.:17.00 1st Qu.:24.00
## Median : 6.000 Median :210.0 Median :19.00 Median :26.00
## Mean : 5.776 Mean :215.9 Mean :20.09 Mean :26.91
## 3rd Qu.: 6.000 3rd Qu.:255.0 3rd Qu.:21.00 3rd Qu.:29.00
## Max. :12.000 Max. :500.0 Max. :60.00 Max. :66.00
## NA's :14 NA's :14
## weight wheel_base length width
## Min. :1850 Min. : 89.0 Min. :143.0 Min. :64.00
## 1st Qu.:3102 1st Qu.:103.0 1st Qu.:177.0 1st Qu.:69.00
## Median :3474 Median :107.0 Median :186.0 Median :71.00
## Mean :3577 Mean :108.2 Mean :185.1 Mean :71.29
## 3rd Qu.:3974 3rd Qu.:112.0 3rd Qu.:193.0 3rd Qu.:73.00
## Max. :7190 Max. :144.0 Max. :227.0 Max. :81.00
## NA's :2 NA's :2 NA's :26 NA's :28
# Create faceted histogram
ggplot(cars, aes(x = wheel_base)) +
geom_histogram()
# Create box plots of city mpg by sports_car
ggplot(cars, aes(x = 1, y = wheel_base)) +
geom_boxplot()
# Create overlaid density plots for same data
ggplot(cars, aes(x = wheel_base)) +
geom_density(alpha = .3)
# If data has extreme values
cars %>%
summarize(median(wheel_base, na.rm = TRUE),
IQR(wheel_base, na.rm = TRUE))
## median(wheel_base, na.rm = TRUE) IQR(wheel_base, na.rm = TRUE)
## 1 107 9
# If data doesn't have extreme values
cars %>%
summarize(mean(wheel_base, na.rm = TRUE),
sd(wheel_base, na.rm = TRUE))
## mean(wheel_base, na.rm = TRUE) sd(wheel_base, na.rm = TRUE)
## 1 108.1737 8.326449
Interpretation
sports_car
sports_car
on the x-axis.sports_car
.# Create box plots of city mpg by sports_car
ggplot(cars, aes(x = sports_car, y = wheel_base)) +
geom_boxplot()
# If data has extreme values
cars %>%
group_by(sports_car) %>%
summarize(median(wheel_base, na.rm = TRUE),
IQR(wheel_base, na.rm = TRUE))
## # A tibble: 2 x 3
## sports_car `median(wheel_base, na.rm = TRUE)`
## <lgl> <int>
## 1 FALSE 107
## 2 TRUE 101
## # ... with 1 more variables: `IQR(wheel_base, na.rm = TRUE)` <dbl>
# If data doesn't have extreme values
cars %>%
group_by(sports_car) %>%
summarize(mean(wheel_base, na.rm = TRUE),
sd(wheel_base, na.rm = TRUE))
## # A tibble: 2 x 3
## sports_car `mean(wheel_base, na.rm = TRUE)`
## <lgl> <dbl>
## 1 FALSE 109.23873
## 2 TRUE 99.97959
## # ... with 1 more variables: `sd(wheel_base, na.rm = TRUE)` <dbl>
Interpretation