Suppose that your boss gave you a task to report an exploratory data analysis including visualization and summary statistics of city_mpg (mileage per gallon) in the cars data set.

Load packages

library(ggplot2)
library(dplyr)

Import Data

# Load data
cars <- read.csv("/resources/data/cars.csv")

str(cars)
## 'data.frame':    428 obs. of  19 variables:
##  $ name       : Factor w/ 425 levels "Acura 3.5 RL 4dr",..: 66 67 68 69 70 114 115 133 129 130 ...
##  $ sports_car : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ suv        : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ wagon      : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ minivan    : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ pickup     : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ all_wheel  : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ rear_wheel : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ msrp       : int  11690 12585 14610 14810 16385 13670 15040 13270 13730 15460 ...
##  $ dealer_cost: int  10965 11802 13697 13884 15357 12849 14086 12482 12906 14496 ...
##  $ eng_size   : num  1.6 1.6 2.2 2.2 2.2 2 2 2 2 2 ...
##  $ ncyl       : int  4 4 4 4 4 4 4 4 4 4 ...
##  $ horsepwr   : int  103 103 140 140 140 132 132 130 110 130 ...
##  $ city_mpg   : int  28 28 26 26 26 29 29 26 27 26 ...
##  $ hwy_mpg    : int  34 34 37 37 37 36 36 33 36 33 ...
##  $ weight     : int  2370 2348 2617 2676 2617 2581 2626 2612 2606 2606 ...
##  $ wheel_base : int  98 98 104 104 104 105 105 103 103 103 ...
##  $ length     : int  167 153 183 183 183 174 174 168 168 168 ...
##  $ width      : int  66 66 69 68 69 67 67 67 67 67 ...

summary(cars)
##                             name     sports_car         suv         
##  Infiniti G35 4dr             :  2   Mode :logical   Mode :logical  
##  Mercedes-Benz C240 4dr       :  2   FALSE:379       FALSE:368      
##  Mercedes-Benz C320 4dr       :  2   TRUE :49        TRUE :60       
##  Acura 3.5 RL 4dr             :  1   NA's :0         NA's :0        
##  Acura 3.5 RL w/Navigation 4dr:  1                                  
##  Acura MDX                    :  1                                  
##  (Other)                      :419                                  
##    wagon          minivan          pickup        all_wheel      
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:398       FALSE:408       FALSE:404       FALSE:336      
##  TRUE :30        TRUE :20        TRUE :24        TRUE :92       
##  NA's :0         NA's :0         NA's :0         NA's :0        
##                                                                 
##                                                                 
##                                                                 
##  rear_wheel           msrp         dealer_cost        eng_size    
##  Mode :logical   Min.   : 10280   Min.   :  9875   Min.   :1.300  
##  FALSE:318       1st Qu.: 20334   1st Qu.: 18866   1st Qu.:2.375  
##  TRUE :110       Median : 27635   Median : 25294   Median :3.000  
##  NA's :0         Mean   : 32775   Mean   : 30015   Mean   :3.197  
##                  3rd Qu.: 39205   3rd Qu.: 35710   3rd Qu.:3.900  
##                  Max.   :192465   Max.   :173560   Max.   :8.300  
##                                                                   
##       ncyl           horsepwr        city_mpg        hwy_mpg     
##  Min.   :-1.000   Min.   : 73.0   Min.   :10.00   Min.   :12.00  
##  1st Qu.: 4.000   1st Qu.:165.0   1st Qu.:17.00   1st Qu.:24.00  
##  Median : 6.000   Median :210.0   Median :19.00   Median :26.00  
##  Mean   : 5.776   Mean   :215.9   Mean   :20.09   Mean   :26.91  
##  3rd Qu.: 6.000   3rd Qu.:255.0   3rd Qu.:21.00   3rd Qu.:29.00  
##  Max.   :12.000   Max.   :500.0   Max.   :60.00   Max.   :66.00  
##                                   NA's   :14      NA's   :14     
##      weight       wheel_base        length          width      
##  Min.   :1850   Min.   : 89.0   Min.   :143.0   Min.   :64.00  
##  1st Qu.:3102   1st Qu.:103.0   1st Qu.:177.0   1st Qu.:69.00  
##  Median :3474   Median :107.0   Median :186.0   Median :71.00  
##  Mean   :3577   Mean   :108.2   Mean   :185.1   Mean   :71.29  
##  3rd Qu.:3974   3rd Qu.:112.0   3rd Qu.:193.0   3rd Qu.:73.00  
##  Max.   :7190   Max.   :144.0   Max.   :227.0   Max.   :81.00  
##  NA's   :2      NA's   :2       NA's   :26      NA's   :28

Visualize

# Create faceted histogram
ggplot(cars, aes(x = wheel_base)) +
  geom_histogram()


# Create box plots of city mpg by suv
ggplot(cars, aes(x = 1, y = wheel_base)) +
  geom_boxplot()


# Create overlaid density plots for same data
ggplot(cars, aes(x = wheel_base)) +
  geom_density(alpha = .3)

Summary statistics

# If data has extreme values
cars %>%
  summarize(median(wheel_base, na.rm = TRUE),
            IQR(wheel_base, na.rm = TRUE))
##   median(wheel_base, na.rm = TRUE) IQR(wheel_base, na.rm = TRUE)
## 1                              107                             9

# If data doesn't have extreme values
cars %>%
  summarize(mean(wheel_base, na.rm = TRUE),
            sd(wheel_base, na.rm = TRUE))
##   mean(wheel_base, na.rm = TRUE) sd(wheel_base, na.rm = TRUE)
## 1                       108.1737                     8.326449

Interpretation

Add the second variable,

# Create box plots of city mpg by suv
ggplot(cars, aes(x = sports_car, y = wheel_base)) +
  geom_boxplot()


# If data has extreme values
cars %>%
  group_by(sports_car) %>%
  summarize(median(wheel_base, na.rm = TRUE),
            IQR(wheel_base, na.rm = TRUE))
## # A tibble: 2 x 3
##   sports_car `median(wheel_base, na.rm = TRUE)`
##        <lgl>                              <int>
## 1      FALSE                                107
## 2       TRUE                                101
## # ... with 1 more variables: `IQR(wheel_base, na.rm = TRUE)` <dbl>

# If data doesn't have extreme values
cars %>%
  group_by(sports_car) %>%
  summarize(mean(wheel_base, na.rm = TRUE),
            sd(wheel_base, na.rm = TRUE))
## # A tibble: 2 x 3
##   sports_car `mean(wheel_base, na.rm = TRUE)`
##        <lgl>                            <dbl>
## 1      FALSE                        109.23873
## 2       TRUE                         99.97959
## # ... with 1 more variables: `sd(wheel_base, na.rm = TRUE)` <dbl>

Interpretation

The wheels on sports cars are smaller in width than the ones on non sports cars, regardless of which measure of center were taken. On average non sports cars have values that are farther away from the center.