Suppose that your boss gave you a task to report an exploratory data analysis including visualization and summary statistics of wheel_base (mileage per gallon) in the cars data set.

Load packages

library(ggplot2)
library(dplyr)

Import Data

# Load data
cars <- read.csv("/resources/rstudio/business statistics/data/cars.csv")

str(cars)
## 'data.frame':    428 obs. of  19 variables:
##  $ name       : Factor w/ 425 levels "Acura 3.5 RL 4dr",..: 66 67 68 69 70 114 115 133 129 130 ...
##  $ sports_car : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ suv        : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ wagon      : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ minivan    : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ pickup     : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ all_wheel  : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ rear_wheel : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ msrp       : int  11690 12585 14610 14810 16385 13670 15040 13270 13730 15460 ...
##  $ dealer_cost: int  10965 11802 13697 13884 15357 12849 14086 12482 12906 14496 ...
##  $ eng_size   : num  1.6 1.6 2.2 2.2 2.2 2 2 2 2 2 ...
##  $ ncyl       : int  4 4 4 4 4 4 4 4 4 4 ...
##  $ horsepwr   : int  103 103 140 140 140 132 132 130 110 130 ...
##  $ city_mpg   : int  28 28 26 26 26 29 29 26 27 26 ...
##  $ hwy_mpg    : int  34 34 37 37 37 36 36 33 36 33 ...
##  $ weight     : int  2370 2348 2617 2676 2617 2581 2626 2612 2606 2606 ...
##  $ wheel_base : int  98 98 104 104 104 105 105 103 103 103 ...
##  $ length     : int  167 153 183 183 183 174 174 168 168 168 ...
##  $ width      : int  66 66 69 68 69 67 67 67 67 67 ...

summary(cars)
##                             name     sports_car         suv         
##  Infiniti G35 4dr             :  2   Mode :logical   Mode :logical  
##  Mercedes-Benz C240 4dr       :  2   FALSE:379       FALSE:368      
##  Mercedes-Benz C320 4dr       :  2   TRUE :49        TRUE :60       
##  Acura 3.5 RL 4dr             :  1   NA's :0         NA's :0        
##  Acura 3.5 RL w/Navigation 4dr:  1                                  
##  Acura MDX                    :  1                                  
##  (Other)                      :419                                  
##    wagon          minivan          pickup        all_wheel      
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:398       FALSE:408       FALSE:404       FALSE:336      
##  TRUE :30        TRUE :20        TRUE :24        TRUE :92       
##  NA's :0         NA's :0         NA's :0         NA's :0        
##                                                                 
##                                                                 
##                                                                 
##  rear_wheel           msrp         dealer_cost        eng_size    
##  Mode :logical   Min.   : 10280   Min.   :  9875   Min.   :1.300  
##  FALSE:318       1st Qu.: 20334   1st Qu.: 18866   1st Qu.:2.375  
##  TRUE :110       Median : 27635   Median : 25294   Median :3.000  
##  NA's :0         Mean   : 32775   Mean   : 30015   Mean   :3.197  
##                  3rd Qu.: 39205   3rd Qu.: 35710   3rd Qu.:3.900  
##                  Max.   :192465   Max.   :173560   Max.   :8.300  
##                                                                   
##       ncyl           horsepwr        city_mpg        hwy_mpg     
##  Min.   :-1.000   Min.   : 73.0   Min.   :10.00   Min.   :12.00  
##  1st Qu.: 4.000   1st Qu.:165.0   1st Qu.:17.00   1st Qu.:24.00  
##  Median : 6.000   Median :210.0   Median :19.00   Median :26.00  
##  Mean   : 5.776   Mean   :215.9   Mean   :20.09   Mean   :26.91  
##  3rd Qu.: 6.000   3rd Qu.:255.0   3rd Qu.:21.00   3rd Qu.:29.00  
##  Max.   :12.000   Max.   :500.0   Max.   :60.00   Max.   :66.00  
##                                   NA's   :14      NA's   :14     
##      weight       wheel_base        length          width      
##  Min.   :1850   Min.   : 89.0   Min.   :143.0   Min.   :64.00  
##  1st Qu.:3102   1st Qu.:103.0   1st Qu.:177.0   1st Qu.:69.00  
##  Median :3474   Median :107.0   Median :186.0   Median :71.00  
##  Mean   :3577   Mean   :108.2   Mean   :185.1   Mean   :71.29  
##  3rd Qu.:3974   3rd Qu.:112.0   3rd Qu.:193.0   3rd Qu.:73.00  
##  Max.   :7190   Max.   :144.0   Max.   :227.0   Max.   :81.00  
##  NA's   :2      NA's   :2       NA's   :26      NA's   :28

Visualize

# Create faceted histogram
ggplot(cars, aes(x = wheel_base)) +
  geom_histogram()


# Create box plots of city mpg by sports_car
ggplot(cars, aes(x = 1, y = wheel_base)) +
  geom_boxplot()


# Create overlaid density plots for same data
ggplot(cars, aes(x = wheel_base)) +
  geom_density(alpha = .3)

Summary statistics

# If data has extreme values
cars %>%
  summarize(median(wheel_base, na.rm = TRUE),
            IQR(wheel_base, na.rm = TRUE))
##   median(wheel_base, na.rm = TRUE) IQR(wheel_base, na.rm = TRUE)
## 1                              107                             9

# If data doesn't have extreme values
cars %>%
  summarize(mean(wheel_base, na.rm = TRUE),
            sd(wheel_base, na.rm = TRUE))
##   mean(wheel_base, na.rm = TRUE) sd(wheel_base, na.rm = TRUE)
## 1                       108.1737                     8.326449

Interpretation

Add the third variable, sports_car

# Create box plots of city mpg by sports_car
ggplot(cars, aes(x = sports_car, y = wheel_base)) +
  geom_boxplot()


# If data has extreme values
cars %>%
  group_by(sports_car) %>%
  summarize(median(wheel_base, na.rm = TRUE),
            IQR(wheel_base, na.rm = TRUE))
## # A tibble: 2 x 3
##   sports_car `median(wheel_base, na.rm = TRUE)`
##        <lgl>                              <int>
## 1      FALSE                                107
## 2       TRUE                                101
## # ... with 1 more variables: `IQR(wheel_base, na.rm = TRUE)` <dbl>

# If data doesn't have extreme values
cars %>%
  group_by(sports_car) %>%
  summarize(mean(wheel_base, na.rm = TRUE),
            sd(wheel_base, na.rm = TRUE))
## # A tibble: 2 x 3
##   sports_car `mean(wheel_base, na.rm = TRUE)`
##        <lgl>                            <dbl>
## 1      FALSE                        109.23873
## 2       TRUE                         99.97959
## # ... with 1 more variables: `sd(wheel_base, na.rm = TRUE)` <dbl>

Interpretation