Data Wrangling Intro

Introduction

Today we will start diving into data wrangling using the dplyr package.

Functions used

as_tibble()
select()
arrange()
desc()
filter()
mutate()
case_when()
group_by()
summarise()
mean()
median()

Libraries

We only need to load one library today: the dplyr package.

library(dplyr)

Getting started

We will be working with the dataset mtcars.

# convert to tibble for easier printing
mtcars_new <- as_tibble(mtcars)

mpg	cyl	disp	hp	drat	wt	qsec	vs	am	gear	carb
21.0	6	160.0	110	3.90	2.620	16.46	0	1	4	4
21.0	6	160.0	110	3.90	2.875	17.02	0	1	4	4
22.8	4	108.0	93	3.85	2.320	18.61	1	1	4	1
21.4	6	258.0	110	3.08	3.215	19.44	1	0	3	1
18.7	8	360.0	175	3.15	3.440	17.02	0	0	3	2
18.1	6	225.0	105	2.76	3.460	20.22	1	0	3	1
14.3	8	360.0	245	3.21	3.570	15.84	0	0	3	4
24.4	4	146.7	62	3.69	3.190	20.00	1	0	4	2
22.8	4	140.8	95	3.92	3.150	22.90	1	0	4	2
19.2	6	167.6	123	3.92	3.440	18.30	1	0	4	4
17.8	6	167.6	123	3.92	3.440	18.90	1	0	4	4
16.4	8	275.8	180	3.07	4.070	17.40	0	0	3	3
17.3	8	275.8	180	3.07	3.730	17.60	0	0	3	3
15.2	8	275.8	180	3.07	3.780	18.00	0	0	3	3
10.4	8	472.0	205	2.93	5.250	17.98	0	0	3	4
10.4	8	460.0	215	3.00	5.424	17.82	0	0	3	4
14.7	8	440.0	230	3.23	5.345	17.42	0	0	3	4
32.4	4	78.7	66	4.08	2.200	19.47	1	1	4	1
30.4	4	75.7	52	4.93	1.615	18.52	1	1	4	2
33.9	4	71.1	65	4.22	1.835	19.90	1	1	4	1
21.5	4	120.1	97	3.70	2.465	20.01	1	0	3	1
15.5	8	318.0	150	2.76	3.520	16.87	0	0	3	2
15.2	8	304.0	150	3.15	3.435	17.30	0	0	3	2
13.3	8	350.0	245	3.73	3.840	15.41	0	0	3	4
19.2	8	400.0	175	3.08	3.845	17.05	0	0	3	2
27.3	4	79.0	66	4.08	1.935	18.90	1	1	4	1
26.0	4	120.3	91	4.43	2.140	16.70	0	1	5	2
30.4	4	95.1	113	3.77	1.513	16.90	1	1	5	2
15.8	8	351.0	264	4.22	3.170	14.50	0	1	5	4
19.7	6	145.0	175	3.62	2.770	15.50	0	1	5	6
15.0	8	301.0	335	3.54	3.570	14.60	0	1	5	8
21.4	4	121.0	109	4.11	2.780	18.60	1	1	4	2

We are interested in 3 variables today: horsepower, weight, and miles-per-gallon (coded as hp, wt, and mpg, respectively, in the dataset). We can use the select() function to isolate these variables.

# interested in horsepower, wt, and mpg
# Variables are: mpg, cyl, hp, vs
# select these variables, all others are removed automatically
mtcars_new_1 <- select(mtcars_new, c(mpg, wt, hp))
knitr::kable(mtcars_new) # All other variables are gone!

mpg	cyl	disp	hp	drat	wt	qsec	vs	am	gear	carb
21.0	6	160.0	110	3.90	2.620	16.46	0	1	4	4
21.0	6	160.0	110	3.90	2.875	17.02	0	1	4	4
22.8	4	108.0	93	3.85	2.320	18.61	1	1	4	1
21.4	6	258.0	110	3.08	3.215	19.44	1	0	3	1
18.7	8	360.0	175	3.15	3.440	17.02	0	0	3	2
18.1	6	225.0	105	2.76	3.460	20.22	1	0	3	1
14.3	8	360.0	245	3.21	3.570	15.84	0	0	3	4
24.4	4	146.7	62	3.69	3.190	20.00	1	0	4	2
22.8	4	140.8	95	3.92	3.150	22.90	1	0	4	2
19.2	6	167.6	123	3.92	3.440	18.30	1	0	4	4
17.8	6	167.6	123	3.92	3.440	18.90	1	0	4	4
16.4	8	275.8	180	3.07	4.070	17.40	0	0	3	3
17.3	8	275.8	180	3.07	3.730	17.60	0	0	3	3
15.2	8	275.8	180	3.07	3.780	18.00	0	0	3	3
10.4	8	472.0	205	2.93	5.250	17.98	0	0	3	4
10.4	8	460.0	215	3.00	5.424	17.82	0	0	3	4
14.7	8	440.0	230	3.23	5.345	17.42	0	0	3	4
32.4	4	78.7	66	4.08	2.200	19.47	1	1	4	1
30.4	4	75.7	52	4.93	1.615	18.52	1	1	4	2
33.9	4	71.1	65	4.22	1.835	19.90	1	1	4	1
21.5	4	120.1	97	3.70	2.465	20.01	1	0	3	1
15.5	8	318.0	150	2.76	3.520	16.87	0	0	3	2
15.2	8	304.0	150	3.15	3.435	17.30	0	0	3	2
13.3	8	350.0	245	3.73	3.840	15.41	0	0	3	4
19.2	8	400.0	175	3.08	3.845	17.05	0	0	3	2
27.3	4	79.0	66	4.08	1.935	18.90	1	1	4	1
26.0	4	120.3	91	4.43	2.140	16.70	0	1	5	2
30.4	4	95.1	113	3.77	1.513	16.90	1	1	5	2
15.8	8	351.0	264	4.22	3.170	14.50	0	1	5	4
19.7	6	145.0	175	3.62	2.770	15.50	0	1	5	6
15.0	8	301.0	335	3.54	3.570	14.60	0	1	5	8
21.4	4	121.0	109	4.11	2.780	18.60	1	1	4	2

Next, we’re going to use the arrange() function to see what the highest mpg values are. We also use the desc() function to arrange the mpg values from highest to lowest.

# next, let's see what the highest mpg values are 
# to do so, we can arrange by descending mpg values using arrange() & desc()
mtcars_new_2 <- arrange(mtcars_new_1, desc(mpg))
knitr::kable(mtcars_new_2)

mpg	wt	hp
33.9	1.835	65
32.4	2.200	66
30.4	1.615	52
30.4	1.513	113
27.3	1.935	66
26.0	2.140	91
24.4	3.190	62
22.8	2.320	93
22.8	3.150	95
21.5	2.465	97
21.4	3.215	110
21.4	2.780	109
21.0	2.620	110
21.0	2.875	110
19.7	2.770	175
19.2	3.440	123
19.2	3.845	175
18.7	3.440	175
18.1	3.460	105
17.8	3.440	123
17.3	3.730	180
16.4	4.070	180
15.8	3.170	264
15.5	3.520	150
15.2	3.780	180
15.2	3.435	150
15.0	3.570	335
14.7	5.345	230
14.3	3.570	245
13.3	3.840	245
10.4	5.250	205
10.4	5.424	215

Let’s say we’re only interested in cars with relatively high mpg. All other rows can be ignored. We can use the filter() function to remove these other rows.

# 20 mpg is our cutoff. All other cars can be removed from the dataset
mtcars_new_3 <- filter(mtcars_new_2, mpg>=20)
knitr::kable(mtcars_new_3)

mpg	wt	hp
33.9	1.835	65
32.4	2.200	66
30.4	1.615	52
30.4	1.513	113
27.3	1.935	66
26.0	2.140	91
24.4	3.190	62
22.8	2.320	93
22.8	3.150	95
21.5	2.465	97
21.4	3.215	110
21.4	2.780	109
21.0	2.620	110
21.0	2.875	110

Now, only 14 cars remain.

What about horsepower? Let’s try arranging by descending values of mpg AND hp.

mtcars_new_4 <- arrange(mtcars_new_3, desc(hp),desc(mpg))
knitr::kable(mtcars_new_4)

mpg	wt	hp
30.4	1.513	113
21.4	3.215	110
21.0	2.620	110
21.0	2.875	110
21.4	2.780	109
21.5	2.465	97
22.8	3.150	95
22.8	2.320	93
26.0	2.140	91
32.4	2.200	66
27.3	1.935	66
33.9	1.835	65
24.4	3.190	62
30.4	1.615	52

mtcars_new_5 <- mutate(mtcars_new, wt_lbs=1000*wt)
mtcars_new$wt_lbs <- mtcars_new$wt*1000
mtcars_new_5

# A tibble: 32 × 12
     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1  21       6  160    110  3.9   2.62  16.5     0     1     4     4
 2  21       6  160    110  3.9   2.88  17.0     0     1     4     4
 3  22.8     4  108     93  3.85  2.32  18.6     1     1     4     1
 4  21.4     6  258    110  3.08  3.22  19.4     1     0     3     1
 5  18.7     8  360    175  3.15  3.44  17.0     0     0     3     2
 6  18.1     6  225    105  2.76  3.46  20.2     1     0     3     1
 7  14.3     8  360    245  3.21  3.57  15.8     0     0     3     4
 8  24.4     4  147.    62  3.69  3.19  20       1     0     4     2
 9  22.8     4  141.    95  3.92  3.15  22.9     1     0     4     2
10  19.2     6  168.   123  3.92  3.44  18.3     1     0     4     4
# … with 22 more rows, and 1 more variable: wt_lbs <dbl>

The Pipe (`%>%`)

The pipe allows us to “chain” operations together. Below we use the pipe to pass the mtcars data frame to the print() function.

mtcars_new %>%
  print()

# A tibble: 32 × 12
     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1  21       6  160    110  3.9   2.62  16.5     0     1     4     4
 2  21       6  160    110  3.9   2.88  17.0     0     1     4     4
 3  22.8     4  108     93  3.85  2.32  18.6     1     1     4     1
 4  21.4     6  258    110  3.08  3.22  19.4     1     0     3     1
 5  18.7     8  360    175  3.15  3.44  17.0     0     0     3     2
 6  18.1     6  225    105  2.76  3.46  20.2     1     0     3     1
 7  14.3     8  360    245  3.21  3.57  15.8     0     0     3     4
 8  24.4     4  147.    62  3.69  3.19  20       1     0     4     2
 9  22.8     4  141.    95  3.92  3.15  22.9     1     0     4     2
10  19.2     6  168.   123  3.92  3.44  18.3     1     0     4     4
# … with 22 more rows, and 1 more variable: wt_lbs <dbl>

mtcars_new %>%
  filter(mpg>=20) %>%
  mutate(wt_lbs=wt*1000) %>%
  arrange(desc(mpg))

# A tibble: 14 × 12
     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1  33.9     4  71.1    65  4.22  1.84  19.9     1     1     4     1
 2  32.4     4  78.7    66  4.08  2.2   19.5     1     1     4     1
 3  30.4     4  75.7    52  4.93  1.62  18.5     1     1     4     2
 4  30.4     4  95.1   113  3.77  1.51  16.9     1     1     5     2
 5  27.3     4  79      66  4.08  1.94  18.9     1     1     4     1
 6  26       4 120.     91  4.43  2.14  16.7     0     1     5     2
 7  24.4     4 147.     62  3.69  3.19  20       1     0     4     2
 8  22.8     4 108      93  3.85  2.32  18.6     1     1     4     1
 9  22.8     4 141.     95  3.92  3.15  22.9     1     0     4     2
10  21.5     4 120.     97  3.7   2.46  20.0     1     0     3     1
11  21.4     6 258     110  3.08  3.22  19.4     1     0     3     1
12  21.4     4 121     109  4.11  2.78  18.6     1     1     4     2
13  21       6 160     110  3.9   2.62  16.5     0     1     4     4
14  21       6 160     110  3.9   2.88  17.0     0     1     4     4
# … with 1 more variable: wt_lbs <dbl>

`group_by()` and `summarise()`

We can use summarise() to collapse our data down to a descriptive statistic (or multiple).

mtcars_summary <- mtcars_new %>%
  summarise(mean_mpg=mean(mpg),
            median_mpg=median(mpg),
            mean_wt_lbs=mean(wt*1000))

However, let’s say we want to summarise the data for each level of a categorical variable. For example, we might be interested in the median mpg for cars with automatic and manual transmissions (am=0 indicates automatic, am=1 indicates manual). We can use group_by() to compute the summary statistics separately for each level of the variable am.

mtcars_new %>%
  group_by(am) %>%
  summarise(median_mpg=median(mpg))

# A tibble: 2 × 2
     am median_mpg
  <dbl>      <dbl>
1     0       17.3
2     1       22.8

This shows us the manual cars (at least in this dataset) actually have slightly better gas mileage than automatic cars.

What if we are interested in how the type of engine (vs=0 indicates V-shape, vs=1 indicates straight) and the type of transmission affects gas mileage? We can use group_by() to group by multiple categorical variables.

mtcars_new %>%
  group_by(vs,am) %>%
  summarise(median_mpg=median(mpg))

# A tibble: 4 × 3
# Groups:   vs [2]
     vs    am median_mpg
  <dbl> <dbl>      <dbl>
1     0     0       15.2
2     0     1       20.4
3     1     0       21.4
4     1     1       30.4

We see that cars with a straight engine and a manual transmission by far have the best gas mileage.

Grouped mutates

mtcars_new %>%
  group_by(am) %>%
  mutate(standard_mpg=(mpg-mean(mpg))/sd(mpg))

# A tibble: 32 × 13
# Groups:   am [2]
     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1  21       6  160    110  3.9   2.62  16.5     0     1     4     4
 2  21       6  160    110  3.9   2.88  17.0     0     1     4     4
 3  22.8     4  108     93  3.85  2.32  18.6     1     1     4     1
 4  21.4     6  258    110  3.08  3.22  19.4     1     0     3     1
 5  18.7     8  360    175  3.15  3.44  17.0     0     0     3     2
 6  18.1     6  225    105  2.76  3.46  20.2     1     0     3     1
 7  14.3     8  360    245  3.21  3.57  15.8     0     0     3     4
 8  24.4     4  147.    62  3.69  3.19  20       1     0     4     2
 9  22.8     4  141.    95  3.92  3.15  22.9     1     0     4     2
10  19.2     6  168.   123  3.92  3.44  18.3     1     0     4     4
# … with 22 more rows, and 2 more variables: wt_lbs <dbl>,
#   standard_mpg <dbl>