Data Transformation using dplyr package in R

#Data Transformation using dplyr package in R

#Aim: To obtain a clean and tidy data wherein each variable has one column and each observation is its own row

#Kindly note: dataset "mtcars" has been used as example
#Command language has been annotated before example for better understanding

#load library
#library(name of the library)
library(tidyverse)

## -- Attaching packages ----------------------------------------- tidyverse 1.2.1 --

## v ggplot2 3.1.1       v purrr   0.3.2  
## v tibble  2.1.1       v dplyr   0.8.0.1
## v tidyr   0.8.3       v stringr 1.4.0  
## v readr   1.3.1       v forcats 0.4.0

## -- Conflicts -------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

####KEY FUNCTIONS IN dplyr####
#Summarise cases
#Group cases
#manipulate cases
#manipulate variables

####SUMMARISE CASES####
#To use summarise function, default code is 
#summarise(data, label = function(variable))
#various functions are mean, mad, SD, median, IQR, mad, min, max, first, last, nth position, count and n_distinct 




#Measures of central tendency
#summarise(data, label = function(variable))
summarise(mtcars,
          mean = mean(disp),
            median = median(disp))

##       mean median
## 1 230.7219  196.3

#Measures of dispersion
#summarise(data, label = function(variable))
summarise(mtcars, 
          SD = sd(disp),
            IQR = IQR(disp),
            MAD = mad(disp),
            min = min(disp),
            max = max(disp))

##         SD     IQR      MAD  min max
## 1 123.9387 205.175 140.4764 71.1 472

#Positional measures.. 
#Note.. arrange data in ascending or descending order 
#summarise(data, label = function(variable))
summarise(mtcars, first = first(disp),
            last = last(disp))

##   first last
## 1   160  121

#to find quantiles
quantile(mtcars$disp)

##      0%     25%     50%     75%    100% 
##  71.100 120.825 196.300 326.000 472.000

#counts
#summarise(data, label = function(variable))
summarise(mtcars, count = n(),
            unique = n_distinct(disp))

##   count unique
## 1    32     27

#to count number of observations in each group..count(data, variable)
#count(data, variable)
count(mtcars, cyl)

## # A tibble: 3 x 2
##     cyl     n
##   <dbl> <int>
## 1     4    11
## 2     6     7
## 3     8    14

####GROUP CASES####
#to group results
#summarise(data, label = function(variable))
mtcars %>%
  group_by(cyl) %>% 
  summarise(mean = mean(disp))

## # A tibble: 3 x 2
##     cyl  mean
##   <dbl> <dbl>
## 1     4  105.
## 2     6  183.
## 3     8  353.

#Alternatively, create a copy of dataframe as grouped table
#new data = data %>% group_by(variable)
gp_mtcars = mtcars %>% 
  group_by(cyl)

#Now using summary function..  group wise results will be obtained
#data %>% summarise(label = function(variable))
gp_mtcars %>% 
  summarise(mean=mean(disp))

## # A tibble: 3 x 2
##     cyl  mean
##   <dbl> <dbl>
## 1     4  105.
## 2     6  183.
## 3     8  353.

####MANIPULATE CASES/ ROWS####
#In manipulate function, we can extract, arrange and add cases/ ROWS
#THE GENERIC CODE IS fun(data, argument)
#various functions under manipulate cases are
# filter, distinct, sample_frac, sample_n, arrange, add_rows,
#bind_rows, intersect, union, setdiff, has_rownames, remove_rownames and rownames_to_column


#To extract a subset of rows meeting a logic
#filter(data, variable >or<or==or!=or>=or<= logic)
cyl_mtcars = filter(mtcars, cyl==6)
cyl_mtcars

##    mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## 1 21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## 2 21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## 3 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## 4 18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## 5 19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## 6 17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
## 7 19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6

#To extract a subset of rows without duplicate values..use function "distinct" 
#as distinct(data, unique variable).. add ".keep_all = TRUE" to keep all variables
example_dupli = distinct(mtcars, cyl, .keep_all = TRUE)
example_dupli

##    mpg cyl disp  hp drat   wt  qsec vs am gear carb
## 1 21.0   6  160 110 3.90 2.62 16.46  0  1    4    4
## 2 22.8   4  108  93 3.85 2.32 18.61  1  1    4    1
## 3 18.7   8  360 175 3.15 3.44 17.02  0  0    3    2

#To randomly extract a sample of rows (by fraction of dataset )
#sample_frac(data, proportion to be sampled).. Add "replace = TRUE" for sample with replacement
sample_frac(mtcars, 0.1, replace = TRUE)

##    mpg cyl disp  hp drat   wt  qsec vs am gear carb
## 1 22.8   4  108  93 3.85 2.32 18.61  1  1    4    1
## 2 14.3   8  360 245 3.21 3.57 15.84  0  0    3    4
## 3 13.3   8  350 245 3.73 3.84 15.41  0  0    3    4

#We can add sampling weights also..They are automatically standardised to sum to 1
#sample_frac(data, proportion to be sampled, weight = NULLor__)
sample_frac(mtcars, 0.1, weight = 1 / mpg)

##    mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## 1 16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
## 2 19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
## 3 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4

#To randomly extract a sample of rows (by mentioning number of rows required)
#sample_n(data, number of rows, replace = TRUE, weight = NULL)
sample_n(mtcars, 2, replace = TRUE, weight = NULL)

##    mpg cyl  disp  hp drat   wt qsec vs am gear carb
## 1 15.2   8 275.8 180 3.07 3.78 18.0  0  0    3    3
## 2 22.8   4 140.8  95 3.92 3.15 22.9  1  0    4    2

#to arrange a variable in ascending order
#arrange (data, variable)
arrange(mtcars, cyl)

##     mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## 1  22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## 2  24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## 3  22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## 4  32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## 5  30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## 6  33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## 7  21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
## 8  27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## 9  26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
## 10 30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
## 11 21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2
## 12 21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## 13 21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## 14 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## 15 18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## 16 19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## 17 17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
## 18 19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
## 19 18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## 20 14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## 21 16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
## 22 17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
## 23 15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
## 24 10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
## 25 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
## 26 14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
## 27 15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2
## 28 15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
## 29 13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
## 30 19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
## 31 15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
## 32 15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8

#to arrange a variable in descending order
#arrange(data, desc(variable))
arrange(mtcars, desc(cyl))

##     mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## 1  18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## 2  14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## 3  16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
## 4  17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
## 5  15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
## 6  10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
## 7  10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
## 8  14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
## 9  15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2
## 10 15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
## 11 13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
## 12 19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
## 13 15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
## 14 15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8
## 15 21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## 16 21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## 17 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## 18 18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## 19 19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## 20 17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
## 21 19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
## 22 22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## 23 24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## 24 22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## 25 32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## 26 30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## 27 33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## 28 21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
## 29 27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## 30 26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
## 31 30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
## 32 21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2

#to add cases/ rows
#add_row(data, observation details)
add_row(mtcars, cyl = 2, wt = 1, gear = 2)

##     mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## 1  21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## 2  21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## 3  22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## 4  21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## 5  18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## 6  18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## 7  14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## 8  24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## 9  22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## 10 19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## 11 17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
## 12 16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
## 13 17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
## 14 15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
## 15 10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
## 16 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
## 17 14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
## 18 32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## 19 30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## 20 33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## 21 21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
## 22 15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2
## 23 15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
## 24 13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
## 25 19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
## 26 27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## 27 26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
## 28 30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
## 29 15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
## 30 19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
## 31 15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8
## 32 21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2
## 33   NA   2    NA  NA   NA 1.000    NA NA NA    2   NA

#to combine data frames row wise
#To understand lets first break data mtcars into two dataframes
#mtcars data has 32 rows and 11 colums (determined using command "dim(mtcars)")
first1 = mtcars[1:14, ]
dim(first1)

## [1] 14 11

second1 = mtcars[15:32, ]
dim(second1)

## [1] 18 11

#now we have two datasets (first and second), but we need to have a single dataset for analysis
#to make a single data from two dataframes:-
#bind_cols(data1, data2)
binder1 = bind_rows(first1, second1)
dim(binder1)

## [1] 32 11

#NOTE: WHEN COMBINING DATASETS, ENSURE THAT THE ROWS/ COLUMNS ARE ALIGNED IN BOTH DATASETS FIRST FOR ACCURATE OUTPUT


#To find rows that appear in both data frames
#intersect(first data, second data)
first2 <- mtcars[1:20, ]
second2 <- mtcars[10:32, ]
intersect = intersect(first2, second2)
head(intersect, 3)

##    mpg cyl  disp  hp drat   wt qsec vs am gear carb
## 1 19.2   6 167.6 123 3.92 3.44 18.3  1  0    4    4
## 2 17.8   6 167.6 123 3.92 3.44 18.9  1  0    4    4
## 3 16.4   8 275.8 180 3.07 4.07 17.4  0  0    3    3

#to find rows that appear in first or second dataset AND removing duplicates
#union(first data, second data)
union = union(first2, second2)
head(union, 2)

##   mpg cyl disp  hp drat    wt  qsec vs am gear carb
## 1  21   6  160 110  3.9 2.620 16.46  0  1    4    4
## 2  21   6  160 110  3.9 2.875 17.02  0  1    4    4

#To find rows that appear in first dataset and not in second
#setdiff(first data, second data)
setdiff1 = setdiff(first2, second2)
head(setdiff1, 2)

##   mpg cyl disp  hp drat    wt  qsec vs am gear carb
## 1  21   6  160 110  3.9 2.620 16.46  0  1    4    4
## 2  21   6  160 110  3.9 2.875 17.02  0  1    4    4

#To work row wise, row names are required...
#First check whether rownames are present or not
#has_rownames(data)
has_rownames(mtcars)

## [1] TRUE

#see head
#head(data, 2)
head(mtcars, 2)

##               mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4      21   6  160 110  3.9 2.620 16.46  0  1    4    4
## Mazda RX4 Wag  21   6  160 110  3.9 2.875 17.02  0  1    4    4

#to remove rownames
#remove_rownames(data)
removed = remove_rownames(mtcars)
head(removed, 2)

##   mpg cyl disp  hp drat    wt  qsec vs am gear carb
## 1  21   6  160 110  3.9 2.620 16.46  0  1    4    4
## 2  21   6  160 110  3.9 2.875 17.02  0  1    4    4

#to add rownames
#rownames_to_column(data, var = "rowname")
row = rownames_to_column(mtcars, var = "rowname")
head(row, 2)

##         rowname mpg cyl disp  hp drat    wt  qsec vs am gear carb
## 1     Mazda RX4  21   6  160 110  3.9 2.620 16.46  0  1    4    4
## 2 Mazda RX4 Wag  21   6  160 110  3.9 2.875 17.02  0  1    4    4

#to add rowid
#rownames_to_column(mtcars, var = "rowid")
id = rowid_to_column(mtcars, var = "rowid")
head(id, 2)

##   rowid mpg cyl disp  hp drat    wt  qsec vs am gear carb
## 1     1  21   6  160 110  3.9 2.620 16.46  0  1    4    4
## 2     2  21   6  160 110  3.9 2.875 17.02  0  1    4    4

####MANIPULATE VARIABLES/ COLUMNS####
#THE GENERIC CODE IS fun(data, argument)
#the various functions are select(labels, starts_with, ends_with, contains, matches, arrange, drop)
#mutate, transmute, add_column, rename, bind_cols
#To extract column variables.. return a set of columns as a new vector or table
#pull(data, column label)
pull(mtcars, cyl)

##  [1] 6 6 4 6 8 6 8 4 4 6 6 8 8 8 8 8 8 4 4 4 4 8 8 8 8 4 4 4 8 6 8 4

#To select columns wherein researcher is interested

#select(data, column labels)
selection = select(mtcars, cyl, disp)
head(selection, 2)

##               cyl disp
## Mazda RX4       6  160
## Mazda RX4 Wag   6  160

#use of "starts with" function
#select(data, starts_with("column labels"))
start = select(mtcars, starts_with("cy"), starts_with("di"))
head(start, 3)

##               cyl disp
## Mazda RX4       6  160
## Mazda RX4 Wag   6  160
## Datsun 710      4  108

#use of "ends with" function
#select(data, ends_with("column labels"))
end = select(mtcars, ends_with("yl"), starts_with("di"))
head(end, 4)

##                cyl disp
## Mazda RX4        6  160
## Mazda RX4 Wag    6  160
## Datsun 710       4  108
## Hornet 4 Drive   6  258

#use of "contains/ matches" function
#select(data, contains("column labels"))
contain = select(mtcars, contains("cy"))
head(contain, 2)

##               cyl
## Mazda RX4       6
## Mazda RX4 Wag   6

#select(data, matches("column labels"))
contain1 = select(mtcars, matches("cy"))
head(contain1, 3)

##               cyl
## Mazda RX4       6
## Mazda RX4 Wag   6
## Datsun 710      4

#to move a variable in front of dataset
#select(data, variable to be in front, everything())
new = select(mtcars, disp, everything())
head(new, 2)

##               disp mpg cyl  hp drat    wt  qsec vs am gear carb
## Mazda RX4      160  21   6 110  3.9 2.620 16.46  0  1    4    4
## Mazda RX4 Wag  160  21   6 110  3.9 2.875 17.02  0  1    4    4

#to move a variable at end of dataset... use negative sign
#select(data, -variable to be at  end, everything())
new1 = select(mtcars, -disp, everything())
head(new1, 2)

##               mpg cyl  hp drat    wt  qsec vs am gear carb disp
## Mazda RX4      21   6 110  3.9 2.620 16.46  0  1    4    4  160
## Mazda RX4 Wag  21   6 110  3.9 2.875 17.02  0  1    4    4  160

#to drop a variable from dataset.. use negative sign
drop = select(mtcars, -disp)
head(drop, 2)

##               mpg cyl  hp drat    wt  qsec vs am gear carb
## Mazda RX4      21   6 110  3.9 2.620 16.46  0  1    4    4
## Mazda RX4 Wag  21   6 110  3.9 2.875 17.02  0  1    4    4

#To make new column AND retain the origional
#mutate(data, new column = function)
new3 = mutate(mtcars, gpm = 1/mpg)
head(new3, 2)

##   mpg cyl disp  hp drat    wt  qsec vs am gear carb        gpm
## 1  21   6  160 110  3.9 2.620 16.46  0  1    4    4 0.04761905
## 2  21   6  160 110  3.9 2.875 17.02  0  1    4    4 0.04761905

#to make a new column without retaining origional
#transmutate(data, new column = function)
new4 = transmute(mtcars, gpm = 1/mpg)
head(new4, 2)

##          gpm
## 1 0.04761905
## 2 0.04761905

#to add new column
#add_column(data, new column label = function)
new_column = add_column(mtcars, newID = 1:32)
head(new_column, 2)

##               mpg cyl disp  hp drat    wt  qsec vs am gear carb newID
## Mazda RX4      21   6  160 110  3.9 2.620 16.46  0  1    4    4     1
## Mazda RX4 Wag  21   6  160 110  3.9 2.875 17.02  0  1    4    4     2

#to rename a column
#rename(data, new label = old label)
renamed = rename(mtcars, MPG = mpg)
head(renamed, 2)

##               MPG cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4      21   6  160 110  3.9 2.620 16.46  0  1    4    4
## Mazda RX4 Wag  21   6  160 110  3.9 2.875 17.02  0  1    4    4

#to combine dataframes column wise
#To understand lets first break data mtcars into two dataframes
#mtcars data has 32 rows and 11 colums (determined using command "dim(mtcars)")
first = mtcars[,1:5]
dim(first)

## [1] 32  5

second = mtcars[, 6:11]
dim(second)

## [1] 32  6

#now we have two datasets (first and second), but we need to have a single dataset for analysis
#to make a single data from two dataframes:-
#bind_cols(data1, data2)
binder = bind_cols(first, second)
dim(binder)

## [1] 32 11

Data Transformation using dplyr package in R

Dr Gurpreet Singh, Dr Biju Soman

27/08/2019