#Data Transformation using dplyr package in R
#Aim: To obtain a clean and tidy data wherein each variable has one column and each observation is its own row
#Kindly note: dataset "mtcars" has been used as example
#Command language has been annotated before example for better understanding
#load library
#library(name of the library)
library(tidyverse)
## -- Attaching packages ----------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.1 v purrr 0.3.2
## v tibble 2.1.1 v dplyr 0.8.0.1
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts -------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
####KEY FUNCTIONS IN dplyr####
#Summarise cases
#Group cases
#manipulate cases
#manipulate variables
####SUMMARISE CASES####
#To use summarise function, default code is
#summarise(data, label = function(variable))
#various functions are mean, mad, SD, median, IQR, mad, min, max, first, last, nth position, count and n_distinct
#Measures of central tendency
#summarise(data, label = function(variable))
summarise(mtcars,
mean = mean(disp),
median = median(disp))
## mean median
## 1 230.7219 196.3
#Measures of dispersion
#summarise(data, label = function(variable))
summarise(mtcars,
SD = sd(disp),
IQR = IQR(disp),
MAD = mad(disp),
min = min(disp),
max = max(disp))
## SD IQR MAD min max
## 1 123.9387 205.175 140.4764 71.1 472
#Positional measures..
#Note.. arrange data in ascending or descending order
#summarise(data, label = function(variable))
summarise(mtcars, first = first(disp),
last = last(disp))
## first last
## 1 160 121
#to find quantiles
quantile(mtcars$disp)
## 0% 25% 50% 75% 100%
## 71.100 120.825 196.300 326.000 472.000
#counts
#summarise(data, label = function(variable))
summarise(mtcars, count = n(),
unique = n_distinct(disp))
## count unique
## 1 32 27
#to count number of observations in each group..count(data, variable)
#count(data, variable)
count(mtcars, cyl)
## # A tibble: 3 x 2
## cyl n
## <dbl> <int>
## 1 4 11
## 2 6 7
## 3 8 14
####GROUP CASES####
#to group results
#summarise(data, label = function(variable))
mtcars %>%
group_by(cyl) %>%
summarise(mean = mean(disp))
## # A tibble: 3 x 2
## cyl mean
## <dbl> <dbl>
## 1 4 105.
## 2 6 183.
## 3 8 353.
#Alternatively, create a copy of dataframe as grouped table
#new data = data %>% group_by(variable)
gp_mtcars = mtcars %>%
group_by(cyl)
#Now using summary function.. group wise results will be obtained
#data %>% summarise(label = function(variable))
gp_mtcars %>%
summarise(mean=mean(disp))
## # A tibble: 3 x 2
## cyl mean
## <dbl> <dbl>
## 1 4 105.
## 2 6 183.
## 3 8 353.
####MANIPULATE CASES/ ROWS####
#In manipulate function, we can extract, arrange and add cases/ ROWS
#THE GENERIC CODE IS fun(data, argument)
#various functions under manipulate cases are
# filter, distinct, sample_frac, sample_n, arrange, add_rows,
#bind_rows, intersect, union, setdiff, has_rownames, remove_rownames and rownames_to_column
#To extract a subset of rows meeting a logic
#filter(data, variable >or<or==or!=or>=or<= logic)
cyl_mtcars = filter(mtcars, cyl==6)
cyl_mtcars
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## 2 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## 3 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## 4 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## 5 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## 6 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## 7 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
#To extract a subset of rows without duplicate values..use function "distinct"
#as distinct(data, unique variable).. add ".keep_all = TRUE" to keep all variables
example_dupli = distinct(mtcars, cyl, .keep_all = TRUE)
example_dupli
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 21.0 6 160 110 3.90 2.62 16.46 0 1 4 4
## 2 22.8 4 108 93 3.85 2.32 18.61 1 1 4 1
## 3 18.7 8 360 175 3.15 3.44 17.02 0 0 3 2
#To randomly extract a sample of rows (by fraction of dataset )
#sample_frac(data, proportion to be sampled).. Add "replace = TRUE" for sample with replacement
sample_frac(mtcars, 0.1, replace = TRUE)
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 22.8 4 108 93 3.85 2.32 18.61 1 1 4 1
## 2 14.3 8 360 245 3.21 3.57 15.84 0 0 3 4
## 3 13.3 8 350 245 3.73 3.84 15.41 0 0 3 4
#We can add sampling weights also..They are automatically standardised to sum to 1
#sample_frac(data, proportion to be sampled, weight = NULLor__)
sample_frac(mtcars, 0.1, weight = 1 / mpg)
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## 2 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## 3 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
#To randomly extract a sample of rows (by mentioning number of rows required)
#sample_n(data, number of rows, replace = TRUE, weight = NULL)
sample_n(mtcars, 2, replace = TRUE, weight = NULL)
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 15.2 8 275.8 180 3.07 3.78 18.0 0 0 3 3
## 2 22.8 4 140.8 95 3.92 3.15 22.9 1 0 4 2
#to arrange a variable in ascending order
#arrange (data, variable)
arrange(mtcars, cyl)
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## 2 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## 3 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## 4 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## 5 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## 6 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## 7 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## 8 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## 9 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## 10 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## 11 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
## 12 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## 13 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## 14 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## 15 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## 16 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## 17 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## 18 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## 19 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## 20 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## 21 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## 22 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## 23 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## 24 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## 25 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## 26 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## 27 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## 28 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## 29 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## 30 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
## 31 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## 32 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
#to arrange a variable in descending order
#arrange(data, desc(variable))
arrange(mtcars, desc(cyl))
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## 2 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## 3 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## 4 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## 5 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## 6 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## 7 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## 8 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## 9 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## 10 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## 11 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## 12 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
## 13 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## 14 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
## 15 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## 16 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## 17 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## 18 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## 19 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## 20 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## 21 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## 22 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## 23 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## 24 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## 25 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## 26 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## 27 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## 28 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## 29 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## 30 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## 31 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## 32 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
#to add cases/ rows
#add_row(data, observation details)
add_row(mtcars, cyl = 2, wt = 1, gear = 2)
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## 2 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## 3 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## 4 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## 5 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## 6 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## 7 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## 8 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## 9 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## 10 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## 11 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## 12 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## 13 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## 14 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## 15 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## 16 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## 17 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## 18 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## 19 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## 20 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## 21 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## 22 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## 23 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## 24 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## 25 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
## 26 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## 27 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## 28 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## 29 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## 30 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## 31 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
## 32 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
## 33 NA 2 NA NA NA 1.000 NA NA NA 2 NA
#to combine data frames row wise
#To understand lets first break data mtcars into two dataframes
#mtcars data has 32 rows and 11 colums (determined using command "dim(mtcars)")
first1 = mtcars[1:14, ]
dim(first1)
## [1] 14 11
second1 = mtcars[15:32, ]
dim(second1)
## [1] 18 11
#now we have two datasets (first and second), but we need to have a single dataset for analysis
#to make a single data from two dataframes:-
#bind_cols(data1, data2)
binder1 = bind_rows(first1, second1)
dim(binder1)
## [1] 32 11
#NOTE: WHEN COMBINING DATASETS, ENSURE THAT THE ROWS/ COLUMNS ARE ALIGNED IN BOTH DATASETS FIRST FOR ACCURATE OUTPUT
#To find rows that appear in both data frames
#intersect(first data, second data)
first2 <- mtcars[1:20, ]
second2 <- mtcars[10:32, ]
intersect = intersect(first2, second2)
head(intersect, 3)
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 19.2 6 167.6 123 3.92 3.44 18.3 1 0 4 4
## 2 17.8 6 167.6 123 3.92 3.44 18.9 1 0 4 4
## 3 16.4 8 275.8 180 3.07 4.07 17.4 0 0 3 3
#to find rows that appear in first or second dataset AND removing duplicates
#union(first data, second data)
union = union(first2, second2)
head(union, 2)
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 21 6 160 110 3.9 2.620 16.46 0 1 4 4
## 2 21 6 160 110 3.9 2.875 17.02 0 1 4 4
#To find rows that appear in first dataset and not in second
#setdiff(first data, second data)
setdiff1 = setdiff(first2, second2)
head(setdiff1, 2)
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 21 6 160 110 3.9 2.620 16.46 0 1 4 4
## 2 21 6 160 110 3.9 2.875 17.02 0 1 4 4
#To work row wise, row names are required...
#First check whether rownames are present or not
#has_rownames(data)
has_rownames(mtcars)
## [1] TRUE
#see head
#head(data, 2)
head(mtcars, 2)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21 6 160 110 3.9 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21 6 160 110 3.9 2.875 17.02 0 1 4 4
#to remove rownames
#remove_rownames(data)
removed = remove_rownames(mtcars)
head(removed, 2)
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 21 6 160 110 3.9 2.620 16.46 0 1 4 4
## 2 21 6 160 110 3.9 2.875 17.02 0 1 4 4
#to add rownames
#rownames_to_column(data, var = "rowname")
row = rownames_to_column(mtcars, var = "rowname")
head(row, 2)
## rowname mpg cyl disp hp drat wt qsec vs am gear carb
## 1 Mazda RX4 21 6 160 110 3.9 2.620 16.46 0 1 4 4
## 2 Mazda RX4 Wag 21 6 160 110 3.9 2.875 17.02 0 1 4 4
#to add rowid
#rownames_to_column(mtcars, var = "rowid")
id = rowid_to_column(mtcars, var = "rowid")
head(id, 2)
## rowid mpg cyl disp hp drat wt qsec vs am gear carb
## 1 1 21 6 160 110 3.9 2.620 16.46 0 1 4 4
## 2 2 21 6 160 110 3.9 2.875 17.02 0 1 4 4
####MANIPULATE VARIABLES/ COLUMNS####
#THE GENERIC CODE IS fun(data, argument)
#the various functions are select(labels, starts_with, ends_with, contains, matches, arrange, drop)
#mutate, transmute, add_column, rename, bind_cols
#To extract column variables.. return a set of columns as a new vector or table
#pull(data, column label)
pull(mtcars, cyl)
## [1] 6 6 4 6 8 6 8 4 4 6 6 8 8 8 8 8 8 4 4 4 4 8 8 8 8 4 4 4 8 6 8 4
#To select columns wherein researcher is interested
#select(data, column labels)
selection = select(mtcars, cyl, disp)
head(selection, 2)
## cyl disp
## Mazda RX4 6 160
## Mazda RX4 Wag 6 160
#use of "starts with" function
#select(data, starts_with("column labels"))
start = select(mtcars, starts_with("cy"), starts_with("di"))
head(start, 3)
## cyl disp
## Mazda RX4 6 160
## Mazda RX4 Wag 6 160
## Datsun 710 4 108
#use of "ends with" function
#select(data, ends_with("column labels"))
end = select(mtcars, ends_with("yl"), starts_with("di"))
head(end, 4)
## cyl disp
## Mazda RX4 6 160
## Mazda RX4 Wag 6 160
## Datsun 710 4 108
## Hornet 4 Drive 6 258
#use of "contains/ matches" function
#select(data, contains("column labels"))
contain = select(mtcars, contains("cy"))
head(contain, 2)
## cyl
## Mazda RX4 6
## Mazda RX4 Wag 6
#select(data, matches("column labels"))
contain1 = select(mtcars, matches("cy"))
head(contain1, 3)
## cyl
## Mazda RX4 6
## Mazda RX4 Wag 6
## Datsun 710 4
#to move a variable in front of dataset
#select(data, variable to be in front, everything())
new = select(mtcars, disp, everything())
head(new, 2)
## disp mpg cyl hp drat wt qsec vs am gear carb
## Mazda RX4 160 21 6 110 3.9 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 160 21 6 110 3.9 2.875 17.02 0 1 4 4
#to move a variable at end of dataset... use negative sign
#select(data, -variable to be at end, everything())
new1 = select(mtcars, -disp, everything())
head(new1, 2)
## mpg cyl hp drat wt qsec vs am gear carb disp
## Mazda RX4 21 6 110 3.9 2.620 16.46 0 1 4 4 160
## Mazda RX4 Wag 21 6 110 3.9 2.875 17.02 0 1 4 4 160
#to drop a variable from dataset.. use negative sign
drop = select(mtcars, -disp)
head(drop, 2)
## mpg cyl hp drat wt qsec vs am gear carb
## Mazda RX4 21 6 110 3.9 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21 6 110 3.9 2.875 17.02 0 1 4 4
#To make new column AND retain the origional
#mutate(data, new column = function)
new3 = mutate(mtcars, gpm = 1/mpg)
head(new3, 2)
## mpg cyl disp hp drat wt qsec vs am gear carb gpm
## 1 21 6 160 110 3.9 2.620 16.46 0 1 4 4 0.04761905
## 2 21 6 160 110 3.9 2.875 17.02 0 1 4 4 0.04761905
#to make a new column without retaining origional
#transmutate(data, new column = function)
new4 = transmute(mtcars, gpm = 1/mpg)
head(new4, 2)
## gpm
## 1 0.04761905
## 2 0.04761905
#to add new column
#add_column(data, new column label = function)
new_column = add_column(mtcars, newID = 1:32)
head(new_column, 2)
## mpg cyl disp hp drat wt qsec vs am gear carb newID
## Mazda RX4 21 6 160 110 3.9 2.620 16.46 0 1 4 4 1
## Mazda RX4 Wag 21 6 160 110 3.9 2.875 17.02 0 1 4 4 2
#to rename a column
#rename(data, new label = old label)
renamed = rename(mtcars, MPG = mpg)
head(renamed, 2)
## MPG cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21 6 160 110 3.9 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21 6 160 110 3.9 2.875 17.02 0 1 4 4
#to combine dataframes column wise
#To understand lets first break data mtcars into two dataframes
#mtcars data has 32 rows and 11 colums (determined using command "dim(mtcars)")
first = mtcars[,1:5]
dim(first)
## [1] 32 5
second = mtcars[, 6:11]
dim(second)
## [1] 32 6
#now we have two datasets (first and second), but we need to have a single dataset for analysis
#to make a single data from two dataframes:-
#bind_cols(data1, data2)
binder = bind_cols(first, second)
dim(binder)
## [1] 32 11