# import library
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.2.5
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Warning: package 'ggplot2' was built under R version 3.2.5
## Warning: package 'tibble' was built under R version 3.2.5
## Warning: package 'tidyr' was built under R version 3.2.5
## Warning: package 'readr' was built under R version 3.2.5
## Warning: package 'purrr' was built under R version 3.2.5
## Warning: package 'dplyr' was built under R version 3.2.5
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
This demonstration will be workig with iris data from caret package as well as other datasets along the way
Using gather to transform wide data to long data
dataset<-iris
# Check the first six rows of dataset
head(dataset)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
# 'gather' function allows to tidy wide data into long data
# Creating a new column ID is that we can use it to make spread in following step
dataset_gather<-dataset %>% dplyr::mutate(ID=row_number(Species)) %>% tidyr::gather(key=Mesuare_Type, value = Values,1:4)
head(dataset_gather)
## Species ID Mesuare_Type Values
## 1 setosa 1 Sepal.Length 5.1
## 2 setosa 2 Sepal.Length 4.9
## 3 setosa 3 Sepal.Length 4.7
## 4 setosa 4 Sepal.Length 4.6
## 5 setosa 5 Sepal.Length 5.0
## 6 setosa 6 Sepal.Length 5.4
dataset_spread<- dataset_gather %>% dplyr::group_by(ID) %>% tidyr::spread(key=Mesuare_Type,value = Values) %>% dplyr::ungroup() %>% dplyr::select(-ID)
# Check the first 6 rows of dataset_spread
head(dataset_spread)
## # A tibble: 6 x 5
## Species Petal.Length Petal.Width Sepal.Length Sepal.Width
## <fctr> <dbl> <dbl> <dbl> <dbl>
## 1 setosa 1.4 0.2 5.1 3.5
## 2 setosa 1.4 0.2 4.9 3.0
## 3 setosa 1.3 0.2 4.7 3.2
## 4 setosa 1.5 0.2 4.6 3.1
## 5 setosa 1.4 0.2 5.0 3.6
## 6 setosa 1.7 0.4 5.4 3.9
# using 'unite' to colapse two variables into a single variable
dataset_unite<- dataset %>% tidyr::unite(Species_Length,Species, Petal.Length,sep="_")
# Check the data
head(dataset_unite)
## Sepal.Length Sepal.Width Species_Length Petal.Width
## 1 5.1 3.5 setosa_1.4 0.2
## 2 4.9 3.0 setosa_1.4 0.2
## 3 4.7 3.2 setosa_1.3 0.2
## 4 4.6 3.1 setosa_1.5 0.2
## 5 5.0 3.6 setosa_1.4 0.2
## 6 5.4 3.9 setosa_1.7 0.4
# Now, in some cases one may want to split it out into two columns
dataset_seperate<-dataset_unite %>% tidyr::separate(Species_Length, c("Species", "Petal.Length"),sep="_") %>% dplyr::select(1,2,4,5,everything())
# The last argument for re-arranging the column names only, and nothing speacial about it.
# Check the data
head(dataset_seperate)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa