Pivot Data, Wide to Long Format

In this exercise we are going to practice data reshaping from Wide format to Long format

Import Library

library(tidyr) 
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2) 
library(readr)

# Simplest case where column names are character data

PivotWideData <- read.csv("PivotWideData.csv") 
head(PivotWideData)

##   Project_Name Jan_Q1 Jan_Q2 Jan_Q3 Jan_Q4 Feb_Q1 Feb_Q2 Feb_Q3 Feb_Q4
## 1    Project_1   1030   4530   2433   2914   4802   3435   2978   4048
## 2    Project_2   3653   4961   4370   4050   2294   1089   3642   1296
## 3    Project_3   1818   1166   3372   3713   4008   3787   1398   1562
## 4    Project_4   1560   4260   1613   4268   4026   1123   1209   3116
## 5    Project_5   3950   1694   2316   2030   3587   1828   3692   3652
## 6    Project_6   2234   4210   1014   3135   3282   2071   4550   4102

PivotWideData |>    # import data from directory   
  pivot_longer(     
    -Project_Name,  # pivot everything apart from project name column     
    names_to = c("month_name", "qtr_name"), # new column name will be      
    names_sep = "_"   )

## # A tibble: 80 × 4
##    Project_Name month_name qtr_name value
##    <chr>        <chr>      <chr>    <int>
##  1 Project_1    Jan        Q1        1030
##  2 Project_1    Jan        Q2        4530
##  3 Project_1    Jan        Q3        2433
##  4 Project_1    Jan        Q4        2914
##  5 Project_1    Feb        Q1        4802
##  6 Project_1    Feb        Q2        3435
##  7 Project_1    Feb        Q3        2978
##  8 Project_1    Feb        Q4        4048
##  9 Project_2    Jan        Q1        3653
## 10 Project_2    Jan        Q2        4961
## # ℹ 70 more rows

# Slightly more complex case where columns have common prefix and missing missings are structural so should be dropped.

head(billboard)

## # A tibble: 6 × 79
##   artist      track date.entered   wk1   wk2   wk3   wk4   wk5   wk6   wk7   wk8
##   <chr>       <chr> <date>       <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2 Pac       Baby… 2000-02-26      87    82    72    77    87    94    99    NA
## 2 2Ge+her     The … 2000-09-02      91    87    92    NA    NA    NA    NA    NA
## 3 3 Doors Do… Kryp… 2000-04-08      81    70    68    67    66    57    54    53
## 4 3 Doors Do… Loser 2000-10-21      76    76    72    69    67    65    55    59
## 5 504 Boyz    Wobb… 2000-04-15      57    34    25    17    17    31    36    49
## 6 98^0        Give… 2000-08-19      51    39    34    26    26    19     2     2
## # ℹ 68 more variables: wk9 <dbl>, wk10 <dbl>, wk11 <dbl>, wk12 <dbl>,
## #   wk13 <dbl>, wk14 <dbl>, wk15 <dbl>, wk16 <dbl>, wk17 <dbl>, wk18 <dbl>,
## #   wk19 <dbl>, wk20 <dbl>, wk21 <dbl>, wk22 <dbl>, wk23 <dbl>, wk24 <dbl>,
## #   wk25 <dbl>, wk26 <dbl>, wk27 <dbl>, wk28 <dbl>, wk29 <dbl>, wk30 <dbl>,
## #   wk31 <dbl>, wk32 <dbl>, wk33 <dbl>, wk34 <dbl>, wk35 <dbl>, wk36 <dbl>,
## #   wk37 <dbl>, wk38 <dbl>, wk39 <dbl>, wk40 <dbl>, wk41 <dbl>, wk42 <dbl>,
## #   wk43 <dbl>, wk44 <dbl>, wk45 <dbl>, wk46 <dbl>, wk47 <dbl>, wk48 <dbl>, …

billboard |>    
  pivot_longer(     
    cols = starts_with("wk"), # Columns to pivot into longer format     
    names_to = "week",     
    values_to = "rank",     
    values_drop_na = TRUE)

## # A tibble: 5,307 × 5
##    artist  track                   date.entered week   rank
##    <chr>   <chr>                   <date>       <chr> <dbl>
##  1 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk1      87
##  2 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk2      82
##  3 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk3      72
##  4 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk4      77
##  5 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk5      87
##  6 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk6      94
##  7 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk7      99
##  8 2Ge+her The Hardest Part Of ... 2000-09-02   wk1      91
##  9 2Ge+her The Hardest Part Of ... 2000-09-02   wk2      87
## 10 2Ge+her The Hardest Part Of ... 2000-09-02   wk3      92
## # ℹ 5,297 more rows

# Multiple variables stored in column names

head(who)

## # A tibble: 6 × 60
##   country   iso2  iso3   year new_sp_m014 new_sp_m1524 new_sp_m2534 new_sp_m3544
##   <chr>     <chr> <chr> <dbl>       <dbl>        <dbl>        <dbl>        <dbl>
## 1 Afghanis… AF    AFG    1980          NA           NA           NA           NA
## 2 Afghanis… AF    AFG    1981          NA           NA           NA           NA
## 3 Afghanis… AF    AFG    1982          NA           NA           NA           NA
## 4 Afghanis… AF    AFG    1983          NA           NA           NA           NA
## 5 Afghanis… AF    AFG    1984          NA           NA           NA           NA
## 6 Afghanis… AF    AFG    1985          NA           NA           NA           NA
## # ℹ 52 more variables: new_sp_m4554 <dbl>, new_sp_m5564 <dbl>,
## #   new_sp_m65 <dbl>, new_sp_f014 <dbl>, new_sp_f1524 <dbl>,
## #   new_sp_f2534 <dbl>, new_sp_f3544 <dbl>, new_sp_f4554 <dbl>,
## #   new_sp_f5564 <dbl>, new_sp_f65 <dbl>, new_sn_m014 <dbl>,
## #   new_sn_m1524 <dbl>, new_sn_m2534 <dbl>, new_sn_m3544 <dbl>,
## #   new_sn_m4554 <dbl>, new_sn_m5564 <dbl>, new_sn_m65 <dbl>,
## #   new_sn_f014 <dbl>, new_sn_f1524 <dbl>, new_sn_f2534 <dbl>, …

who |>    
  pivot_longer(     
    cols = new_sp_m014:newrel_f65,     
    names_to = c("diagnosis", "gender","age"),     
    names_pattern = "new_?(.*)_(.)(.*)",     
    values_to = "count"   )

## # A tibble: 405,440 × 8
##    country     iso2  iso3   year diagnosis gender age   count
##    <chr>       <chr> <chr> <dbl> <chr>     <chr>  <chr> <dbl>
##  1 Afghanistan AF    AFG    1980 sp        m      014      NA
##  2 Afghanistan AF    AFG    1980 sp        m      1524     NA
##  3 Afghanistan AF    AFG    1980 sp        m      2534     NA
##  4 Afghanistan AF    AFG    1980 sp        m      3544     NA
##  5 Afghanistan AF    AFG    1980 sp        m      4554     NA
##  6 Afghanistan AF    AFG    1980 sp        m      5564     NA
##  7 Afghanistan AF    AFG    1980 sp        m      65       NA
##  8 Afghanistan AF    AFG    1980 sp        f      014      NA
##  9 Afghanistan AF    AFG    1980 sp        f      1524     NA
## 10 Afghanistan AF    AFG    1980 sp        f      2534     NA
## # ℹ 405,430 more rows

# Multiple observations per row. Since all columns are used in the pivoting process, we’ll use `cols_vary` to keep values from the original columns close together in the output.

head(anscombe)

##   x1 x2 x3 x4   y1   y2    y3   y4
## 1 10 10 10  8 8.04 9.14  7.46 6.58
## 2  8  8  8  8 6.95 8.14  6.77 5.76
## 3 13 13 13  8 7.58 8.74 12.74 7.71
## 4  9  9  9  8 8.81 8.77  7.11 8.84
## 5 11 11 11  8 8.33 9.26  7.81 8.47
## 6 14 14 14  8 9.96 8.10  8.84 7.04

View(anscombe) 
anscombe |>    
  pivot_longer(     
    everything()   
)

## # A tibble: 88 × 2
##    name  value
##    <chr> <dbl>
##  1 x1    10   
##  2 x2    10   
##  3 x3    10   
##  4 x4     8   
##  5 y1     8.04
##  6 y2     9.14
##  7 y3     7.46
##  8 y4     6.58
##  9 x1     8   
## 10 x2     8   
## # ℹ 78 more rows

{r} anscombe |>} pivot_longer( everything(), names_to = c(".value", "set"), names_pattern = "(.)(.)" )

Pivot Data, Wide to Long Format

Angad

2024-01-31

Import Library