In this exercise we are going to practice data reshaping from Wide format to Long format
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(readr)
# Simplest case where column names are character data
PivotWideData <- read.csv("PivotWideData.csv")
head(PivotWideData)
## Project_Name Jan_Q1 Jan_Q2 Jan_Q3 Jan_Q4 Feb_Q1 Feb_Q2 Feb_Q3 Feb_Q4
## 1 Project_1 1030 4530 2433 2914 4802 3435 2978 4048
## 2 Project_2 3653 4961 4370 4050 2294 1089 3642 1296
## 3 Project_3 1818 1166 3372 3713 4008 3787 1398 1562
## 4 Project_4 1560 4260 1613 4268 4026 1123 1209 3116
## 5 Project_5 3950 1694 2316 2030 3587 1828 3692 3652
## 6 Project_6 2234 4210 1014 3135 3282 2071 4550 4102
PivotWideData |> # import data from directory
pivot_longer(
-Project_Name, # pivot everything apart from project name column
names_to = c("month_name", "qtr_name"), # new column name will be
names_sep = "_" )
## # A tibble: 80 × 4
## Project_Name month_name qtr_name value
## <chr> <chr> <chr> <int>
## 1 Project_1 Jan Q1 1030
## 2 Project_1 Jan Q2 4530
## 3 Project_1 Jan Q3 2433
## 4 Project_1 Jan Q4 2914
## 5 Project_1 Feb Q1 4802
## 6 Project_1 Feb Q2 3435
## 7 Project_1 Feb Q3 2978
## 8 Project_1 Feb Q4 4048
## 9 Project_2 Jan Q1 3653
## 10 Project_2 Jan Q2 4961
## # ℹ 70 more rows
# Slightly more complex case where columns have common prefix and missing missings are structural so should be dropped.
head(billboard)
## # A tibble: 6 × 79
## artist track date.entered wk1 wk2 wk3 wk4 wk5 wk6 wk7 wk8
## <chr> <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2 Pac Baby… 2000-02-26 87 82 72 77 87 94 99 NA
## 2 2Ge+her The … 2000-09-02 91 87 92 NA NA NA NA NA
## 3 3 Doors Do… Kryp… 2000-04-08 81 70 68 67 66 57 54 53
## 4 3 Doors Do… Loser 2000-10-21 76 76 72 69 67 65 55 59
## 5 504 Boyz Wobb… 2000-04-15 57 34 25 17 17 31 36 49
## 6 98^0 Give… 2000-08-19 51 39 34 26 26 19 2 2
## # ℹ 68 more variables: wk9 <dbl>, wk10 <dbl>, wk11 <dbl>, wk12 <dbl>,
## # wk13 <dbl>, wk14 <dbl>, wk15 <dbl>, wk16 <dbl>, wk17 <dbl>, wk18 <dbl>,
## # wk19 <dbl>, wk20 <dbl>, wk21 <dbl>, wk22 <dbl>, wk23 <dbl>, wk24 <dbl>,
## # wk25 <dbl>, wk26 <dbl>, wk27 <dbl>, wk28 <dbl>, wk29 <dbl>, wk30 <dbl>,
## # wk31 <dbl>, wk32 <dbl>, wk33 <dbl>, wk34 <dbl>, wk35 <dbl>, wk36 <dbl>,
## # wk37 <dbl>, wk38 <dbl>, wk39 <dbl>, wk40 <dbl>, wk41 <dbl>, wk42 <dbl>,
## # wk43 <dbl>, wk44 <dbl>, wk45 <dbl>, wk46 <dbl>, wk47 <dbl>, wk48 <dbl>, …
billboard |>
pivot_longer(
cols = starts_with("wk"), # Columns to pivot into longer format
names_to = "week",
values_to = "rank",
values_drop_na = TRUE)
## # A tibble: 5,307 × 5
## artist track date.entered week rank
## <chr> <chr> <date> <chr> <dbl>
## 1 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk1 87
## 2 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk2 82
## 3 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk3 72
## 4 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk4 77
## 5 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk5 87
## 6 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk6 94
## 7 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk7 99
## 8 2Ge+her The Hardest Part Of ... 2000-09-02 wk1 91
## 9 2Ge+her The Hardest Part Of ... 2000-09-02 wk2 87
## 10 2Ge+her The Hardest Part Of ... 2000-09-02 wk3 92
## # ℹ 5,297 more rows
# Multiple variables stored in column names
head(who)
## # A tibble: 6 × 60
## country iso2 iso3 year new_sp_m014 new_sp_m1524 new_sp_m2534 new_sp_m3544
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Afghanis… AF AFG 1980 NA NA NA NA
## 2 Afghanis… AF AFG 1981 NA NA NA NA
## 3 Afghanis… AF AFG 1982 NA NA NA NA
## 4 Afghanis… AF AFG 1983 NA NA NA NA
## 5 Afghanis… AF AFG 1984 NA NA NA NA
## 6 Afghanis… AF AFG 1985 NA NA NA NA
## # ℹ 52 more variables: new_sp_m4554 <dbl>, new_sp_m5564 <dbl>,
## # new_sp_m65 <dbl>, new_sp_f014 <dbl>, new_sp_f1524 <dbl>,
## # new_sp_f2534 <dbl>, new_sp_f3544 <dbl>, new_sp_f4554 <dbl>,
## # new_sp_f5564 <dbl>, new_sp_f65 <dbl>, new_sn_m014 <dbl>,
## # new_sn_m1524 <dbl>, new_sn_m2534 <dbl>, new_sn_m3544 <dbl>,
## # new_sn_m4554 <dbl>, new_sn_m5564 <dbl>, new_sn_m65 <dbl>,
## # new_sn_f014 <dbl>, new_sn_f1524 <dbl>, new_sn_f2534 <dbl>, …
who |>
pivot_longer(
cols = new_sp_m014:newrel_f65,
names_to = c("diagnosis", "gender","age"),
names_pattern = "new_?(.*)_(.)(.*)",
values_to = "count" )
## # A tibble: 405,440 × 8
## country iso2 iso3 year diagnosis gender age count
## <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <dbl>
## 1 Afghanistan AF AFG 1980 sp m 014 NA
## 2 Afghanistan AF AFG 1980 sp m 1524 NA
## 3 Afghanistan AF AFG 1980 sp m 2534 NA
## 4 Afghanistan AF AFG 1980 sp m 3544 NA
## 5 Afghanistan AF AFG 1980 sp m 4554 NA
## 6 Afghanistan AF AFG 1980 sp m 5564 NA
## 7 Afghanistan AF AFG 1980 sp m 65 NA
## 8 Afghanistan AF AFG 1980 sp f 014 NA
## 9 Afghanistan AF AFG 1980 sp f 1524 NA
## 10 Afghanistan AF AFG 1980 sp f 2534 NA
## # ℹ 405,430 more rows
# Multiple observations per row. Since all columns are used in the pivoting process, we’ll use `cols_vary` to keep values from the original columns close together in the output.
head(anscombe)
## x1 x2 x3 x4 y1 y2 y3 y4
## 1 10 10 10 8 8.04 9.14 7.46 6.58
## 2 8 8 8 8 6.95 8.14 6.77 5.76
## 3 13 13 13 8 7.58 8.74 12.74 7.71
## 4 9 9 9 8 8.81 8.77 7.11 8.84
## 5 11 11 11 8 8.33 9.26 7.81 8.47
## 6 14 14 14 8 9.96 8.10 8.84 7.04
View(anscombe)
anscombe |>
pivot_longer(
everything()
)
## # A tibble: 88 × 2
## name value
## <chr> <dbl>
## 1 x1 10
## 2 x2 10
## 3 x3 10
## 4 x4 8
## 5 y1 8.04
## 6 y2 9.14
## 7 y3 7.46
## 8 y4 6.58
## 9 x1 8
## 10 x2 8
## # ℹ 78 more rows
{r} anscombe |>} pivot_longer( everything(), names_to = c(".value", "set"), names_pattern = "(.)(.)" )