Data

Potthoff and Roy (1964) reported data on a study in 16 boys and 11 girls, who at ages 8, 10, 12, and 14 had the distance (mm) from the center of the pituitary gland to the pteryomaxillary fissure measured. Changes in pituitary-pteryomaxillary distances during growth is important in orthodontic therapy.

Exercises - using the female data only. - using the male data only. - using both data from males and females

Girl data

We consider data from girls only here.

# require(pacman)
pacman::p_load(mice)
data(potthoffroy)
subset(potthoffroy, sex=='F')
   id sex   d8  d10  d12  d14
1   1   F 21.0 20.0 21.5 23.0
2   2   F 21.0 21.5 24.0 25.5
3   3   F 20.5 24.0 24.5 26.0
4   4   F 23.5 24.5 25.0 26.5
5   5   F 21.5 23.0 22.5 23.5
6   6   F 20.0 21.0 21.0 22.5
7   7   F 21.5 22.5 23.0 25.0
8   8   F 23.0 23.0 23.5 24.0
9   9   F 20.0 21.0 22.0 21.5
10 10   F 16.5 19.0 19.0 19.5
11 11   F 24.5 25.0 28.0 28.0

Files in a folder

We assume that there are 4 separate files in a local folder called tmp_data. (We will create them for you.)

list.files("./tmp_data/", pattern="f_")
[1] "f_1.csv" "f_2.csv" "f_3.csv" "f_4.csv"

The content of the first one looks like this.

read.csv("./tmp_data/f_1.csv")
   id   d8
1   1 21.0
2   2 21.0
3   3 20.5
4   4 23.5
5   5 21.5
6   6 20.0
7   7 21.5
8   8 23.0
9   9 20.0
10 10 16.5
11 11 24.5

Now collect the file names.

fls <- list.files(path = "./tmp_data", pattern = "f_")
fls
[1] "f_1.csv" "f_2.csv" "f_3.csv" "f_4.csv"

Remember to give files the full path to their location.

fL <- paste0("./tmp_data/", fls)

fL
[1] "./tmp_data/f_1.csv" "./tmp_data/f_2.csv" "./tmp_data/f_3.csv"
[4] "./tmp_data/f_4.csv"

Input multiple files

Input these files as a list of data frames

ff <- lapply(fL, read.csv)

Merge

We can merge two files by id.

merge(ff[1], ff[2])
   id   d8  d10
1   1 21.0 20.0
2   2 21.0 21.5
3   3 20.5 24.0
4   4 23.5 24.5
5   5 21.5 23.0
6   6 20.0 21.0
7   7 21.5 22.5
8   8 23.0 23.0
9   9 20.0 21.0
10 10 16.5 19.0
11 11 24.5 25.0

Reduce

The function Reduce allows us to ‘loop’ through the list of files with our own version of merge called mrg2.

# Roll our own merging function
mrg2 <- function(f1, f2){                                
  merge(f1, f2, by="id")
}
Reduce(mrg2, ff)
   id   d8  d10  d12  d14
1   1 21.0 20.0 21.5 23.0
2   2 21.0 21.5 24.0 25.5
3   3 20.5 24.0 24.5 26.0
4   4 23.5 24.5 25.0 26.5
5   5 21.5 23.0 22.5 23.5
6   6 20.0 21.0 21.0 22.5
7   7 21.5 22.5 23.0 25.0
8   8 23.0 23.0 23.5 24.0
9   9 20.0 21.0 22.0 21.5
10 10 16.5 19.0 19.0 19.5
11 11 24.5 25.0 28.0 28.0

Tidy approach

Instead of ‘merge’, ‘inner_join’ is used; instead of ‘Reduce’, ‘reduce’.

library(tidyverse)
ff |> reduce(inner_join, by='id')
   id   d8  d10  d12  d14
1   1 21.0 20.0 21.5 23.0
2   2 21.0 21.5 24.0 25.5
3   3 20.5 24.0 24.5 26.0
4   4 23.5 24.5 25.0 26.5
5   5 21.5 23.0 22.5 23.5
6   6 20.0 21.0 21.0 22.5
7   7 21.5 22.5 23.0 25.0
8   8 23.0 23.0 23.5 24.0
9   9 20.0 21.0 22.0 21.5
10 10 16.5 19.0 19.0 19.5
11 11 24.5 25.0 28.0 28.0

Vertical direction

We can ‘bind’ the input in the vertical direction to construct an output in the long format in contrast to the wide format of the original data.

# extract number of observations from components of the list
n <- sapply(ff, function(x) dim(x)[1])
# extract numbers from year variable
p <- sapply(ff, function(x) names(x)[2]) |> parse_number()
# convert list of data frames to matrices
ff <- lapply(ff, as.matrix)
# augment data with a new column variable 'year'
dtaL <- cbind(Reduce(rbind, ff), year=rep(p, n)) |> as.data.frame()
# renme the second column
names(dtaL)[2] <- "pp_distance"
head(dtaL, 13)
   id pp_distance year
1   1        21.0    8
2   2        21.0    8
3   3        20.5    8
4   4        23.5    8
5   5        21.5    8
6   6        20.0    8
7   7        21.5    8
8   8        23.0    8
9   9        20.0    8
10 10        16.5    8
11 11        24.5    8
12  1        20.0   10
13  2        21.5   10

Boy data

subset(potthoffroy, sex=='M')
   id sex   d8  d10  d12  d14
12 12   M 26.0 25.0 29.0 31.0
13 13   M 21.5 22.5 23.0 26.5
14 14   M 23.0 22.5 24.0 27.5
15 15   M 25.5 27.5 26.5 27.0
16 16   M 20.0 23.5 22.5 26.0
17 17   M 24.5 25.5 27.0 28.5
18 18   M 22.0 22.0 24.5 26.5
19 19   M 24.0 21.5 24.5 25.5
20 20   M 23.0 20.5 31.0 26.0
21 21   M 27.5 28.0 31.0 31.5
22 22   M 23.0 23.0 23.5 25.0
23 23   M 21.5 23.5 24.0 28.0
24 24   M 17.0 24.5 26.0 29.5
25 25   M 22.5 25.5 25.5 26.0
26 26   M 23.0 24.5 26.0 30.0
27 27   M 22.0 21.5 23.5 25.0
dir.create(file.path(getwd(), "./tmp2_data"), showWarnings=FALSE)

lapply(3:6, function(i) {
              write.csv(subset(potthoffroy, sex=='M')[, c(1, i)],
                        file=paste0("./tmp2_data/m_", i-2, ".csv"),
                        row.names=FALSE)
                      }
)
[[1]]
NULL

[[2]]
NULL

[[3]]
NULL

[[4]]
NULL

Files in a folder

list.files("./tmp2_data/", pattern="m_")
[1] "m_1.csv" "m_2.csv" "m_3.csv" "m_4.csv"
read.csv("./tmp2_data/m_1.csv")
   id   d8
1  12 26.0
2  13 21.5
3  14 23.0
4  15 25.5
5  16 20.0
6  17 24.5
7  18 22.0
8  19 24.0
9  20 23.0
10 21 27.5
11 22 23.0
12 23 21.5
13 24 17.0
14 25 22.5
15 26 23.0
16 27 22.0

Collect the file names.

fls <- list.files(path = "./tmp2_data", pattern = "m_")
fls
[1] "m_1.csv" "m_2.csv" "m_3.csv" "m_4.csv"

Give files the full path to their location.

fL <- paste0("./tmp2_data/", fls)
fL
[1] "./tmp2_data/m_1.csv" "./tmp2_data/m_2.csv" "./tmp2_data/m_3.csv"
[4] "./tmp2_data/m_4.csv"

Input multiple files

ff <- lapply(fL, read.csv)

Merge

merge(ff[1], ff[2])
   id   d8  d10
1  12 26.0 25.0
2  13 21.5 22.5
3  14 23.0 22.5
4  15 25.5 27.5
5  16 20.0 23.5
6  17 24.5 25.5
7  18 22.0 22.0
8  19 24.0 21.5
9  20 23.0 20.5
10 21 27.5 28.0
11 22 23.0 23.0
12 23 21.5 23.5
13 24 17.0 24.5
14 25 22.5 25.5
15 26 23.0 24.5
16 27 22.0 21.5

Reduce approach

library(dplyr)
# Roll our own merging function
mrg2 <- function(f1, f2){                                
  merge(f1, f2, by="id")
}

# as.data.frame is easier to read in Rstudio
dtaW<-Reduce(mrg2, ff)%>% as.data.frame()
# rename the second column
names(dtaW)[1] <- "ID"
#
str(dtaW)
'data.frame':   16 obs. of  5 variables:
 $ ID : int  12 13 14 15 16 17 18 19 20 21 ...
 $ d8 : num  26 21.5 23 25.5 20 24.5 22 24 23 27.5 ...
 $ d10: num  25 22.5 22.5 27.5 23.5 25.5 22 21.5 20.5 28 ...
 $ d12: num  29 23 24 26.5 22.5 27 24.5 24.5 31 31 ...
 $ d14: num  31 26.5 27.5 27 26 28.5 26.5 25.5 26 31.5 ...
head(dtaW, 16)
   ID   d8  d10  d12  d14
1  12 26.0 25.0 29.0 31.0
2  13 21.5 22.5 23.0 26.5
3  14 23.0 22.5 24.0 27.5
4  15 25.5 27.5 26.5 27.0
5  16 20.0 23.5 22.5 26.0
6  17 24.5 25.5 27.0 28.5
7  18 22.0 22.0 24.5 26.5
8  19 24.0 21.5 24.5 25.5
9  20 23.0 20.5 31.0 26.0
10 21 27.5 28.0 31.0 31.5
11 22 23.0 23.0 23.5 25.0
12 23 21.5 23.5 24.0 28.0
13 24 17.0 24.5 26.0 29.5
14 25 22.5 25.5 25.5 26.0
15 26 23.0 24.5 26.0 30.0
16 27 22.0 21.5 23.5 25.0

Tidy approach

library(tidyverse)
# as.data.frame is easier to read in Rstudio
dtaW2<-ff %>% reduce(inner_join, by='id')%>% as.data.frame()
# rename the second column
names(dtaW2)[1] <- "ID"
#
str(dtaW2)
'data.frame':   16 obs. of  5 variables:
 $ ID : int  12 13 14 15 16 17 18 19 20 21 ...
 $ d8 : num  26 21.5 23 25.5 20 24.5 22 24 23 27.5 ...
 $ d10: num  25 22.5 22.5 27.5 23.5 25.5 22 21.5 20.5 28 ...
 $ d12: num  29 23 24 26.5 22.5 27 24.5 24.5 31 31 ...
 $ d14: num  31 26.5 27.5 27 26 28.5 26.5 25.5 26 31.5 ...

Vertical direction

# convert list of data frames to matrices
ff <- lapply(ff, as.matrix)
# augment data with a new column variable 'year'
dtaL <- cbind(Reduce(rbind, ff), 
              year=rep(c(8,10,12,14), c(16,16,16,16))) %>% as.data.frame()
# rename the second column
names(dtaL)[2] <- "pp_distance"
#
str(dtaL)
'data.frame':   64 obs. of  3 variables:
 $ id         : num  12 13 14 15 16 17 18 19 20 21 ...
 $ pp_distance: num  26 21.5 23 25.5 20 24.5 22 24 23 27.5 ...
 $ year       : num  8 8 8 8 8 8 8 8 8 8 ...
head(dtaL, 20)
   id pp_distance year
1  12        26.0    8
2  13        21.5    8
3  14        23.0    8
4  15        25.5    8
5  16        20.0    8
6  17        24.5    8
7  18        22.0    8
8  19        24.0    8
9  20        23.0    8
10 21        27.5    8
11 22        23.0    8
12 23        21.5    8
13 24        17.0    8
14 25        22.5    8
15 26        23.0    8
16 27        22.0    8
17 12        25.0   10
18 13        22.5   10
19 14        22.5   10
20 15        27.5   10

Girl and Boy

Files in a folder

Collect the file names.

flsM <- list.files(path = "./tmp2_data", pattern = "m_")
flsF <- list.files(path = "./tmp_data", pattern = "f_")

Give files the full path to their location.

fLM <- paste0("./tmp2_data/", flsM)
fLF <- paste0("./tmp_data/", flsF)

Input multiple files

ffM <- lapply(fLM, read.csv)
ffF<- lapply(fLF, read.csv)

Reduce approach

library(dplyr)
# Roll our own merging function
mrg2 <- function(f1, f2){                                
  merge(f1, f2, by="id")
}

# 分別merge 男生與女生的list file
dtaW.M<-Reduce(mrg2, ffM)%>% as.data.frame()
dtaW.F<-Reduce(mrg2, ffF)%>% as.data.frame()

# rbind 男生與女生的資料
dtaW.all <- rbind(dtaW.F,dtaW.M)
#
str(dtaW.M)
'data.frame':   16 obs. of  5 variables:
 $ id : int  12 13 14 15 16 17 18 19 20 21 ...
 $ d8 : num  26 21.5 23 25.5 20 24.5 22 24 23 27.5 ...
 $ d10: num  25 22.5 22.5 27.5 23.5 25.5 22 21.5 20.5 28 ...
 $ d12: num  29 23 24 26.5 22.5 27 24.5 24.5 31 31 ...
 $ d14: num  31 26.5 27.5 27 26 28.5 26.5 25.5 26 31.5 ...
str(dtaW.F)
'data.frame':   11 obs. of  5 variables:
 $ id : int  1 2 3 4 5 6 7 8 9 10 ...
 $ d8 : num  21 21 20.5 23.5 21.5 20 21.5 23 20 16.5 ...
 $ d10: num  20 21.5 24 24.5 23 21 22.5 23 21 19 ...
 $ d12: num  21.5 24 24.5 25 22.5 21 23 23.5 22 19 ...
 $ d14: num  23 25.5 26 26.5 23.5 22.5 25 24 21.5 19.5 ...
str(dtaW.all)
'data.frame':   27 obs. of  5 variables:
 $ id : int  1 2 3 4 5 6 7 8 9 10 ...
 $ d8 : num  21 21 20.5 23.5 21.5 20 21.5 23 20 16.5 ...
 $ d10: num  20 21.5 24 24.5 23 21 22.5 23 21 19 ...
 $ d12: num  21.5 24 24.5 25 22.5 21 23 23.5 22 19 ...
 $ d14: num  23 25.5 26 26.5 23.5 22.5 25 24 21.5 19.5 ...
head(dtaW.all, 13)
   id   d8  d10  d12  d14
1   1 21.0 20.0 21.5 23.0
2   2 21.0 21.5 24.0 25.5
3   3 20.5 24.0 24.5 26.0
4   4 23.5 24.5 25.0 26.5
5   5 21.5 23.0 22.5 23.5
6   6 20.0 21.0 21.0 22.5
7   7 21.5 22.5 23.0 25.0
8   8 23.0 23.0 23.5 24.0
9   9 20.0 21.0 22.0 21.5
10 10 16.5 19.0 19.0 19.5
11 11 24.5 25.0 28.0 28.0
12 12 26.0 25.0 29.0 31.0
13 13 21.5 22.5 23.0 26.5

Tidy approach

library(tidyverse)
# 分別merge 男生與女生的list file
dtaW2.M<-ffM %>% reduce(inner_join, by='id')%>% as.data.frame()
dtaW2.F<-ffF %>% reduce(inner_join, by='id')%>% as.data.frame()
# rbind 男生與女生的資料
dtaW2.all <- rbind(dtaW2.F,dtaW2.M)
#
str(dtaW2.all)
'data.frame':   27 obs. of  5 variables:
 $ id : int  1 2 3 4 5 6 7 8 9 10 ...
 $ d8 : num  21 21 20.5 23.5 21.5 20 21.5 23 20 16.5 ...
 $ d10: num  20 21.5 24 24.5 23 21 22.5 23 21 19 ...
 $ d12: num  21.5 24 24.5 25 22.5 21 23 23.5 22 19 ...
 $ d14: num  23 25.5 26 26.5 23.5 22.5 25 24 21.5 19.5 ...

Vertical direction

# convert list of data frames to matrices
ffM <- lapply(ffM, as.matrix)
ffF <- lapply(ffF, as.matrix)
# 分別針對男生女生先做rbind(ID與pp_distance) & cbind(year variable)
# augment data with a new column variable 'year'
dtaL.F <- cbind(Reduce(rbind, ffF), 
              year=rep(c(8,10,12,14), c(11,11,11,11))) %>% as.data.frame()
dtaL.M <- cbind(Reduce(rbind, ffM), 
              year=rep(c(8,10,12,14), c(16,16,16,16))) %>% as.data.frame()

# rename the second column
names(dtaL.F)[2] <- "pp_distance"
names(dtaL.M)[2] <- "pp_distance"

# rbind 整體男生與女生的資料
dtaL.all <- rbind(dtaL.F,dtaL.M)

#
str(dtaL.all)
'data.frame':   108 obs. of  3 variables:
 $ id         : num  1 2 3 4 5 6 7 8 9 10 ...
 $ pp_distance: num  21 21 20.5 23.5 21.5 20 21.5 23 20 16.5 ...
 $ year       : num  8 8 8 8 8 8 8 8 8 8 ...