Potthoff and Roy (1964) reported data on a study in 16 boys and 11 girls, who at ages 8, 10, 12, and 14 had the distance (mm) from the center of the pituitary gland to the pteryomaxillary fissure measured. Changes in pituitary-pteryomaxillary distances during growth is important in orthodontic therapy.
Exercises - using the female data only. - using the male data only. - using both data from males and females
We consider data from girls only here.
# require(pacman)
::p_load(mice)
pacmandata(potthoffroy)
subset(potthoffroy, sex=='F')
id sex d8 d10 d12 d14
1 1 F 21.0 20.0 21.5 23.0
2 2 F 21.0 21.5 24.0 25.5
3 3 F 20.5 24.0 24.5 26.0
4 4 F 23.5 24.5 25.0 26.5
5 5 F 21.5 23.0 22.5 23.5
6 6 F 20.0 21.0 21.0 22.5
7 7 F 21.5 22.5 23.0 25.0
8 8 F 23.0 23.0 23.5 24.0
9 9 F 20.0 21.0 22.0 21.5
10 10 F 16.5 19.0 19.0 19.5
11 11 F 24.5 25.0 28.0 28.0
We assume that there are 4 separate files in a local folder called tmp_data. (We will create them for you.)
list.files("./tmp_data/", pattern="f_")
[1] "f_1.csv" "f_2.csv" "f_3.csv" "f_4.csv"
The content of the first one looks like this.
read.csv("./tmp_data/f_1.csv")
id d8
1 1 21.0
2 2 21.0
3 3 20.5
4 4 23.5
5 5 21.5
6 6 20.0
7 7 21.5
8 8 23.0
9 9 20.0
10 10 16.5
11 11 24.5
Now collect the file names.
<- list.files(path = "./tmp_data", pattern = "f_")
fls fls
[1] "f_1.csv" "f_2.csv" "f_3.csv" "f_4.csv"
Remember to give files the full path to their location.
<- paste0("./tmp_data/", fls)
fL
fL
[1] "./tmp_data/f_1.csv" "./tmp_data/f_2.csv" "./tmp_data/f_3.csv"
[4] "./tmp_data/f_4.csv"
Input these files as a list of data frames
<- lapply(fL, read.csv) ff
We can merge two files by id.
merge(ff[1], ff[2])
id d8 d10
1 1 21.0 20.0
2 2 21.0 21.5
3 3 20.5 24.0
4 4 23.5 24.5
5 5 21.5 23.0
6 6 20.0 21.0
7 7 21.5 22.5
8 8 23.0 23.0
9 9 20.0 21.0
10 10 16.5 19.0
11 11 24.5 25.0
The function Reduce allows us to ‘loop’ through the list of files with our own version of merge called mrg2.
# Roll our own merging function
<- function(f1, f2){
mrg2 merge(f1, f2, by="id")
}Reduce(mrg2, ff)
id d8 d10 d12 d14
1 1 21.0 20.0 21.5 23.0
2 2 21.0 21.5 24.0 25.5
3 3 20.5 24.0 24.5 26.0
4 4 23.5 24.5 25.0 26.5
5 5 21.5 23.0 22.5 23.5
6 6 20.0 21.0 21.0 22.5
7 7 21.5 22.5 23.0 25.0
8 8 23.0 23.0 23.5 24.0
9 9 20.0 21.0 22.0 21.5
10 10 16.5 19.0 19.0 19.5
11 11 24.5 25.0 28.0 28.0
Instead of ‘merge’, ‘inner_join’ is used; instead of ‘Reduce’, ‘reduce’.
library(tidyverse)
|> reduce(inner_join, by='id') ff
id d8 d10 d12 d14
1 1 21.0 20.0 21.5 23.0
2 2 21.0 21.5 24.0 25.5
3 3 20.5 24.0 24.5 26.0
4 4 23.5 24.5 25.0 26.5
5 5 21.5 23.0 22.5 23.5
6 6 20.0 21.0 21.0 22.5
7 7 21.5 22.5 23.0 25.0
8 8 23.0 23.0 23.5 24.0
9 9 20.0 21.0 22.0 21.5
10 10 16.5 19.0 19.0 19.5
11 11 24.5 25.0 28.0 28.0
We can ‘bind’ the input in the vertical direction to construct an output in the long format in contrast to the wide format of the original data.
# extract number of observations from components of the list
<- sapply(ff, function(x) dim(x)[1]) n
# extract numbers from year variable
<- sapply(ff, function(x) names(x)[2]) |> parse_number() p
# convert list of data frames to matrices
<- lapply(ff, as.matrix) ff
# augment data with a new column variable 'year'
<- cbind(Reduce(rbind, ff), year=rep(p, n)) |> as.data.frame()
dtaL # renme the second column
names(dtaL)[2] <- "pp_distance"
head(dtaL, 13)
id pp_distance year
1 1 21.0 8
2 2 21.0 8
3 3 20.5 8
4 4 23.5 8
5 5 21.5 8
6 6 20.0 8
7 7 21.5 8
8 8 23.0 8
9 9 20.0 8
10 10 16.5 8
11 11 24.5 8
12 1 20.0 10
13 2 21.5 10
subset(potthoffroy, sex=='M')
id sex d8 d10 d12 d14
12 12 M 26.0 25.0 29.0 31.0
13 13 M 21.5 22.5 23.0 26.5
14 14 M 23.0 22.5 24.0 27.5
15 15 M 25.5 27.5 26.5 27.0
16 16 M 20.0 23.5 22.5 26.0
17 17 M 24.5 25.5 27.0 28.5
18 18 M 22.0 22.0 24.5 26.5
19 19 M 24.0 21.5 24.5 25.5
20 20 M 23.0 20.5 31.0 26.0
21 21 M 27.5 28.0 31.0 31.5
22 22 M 23.0 23.0 23.5 25.0
23 23 M 21.5 23.5 24.0 28.0
24 24 M 17.0 24.5 26.0 29.5
25 25 M 22.5 25.5 25.5 26.0
26 26 M 23.0 24.5 26.0 30.0
27 27 M 22.0 21.5 23.5 25.0
dir.create(file.path(getwd(), "./tmp2_data"), showWarnings=FALSE)
lapply(3:6, function(i) {
write.csv(subset(potthoffroy, sex=='M')[, c(1, i)],
file=paste0("./tmp2_data/m_", i-2, ".csv"),
row.names=FALSE)
} )
[[1]]
NULL
[[2]]
NULL
[[3]]
NULL
[[4]]
NULL
list.files("./tmp2_data/", pattern="m_")
[1] "m_1.csv" "m_2.csv" "m_3.csv" "m_4.csv"
read.csv("./tmp2_data/m_1.csv")
id d8
1 12 26.0
2 13 21.5
3 14 23.0
4 15 25.5
5 16 20.0
6 17 24.5
7 18 22.0
8 19 24.0
9 20 23.0
10 21 27.5
11 22 23.0
12 23 21.5
13 24 17.0
14 25 22.5
15 26 23.0
16 27 22.0
Collect the file names.
<- list.files(path = "./tmp2_data", pattern = "m_")
fls fls
[1] "m_1.csv" "m_2.csv" "m_3.csv" "m_4.csv"
Give files the full path to their location.
<- paste0("./tmp2_data/", fls)
fL fL
[1] "./tmp2_data/m_1.csv" "./tmp2_data/m_2.csv" "./tmp2_data/m_3.csv"
[4] "./tmp2_data/m_4.csv"
<- lapply(fL, read.csv) ff
merge(ff[1], ff[2])
id d8 d10
1 12 26.0 25.0
2 13 21.5 22.5
3 14 23.0 22.5
4 15 25.5 27.5
5 16 20.0 23.5
6 17 24.5 25.5
7 18 22.0 22.0
8 19 24.0 21.5
9 20 23.0 20.5
10 21 27.5 28.0
11 22 23.0 23.0
12 23 21.5 23.5
13 24 17.0 24.5
14 25 22.5 25.5
15 26 23.0 24.5
16 27 22.0 21.5
library(dplyr)
# Roll our own merging function
<- function(f1, f2){
mrg2 merge(f1, f2, by="id")
}
# as.data.frame is easier to read in Rstudio
<-Reduce(mrg2, ff)%>% as.data.frame()
dtaW# rename the second column
names(dtaW)[1] <- "ID"
#
str(dtaW)
'data.frame': 16 obs. of 5 variables:
$ ID : int 12 13 14 15 16 17 18 19 20 21 ...
$ d8 : num 26 21.5 23 25.5 20 24.5 22 24 23 27.5 ...
$ d10: num 25 22.5 22.5 27.5 23.5 25.5 22 21.5 20.5 28 ...
$ d12: num 29 23 24 26.5 22.5 27 24.5 24.5 31 31 ...
$ d14: num 31 26.5 27.5 27 26 28.5 26.5 25.5 26 31.5 ...
head(dtaW, 16)
ID d8 d10 d12 d14
1 12 26.0 25.0 29.0 31.0
2 13 21.5 22.5 23.0 26.5
3 14 23.0 22.5 24.0 27.5
4 15 25.5 27.5 26.5 27.0
5 16 20.0 23.5 22.5 26.0
6 17 24.5 25.5 27.0 28.5
7 18 22.0 22.0 24.5 26.5
8 19 24.0 21.5 24.5 25.5
9 20 23.0 20.5 31.0 26.0
10 21 27.5 28.0 31.0 31.5
11 22 23.0 23.0 23.5 25.0
12 23 21.5 23.5 24.0 28.0
13 24 17.0 24.5 26.0 29.5
14 25 22.5 25.5 25.5 26.0
15 26 23.0 24.5 26.0 30.0
16 27 22.0 21.5 23.5 25.0
library(tidyverse)
# as.data.frame is easier to read in Rstudio
<-ff %>% reduce(inner_join, by='id')%>% as.data.frame()
dtaW2# rename the second column
names(dtaW2)[1] <- "ID"
#
str(dtaW2)
'data.frame': 16 obs. of 5 variables:
$ ID : int 12 13 14 15 16 17 18 19 20 21 ...
$ d8 : num 26 21.5 23 25.5 20 24.5 22 24 23 27.5 ...
$ d10: num 25 22.5 22.5 27.5 23.5 25.5 22 21.5 20.5 28 ...
$ d12: num 29 23 24 26.5 22.5 27 24.5 24.5 31 31 ...
$ d14: num 31 26.5 27.5 27 26 28.5 26.5 25.5 26 31.5 ...
# convert list of data frames to matrices
<- lapply(ff, as.matrix)
ff # augment data with a new column variable 'year'
<- cbind(Reduce(rbind, ff),
dtaL year=rep(c(8,10,12,14), c(16,16,16,16))) %>% as.data.frame()
# rename the second column
names(dtaL)[2] <- "pp_distance"
#
str(dtaL)
'data.frame': 64 obs. of 3 variables:
$ id : num 12 13 14 15 16 17 18 19 20 21 ...
$ pp_distance: num 26 21.5 23 25.5 20 24.5 22 24 23 27.5 ...
$ year : num 8 8 8 8 8 8 8 8 8 8 ...
head(dtaL, 20)
id pp_distance year
1 12 26.0 8
2 13 21.5 8
3 14 23.0 8
4 15 25.5 8
5 16 20.0 8
6 17 24.5 8
7 18 22.0 8
8 19 24.0 8
9 20 23.0 8
10 21 27.5 8
11 22 23.0 8
12 23 21.5 8
13 24 17.0 8
14 25 22.5 8
15 26 23.0 8
16 27 22.0 8
17 12 25.0 10
18 13 22.5 10
19 14 22.5 10
20 15 27.5 10
Collect the file names.
<- list.files(path = "./tmp2_data", pattern = "m_")
flsM <- list.files(path = "./tmp_data", pattern = "f_") flsF
Give files the full path to their location.
<- paste0("./tmp2_data/", flsM)
fLM <- paste0("./tmp_data/", flsF) fLF
<- lapply(fLM, read.csv)
ffM <- lapply(fLF, read.csv) ffF
library(dplyr)
# Roll our own merging function
<- function(f1, f2){
mrg2 merge(f1, f2, by="id")
}
# 分別merge 男生與女生的list file
<-Reduce(mrg2, ffM)%>% as.data.frame()
dtaW.M<-Reduce(mrg2, ffF)%>% as.data.frame()
dtaW.F
# rbind 男生與女生的資料
<- rbind(dtaW.F,dtaW.M)
dtaW.all #
str(dtaW.M)
'data.frame': 16 obs. of 5 variables:
$ id : int 12 13 14 15 16 17 18 19 20 21 ...
$ d8 : num 26 21.5 23 25.5 20 24.5 22 24 23 27.5 ...
$ d10: num 25 22.5 22.5 27.5 23.5 25.5 22 21.5 20.5 28 ...
$ d12: num 29 23 24 26.5 22.5 27 24.5 24.5 31 31 ...
$ d14: num 31 26.5 27.5 27 26 28.5 26.5 25.5 26 31.5 ...
str(dtaW.F)
'data.frame': 11 obs. of 5 variables:
$ id : int 1 2 3 4 5 6 7 8 9 10 ...
$ d8 : num 21 21 20.5 23.5 21.5 20 21.5 23 20 16.5 ...
$ d10: num 20 21.5 24 24.5 23 21 22.5 23 21 19 ...
$ d12: num 21.5 24 24.5 25 22.5 21 23 23.5 22 19 ...
$ d14: num 23 25.5 26 26.5 23.5 22.5 25 24 21.5 19.5 ...
str(dtaW.all)
'data.frame': 27 obs. of 5 variables:
$ id : int 1 2 3 4 5 6 7 8 9 10 ...
$ d8 : num 21 21 20.5 23.5 21.5 20 21.5 23 20 16.5 ...
$ d10: num 20 21.5 24 24.5 23 21 22.5 23 21 19 ...
$ d12: num 21.5 24 24.5 25 22.5 21 23 23.5 22 19 ...
$ d14: num 23 25.5 26 26.5 23.5 22.5 25 24 21.5 19.5 ...
head(dtaW.all, 13)
id d8 d10 d12 d14
1 1 21.0 20.0 21.5 23.0
2 2 21.0 21.5 24.0 25.5
3 3 20.5 24.0 24.5 26.0
4 4 23.5 24.5 25.0 26.5
5 5 21.5 23.0 22.5 23.5
6 6 20.0 21.0 21.0 22.5
7 7 21.5 22.5 23.0 25.0
8 8 23.0 23.0 23.5 24.0
9 9 20.0 21.0 22.0 21.5
10 10 16.5 19.0 19.0 19.5
11 11 24.5 25.0 28.0 28.0
12 12 26.0 25.0 29.0 31.0
13 13 21.5 22.5 23.0 26.5
library(tidyverse)
# 分別merge 男生與女生的list file
<-ffM %>% reduce(inner_join, by='id')%>% as.data.frame()
dtaW2.M<-ffF %>% reduce(inner_join, by='id')%>% as.data.frame()
dtaW2.F# rbind 男生與女生的資料
<- rbind(dtaW2.F,dtaW2.M)
dtaW2.all #
str(dtaW2.all)
'data.frame': 27 obs. of 5 variables:
$ id : int 1 2 3 4 5 6 7 8 9 10 ...
$ d8 : num 21 21 20.5 23.5 21.5 20 21.5 23 20 16.5 ...
$ d10: num 20 21.5 24 24.5 23 21 22.5 23 21 19 ...
$ d12: num 21.5 24 24.5 25 22.5 21 23 23.5 22 19 ...
$ d14: num 23 25.5 26 26.5 23.5 22.5 25 24 21.5 19.5 ...
# convert list of data frames to matrices
<- lapply(ffM, as.matrix)
ffM <- lapply(ffF, as.matrix)
ffF # 分別針對男生女生先做rbind(ID與pp_distance) & cbind(year variable)
# augment data with a new column variable 'year'
<- cbind(Reduce(rbind, ffF),
dtaL.F year=rep(c(8,10,12,14), c(11,11,11,11))) %>% as.data.frame()
<- cbind(Reduce(rbind, ffM),
dtaL.M year=rep(c(8,10,12,14), c(16,16,16,16))) %>% as.data.frame()
# rename the second column
names(dtaL.F)[2] <- "pp_distance"
names(dtaL.M)[2] <- "pp_distance"
# rbind 整體男生與女生的資料
<- rbind(dtaL.F,dtaL.M)
dtaL.all
#
str(dtaL.all)
'data.frame': 108 obs. of 3 variables:
$ id : num 1 2 3 4 5 6 7 8 9 10 ...
$ pp_distance: num 21 21 20.5 23.5 21.5 20 21.5 23 20 16.5 ...
$ year : num 8 8 8 8 8 8 8 8 8 8 ...