Potthoff and Roy (1964) reported data on a study in 16 boys and 11 girls, who at ages 8, 10, 12, and 14 had the distance (mm) from the center of the pituitary gland to the pteryomaxillary fissure measured. Changes in pituitary-pteryomaxillary distances during growth is important in orthodontic therapy.
Exercises - using the female data only. - using the male data only. - using both data from males and females
We consider data from girls only here.
# require(pacman)
pacman::p_load(mice)
data(potthoffroy)
subset(potthoffroy, sex=='F') id sex d8 d10 d12 d14
1 1 F 21.0 20.0 21.5 23.0
2 2 F 21.0 21.5 24.0 25.5
3 3 F 20.5 24.0 24.5 26.0
4 4 F 23.5 24.5 25.0 26.5
5 5 F 21.5 23.0 22.5 23.5
6 6 F 20.0 21.0 21.0 22.5
7 7 F 21.5 22.5 23.0 25.0
8 8 F 23.0 23.0 23.5 24.0
9 9 F 20.0 21.0 22.0 21.5
10 10 F 16.5 19.0 19.0 19.5
11 11 F 24.5 25.0 28.0 28.0
We assume that there are 4 separate files in a local folder called tmp_data. (We will create them for you.)
list.files("./tmp_data/", pattern="f_")[1] "f_1.csv" "f_2.csv" "f_3.csv" "f_4.csv"
The content of the first one looks like this.
read.csv("./tmp_data/f_1.csv") id d8
1 1 21.0
2 2 21.0
3 3 20.5
4 4 23.5
5 5 21.5
6 6 20.0
7 7 21.5
8 8 23.0
9 9 20.0
10 10 16.5
11 11 24.5
Now collect the file names.
fls <- list.files(path = "./tmp_data", pattern = "f_")
fls[1] "f_1.csv" "f_2.csv" "f_3.csv" "f_4.csv"
Remember to give files the full path to their location.
fL <- paste0("./tmp_data/", fls)
fL[1] "./tmp_data/f_1.csv" "./tmp_data/f_2.csv" "./tmp_data/f_3.csv"
[4] "./tmp_data/f_4.csv"
Input these files as a list of data frames
ff <- lapply(fL, read.csv)We can merge two files by id.
merge(ff[1], ff[2]) id d8 d10
1 1 21.0 20.0
2 2 21.0 21.5
3 3 20.5 24.0
4 4 23.5 24.5
5 5 21.5 23.0
6 6 20.0 21.0
7 7 21.5 22.5
8 8 23.0 23.0
9 9 20.0 21.0
10 10 16.5 19.0
11 11 24.5 25.0
The function Reduce allows us to ‘loop’ through the list of files with our own version of merge called mrg2.
# Roll our own merging function
mrg2 <- function(f1, f2){
merge(f1, f2, by="id")
}
Reduce(mrg2, ff) id d8 d10 d12 d14
1 1 21.0 20.0 21.5 23.0
2 2 21.0 21.5 24.0 25.5
3 3 20.5 24.0 24.5 26.0
4 4 23.5 24.5 25.0 26.5
5 5 21.5 23.0 22.5 23.5
6 6 20.0 21.0 21.0 22.5
7 7 21.5 22.5 23.0 25.0
8 8 23.0 23.0 23.5 24.0
9 9 20.0 21.0 22.0 21.5
10 10 16.5 19.0 19.0 19.5
11 11 24.5 25.0 28.0 28.0
Instead of ‘merge’, ‘inner_join’ is used; instead of ‘Reduce’, ‘reduce’.
library(tidyverse)
ff |> reduce(inner_join, by='id') id d8 d10 d12 d14
1 1 21.0 20.0 21.5 23.0
2 2 21.0 21.5 24.0 25.5
3 3 20.5 24.0 24.5 26.0
4 4 23.5 24.5 25.0 26.5
5 5 21.5 23.0 22.5 23.5
6 6 20.0 21.0 21.0 22.5
7 7 21.5 22.5 23.0 25.0
8 8 23.0 23.0 23.5 24.0
9 9 20.0 21.0 22.0 21.5
10 10 16.5 19.0 19.0 19.5
11 11 24.5 25.0 28.0 28.0
We can ‘bind’ the input in the vertical direction to construct an output in the long format in contrast to the wide format of the original data.
# extract number of observations from components of the list
n <- sapply(ff, function(x) dim(x)[1])# extract numbers from year variable
p <- sapply(ff, function(x) names(x)[2]) |> parse_number()# convert list of data frames to matrices
ff <- lapply(ff, as.matrix)# augment data with a new column variable 'year'
dtaL <- cbind(Reduce(rbind, ff), year=rep(p, n)) |> as.data.frame()
# renme the second column
names(dtaL)[2] <- "pp_distance"head(dtaL, 13) id pp_distance year
1 1 21.0 8
2 2 21.0 8
3 3 20.5 8
4 4 23.5 8
5 5 21.5 8
6 6 20.0 8
7 7 21.5 8
8 8 23.0 8
9 9 20.0 8
10 10 16.5 8
11 11 24.5 8
12 1 20.0 10
13 2 21.5 10
subset(potthoffroy, sex=='M') id sex d8 d10 d12 d14
12 12 M 26.0 25.0 29.0 31.0
13 13 M 21.5 22.5 23.0 26.5
14 14 M 23.0 22.5 24.0 27.5
15 15 M 25.5 27.5 26.5 27.0
16 16 M 20.0 23.5 22.5 26.0
17 17 M 24.5 25.5 27.0 28.5
18 18 M 22.0 22.0 24.5 26.5
19 19 M 24.0 21.5 24.5 25.5
20 20 M 23.0 20.5 31.0 26.0
21 21 M 27.5 28.0 31.0 31.5
22 22 M 23.0 23.0 23.5 25.0
23 23 M 21.5 23.5 24.0 28.0
24 24 M 17.0 24.5 26.0 29.5
25 25 M 22.5 25.5 25.5 26.0
26 26 M 23.0 24.5 26.0 30.0
27 27 M 22.0 21.5 23.5 25.0
dir.create(file.path(getwd(), "./tmp2_data"), showWarnings=FALSE)
lapply(3:6, function(i) {
write.csv(subset(potthoffroy, sex=='M')[, c(1, i)],
file=paste0("./tmp2_data/m_", i-2, ".csv"),
row.names=FALSE)
}
)[[1]]
NULL
[[2]]
NULL
[[3]]
NULL
[[4]]
NULL
list.files("./tmp2_data/", pattern="m_")[1] "m_1.csv" "m_2.csv" "m_3.csv" "m_4.csv"
read.csv("./tmp2_data/m_1.csv") id d8
1 12 26.0
2 13 21.5
3 14 23.0
4 15 25.5
5 16 20.0
6 17 24.5
7 18 22.0
8 19 24.0
9 20 23.0
10 21 27.5
11 22 23.0
12 23 21.5
13 24 17.0
14 25 22.5
15 26 23.0
16 27 22.0
Collect the file names.
fls <- list.files(path = "./tmp2_data", pattern = "m_")
fls[1] "m_1.csv" "m_2.csv" "m_3.csv" "m_4.csv"
Give files the full path to their location.
fL <- paste0("./tmp2_data/", fls)
fL[1] "./tmp2_data/m_1.csv" "./tmp2_data/m_2.csv" "./tmp2_data/m_3.csv"
[4] "./tmp2_data/m_4.csv"
ff <- lapply(fL, read.csv)merge(ff[1], ff[2]) id d8 d10
1 12 26.0 25.0
2 13 21.5 22.5
3 14 23.0 22.5
4 15 25.5 27.5
5 16 20.0 23.5
6 17 24.5 25.5
7 18 22.0 22.0
8 19 24.0 21.5
9 20 23.0 20.5
10 21 27.5 28.0
11 22 23.0 23.0
12 23 21.5 23.5
13 24 17.0 24.5
14 25 22.5 25.5
15 26 23.0 24.5
16 27 22.0 21.5
library(dplyr)
# Roll our own merging function
mrg2 <- function(f1, f2){
merge(f1, f2, by="id")
}
# as.data.frame is easier to read in Rstudio
dtaW<-Reduce(mrg2, ff)%>% as.data.frame()
# rename the second column
names(dtaW)[1] <- "ID"
#
str(dtaW)'data.frame': 16 obs. of 5 variables:
$ ID : int 12 13 14 15 16 17 18 19 20 21 ...
$ d8 : num 26 21.5 23 25.5 20 24.5 22 24 23 27.5 ...
$ d10: num 25 22.5 22.5 27.5 23.5 25.5 22 21.5 20.5 28 ...
$ d12: num 29 23 24 26.5 22.5 27 24.5 24.5 31 31 ...
$ d14: num 31 26.5 27.5 27 26 28.5 26.5 25.5 26 31.5 ...
head(dtaW, 16) ID d8 d10 d12 d14
1 12 26.0 25.0 29.0 31.0
2 13 21.5 22.5 23.0 26.5
3 14 23.0 22.5 24.0 27.5
4 15 25.5 27.5 26.5 27.0
5 16 20.0 23.5 22.5 26.0
6 17 24.5 25.5 27.0 28.5
7 18 22.0 22.0 24.5 26.5
8 19 24.0 21.5 24.5 25.5
9 20 23.0 20.5 31.0 26.0
10 21 27.5 28.0 31.0 31.5
11 22 23.0 23.0 23.5 25.0
12 23 21.5 23.5 24.0 28.0
13 24 17.0 24.5 26.0 29.5
14 25 22.5 25.5 25.5 26.0
15 26 23.0 24.5 26.0 30.0
16 27 22.0 21.5 23.5 25.0
library(tidyverse)
# as.data.frame is easier to read in Rstudio
dtaW2<-ff %>% reduce(inner_join, by='id')%>% as.data.frame()
# rename the second column
names(dtaW2)[1] <- "ID"
#
str(dtaW2)'data.frame': 16 obs. of 5 variables:
$ ID : int 12 13 14 15 16 17 18 19 20 21 ...
$ d8 : num 26 21.5 23 25.5 20 24.5 22 24 23 27.5 ...
$ d10: num 25 22.5 22.5 27.5 23.5 25.5 22 21.5 20.5 28 ...
$ d12: num 29 23 24 26.5 22.5 27 24.5 24.5 31 31 ...
$ d14: num 31 26.5 27.5 27 26 28.5 26.5 25.5 26 31.5 ...
# convert list of data frames to matrices
ff <- lapply(ff, as.matrix)
# augment data with a new column variable 'year'
dtaL <- cbind(Reduce(rbind, ff),
year=rep(c(8,10,12,14), c(16,16,16,16))) %>% as.data.frame()
# rename the second column
names(dtaL)[2] <- "pp_distance"
#
str(dtaL)'data.frame': 64 obs. of 3 variables:
$ id : num 12 13 14 15 16 17 18 19 20 21 ...
$ pp_distance: num 26 21.5 23 25.5 20 24.5 22 24 23 27.5 ...
$ year : num 8 8 8 8 8 8 8 8 8 8 ...
head(dtaL, 20) id pp_distance year
1 12 26.0 8
2 13 21.5 8
3 14 23.0 8
4 15 25.5 8
5 16 20.0 8
6 17 24.5 8
7 18 22.0 8
8 19 24.0 8
9 20 23.0 8
10 21 27.5 8
11 22 23.0 8
12 23 21.5 8
13 24 17.0 8
14 25 22.5 8
15 26 23.0 8
16 27 22.0 8
17 12 25.0 10
18 13 22.5 10
19 14 22.5 10
20 15 27.5 10
Collect the file names.
flsM <- list.files(path = "./tmp2_data", pattern = "m_")
flsF <- list.files(path = "./tmp_data", pattern = "f_")Give files the full path to their location.
fLM <- paste0("./tmp2_data/", flsM)
fLF <- paste0("./tmp_data/", flsF)ffM <- lapply(fLM, read.csv)
ffF<- lapply(fLF, read.csv)library(dplyr)
# Roll our own merging function
mrg2 <- function(f1, f2){
merge(f1, f2, by="id")
}
# 分別merge 男生與女生的list file
dtaW.M<-Reduce(mrg2, ffM)%>% as.data.frame()
dtaW.F<-Reduce(mrg2, ffF)%>% as.data.frame()
# rbind 男生與女生的資料
dtaW.all <- rbind(dtaW.F,dtaW.M)
#
str(dtaW.M)'data.frame': 16 obs. of 5 variables:
$ id : int 12 13 14 15 16 17 18 19 20 21 ...
$ d8 : num 26 21.5 23 25.5 20 24.5 22 24 23 27.5 ...
$ d10: num 25 22.5 22.5 27.5 23.5 25.5 22 21.5 20.5 28 ...
$ d12: num 29 23 24 26.5 22.5 27 24.5 24.5 31 31 ...
$ d14: num 31 26.5 27.5 27 26 28.5 26.5 25.5 26 31.5 ...
str(dtaW.F)'data.frame': 11 obs. of 5 variables:
$ id : int 1 2 3 4 5 6 7 8 9 10 ...
$ d8 : num 21 21 20.5 23.5 21.5 20 21.5 23 20 16.5 ...
$ d10: num 20 21.5 24 24.5 23 21 22.5 23 21 19 ...
$ d12: num 21.5 24 24.5 25 22.5 21 23 23.5 22 19 ...
$ d14: num 23 25.5 26 26.5 23.5 22.5 25 24 21.5 19.5 ...
str(dtaW.all)'data.frame': 27 obs. of 5 variables:
$ id : int 1 2 3 4 5 6 7 8 9 10 ...
$ d8 : num 21 21 20.5 23.5 21.5 20 21.5 23 20 16.5 ...
$ d10: num 20 21.5 24 24.5 23 21 22.5 23 21 19 ...
$ d12: num 21.5 24 24.5 25 22.5 21 23 23.5 22 19 ...
$ d14: num 23 25.5 26 26.5 23.5 22.5 25 24 21.5 19.5 ...
head(dtaW.all, 13) id d8 d10 d12 d14
1 1 21.0 20.0 21.5 23.0
2 2 21.0 21.5 24.0 25.5
3 3 20.5 24.0 24.5 26.0
4 4 23.5 24.5 25.0 26.5
5 5 21.5 23.0 22.5 23.5
6 6 20.0 21.0 21.0 22.5
7 7 21.5 22.5 23.0 25.0
8 8 23.0 23.0 23.5 24.0
9 9 20.0 21.0 22.0 21.5
10 10 16.5 19.0 19.0 19.5
11 11 24.5 25.0 28.0 28.0
12 12 26.0 25.0 29.0 31.0
13 13 21.5 22.5 23.0 26.5
library(tidyverse)
# 分別merge 男生與女生的list file
dtaW2.M<-ffM %>% reduce(inner_join, by='id')%>% as.data.frame()
dtaW2.F<-ffF %>% reduce(inner_join, by='id')%>% as.data.frame()
# rbind 男生與女生的資料
dtaW2.all <- rbind(dtaW2.F,dtaW2.M)
#
str(dtaW2.all)'data.frame': 27 obs. of 5 variables:
$ id : int 1 2 3 4 5 6 7 8 9 10 ...
$ d8 : num 21 21 20.5 23.5 21.5 20 21.5 23 20 16.5 ...
$ d10: num 20 21.5 24 24.5 23 21 22.5 23 21 19 ...
$ d12: num 21.5 24 24.5 25 22.5 21 23 23.5 22 19 ...
$ d14: num 23 25.5 26 26.5 23.5 22.5 25 24 21.5 19.5 ...
# convert list of data frames to matrices
ffM <- lapply(ffM, as.matrix)
ffF <- lapply(ffF, as.matrix)
# 分別針對男生女生先做rbind(ID與pp_distance) & cbind(year variable)
# augment data with a new column variable 'year'
dtaL.F <- cbind(Reduce(rbind, ffF),
year=rep(c(8,10,12,14), c(11,11,11,11))) %>% as.data.frame()
dtaL.M <- cbind(Reduce(rbind, ffM),
year=rep(c(8,10,12,14), c(16,16,16,16))) %>% as.data.frame()
# rename the second column
names(dtaL.F)[2] <- "pp_distance"
names(dtaL.M)[2] <- "pp_distance"
# rbind 整體男生與女生的資料
dtaL.all <- rbind(dtaL.F,dtaL.M)
#
str(dtaL.all)'data.frame': 108 obs. of 3 variables:
$ id : num 1 2 3 4 5 6 7 8 9 10 ...
$ pp_distance: num 21 21 20.5 23.5 21.5 20 21.5 23 20 16.5 ...
$ year : num 8 8 8 8 8 8 8 8 8 8 ...