Input and merge files: exercise for male data

Data

Potthoff and Roy (1964) reported data on a study in 16 boys and 11 girls, who at ages 8, 10, 12, and 14 had the distance (mm) from the center of the pituitary gland to the pteryomaxillary fissure measured. Changes in pituitary-pteryomaxillary distances during growth is important in orthodontic therapy. We consider data from girls only here.

# require(pacman)
pacman::p_load(mice)
data(potthoffroy)
subset(potthoffroy, sex=='M')

dir.create(file.path(getwd(), "./try_data"), showWarnings=FALSE)

lapply(3:6, function(i) {
              write.csv(subset(potthoffroy, sex=='M')[, c(1, i)],
                        file=paste0("./try_data/m_", i-2, ".csv"),
                        row.names=FALSE)
                      }
)

Files in a folder

list.files("./try_data/", pattern="m_")

## [1] "m_1.csv" "m_2.csv" "m_3.csv" "m_4.csv"

read.csv("./try_data/m_1.csv")

##    id   d8
## 1  12 26.0
## 2  13 21.5
## 3  14 23.0
## 4  15 25.5
## 5  16 20.0
## 6  17 24.5
## 7  18 22.0
## 8  19 24.0
## 9  20 23.0
## 10 21 27.5
## 11 22 23.0
## 12 23 21.5
## 13 24 17.0
## 14 25 22.5
## 15 26 23.0
## 16 27 22.0

Collect the file names.

fls <- list.files(path = "./try_data", pattern = "m_")
fls

## [1] "m_1.csv" "m_2.csv" "m_3.csv" "m_4.csv"

Give files the full path to their location.

fL <- paste0("./try_data/", fls)
fL

## [1] "./try_data/m_1.csv" "./try_data/m_2.csv" "./try_data/m_3.csv"
## [4] "./try_data/m_4.csv"

Input multiple files

ff <- lapply(fL, read.csv)

Merge

merge(ff[1], ff[2])

##    id   d8  d10
## 1  12 26.0 25.0
## 2  13 21.5 22.5
## 3  14 23.0 22.5
## 4  15 25.5 27.5
## 5  16 20.0 23.5
## 6  17 24.5 25.5
## 7  18 22.0 22.0
## 8  19 24.0 21.5
## 9  20 23.0 20.5
## 10 21 27.5 28.0
## 11 22 23.0 23.0
## 12 23 21.5 23.5
## 13 24 17.0 24.5
## 14 25 22.5 25.5
## 15 26 23.0 24.5
## 16 27 22.0 21.5

Reduce approach

library(dplyr)
# Roll our own merging function
mrg2 <- function(f1, f2){                                
  merge(f1, f2, by="id")
}

# as.data.frame is easier to read in Rstudio
dtaW<-Reduce(mrg2, ff)%>% as.data.frame()
# rename the second column
names(dtaW)[1] <- "ID"
#
str(dtaW)

## 'data.frame':    16 obs. of  5 variables:
##  $ ID : int  12 13 14 15 16 17 18 19 20 21 ...
##  $ d8 : num  26 21.5 23 25.5 20 24.5 22 24 23 27.5 ...
##  $ d10: num  25 22.5 22.5 27.5 23.5 25.5 22 21.5 20.5 28 ...
##  $ d12: num  29 23 24 26.5 22.5 27 24.5 24.5 31 31 ...
##  $ d14: num  31 26.5 27.5 27 26 28.5 26.5 25.5 26 31.5 ...

head(dtaW, 16)

##    ID   d8  d10  d12  d14
## 1  12 26.0 25.0 29.0 31.0
## 2  13 21.5 22.5 23.0 26.5
## 3  14 23.0 22.5 24.0 27.5
## 4  15 25.5 27.5 26.5 27.0
## 5  16 20.0 23.5 22.5 26.0
## 6  17 24.5 25.5 27.0 28.5
## 7  18 22.0 22.0 24.5 26.5
## 8  19 24.0 21.5 24.5 25.5
## 9  20 23.0 20.5 31.0 26.0
## 10 21 27.5 28.0 31.0 31.5
## 11 22 23.0 23.0 23.5 25.0
## 12 23 21.5 23.5 24.0 28.0
## 13 24 17.0 24.5 26.0 29.5
## 14 25 22.5 25.5 25.5 26.0
## 15 26 23.0 24.5 26.0 30.0
## 16 27 22.0 21.5 23.5 25.0

Tidy approach

library(tidyverse)
# as.data.frame is easier to read in Rstudio
dtaW2<-ff %>% reduce(inner_join, by='id')%>% as.data.frame()
# rename the second column
names(dtaW2)[1] <- "ID"
#
str(dtaW2)

## 'data.frame':    16 obs. of  5 variables:
##  $ ID : int  12 13 14 15 16 17 18 19 20 21 ...
##  $ d8 : num  26 21.5 23 25.5 20 24.5 22 24 23 27.5 ...
##  $ d10: num  25 22.5 22.5 27.5 23.5 25.5 22 21.5 20.5 28 ...
##  $ d12: num  29 23 24 26.5 22.5 27 24.5 24.5 31 31 ...
##  $ d14: num  31 26.5 27.5 27 26 28.5 26.5 25.5 26 31.5 ...

Vertical direction

# convert list of data frames to matrices
ff <- lapply(ff, as.matrix)

# augment data with a new column variable 'year'
dtaL <- cbind(Reduce(rbind, ff), 
              year=rep(c(8,10,12,14), c(16,16,16,16))) %>% as.data.frame()
# rename the second column
names(dtaL)[2] <- "pp_distance"
#
str(dtaL)

## 'data.frame':    64 obs. of  3 variables:
##  $ id         : num  12 13 14 15 16 17 18 19 20 21 ...
##  $ pp_distance: num  26 21.5 23 25.5 20 24.5 22 24 23 27.5 ...
##  $ year       : num  8 8 8 8 8 8 8 8 8 8 ...

head(dtaL, 20)

##    id pp_distance year
## 1  12        26.0    8
## 2  13        21.5    8
## 3  14        23.0    8
## 4  15        25.5    8
## 5  16        20.0    8
## 6  17        24.5    8
## 7  18        22.0    8
## 8  19        24.0    8
## 9  20        23.0    8
## 10 21        27.5    8
## 11 22        23.0    8
## 12 23        21.5    8
## 13 24        17.0    8
## 14 25        22.5    8
## 15 26        23.0    8
## 16 27        22.0    8
## 17 12        25.0   10
## 18 13        22.5   10
## 19 14        22.5   10
## 20 15        27.5   10