R Code Section

Preprocessing

Osaka Data Set Aggregation

Reads in and aggregates raw sensor data by subject.

# get all file with sensor raw data from each subject namnes in directory
fls = list.files(path = paste0(root, center.data))

# read in file, only accelerometer data, after file and merge data into df.ac
df.ac = data.frame()
for (f in 1:length(fls)){
    #f = 1
    fl = fls[f]
    
    # read file
    df.temp =  read.csv(file = paste0(root, center.data, fl),header = F,skip = 2)[,c(4:6)]
    
    # extract data from filename 
    id.seq = unlist(strsplit(fl, "_"))[c(2, 4)]
    
    # Getting ID number
    m = regexec('[0-9]+', id.seq[1])
    id = regmatches(id.seq[1], m)
    
    # Getting Squence 0 or 1
    m = regexec('[0-9]', id.seq[2])
    sq = regmatches(id.seq[2], m)
    
    df.temp$ID = as.character(id)
    df.temp$seq = as.character(sq)
    df.temp$time =  timestamp(n = dim(df.temp)[1])
    
    df.ac = rbind(df.ac, df.temp)
}
df.ac$id = as.integer(df.ac$ID)

# merge over id age and gender
# read in gender, age file 
df.age.gender = read.csv(file = paste0(root,age.gender))


# merge file
df.gait = merge(df.ac, df.age.gender, by = "ID", rownames = F)
#df.gait = df.gait[complete.cases(df.gait), ]

# clean up data frame
names =c("id", "x", "y", "z", "seq", 'time', 'gender', 'age')
colnames(df.gait) = names
df.gait = df.gait[order(df.gait$seq, df.gait$id, df.gait$time), ]

# Data frame Report
str(df.gait))

write.csv(df.gait, file = paste0(root, "/initial.gait.sensor.center.csv"), row.names = FALSE)
my_height_age  = read.csv( file = paste0(root, "/initial.gait.sensor.center.csv"), row.names = NULL)#[, c("id", "age", "gender")]

data.set.dimensions = data.frame(
    subjects = length(unique(my_height_age$id)), 
    age = "3-78 years",
    gender = "m = 359, f = 385",
    walks = length(unique(my_height_age$seq)),
    rows = dim(my_height_age)[1], 
    col = dim(my_height_age)[2]
)

Enhancing Osaka data set with height and unix time stamp

In order to apply the gaitpy module height per subject and a unix time stamp are required. The missing height is sampled from the height distribution (2018), collected yearly by the Japan Sports Agency. https://nbakki.hatenablog.com/entry/Average_Height_of_Japanese_2018

Preprocessing height / age data 2018

h.jp = read.xlsx(paste0(root, "/average_height_jp.xlsx"), sheet = 1)
jp = data.frame()
for (ref in h.jp$age){
    #print(ref)
    for (g in c(0,1)){
        l = as.integer(unlist(strsplit(ref, "-")))
        if ((length(l))>1){
            vec = seq(l[1], l[2], 1)
            # print(vec)
        }
        else vec = as.numeric(l) 
        
        temp = cbind(vec, 
                     rep(h.jp[h.jp$age == ref, 3-g], length(vec)),
                     rep(g, length(vec)))
        jp = rbind(temp, jp)
    }
}

names(jp) = c("age", "height", "gender" )
jp = jp[order(jp$age),]

# ouput plot
p = ggplot(data = jp, aes(x = age, y = height, color = factor(gender))) +
    geom_point() + 
    ggtitle("")+
    scale_color_discrete(labels = c("female", "male")) + 
    labs(color = "Average height per age in Japan, 2018") +
    theme(legend.position = c(0.6, 0.2), 
          legend.background = element_rect(fill=alpha('white', 0.1))) 
#p

Associating age with height

## Associating height, age and gender
my_height_age  = read.csv( file = paste0(root, "/initial.gait.sensor.center.csv"))#[, c("id", "age", "gender")]
df = data.frame()
idx = 0
for (ref in unique(my_height_age$age)){
    for (g in c(0,1)){
        idx = idx + 1
        #print(ref)
        temp = subset(my_height_age, age == ref & gender == g)
        
        if (all(dim(temp)) == TRUE ){
            #print(dim(temp))
            temp$height = jp[jp$age == ref & jp$gender == g, 2]
            df = rbind(df, temp) 
        }
    }
}
df$height = as.integer(df$height)

## Introducing index per id, and sequence
idx = 0
mydf = data.frame()
for (i in unique(df$id)){
    sq = unique(df[df$id == i, ]$seq)
    if (length(sq) < 2) print(paste("sq < 2, id = ", i))
    for (s in sq){
        temp = subset(df, id == i & seq == s)
        idx = idx + 1
        temp$idx = idx
        if ((idx %% 100) == 0 ) print(idx)
        mydf = rbind(mydf, temp)
    }
}


# Store enhanced set
write.csv(mydf, file = paste0(root, "/gait.sensor.center.csv"), row.names = FALSE)

Data set ready for gaitpy module application

my_height_age  = read.csv( file = paste0(root, "/gait.sensor.center.csv"))[, -c(2, 4) ]
p.age.height.distribution = ggplot(data = my_height_age[!duplicated(my_height_age$id), ], aes(x = age, y = height, color = factor(gender))) +
    geom_point() + 
    ggtitle("")+
    scale_color_discrete(labels = c("female", "male")) + 
    labs(color = "Average height per age") +
    guides(fill=guide_legend(title="")) +
    theme(legend.position = c(0.6, 0.4), 
          legend.background = element_rect(fill=alpha('white', 0.1))) 


p.height.distribution = ggplot(data = my_height_age[!duplicated(my_height_age$id), ], 
                               aes(x = height, fill = as.factor(gender))) + 
    geom_histogram(alpha = 0.5, 
                   position = 'identity', 
                   binwidth = 1) + 
    ggtitle("") +
    scale_fill_discrete(labels = "") +
    labs(color = "Height Distribution") +
    guides(fill=guide_legend(title="")) +
    theme(legend.position = "none"
          #legend.background = element_rect(fill=alpha('white', 0.0))
    )

p.age.distribution = ggplot(data = my_height_age[!duplicated(my_height_age$id), ], 
                            aes(x = age, fill = as.factor(gender))) + 
    geom_histogram(alpha = 0.5, 
                   position = 'identity', 
                   binwidth = 1) + 
    ggtitle("") +
    scale_fill_discrete(labels = "") +
    labs(color = "Age Distribution") +
    guides(fill=guide_legend(title="")) +
    theme(legend.position = "none"
          #legend.background = element_rect(fill=alpha('white', 0.0))
    )

Postprocessing of gaitpy module output

Undetected bouts

Determination of for how many walks no bouts could be detected

# All ids
fls_all = list.files(path = paste0(root, center.data))
all_ids = data.frame()
for (fn in fls_all){
    # id
    n = unlist(strsplit(fn, "_"))[c(2,4)]
    char = unlist(strsplit(n[1], split = ""))
    id = as.numeric(paste(grep("[[:digit:]]+", char, value = T), collapse = ''))
    
    # sequence
    char = unlist(strsplit(n[2], split = ""))
    seq = as.numeric(paste(grep("[[:digit:]]+", char, value = T), collapse = ''))
    
    # putting together in a dataframe
    temp = data.frame(id = id, seq = seq, ids = paste0(id, seq, collapse = "")) 
    all_ids = rbind(all_ids, temp)
}


# bout ids
fls_bouts = list.files(path = paste0(root, "/Results"))

bout_ids = data.frame()
for (fn in fls_bouts){
    #print(fn)
    id = as.numeric(unlist(strsplit(fn, "_"))[1])
    seq = as.numeric(unlist(strsplit(fn, "_"))[2])
    temp = data.frame(id = id, seq = seq, ids = paste0(id, seq, collapse = "")) 
    bout_ids = rbind(bout_ids, temp)
}


id_no_bout = all_ids[!all_ids$ids %in% bout_ids$ids, ]
mydf = my_height_age[!duplicated(my_height_age$idx), ]

mydf.bout.undetected = subset(mydf, 
                              id == 9859 & seq == 1 |
                                  id == 223743 & seq == 1 |
                                  id == 250428 & seq == 1 |
                                  id == 259559 & seq == 1 |
                                  id == 370121 & seq == 1 |
                                  id == 469147 & seq == 1 |
                                  id == 469147 & seq == 0 )

Undetected cycles

Based on detected bouts, here I determine how many cycles were undeteced within the bouts.

# Selecting the files with 1Byte ? files with no bouts detected
fls.cycles = list.files(path = paste0(root, "/Results"))

File_no_cy_det = vector()
ID_no_cy_det = vector()
Seq_no_cy_det = vector()
Idx_no_cy_det = vector()

for (f in fls.cycles){
    file.size = file.info(path = paste0(root, "/Results/", f))$size
    # print(file.size)
    if (file.size <=1){
        File_no_cy_det = c(File_no_cy_det, f)
        ID_no_cy_det = c(ID_no_cy_det, unlist(strsplit(f, "_"))[1])
        Seq_no_cy_det = c(Seq_no_cy_det, unlist(strsplit(f, "_"))[2])
        Idx_no_cy_det = c(Idx_no_cy_det, unlist(strsplit(f, "_"))[3])
    }
}

df_no_cy_det = data.frame(file = File_no_cy_det, 
                          ID = as.integer(ID_no_cy_det), 
                          Seq = as.integer(Seq_no_cy_det), 
                          Idx = as.integer(Idx_no_cy_det))
# Preparing data frame to plot
mydf = my_height_age[!duplicated(my_height_age$idx), ]
mydf.cycled.detection.failed = mydf[unique(mydf$idx) %in% df_no_cy_det$Idx,   ]

Cycle undectected, normalized

# participants per age
mydf  = read.csv( file = paste0(root, "/gait.sensor.center.csv"))[, c("id","idx", "gender", "age")]
myall = mydf[!duplicated(mydf$idx), c("idx", "gender", "age") ]
ag.sum = myall %>% group_by(gender, age) %>% summarise(
    n.all = length(age),
    .groups = 'drop'
)

# not detected per age
mydf  = read.csv( file = paste0(root, "/gait.sensor.center.csv"))[, c("idx", "gender", "age")]
mydf = mydf[!duplicated(mydf$idx), ]
my.ag.not = mydf[mydf$idx %in% df_no_cy_det$Idx, c("gender", "age")  ]

ag.not.sum = my.ag.not %>% group_by(gender, age) %>% summarise(
    n.not = length(age), 
    .groups = 'drop'
)

# merging age groups no cycles detected with all ages groups
df = merge(ag.not.sum, ag.sum, by = c("age", 'gender'), all = TRUE)
df[is.na(df$n.not),]$n.not = 0
# normalization, no detected relative to all subject in a given age group 
df$percent = df$n.not/df$n.all

p.cycles.undetedected = ggplot(data = df, aes(x = as.factor(age), fill = as.factor(gender))) +
    geom_bar(aes(y = percent ), stat="identity", width = 0.9,
             position = position_dodge2(preserve = "total")) +
    ggtitle("") +
    scale_fill_hue(name="", labels = c("female", "male")) + 
    scale_x_discrete("age", expand=c(0,0)) +
    theme(legend.position = c(0.05, 0.85), 
          legend.background = element_rect(fill=alpha('white', 0.1)),
          axis.text.x = element_text(angle=90, vjust = 0.5, size = 9))

Cycles detected

## Files > 1Byte, Bouts detected
File_cy_det = vector()
ID_cy_det = vector()
Seq_cy_det = vector()
Idx_cy_det = vector()


for (f in fls.cycles){
    file.size = file.info(path = paste0(root, "/Results/", f))$size
    # print(file.size)
    if (file.size > 1){
        File_cy_det = c(File_cy_det, f)
        ID_cy_det = c(ID_cy_det, unlist(strsplit(f, "_"))[1])
        Seq_cy_det = c(Seq_cy_det, unlist(strsplit(f, "_"))[2])
        Idx_cy_det = c(Idx_cy_det, unlist(strsplit(f, "_"))[3])
    }
}

df_cy_det = data.frame(file = File_cy_det, 
                       ID = as.integer(ID_cy_det), 
                       Seq = as.integer(Seq_cy_det), 
                       Idx = as.integer(Idx_cy_det))

mydf = my_height_age[!duplicated(my_height_age$idx), ]
mydf.cycles.detected = mydf[unique(mydf$idx) %in% df_cy_det$Idx,   ]

Aggregating Gait Features (Output of gaitpy module)

In sum 1146 gait cycles detected and their gait parameters are aggregated into 1 file.

## Takes generaged results from gaitpy module and aggregates all resutls into one 1 dataframe 
## for analysis. 

# get all file namnes in directory
fls = list.files(path = paste0(root, results))
gait.features = data.frame()

for (f in fls){
    pth = paste0(root, results, "/", f)
    #print(file.info(pth)$size)
    if (file.info(pth)$size <=1){
        #print(paste("skip ", f, ", " , " since NA ", file.info(pth)$size))
        next
    }
    df = read.csv(pth)
    
    # get meta data
    meta.data = unlist(strsplit(f, "_"))
    df$id = meta.data[1]
    df$seq = meta.data[2]
    df$idx = meta.data[3]
    gait.features = rbind(gait.features, df)
    
}

# including age and height for each subject 
gait.features$idx = as.integer(gait.features$idx)
age.height.gender = my_height_age[!duplicated(my_height_age$idx), c( "idx", "age", "gender", "height")]

df.features = merge(gait.features, age.height.gender, by = "idx")


osaka.features = df.features[, c(31:35, 1, 2:30)]
osaka.features = subset(osaka.features, select = -c(bout_start_time, bout_number, IC, FC))


write.csv(osaka.features, paste0(root, "/gait.characteristics.csv"), row.names = FALSE)

Variability by Age and Gender

28 Gait Charcteristics

Reproducibility

Each subject was encouraged to perform 2 flat ground level In the following of module by comparing two level walks seq 0 and 1

gt.char = read.csv(paste0(root, "/gait.characteristics.csv"))

# Calculating mean per individual
age.gender =  gt.char %>% 
    group_by(id, seq, idx, age, gender, height) %>% 
    summarise_all(list(mean))

vec.2walks = age.gender[duplicated(age.gender$id), ]$id
df.2walks = age.gender[ age.gender$id %in% vec.2walks, ]

# split data in level walk 1 and level walk 2
lev.walk.0 = subset(df.2walks, seq == 0)
lev.walk.1 = subset(df.2walks, seq == 1)

# difference normlized through dividing  by avergage
av = (lev.walk.0[,-c(1:7)] + lev.walk.1[,-c(1:7)])/2
sdt = sqrt((lev.walk.0[,-c(1:7)] - av)^2 + (lev.walk.1[,-c(1:7)] - av)^2)

# sdt
df.2walks.sd = cbind(df.2walks[duplicated(df.2walks$id),c(1:6)], sdt)

df.2walks.sd.long = melt(df.2walks.sd, 
                         id.vars = c("id", "age", "gender", "seq", "idx", "height"),
                         variable.name="characteristic")


stp.cad.gait = subset(df.2walks.sd.long, characteristic == "gait_cycles" | 
                          characteristic == "steps" |
                          characteristic == "cadence"
)

no.stp.cad.gait = subset(df.2walks.sd.long, characteristic != "gait_cycles" & 
                             characteristic != "steps" &
                             characteristic != "cadence"
)

Gaitpy Module Assessment

Osaka Data Set

Table 1: Meta data Osaka data set
subjects age gender walks rows col
744 3-78 years m = 359, f = 385 2 805945 8

The “id = 469147” is available in the data set but not in the age, gender table. The entire data set shall have, according to the description 744 subjects. I excluded subject 469147. There are 744 subjects, each subject does 2 walks (seq = 0 or 1), that amounts to 1488 unique idx per walk. 805945 records.

Osaka Data Set & Augmenentation
Table 2: Osaka data set
id y seq gender age
103114 -0.830 0 0 33
156237 -0.684 0 0 33
200938 -1.122 0 0 33
227547 -0.960 0 0 33
250220 -1.028 0 0 33
Table 3: Osaka data set extended by timestamp and height
id y seq gender age time height
103114 -0.830 0 0 33 2000-01-01 00:00:00.000 158
156237 -0.684 0 0 33 2000-01-01 00:00:00.000 158
200938 -1.122 0 0 33 2000-01-01 00:00:00.000 158
227547 -0.960 0 0 33 2000-01-01 00:00:00.000 158
250220 -1.028 0 0 33 2000-01-01 00:00:00.000 158

time: The gaitpy module requires Unix Time Stamp a resolution of 10ms, e.g. 2000-01-01 00:00:00.000, 2000-01-01 00:00:00.020, 2000-01-01 00:00:00.030, … The timestamp is used to synchronize and normalize the data sampling rate within the module.
height: The module requires also the height of each subject. The Osaka data set doesn’t provide the height. Therefore I sampled from the average age-height distribution, issued by Japan’s sport agency in 2018. The graphics below show the age-height distribution generated.

Age Height Distribution

Figure 1: Age Height Distribution

Gaitpy Module Assessment Strategy

Determining:

  1. Bout detection rate

  2. Cycle detection rate

  3. Bias towards age, or gender in regard to bouts and cycle detection

  4. Gait characteristics

    . Reproducing results from (1)
    . Something else

  1. Almarwani M, Perera S, VanSwearingen JM, Sparto PJ, Brach JS. The test-retest reliability and minimal detectable change of spatial and temporal gait variability during usual over-ground walking for younger and older adults. Gait Posture. 2016;44:94-99. doi:10.1016/j.gaitpost.2015.11.014

Gait Bout & Cycle Detection

The gaitpy module entails 2 pre-trained alghorithms to identify bouts and cylces. The module identified from 744 subjects with each 2 walks the following amount bouts and cycles. The tables below show how many bouts and cycles were detected. The graphic below shows the age and gender distribution of the undetected cycles. The limitation of the database is that all walk sequences of a subject a rerelatively short,which were capture dwithin a session of about 1min. Level and slope walk paths were about 9m and 3 m long, respectively.

Table 4: Number of bouts with or without cycles
characteristic walks detected undetected percent
bouts 1490 1483 7 99.53
bouts with cycles 1483 1146 337 77.28
Distribution of undetected gait cycles

Figure 2: Distribution of undetected gait cycles

Conclusion
The normalized age distribution shows undetected gait cycles in percent for each age. The average of

Variability of stride duration asymmetry over age

Gait variability or fluctuations in gait characteristics from one step to the next is evident even when task demands and environmental conditions are constant. The generalized motor program theory (GMPT) defines that motor programs are embedded in the central nervous system for the purpose of rapid and skilled movements. A healthy person with well-learned tasks is able to apply the motor program efficiently and accurately to fulfill a task. Any variability is considered an error in the selection of stored motor programs for gait. Gait characteristics such as step length or stride duration vary across mulitple cycles in two distinct yet complementary ways:variability of gait, expressed by linear predictiable fluctuaions described by the statistical parameters sd, var, range. Variability depend on age, pathololgy and tasks and environmental demands. The amount of variability is very low for adults, and high for children in the learnign phase.

Variability of stride duration asymmetry | age groups 9-16, 17-65 and 66-78 The variability of the gait characteristic can be explained by age. Very young subject are still learning to walk (here in a new setup, being a part of an experiment). The stride duration asymmetry is

Reproducing Paper

Title: The test–retest reliability and minimal detectable change of spatial and temporal gait variability during usual over-ground walking for younger and older adults, Maha Almarwani, Subashan Perera, Jessie M. VanSwearingen, Patrick J. Sparto, Jennifer S. Brach, 2015

Assessing test–retest reliability and determine minimal detectable change (MDC) of spatial and temporal gait variability in younger and older adults. 40 younger (mean age = 26.6 +- 6.0 years) and 46 older adults (mean age = 78.1 +- 6.2 years) were included in the study.

Addendum

Gaitpy input

(#tab:gaitpy input)Gaitpy input data set
id y seq time gender age height idx
103114 -0.830 0 2000-01-01 00:00:00.000 0 33 158 1
103114 -0.730 1 2000-01-01 00:00:00.000 0 33 158 2
156237 -0.684 0 2000-01-01 00:00:00.000 0 33 158 3
156237 -0.846 1 2000-01-01 00:00:00.000 0 33 158 4
200938 -1.122 0 2000-01-01 00:00:00.000 0 33 158 5
200938 -0.702 1 2000-01-01 00:00:00.000 0 33 158 6
227547 -0.960 0 2000-01-01 00:00:00.000 0 33 158 7
227547 -0.644 1 2000-01-01 00:00:00.000 0 33 158 8
250220 -1.028 0 2000-01-01 00:00:00.000 0 33 158 9
250220 -1.220 1 2000-01-01 00:00:00.000 0 33 158 10

Gaitpy output

(#tab:gaitpy output)Gaitpy module output variables
Variable Class Values
id character 10002, 101212, 101628
seq character 0, 1, NA
age integer 2, 3, 4
gender integer 0, 1, NA
height integer 87, 94, 95
idx integer 1, 3, 4
bout_length_sec double 2.86, 2.96, 3
gait_cycles integer 1, 2, 3
steps integer 6, 7, 8
stride_duration double 0.7, 0.72, 0.74
stride_duration_asymmetry double 0, 0.02, 0.04
step_duration double 0.3, 0.34, 0.36
step_duration_asymmetry double 0, 0.02, 0.04
cadence double 41.67, 42.86, 56.6
initial_double_support double 0.04, 0.06, 0.08
initial_double_support_asymmetry double 0, 0.02, 0.04
terminal_double_support double -0.96, -0.44, -0.26
terminal_double_support_asymmetry double 0, 0.02, 0.04
double_support double -0.86, -0.24, -0.16
double_support_asymmetry double 0, 0.02, 0.04
single_limb_support double -0.42, -0.12, -0.1
single_limb_support_asymmetry double 0, 0.02, 0.04
stance double 0.46, 0.48, 0.5
stance_asymmetry double 0, 0.02, 0.04
swing double -0.12, -0.1, -0.08
swing_asymmetry double 0, 0.02, 0.04
step_length double 0.11, 0.21, 0.25
step_length_asymmetry double 0, 0.01, 0.02
stride_length double 0.42, 0.46, 0.47
stride_length_asymmetry double 0, 0.01, 0.02
gait_speed double 0.26, 0.28, 0.29

Subjects with undeteced bouts

Bout Histogram (remarkable gaps?!)

The histrogram shows no bout length beteen 6-7.5 sec, and 3-4.5. Bouth have is a difference of 1.5 seconds. The distribution looks weird. Is there an explanation, investigation needed?

Subjects with detected cycles

Subjects with undetected cycles

Distributions detected cycles vs. undetected cycles

Walk 1 vs. Walk 2 standard deviation per gait character

Gait Cycles, Steps, Cadence Other gait characterstics

25 gait characteristcs

Remarks:
Filter outlier I filtered age > 12, stride duration asymmetry < 0.26 (need to figure which subject that is … )