R Code Section

Preprocessing

Osaka Data Set Aggregation

Reads in and aggregates raw sensor data by subject.

# get all file with sensor raw data from each subject namnes in directory
fls = list.files(path = paste0(root, center.data))

# read in file, only accelerometer data, after file and merge data into df.ac
df.ac = data.frame()
for (f in 1:length(fls)){
    #f = 1
    fl = fls[f]
    
    # read file
    df.temp =  read.csv(file = paste0(root, center.data, fl),header = F,skip = 2)[,c(4:6)]
    
    # extract data from filename 
    id.seq = unlist(strsplit(fl, "_"))[c(2, 4)]
    
    # Getting ID number
    m = regexec('[0-9]+', id.seq[1])
    id = regmatches(id.seq[1], m)
    
    # Getting Squence 0 or 1
    m = regexec('[0-9]', id.seq[2])
    sq = regmatches(id.seq[2], m)
    
    df.temp$ID = as.character(id)
    df.temp$seq = as.character(sq)
    df.temp$time =  timestamp(n = dim(df.temp)[1])
    
    df.ac = rbind(df.ac, df.temp)
}
df.ac$id = as.integer(df.ac$ID)

# merge over id age and gender
# read in gender, age file 
df.age.gender = read.csv(file = paste0(root,age.gender))


# merge file
df.gait = merge(df.ac, df.age.gender, by = "ID", rownames = F)
#df.gait = df.gait[complete.cases(df.gait), ]

# clean up data frame
names =c("id", "x", "y", "z", "seq", 'time', 'gender', 'age')
colnames(df.gait) = names
df.gait = df.gait[order(df.gait$seq, df.gait$id, df.gait$time), ]

# Data frame Report
str(df.gait))

write.csv(df.gait, file = paste0(root, "/initial.gait.sensor.center.csv"), row.names = FALSE)

my_height_age  = read.csv( file = paste0(root, "/initial.gait.sensor.center.csv"), row.names = NULL)#[, c("id", "age", "gender")]

data.set.dimensions = data.frame(
    subjects = length(unique(my_height_age$id)), 
    age = "3-78 years",
    gender = "m = 359, f = 385",
    walks = length(unique(my_height_age$seq)),
    rows = dim(my_height_age)[1], 
    col = dim(my_height_age)[2]
)

Enhancing Osaka data set with height and unix time stamp

In order to apply the gaitpy module height per subject and a unix time stamp are required. The missing height is sampled from the height distribution (2018), collected yearly by the Japan Sports Agency. https://nbakki.hatenablog.com/entry/Average_Height_of_Japanese_2018

Preprocessing height / age data 2018

h.jp = read.xlsx(paste0(root, "/average_height_jp.xlsx"), sheet = 1)
jp = data.frame()
for (ref in h.jp$age){
    #print(ref)
    for (g in c(0,1)){
        l = as.integer(unlist(strsplit(ref, "-")))
        if ((length(l))>1){
            vec = seq(l[1], l[2], 1)
            # print(vec)
        }
        else vec = as.numeric(l) 
        
        temp = cbind(vec, 
                     rep(h.jp[h.jp$age == ref, 3-g], length(vec)),
                     rep(g, length(vec)))
        jp = rbind(temp, jp)
    }
}

names(jp) = c("age", "height", "gender" )
jp = jp[order(jp$age),]

# ouput plot
p = ggplot(data = jp, aes(x = age, y = height, color = factor(gender))) +
    geom_point() + 
    ggtitle("")+
    scale_color_discrete(labels = c("female", "male")) + 
    labs(color = "Average height per age in Japan, 2018") +
    theme(legend.position = c(0.6, 0.2), 
          legend.background = element_rect(fill=alpha('white', 0.1))) 
#p

Associating age with height

## Associating height, age and gender
my_height_age  = read.csv( file = paste0(root, "/initial.gait.sensor.center.csv"))#[, c("id", "age", "gender")]
df = data.frame()
idx = 0
for (ref in unique(my_height_age$age)){
    for (g in c(0,1)){
        idx = idx + 1
        #print(ref)
        temp = subset(my_height_age, age == ref & gender == g)
        
        if (all(dim(temp)) == TRUE ){
            #print(dim(temp))
            temp$height = jp[jp$age == ref & jp$gender == g, 2]
            df = rbind(df, temp) 
        }
    }
}
df$height = as.integer(df$height)

## Introducing index per id, and sequence
idx = 0
mydf = data.frame()
for (i in unique(df$id)){
    sq = unique(df[df$id == i, ]$seq)
    if (length(sq) < 2) print(paste("sq < 2, id = ", i))
    for (s in sq){
        temp = subset(df, id == i & seq == s)
        idx = idx + 1
        temp$idx = idx
        if ((idx %% 100) == 0 ) print(idx)
        mydf = rbind(mydf, temp)
    }
}


# Store enhanced set
write.csv(mydf, file = paste0(root, "/gait.sensor.center.csv"), row.names = FALSE)

Data set ready for gaitpy module application

my_height_age  = read.csv( file = paste0(root, "/gait.sensor.center.csv"))[, -c(2, 4) ]

p.age.height.distribution = ggplot(data = my_height_age[!duplicated(my_height_age$id), ], aes(x = age, y = height, color = factor(gender))) +
    geom_point() + 
    ggtitle("")+
    scale_color_discrete(labels = c("female", "male")) + 
    labs(color = "Average height per age") +
    guides(fill=guide_legend(title="")) +
    theme(legend.position = c(0.6, 0.4), 
          legend.background = element_rect(fill=alpha('white', 0.1))) 


p.height.distribution = ggplot(data = my_height_age[!duplicated(my_height_age$id), ], 
                               aes(x = height, fill = as.factor(gender))) + 
    geom_histogram(alpha = 0.5, 
                   position = 'identity', 
                   binwidth = 1) + 
    ggtitle("") +
    scale_fill_discrete(labels = "") +
    labs(color = "Height Distribution") +
    guides(fill=guide_legend(title="")) +
    theme(legend.position = "none"
          #legend.background = element_rect(fill=alpha('white', 0.0))
    )

p.age.distribution = ggplot(data = my_height_age[!duplicated(my_height_age$id), ], 
                            aes(x = age, fill = as.factor(gender))) + 
    geom_histogram(alpha = 0.5, 
                   position = 'identity', 
                   binwidth = 1) + 
    ggtitle("") +
    scale_fill_discrete(labels = "") +
    labs(color = "Age Distribution") +
    guides(fill=guide_legend(title="")) +
    theme(legend.position = "none"
          #legend.background = element_rect(fill=alpha('white', 0.0))
    )

Postprocessing of gaitpy module output

Undetected bouts

Determination of for how many walks no bouts could be detected

# All ids
fls_all = list.files(path = paste0(root, center.data))
all_ids = data.frame()
for (fn in fls_all){
    # id
    n = unlist(strsplit(fn, "_"))[c(2,4)]
    char = unlist(strsplit(n[1], split = ""))
    id = as.numeric(paste(grep("[[:digit:]]+", char, value = T), collapse = ''))
    
    # sequence
    char = unlist(strsplit(n[2], split = ""))
    seq = as.numeric(paste(grep("[[:digit:]]+", char, value = T), collapse = ''))
    
    # putting together in a dataframe
    temp = data.frame(id = id, seq = seq, ids = paste0(id, seq, collapse = "")) 
    all_ids = rbind(all_ids, temp)
}


# bout ids
fls_bouts = list.files(path = paste0(root, "/Results"))

bout_ids = data.frame()
for (fn in fls_bouts){
    #print(fn)
    id = as.numeric(unlist(strsplit(fn, "_"))[1])
    seq = as.numeric(unlist(strsplit(fn, "_"))[2])
    temp = data.frame(id = id, seq = seq, ids = paste0(id, seq, collapse = "")) 
    bout_ids = rbind(bout_ids, temp)
}


id_no_bout = all_ids[!all_ids$ids %in% bout_ids$ids, ]
mydf = my_height_age[!duplicated(my_height_age$idx), ]

mydf.bout.undetected = subset(mydf, 
                              id == 9859 & seq == 1 |
                                  id == 223743 & seq == 1 |
                                  id == 250428 & seq == 1 |
                                  id == 259559 & seq == 1 |
                                  id == 370121 & seq == 1 |
                                  id == 469147 & seq == 1 |
                                  id == 469147 & seq == 0 )

Undetected cycles

Based on detected bouts, here I determine how many cycles were undeteced within the bouts.

# Selecting the files with 1Byte ? files with no bouts detected
fls.cycles = list.files(path = paste0(root, "/Results"))

File_no_cy_det = vector()
ID_no_cy_det = vector()
Seq_no_cy_det = vector()
Idx_no_cy_det = vector()

for (f in fls.cycles){
    file.size = file.info(path = paste0(root, "/Results/", f))$size
    # print(file.size)
    if (file.size <=1){
        File_no_cy_det = c(File_no_cy_det, f)
        ID_no_cy_det = c(ID_no_cy_det, unlist(strsplit(f, "_"))[1])
        Seq_no_cy_det = c(Seq_no_cy_det, unlist(strsplit(f, "_"))[2])
        Idx_no_cy_det = c(Idx_no_cy_det, unlist(strsplit(f, "_"))[3])
    }
}

df_no_cy_det = data.frame(file = File_no_cy_det, 
                          ID = as.integer(ID_no_cy_det), 
                          Seq = as.integer(Seq_no_cy_det), 
                          Idx = as.integer(Idx_no_cy_det))

# Preparing data frame to plot
mydf = my_height_age[!duplicated(my_height_age$idx), ]
mydf.cycled.detection.failed = mydf[unique(mydf$idx) %in% df_no_cy_det$Idx,   ]

Cycle undectected, normalized

# participants per age
mydf  = read.csv( file = paste0(root, "/gait.sensor.center.csv"))[, c("id","idx", "gender", "age")]
myall = mydf[!duplicated(mydf$idx), c("idx", "gender", "age") ]
ag.sum = myall %>% group_by(gender, age) %>% summarise(
    n.all = length(age),
    .groups = 'drop'
)

# not detected per age
mydf  = read.csv( file = paste0(root, "/gait.sensor.center.csv"))[, c("idx", "gender", "age")]
mydf = mydf[!duplicated(mydf$idx), ]
my.ag.not = mydf[mydf$idx %in% df_no_cy_det$Idx, c("gender", "age")  ]

ag.not.sum = my.ag.not %>% group_by(gender, age) %>% summarise(
    n.not = length(age), 
    .groups = 'drop'
)

# merging age groups no cycles detected with all ages groups
df = merge(ag.not.sum, ag.sum, by = c("age", 'gender'), all = TRUE)
df[is.na(df$n.not),]$n.not = 0
# normalization, no detected relative to all subject in a given age group 
df$percent = df$n.not/df$n.all

p.cycles.undetedected = ggplot(data = df, aes(x = as.factor(age), fill = as.factor(gender))) +
    geom_bar(aes(y = percent ), stat="identity", width = 0.9,
             position = position_dodge2(preserve = "total")) +
    ggtitle("") +
    scale_fill_hue(name="", labels = c("female", "male")) + 
    scale_x_discrete("age", expand=c(0,0)) +
    theme(legend.position = c(0.05, 0.85), 
          legend.background = element_rect(fill=alpha('white', 0.1)),
          axis.text.x = element_text(angle=90, vjust = 0.5, size = 9))

Cycles detected

## Files > 1Byte, Bouts detected
File_cy_det = vector()
ID_cy_det = vector()
Seq_cy_det = vector()
Idx_cy_det = vector()


for (f in fls.cycles){
    file.size = file.info(path = paste0(root, "/Results/", f))$size
    # print(file.size)
    if (file.size > 1){
        File_cy_det = c(File_cy_det, f)
        ID_cy_det = c(ID_cy_det, unlist(strsplit(f, "_"))[1])
        Seq_cy_det = c(Seq_cy_det, unlist(strsplit(f, "_"))[2])
        Idx_cy_det = c(Idx_cy_det, unlist(strsplit(f, "_"))[3])
    }
}

df_cy_det = data.frame(file = File_cy_det, 
                       ID = as.integer(ID_cy_det), 
                       Seq = as.integer(Seq_cy_det), 
                       Idx = as.integer(Idx_cy_det))

mydf = my_height_age[!duplicated(my_height_age$idx), ]
mydf.cycles.detected = mydf[unique(mydf$idx) %in% df_cy_det$Idx,   ]

Aggregating Gait Features (Output of gaitpy module)

In sum 1146 gait cycles detected and their gait parameters are aggregated into 1 file.

## Takes generaged results from gaitpy module and aggregates all resutls into one 1 dataframe 
## for analysis. 

# get all file namnes in directory
fls = list.files(path = paste0(root, results))
gait.features = data.frame()

for (f in fls){
    pth = paste0(root, results, "/", f)
    #print(file.info(pth)$size)
    if (file.info(pth)$size <=1){
        #print(paste("skip ", f, ", " , " since NA ", file.info(pth)$size))
        next
    }
    df = read.csv(pth)
    
    # get meta data
    meta.data = unlist(strsplit(f, "_"))
    df$id = meta.data[1]
    df$seq = meta.data[2]
    df$idx = meta.data[3]
    gait.features = rbind(gait.features, df)
    
}

# including age and height for each subject 
gait.features$idx = as.integer(gait.features$idx)
age.height.gender = my_height_age[!duplicated(my_height_age$idx), c( "idx", "age", "gender", "height")]

df.features = merge(gait.features, age.height.gender, by = "idx")


osaka.features = df.features[, c(31:35, 1, 2:30)]
osaka.features = subset(osaka.features, select = -c(bout_start_time, bout_number, IC, FC))


write.csv(osaka.features, paste0(root, "/gait.characteristics.csv"), row.names = FALSE)

Variability by Age and Gender

28 Gait Charcteristics

Reproducibility

Each subject was encouraged to perform 2 flat ground level In the following of module by comparing two level walks seq 0 and 1

gt.char = read.csv(paste0(root, "/gait.characteristics.csv"))

# Calculating mean per individual
age.gender =  gt.char %>% 
    group_by(id, seq, idx, age, gender, height) %>% 
    summarise_all(list(mean))

vec.2walks = age.gender[duplicated(age.gender$id), ]$id
df.2walks = age.gender[ age.gender$id %in% vec.2walks, ]

# split data in level walk 1 and level walk 2
lev.walk.0 = subset(df.2walks, seq == 0)
lev.walk.1 = subset(df.2walks, seq == 1)

# difference normlized through dividing  by avergage
av = (lev.walk.0[,-c(1:7)] + lev.walk.1[,-c(1:7)])/2
sdt = sqrt((lev.walk.0[,-c(1:7)] - av)^2 + (lev.walk.1[,-c(1:7)] - av)^2)

# sdt
df.2walks.sd = cbind(df.2walks[duplicated(df.2walks$id),c(1:6)], sdt)

df.2walks.sd.long = melt(df.2walks.sd, 
                         id.vars = c("id", "age", "gender", "seq", "idx", "height"),
                         variable.name="characteristic")


stp.cad.gait = subset(df.2walks.sd.long, characteristic == "gait_cycles" | 
                          characteristic == "steps" |
                          characteristic == "cadence"
)

no.stp.cad.gait = subset(df.2walks.sd.long, characteristic != "gait_cycles" & 
                             characteristic != "steps" &
                             characteristic != "cadence"
)

Gaitpy Module Assessment

Osaka Data Set

Table 1: Meta data Osaka data set
subjects	age	gender	walks	rows	col
744	3-78 years	m = 359, f = 385	2	805945	8

The “id = 469147” is available in the data set but not in the age, gender table. The entire data set shall have, according to the description 744 subjects. I excluded subject 469147. There are 744 subjects, each subject does 2 walks (seq = 0 or 1), that amounts to 1488 unique idx per walk. 805945 records.

Osaka Data Set & Augmenentation

Table 2: Osaka data set
id	y	age
103114	-0.830	33
156237	-0.684	33
200938	-1.122	33
227547	-0.960	33
250220	-1.028	33

Table 3: Osaka data set extended by timestamp and height
id	y	age	time	height
103114	-0.830	33	2000-01-01 00:00:00.000	158
156237	-0.684	33	2000-01-01 00:00:00.000	158
200938	-1.122	33	2000-01-01 00:00:00.000	158
227547	-0.960	33	2000-01-01 00:00:00.000	158
250220	-1.028	33	2000-01-01 00:00:00.000	158

time: The gaitpy module requires Unix Time Stamp a resolution of 10ms, e.g. 2000-01-01 00:00:00.000, 2000-01-01 00:00:00.020, 2000-01-01 00:00:00.030, … The timestamp is used to synchronize and normalize the data sampling rate within the module.
height: The module requires also the height of each subject. The Osaka data set doesn’t provide the height. Therefore I sampled from the average age-height distribution, issued by Japan’s sport agency in 2018. The graphics below show the age-height distribution generated.

Figure 1: Age Height Distribution

Gaitpy Module Assessment Strategy

Determining:

Bout detection rate
Cycle detection rate
Bias towards age, or gender in regard to bouts and cycle detection
Gait characteristics

. Reproducing results from (1)
. Something else

Almarwani M, Perera S, VanSwearingen JM, Sparto PJ, Brach JS. The test-retest reliability and minimal detectable change of spatial and temporal gait variability during usual over-ground walking for younger and older adults. Gait Posture. 2016;44:94-99. doi:10.1016/j.gaitpost.2015.11.014

Gait Bout & Cycle Detection

The gaitpy module entails 2 pre-trained alghorithms to identify bouts and cylces. The module identified from 744 subjects with each 2 walks the following amount bouts and cycles. The tables below show how many bouts and cycles were detected. The graphic below shows the age and gender distribution of the undetected cycles. The limitation of the database is that all walk sequences of a subject a rerelatively short,which were capture dwithin a session of about 1min. Level and slope walk paths were about 9m and 3 m long, respectively.

Table 4: Number of bouts with or without cycles
characteristic	walks	detected	undetected	percent
bouts	1490	1483	7	99.53
bouts with cycles	1483	1146	337	77.28

Figure 2: Distribution of undetected gait cycles

Conclusion
The normalized age distribution shows undetected gait cycles in percent for each age. The average of

Variability of stride duration asymmetry over age

Gait variability or fluctuations in gait characteristics from one step to the next is evident even when task demands and environmental conditions are constant. The generalized motor program theory (GMPT) defines that motor programs are embedded in the central nervous system for the purpose of rapid and skilled movements. A healthy person with well-learned tasks is able to apply the motor program efficiently and accurately to fulfill a task. Any variability is considered an error in the selection of stored motor programs for gait. Gait characteristics such as step length or stride duration vary across mulitple cycles in two distinct yet complementary ways:variability of gait, expressed by linear predictiable fluctuaions described by the statistical parameters sd, var, range. Variability depend on age, pathololgy and tasks and environmental demands. The amount of variability is very low for adults, and high for children in the learnign phase.

Variability of stride duration asymmetry | age groups 9-16, 17-65 and 66-78 The variability of the gait characteristic can be explained by age. Very young subject are still learning to walk (here in a new setup, being a part of an experiment). The stride duration asymmetry is

Reproducing Paper

Title: The test–retest reliability and minimal detectable change of spatial and temporal gait variability during usual over-ground walking for younger and older adults, Maha Almarwani, Subashan Perera, Jessie M. VanSwearingen, Patrick J. Sparto, Jennifer S. Brach, 2015

Assessing test–retest reliability and determine minimal detectable change (MDC) of spatial and temporal gait variability in younger and older adults. 40 younger (mean age = 26.6 +- 6.0 years) and 46 older adults (mean age = 78.1 +- 6.2 years) were included in the study.

Addendum

Gaitpy input

(#tab:gaitpy input)Gaitpy input data set
id	y	seq	time	age	height	idx
103114	-0.830	0	2000-01-01 00:00:00.000	33	158	1
103114	-0.730	1	2000-01-01 00:00:00.000	33	158	2
156237	-0.684	0	2000-01-01 00:00:00.000	33	158	3
156237	-0.846	1	2000-01-01 00:00:00.000	33	158	4
200938	-1.122	0	2000-01-01 00:00:00.000	33	158	5
200938	-0.702	1	2000-01-01 00:00:00.000	33	158	6
227547	-0.960	0	2000-01-01 00:00:00.000	33	158	7
227547	-0.644	1	2000-01-01 00:00:00.000	33	158	8
250220	-1.028	0	2000-01-01 00:00:00.000	33	158	9
250220	-1.220	1	2000-01-01 00:00:00.000	33	158	10

Gaitpy output

(#tab:gaitpy output)Gaitpy module output variables
Variable	Class	Values
id	character	10002, 101212, 101628
seq	character	0, 1, NA
age	integer	2, 3, 4
gender	integer	0, 1, NA
height	integer	87, 94, 95
idx	integer	1, 3, 4
bout_length_sec	double	2.86, 2.96, 3
gait_cycles	integer	1, 2, 3
steps	integer	6, 7, 8
stride_duration	double	0.7, 0.72, 0.74
stride_duration_asymmetry	double	0, 0.02, 0.04
step_duration	double	0.3, 0.34, 0.36
step_duration_asymmetry	double	0, 0.02, 0.04
cadence	double	41.67, 42.86, 56.6
initial_double_support	double	0.04, 0.06, 0.08
initial_double_support_asymmetry	double	0, 0.02, 0.04
terminal_double_support	double	-0.96, -0.44, -0.26
terminal_double_support_asymmetry	double	0, 0.02, 0.04
double_support	double	-0.86, -0.24, -0.16
double_support_asymmetry	double	0, 0.02, 0.04
single_limb_support	double	-0.42, -0.12, -0.1
single_limb_support_asymmetry	double	0, 0.02, 0.04
stance	double	0.46, 0.48, 0.5
stance_asymmetry	double	0, 0.02, 0.04
swing	double	-0.12, -0.1, -0.08
swing_asymmetry	double	0, 0.02, 0.04
step_length	double	0.11, 0.21, 0.25
step_length_asymmetry	double	0, 0.01, 0.02
stride_length	double	0.42, 0.46, 0.47
stride_length_asymmetry	double	0, 0.01, 0.02
gait_speed	double	0.26, 0.28, 0.29

Subjects with undeteced bouts

Bout Histogram (remarkable gaps?!)

The histrogram shows no bout length beteen 6-7.5 sec, and 3-4.5. Bouth have is a difference of 1.5 seconds. The distribution looks weird. Is there an explanation, investigation needed?

Subjects with detected cycles

Subjects with undetected cycles

Distributions detected cycles vs. undetected cycles

Walk 1 vs. Walk 2 standard deviation per gait character

Gait Cycles, Steps, Cadence Other gait characterstics

25 gait characteristcs

Remarks:
Filter outlier I filtered age > 12, stride duration asymmetry < 0.26 (need to figure which subject that is … )

DRAFT Assessment Gaitpy Module