Reads in and aggregates raw sensor data by subject.
# get all file with sensor raw data from each subject namnes in directory
fls = list.files(path = paste0(root, center.data))
# read in file, only accelerometer data, after file and merge data into df.ac
df.ac = data.frame()
for (f in 1:length(fls)){
#f = 1
fl = fls[f]
# read file
df.temp = read.csv(file = paste0(root, center.data, fl),header = F,skip = 2)[,c(4:6)]
# extract data from filename
id.seq = unlist(strsplit(fl, "_"))[c(2, 4)]
# Getting ID number
m = regexec('[0-9]+', id.seq[1])
id = regmatches(id.seq[1], m)
# Getting Squence 0 or 1
m = regexec('[0-9]', id.seq[2])
sq = regmatches(id.seq[2], m)
df.temp$ID = as.character(id)
df.temp$seq = as.character(sq)
df.temp$time = timestamp(n = dim(df.temp)[1])
df.ac = rbind(df.ac, df.temp)
}
df.ac$id = as.integer(df.ac$ID)
# merge over id age and gender
# read in gender, age file
df.age.gender = read.csv(file = paste0(root,age.gender))
# merge file
df.gait = merge(df.ac, df.age.gender, by = "ID", rownames = F)
#df.gait = df.gait[complete.cases(df.gait), ]
# clean up data frame
names =c("id", "x", "y", "z", "seq", 'time', 'gender', 'age')
colnames(df.gait) = names
df.gait = df.gait[order(df.gait$seq, df.gait$id, df.gait$time), ]
# Data frame Report
str(df.gait))
write.csv(df.gait, file = paste0(root, "/initial.gait.sensor.center.csv"), row.names = FALSE)
my_height_age = read.csv( file = paste0(root, "/initial.gait.sensor.center.csv"), row.names = NULL)#[, c("id", "age", "gender")]
data.set.dimensions = data.frame(
subjects = length(unique(my_height_age$id)),
age = "3-78 years",
gender = "m = 359, f = 385",
walks = length(unique(my_height_age$seq)),
rows = dim(my_height_age)[1],
col = dim(my_height_age)[2]
)
In order to apply the gaitpy module height per subject and a unix time stamp are required. The missing height is sampled from the height distribution (2018), collected yearly by the Japan Sports Agency. https://nbakki.hatenablog.com/entry/Average_Height_of_Japanese_2018
Preprocessing height / age data 2018
h.jp = read.xlsx(paste0(root, "/average_height_jp.xlsx"), sheet = 1)
jp = data.frame()
for (ref in h.jp$age){
#print(ref)
for (g in c(0,1)){
l = as.integer(unlist(strsplit(ref, "-")))
if ((length(l))>1){
vec = seq(l[1], l[2], 1)
# print(vec)
}
else vec = as.numeric(l)
temp = cbind(vec,
rep(h.jp[h.jp$age == ref, 3-g], length(vec)),
rep(g, length(vec)))
jp = rbind(temp, jp)
}
}
names(jp) = c("age", "height", "gender" )
jp = jp[order(jp$age),]
# ouput plot
p = ggplot(data = jp, aes(x = age, y = height, color = factor(gender))) +
geom_point() +
ggtitle("")+
scale_color_discrete(labels = c("female", "male")) +
labs(color = "Average height per age in Japan, 2018") +
theme(legend.position = c(0.6, 0.2),
legend.background = element_rect(fill=alpha('white', 0.1)))
#p
Associating age with height
## Associating height, age and gender
my_height_age = read.csv( file = paste0(root, "/initial.gait.sensor.center.csv"))#[, c("id", "age", "gender")]
df = data.frame()
idx = 0
for (ref in unique(my_height_age$age)){
for (g in c(0,1)){
idx = idx + 1
#print(ref)
temp = subset(my_height_age, age == ref & gender == g)
if (all(dim(temp)) == TRUE ){
#print(dim(temp))
temp$height = jp[jp$age == ref & jp$gender == g, 2]
df = rbind(df, temp)
}
}
}
df$height = as.integer(df$height)
## Introducing index per id, and sequence
idx = 0
mydf = data.frame()
for (i in unique(df$id)){
sq = unique(df[df$id == i, ]$seq)
if (length(sq) < 2) print(paste("sq < 2, id = ", i))
for (s in sq){
temp = subset(df, id == i & seq == s)
idx = idx + 1
temp$idx = idx
if ((idx %% 100) == 0 ) print(idx)
mydf = rbind(mydf, temp)
}
}
# Store enhanced set
write.csv(mydf, file = paste0(root, "/gait.sensor.center.csv"), row.names = FALSE)
Data set ready for gaitpy module application
my_height_age = read.csv( file = paste0(root, "/gait.sensor.center.csv"))[, -c(2, 4) ]
p.age.height.distribution = ggplot(data = my_height_age[!duplicated(my_height_age$id), ], aes(x = age, y = height, color = factor(gender))) +
geom_point() +
ggtitle("")+
scale_color_discrete(labels = c("female", "male")) +
labs(color = "Average height per age") +
guides(fill=guide_legend(title="")) +
theme(legend.position = c(0.6, 0.4),
legend.background = element_rect(fill=alpha('white', 0.1)))
p.height.distribution = ggplot(data = my_height_age[!duplicated(my_height_age$id), ],
aes(x = height, fill = as.factor(gender))) +
geom_histogram(alpha = 0.5,
position = 'identity',
binwidth = 1) +
ggtitle("") +
scale_fill_discrete(labels = "") +
labs(color = "Height Distribution") +
guides(fill=guide_legend(title="")) +
theme(legend.position = "none"
#legend.background = element_rect(fill=alpha('white', 0.0))
)
p.age.distribution = ggplot(data = my_height_age[!duplicated(my_height_age$id), ],
aes(x = age, fill = as.factor(gender))) +
geom_histogram(alpha = 0.5,
position = 'identity',
binwidth = 1) +
ggtitle("") +
scale_fill_discrete(labels = "") +
labs(color = "Age Distribution") +
guides(fill=guide_legend(title="")) +
theme(legend.position = "none"
#legend.background = element_rect(fill=alpha('white', 0.0))
)
Determination of for how many walks no bouts could be detected
# All ids
fls_all = list.files(path = paste0(root, center.data))
all_ids = data.frame()
for (fn in fls_all){
# id
n = unlist(strsplit(fn, "_"))[c(2,4)]
char = unlist(strsplit(n[1], split = ""))
id = as.numeric(paste(grep("[[:digit:]]+", char, value = T), collapse = ''))
# sequence
char = unlist(strsplit(n[2], split = ""))
seq = as.numeric(paste(grep("[[:digit:]]+", char, value = T), collapse = ''))
# putting together in a dataframe
temp = data.frame(id = id, seq = seq, ids = paste0(id, seq, collapse = ""))
all_ids = rbind(all_ids, temp)
}
# bout ids
fls_bouts = list.files(path = paste0(root, "/Results"))
bout_ids = data.frame()
for (fn in fls_bouts){
#print(fn)
id = as.numeric(unlist(strsplit(fn, "_"))[1])
seq = as.numeric(unlist(strsplit(fn, "_"))[2])
temp = data.frame(id = id, seq = seq, ids = paste0(id, seq, collapse = ""))
bout_ids = rbind(bout_ids, temp)
}
id_no_bout = all_ids[!all_ids$ids %in% bout_ids$ids, ]
mydf = my_height_age[!duplicated(my_height_age$idx), ]
mydf.bout.undetected = subset(mydf,
id == 9859 & seq == 1 |
id == 223743 & seq == 1 |
id == 250428 & seq == 1 |
id == 259559 & seq == 1 |
id == 370121 & seq == 1 |
id == 469147 & seq == 1 |
id == 469147 & seq == 0 )
Based on detected bouts, here I determine how many cycles were undeteced within the bouts.
# Selecting the files with 1Byte ? files with no bouts detected
fls.cycles = list.files(path = paste0(root, "/Results"))
File_no_cy_det = vector()
ID_no_cy_det = vector()
Seq_no_cy_det = vector()
Idx_no_cy_det = vector()
for (f in fls.cycles){
file.size = file.info(path = paste0(root, "/Results/", f))$size
# print(file.size)
if (file.size <=1){
File_no_cy_det = c(File_no_cy_det, f)
ID_no_cy_det = c(ID_no_cy_det, unlist(strsplit(f, "_"))[1])
Seq_no_cy_det = c(Seq_no_cy_det, unlist(strsplit(f, "_"))[2])
Idx_no_cy_det = c(Idx_no_cy_det, unlist(strsplit(f, "_"))[3])
}
}
df_no_cy_det = data.frame(file = File_no_cy_det,
ID = as.integer(ID_no_cy_det),
Seq = as.integer(Seq_no_cy_det),
Idx = as.integer(Idx_no_cy_det))
# Preparing data frame to plot
mydf = my_height_age[!duplicated(my_height_age$idx), ]
mydf.cycled.detection.failed = mydf[unique(mydf$idx) %in% df_no_cy_det$Idx, ]
# participants per age
mydf = read.csv( file = paste0(root, "/gait.sensor.center.csv"))[, c("id","idx", "gender", "age")]
myall = mydf[!duplicated(mydf$idx), c("idx", "gender", "age") ]
ag.sum = myall %>% group_by(gender, age) %>% summarise(
n.all = length(age),
.groups = 'drop'
)
# not detected per age
mydf = read.csv( file = paste0(root, "/gait.sensor.center.csv"))[, c("idx", "gender", "age")]
mydf = mydf[!duplicated(mydf$idx), ]
my.ag.not = mydf[mydf$idx %in% df_no_cy_det$Idx, c("gender", "age") ]
ag.not.sum = my.ag.not %>% group_by(gender, age) %>% summarise(
n.not = length(age),
.groups = 'drop'
)
# merging age groups no cycles detected with all ages groups
df = merge(ag.not.sum, ag.sum, by = c("age", 'gender'), all = TRUE)
df[is.na(df$n.not),]$n.not = 0
# normalization, no detected relative to all subject in a given age group
df$percent = df$n.not/df$n.all
p.cycles.undetedected = ggplot(data = df, aes(x = as.factor(age), fill = as.factor(gender))) +
geom_bar(aes(y = percent ), stat="identity", width = 0.9,
position = position_dodge2(preserve = "total")) +
ggtitle("") +
scale_fill_hue(name="", labels = c("female", "male")) +
scale_x_discrete("age", expand=c(0,0)) +
theme(legend.position = c(0.05, 0.85),
legend.background = element_rect(fill=alpha('white', 0.1)),
axis.text.x = element_text(angle=90, vjust = 0.5, size = 9))
## Files > 1Byte, Bouts detected
File_cy_det = vector()
ID_cy_det = vector()
Seq_cy_det = vector()
Idx_cy_det = vector()
for (f in fls.cycles){
file.size = file.info(path = paste0(root, "/Results/", f))$size
# print(file.size)
if (file.size > 1){
File_cy_det = c(File_cy_det, f)
ID_cy_det = c(ID_cy_det, unlist(strsplit(f, "_"))[1])
Seq_cy_det = c(Seq_cy_det, unlist(strsplit(f, "_"))[2])
Idx_cy_det = c(Idx_cy_det, unlist(strsplit(f, "_"))[3])
}
}
df_cy_det = data.frame(file = File_cy_det,
ID = as.integer(ID_cy_det),
Seq = as.integer(Seq_cy_det),
Idx = as.integer(Idx_cy_det))
mydf = my_height_age[!duplicated(my_height_age$idx), ]
mydf.cycles.detected = mydf[unique(mydf$idx) %in% df_cy_det$Idx, ]
In sum 1146 gait cycles detected and their gait parameters are aggregated into 1 file.
## Takes generaged results from gaitpy module and aggregates all resutls into one 1 dataframe
## for analysis.
# get all file namnes in directory
fls = list.files(path = paste0(root, results))
gait.features = data.frame()
for (f in fls){
pth = paste0(root, results, "/", f)
#print(file.info(pth)$size)
if (file.info(pth)$size <=1){
#print(paste("skip ", f, ", " , " since NA ", file.info(pth)$size))
next
}
df = read.csv(pth)
# get meta data
meta.data = unlist(strsplit(f, "_"))
df$id = meta.data[1]
df$seq = meta.data[2]
df$idx = meta.data[3]
gait.features = rbind(gait.features, df)
}
# including age and height for each subject
gait.features$idx = as.integer(gait.features$idx)
age.height.gender = my_height_age[!duplicated(my_height_age$idx), c( "idx", "age", "gender", "height")]
df.features = merge(gait.features, age.height.gender, by = "idx")
osaka.features = df.features[, c(31:35, 1, 2:30)]
osaka.features = subset(osaka.features, select = -c(bout_start_time, bout_number, IC, FC))
write.csv(osaka.features, paste0(root, "/gait.characteristics.csv"), row.names = FALSE)
Each subject was encouraged to perform 2 flat ground level In the following of module by comparing two level walks seq 0 and 1
gt.char = read.csv(paste0(root, "/gait.characteristics.csv"))
# Calculating mean per individual
age.gender = gt.char %>%
group_by(id, seq, idx, age, gender, height) %>%
summarise_all(list(mean))
vec.2walks = age.gender[duplicated(age.gender$id), ]$id
df.2walks = age.gender[ age.gender$id %in% vec.2walks, ]
# split data in level walk 1 and level walk 2
lev.walk.0 = subset(df.2walks, seq == 0)
lev.walk.1 = subset(df.2walks, seq == 1)
# difference normlized through dividing by avergage
av = (lev.walk.0[,-c(1:7)] + lev.walk.1[,-c(1:7)])/2
sdt = sqrt((lev.walk.0[,-c(1:7)] - av)^2 + (lev.walk.1[,-c(1:7)] - av)^2)
# sdt
df.2walks.sd = cbind(df.2walks[duplicated(df.2walks$id),c(1:6)], sdt)
df.2walks.sd.long = melt(df.2walks.sd,
id.vars = c("id", "age", "gender", "seq", "idx", "height"),
variable.name="characteristic")
stp.cad.gait = subset(df.2walks.sd.long, characteristic == "gait_cycles" |
characteristic == "steps" |
characteristic == "cadence"
)
no.stp.cad.gait = subset(df.2walks.sd.long, characteristic != "gait_cycles" &
characteristic != "steps" &
characteristic != "cadence"
)
| subjects | age | gender | walks | rows | col |
|---|---|---|---|---|---|
| 744 | 3-78 years | m = 359, f = 385 | 2 | 805945 | 8 |
The “id = 469147” is available in the data set but not in the age, gender table. The entire data set shall have, according to the description 744 subjects. I excluded subject 469147. There are 744 subjects, each subject does 2 walks (seq = 0 or 1), that amounts to 1488 unique idx per walk. 805945 records.
Osaka Data Set & Augmenentation| id | y | seq | gender | age |
|---|---|---|---|---|
| 103114 | -0.830 | 0 | 0 | 33 |
| 156237 | -0.684 | 0 | 0 | 33 |
| 200938 | -1.122 | 0 | 0 | 33 |
| 227547 | -0.960 | 0 | 0 | 33 |
| 250220 | -1.028 | 0 | 0 | 33 |
| id | y | seq | gender | age | time | height |
|---|---|---|---|---|---|---|
| 103114 | -0.830 | 0 | 0 | 33 | 2000-01-01 00:00:00.000 | 158 |
| 156237 | -0.684 | 0 | 0 | 33 | 2000-01-01 00:00:00.000 | 158 |
| 200938 | -1.122 | 0 | 0 | 33 | 2000-01-01 00:00:00.000 | 158 |
| 227547 | -0.960 | 0 | 0 | 33 | 2000-01-01 00:00:00.000 | 158 |
| 250220 | -1.028 | 0 | 0 | 33 | 2000-01-01 00:00:00.000 | 158 |
time: The gaitpy module requires Unix Time Stamp a resolution of 10ms, e.g. 2000-01-01 00:00:00.000, 2000-01-01 00:00:00.020, 2000-01-01 00:00:00.030, … The timestamp is used to synchronize and normalize the data sampling rate within the module.
height: The module requires also the height of each subject. The Osaka data set doesn’t provide the height. Therefore I sampled from the average age-height distribution, issued by Japan’s sport agency in 2018. The graphics below show the age-height distribution generated.
Figure 1: Age Height Distribution
Determining:
Bout detection rate
Cycle detection rate
Bias towards age, or gender in regard to bouts and cycle detection
Gait characteristics
. Reproducing results from (1)
. Something else
The gaitpy module entails 2 pre-trained alghorithms to identify bouts and cylces. The module identified from 744 subjects with each 2 walks the following amount bouts and cycles. The tables below show how many bouts and cycles were detected. The graphic below shows the age and gender distribution of the undetected cycles. The limitation of the database is that all walk sequences of a subject a rerelatively short,which were capture dwithin a session of about 1min. Level and slope walk paths were about 9m and 3 m long, respectively.
| characteristic | walks | detected | undetected | percent |
|---|---|---|---|---|
| bouts | 1490 | 1483 | 7 | 99.53 |
| bouts with cycles | 1483 | 1146 | 337 | 77.28 |
Figure 2: Distribution of undetected gait cycles
Conclusion
The normalized age distribution shows undetected gait cycles in percent for each age. The average of
Gait variability or fluctuations in gait characteristics from one step to the next is evident even when task demands and environmental conditions are constant. The generalized motor program theory (GMPT) defines that motor programs are embedded in the central nervous system for the purpose of rapid and skilled movements. A healthy person with well-learned tasks is able to apply the motor program efficiently and accurately to fulfill a task. Any variability is considered an error in the selection of stored motor programs for gait. Gait characteristics such as step length or stride duration vary across mulitple cycles in two distinct yet complementary ways:variability of gait, expressed by linear predictiable fluctuaions described by the statistical parameters sd, var, range. Variability depend on age, pathololgy and tasks and environmental demands. The amount of variability is very low for adults, and high for children in the learnign phase.
Variability of stride duration asymmetry | age groups 9-16, 17-65 and 66-78
The variability of the gait characteristic can be explained by age. Very young subject are still learning to walk (here in a new setup, being a part of an experiment). The stride duration asymmetry is
Title: The test–retest reliability and minimal detectable change of spatial and temporal gait variability during usual over-ground walking for younger and older adults, Maha Almarwani, Subashan Perera, Jessie M. VanSwearingen, Patrick J. Sparto, Jennifer S. Brach, 2015
Assessing test–retest reliability and determine minimal detectable change (MDC) of spatial and temporal gait variability in younger and older adults. 40 younger (mean age = 26.6 +- 6.0 years) and 46 older adults (mean age = 78.1 +- 6.2 years) were included in the study.
| id | y | seq | time | gender | age | height | idx |
|---|---|---|---|---|---|---|---|
| 103114 | -0.830 | 0 | 2000-01-01 00:00:00.000 | 0 | 33 | 158 | 1 |
| 103114 | -0.730 | 1 | 2000-01-01 00:00:00.000 | 0 | 33 | 158 | 2 |
| 156237 | -0.684 | 0 | 2000-01-01 00:00:00.000 | 0 | 33 | 158 | 3 |
| 156237 | -0.846 | 1 | 2000-01-01 00:00:00.000 | 0 | 33 | 158 | 4 |
| 200938 | -1.122 | 0 | 2000-01-01 00:00:00.000 | 0 | 33 | 158 | 5 |
| 200938 | -0.702 | 1 | 2000-01-01 00:00:00.000 | 0 | 33 | 158 | 6 |
| 227547 | -0.960 | 0 | 2000-01-01 00:00:00.000 | 0 | 33 | 158 | 7 |
| 227547 | -0.644 | 1 | 2000-01-01 00:00:00.000 | 0 | 33 | 158 | 8 |
| 250220 | -1.028 | 0 | 2000-01-01 00:00:00.000 | 0 | 33 | 158 | 9 |
| 250220 | -1.220 | 1 | 2000-01-01 00:00:00.000 | 0 | 33 | 158 | 10 |
| Variable | Class | Values |
|---|---|---|
| id | character | 10002, 101212, 101628 |
| seq | character | 0, 1, NA |
| age | integer | 2, 3, 4 |
| gender | integer | 0, 1, NA |
| height | integer | 87, 94, 95 |
| idx | integer | 1, 3, 4 |
| bout_length_sec | double | 2.86, 2.96, 3 |
| gait_cycles | integer | 1, 2, 3 |
| steps | integer | 6, 7, 8 |
| stride_duration | double | 0.7, 0.72, 0.74 |
| stride_duration_asymmetry | double | 0, 0.02, 0.04 |
| step_duration | double | 0.3, 0.34, 0.36 |
| step_duration_asymmetry | double | 0, 0.02, 0.04 |
| cadence | double | 41.67, 42.86, 56.6 |
| initial_double_support | double | 0.04, 0.06, 0.08 |
| initial_double_support_asymmetry | double | 0, 0.02, 0.04 |
| terminal_double_support | double | -0.96, -0.44, -0.26 |
| terminal_double_support_asymmetry | double | 0, 0.02, 0.04 |
| double_support | double | -0.86, -0.24, -0.16 |
| double_support_asymmetry | double | 0, 0.02, 0.04 |
| single_limb_support | double | -0.42, -0.12, -0.1 |
| single_limb_support_asymmetry | double | 0, 0.02, 0.04 |
| stance | double | 0.46, 0.48, 0.5 |
| stance_asymmetry | double | 0, 0.02, 0.04 |
| swing | double | -0.12, -0.1, -0.08 |
| swing_asymmetry | double | 0, 0.02, 0.04 |
| step_length | double | 0.11, 0.21, 0.25 |
| step_length_asymmetry | double | 0, 0.01, 0.02 |
| stride_length | double | 0.42, 0.46, 0.47 |
| stride_length_asymmetry | double | 0, 0.01, 0.02 |
| gait_speed | double | 0.26, 0.28, 0.29 |
The histrogram shows no bout length beteen 6-7.5 sec, and 3-4.5. Bouth have is a difference of 1.5 seconds. The distribution looks weird. Is there an explanation, investigation needed?
Gait Cycles, Steps, Cadence
Other gait characterstics
Remarks:
Filter outlier I filtered age > 12, stride duration asymmetry < 0.26 (need to figure which subject that is … )