knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE,
fig.path='figures/', fig.ext='jpeg', fig.width=6,
fig.height=4, dpi=300)
# set working directory
setwd("~/Desktop")
# libraries
library(readxl)
library(tidyverse)
library(magrittr)
library(ggplot2)
library(dplyr)
library(caret)
data <- read_excel("ScapClavFull.xlsx")
data %<>% select(-Notes) %>% na.omit()
data$AgeAtDeath <- as.numeric(data$AgeAtDeath)
data$Sex <- as.factor(data$Sex)
data$Race <- as.factor(data$Race)
data$HispanicOrigin <- as.factor(data$HispanicOrigin)
data$`Side Used` <- as.factor(data$`Side Used`)
data %<>% mutate(across(7:20, as.numeric))
data_m <- data %>% filter(Sex == 'Male') %>% select(-AgeAtDeath)
vars_to_test_m <- names(data_m)[sapply(data_m, is.numeric)]
shapiro_results_m <- list()
summary_table_m <- data.frame(Variable = character(),
W_Statistic = numeric(),
P_Value = numeric(),
stringsAsFactors = FALSE)
for (var in vars_to_test_m) {
test <- shapiro.test(data_m[[var]])
summary_table_m <- rbind(summary_table_m, data.frame(
Variable = var,
W_Statistic = test$statistic,
P_Value = test$p.value
))
}
print(summary_table_m)
## Variable W_Statistic P_Value
## W GH 0.9683449 0.0811894066
## W1 GW 0.9712547 0.1174624870
## W2 SPL 0.9438377 0.0041225377
## W3 SH 0.9809394 0.3836900545
## W4 SW 0.9559658 0.0172704186
## W5 ISH 0.9642835 0.0485391539
## W6 SSH 0.9851546 0.5974740848
## W7 AW 0.9848588 0.5808133544
## W8 CL 0.9750312 0.1888916382
## W9 MSC 0.9472704 0.0061235832
## W10 MSMAX 0.9301946 0.0009228846
## W11 MSMIN 0.9319212 0.0011081157
## W12 STERN 0.9767232 0.2328035740
## W13 ACROM 0.9707890 0.1107281645
data_f <- data %>% filter(Sex == 'Female') %>% select(-AgeAtDeath)
vars_to_test_f <- names(data_f)[sapply(data_f, is.numeric)]
shapiro_results_f <- list()
summary_table_f <- data.frame(Variable = character(),
W_Statistic = numeric(),
P_Value = numeric(),
stringsAsFactors = FALSE)
for (var in vars_to_test_f) {
test <- shapiro.test(data_f[[var]])
summary_table_f <- rbind(summary_table_f, data.frame(
Variable = var,
W_Statistic = test$statistic,
P_Value = test$p.value
))
}
print(summary_table_f)
## Variable W_Statistic P_Value
## W GH 0.9468462 3.109518e-03
## W1 GW 0.9547980 8.610363e-03
## W2 SPL 0.9905390 8.496573e-01
## W3 SH 0.9757310 1.531178e-01
## W4 SW 0.9854887 5.399955e-01
## W5 ISH 0.9707688 7.641485e-02
## W6 SSH 0.9768788 1.794921e-01
## W7 AW 0.9913695 8.917136e-01
## W8 CL 0.9720303 9.122867e-02
## W9 MSC 0.9283038 3.497245e-04
## W10 MSMAX 0.8644230 8.689984e-07
## W11 MSMIN 0.8856263 5.164714e-06
## W12 STERN 0.9774936 1.953301e-01
## W13 ACROM 0.9811448 3.185746e-01
# Normal for some variables - outliers to remove
up1m <- mvoutlier::uni.plot(as.data.frame(data_m[6:13]), symb=TRUE)
data1 <- data_m[!up1m$outliers, ]
up2m <- mvoutlier::uni.plot(as.data.frame(data1[14:19]), symb=TRUE)
data2 <- data1[!up2m$outliers, ]
up1f <- mvoutlier::uni.plot(as.data.frame(data_f[6:13]), symb=TRUE)
data3 <- data_f[!up1f$outliers, ]
up2f <- mvoutlier::uni.plot(as.data.frame(data3[14:19]), symb=TRUE)
data4 <- data3[!up2f$outliers, ]
data_down <- rbind(data2, data4)
ggplot(data_down, aes(sample = MSMAX)) + stat_qq() + stat_qq_line(color = "red") + facet_wrap(~Sex) + theme_minimal()
ggplot(data, aes(sample = MSMAX)) + stat_qq() + stat_qq_line(color = "red") + facet_wrap(~Sex) + theme_minimal()
The problem is is that for some of the outliers it is claiming, there are some variables that have a lot of repeat values and that is seeming to cause issues like with MSMAX. It is identifying a lot of outliers for this variable when it is just natural variation. Therefore, I won’t be removing outliers for this sample. Additionally, after plotting some graphs that I did not include (trust me bro), a lot of the breaking from normality is on the upper/lower ends, which I would say is fairly normal for variation. Based on the two plots above that show the spread for MSMAX without outliers and with outliers, there are A LOT of females that are scoring higher maximum midshaft clavicular diameters that are being excluded from the sample.
set.seed(666)
data_sub <- downSample(data[-c(1:6)], factor(data$Sex)) %>% rename(Sex = Class)
data_sub %>% group_by(Sex) %>% summary(n = n())
## GH GW SPL SH
## Min. :30.00 Min. :22.00 Min. :111.0 Min. :125.0
## 1st Qu.:34.00 1st Qu.:26.00 1st Qu.:128.0 1st Qu.:143.0
## Median :36.00 Median :28.00 Median :135.5 Median :153.0
## Mean :36.65 Mean :27.96 Mean :136.2 Mean :152.5
## 3rd Qu.:39.00 3rd Qu.:30.00 3rd Qu.:143.2 3rd Qu.:161.0
## Max. :45.00 Max. :35.00 Max. :158.0 Max. :180.0
## SW ISH SSH AW
## Min. : 83.0 Min. : 87.0 Min. :36.00 Min. :32.00
## 1st Qu.: 97.0 1st Qu.:107.5 1st Qu.:50.00 1st Qu.:41.75
## Median :102.0 Median :113.0 Median :54.50 Median :45.00
## Mean :103.1 Mean :112.4 Mean :54.43 Mean :45.46
## 3rd Qu.:108.2 3rd Qu.:117.0 3rd Qu.:59.00 3rd Qu.:49.00
## Max. :123.0 Max. :138.0 Max. :73.00 Max. :60.00
## CL MSC MSMAX MSMIN
## Min. :128.0 Min. :29.00 Min. :10.00 Min. : 7.000
## 1st Qu.:143.0 1st Qu.:34.00 1st Qu.:12.00 1st Qu.: 9.000
## Median :149.0 Median :36.00 Median :13.00 Median :10.000
## Mean :150.6 Mean :36.93 Mean :13.17 Mean : 9.993
## 3rd Qu.:157.2 3rd Qu.:40.00 3rd Qu.:14.00 3rd Qu.:11.000
## Max. :179.0 Max. :49.00 Max. :19.00 Max. :13.000
## STERN ACROM Sex
## Min. :19.00 Min. :14.00 Female:68
## 1st Qu.:25.00 1st Qu.:22.75 Male :68
## Median :28.00 Median :25.50
## Mean :27.63 Mean :25.53
## 3rd Qu.:30.00 3rd Qu.:29.00
## Max. :39.00 Max. :34.00
Data selection - 25-60 (fusion-extreme lipping), not selecting for Pop Affinity, L and R measurements taken unless unable, did not measure if there was pathology or trauma
Used a 90/10 split of the original dataset of 139 males and 139 females (124/15).
cols_to_test <- names(data_sub)[1:14]
p_values <- numeric(length(cols_to_test))
for (i in seq_along(cols_to_test)) {
var_name <- cols_to_test[i]
test_result <- t.test(data_sub[[var_name]] ~ data_sub$Sex)
p_values[i] <- test_result$p.value
}
print(p_values) # It would be much more helpful to see which p-value belongs to which variable but they are less than 0.05, which means significant difference between sexes!
## [1] 4.384582e-20 5.485770e-25 7.980352e-20 3.146321e-23 7.106555e-17
## [6] 5.542960e-14 5.069001e-16 2.670584e-15 1.828695e-12 7.773521e-24
## [11] 1.587719e-19 8.987695e-16 1.076125e-07 1.321741e-09
data_sub %<>% mutate(DF1 = 0.046*GH + 0.029*GW + 0.111*SPL + 0.296*SH - 0.064*SW - 0.025*ISH + 0.135*SSH + 0.058*AW + 0.014*CL + 0.024*MSC - 0.006*MSMAX - 0.005*MSMIN + 0.018*STERN + 0.042*ACROM - 22.437)
# Something is wrong with this one... I think it involves the coefficient for SH but can't confirm because I am not the authors
data_sub %<>% mutate(DF2 = 0.166*GH + 0.029*SH + 0.045*AW + 0.052*CL + 0.121*MSC - 22.836)
data_sub %<>% mutate(DF3 = 0.104*GH + 0.121*GW + 0.042*SPL + 0.029*SH + 0.000*SW + 0.025*ISH + 0.022*SSH + 0.036*AW - 20.900)
data_sub %<>% mutate(DF4 = 0.112*GH + 0.134*GW + 0.049*SPL + 0.056*SH - 21.096)
data_sub %<>% mutate(DF5 = 0.074*CL + 0.003*MSC + 0.279*MSMAX + 0.416*MSMIN - 0.011*STERN + 0.027*ACROM - 18.153)
data_sub %<>% mutate(DF6 = 0.076*CL + 0.290*MSMAX + 0.434*MSMIN - 18.223)
data_sub$DF1_Pred <- ifelse(data_sub$DF1 >= 0, "Male", "Female")
data_sub$DF2_Pred <- ifelse(data_sub$DF2 >= 0, "Male", "Female")
data_sub$DF3_Pred <- ifelse(data_sub$DF3 >= 0, "Male", "Female")
data_sub$DF4_Pred <- ifelse(data_sub$DF4 >= 0, "Male", "Female")
data_sub$DF5_Pred <- ifelse(data_sub$DF5 >= 0, "Male", "Female")
data_sub$DF6_Pred <- ifelse(data_sub$DF6 >= 0, "Male", "Female")
ggplot(data_sub, aes(x = DF1, fill = Sex)) +
geom_density(alpha = 0.5) +
labs(y = "Density", x = "Discriminant Function 1") +
theme_minimal() + geom_vline(xintercept=0, linetype="dashed") + scale_fill_manual(values = c("#bf987c","#de6612"))
ggplot(data_sub, aes(x = DF2, fill = Sex)) +
geom_density(alpha = 0.5) +
labs(y = "Density", x = "Discriminant Function 2") +
theme_minimal() + geom_vline(xintercept=0, linetype="dashed") + scale_fill_manual(values = c("#c9be97","#e6ba1c"))
ggplot(data_sub, aes(x = DF3, fill = Sex)) +
geom_density(alpha = 0.5) +
labs(y = "Density", x = "Discriminant Function 3") +
theme_minimal() + geom_vline(xintercept=0, linetype="dashed") + scale_fill_manual(values = c("#958ebd","#3e23d9"))
ggplot(data_sub, aes(x = DF4, fill = Sex)) +
geom_density(alpha = 0.5) +
labs(y = "Density", x = "Discriminant Function 4") +
theme_minimal() + geom_vline(xintercept=0, linetype="dashed") + scale_fill_manual(values = c("#9780ad","#7030b0"))
ggplot(data_sub, aes(x = DF5, fill = Sex)) +
geom_density(alpha = 0.5) +
labs(y = "Density", x = "Discriminant Function 5") +
theme_minimal() + geom_vline(xintercept=0, linetype="dashed") + scale_fill_manual(values = c("#ba93ba","#b82ab8"))
ggplot(data_sub, aes(x = DF6, fill = Sex)) +
geom_density(alpha = 0.5) +
labs(y = "Density", x = "Discriminant Function 6") +
theme_minimal() + geom_vline(xintercept=0, linetype="dashed") + scale_fill_manual(values = c("#bf889c","#d9306e"))
| Method | Overall Accuracy Rate | Female Accuracy Rate | Male Accuracy Rate |
|---|---|---|---|
| All Variables | 50% | 0% | 100% |
| Stepwise All Variables | 65.44% | 32.35% | 98.53% |
| All Scapular Variables | 61.03% | 22.06% | 100% |
| Stepwise Scapular Variables | 63.24% | 26.47% | 100% |
| All Clavicular Variables | 73.53% | 50.00% | 97.06% |
| Stepwise Clavicular Variables | 72.06% | 47.06% | 97.06% |
confusionMatrix(factor(data_sub$DF1_Pred), factor(data_sub$Sex))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Female Male
## Female 0 0
## Male 68 68
##
## Accuracy : 0.5
## 95% CI : (0.4131, 0.5869)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 0.5341
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 4.476e-16
##
## Sensitivity : 0.0
## Specificity : 1.0
## Pos Pred Value : NaN
## Neg Pred Value : 0.5
## Prevalence : 0.5
## Detection Rate : 0.0
## Detection Prevalence : 0.0
## Balanced Accuracy : 0.5
##
## 'Positive' Class : Female
##
confusionMatrix(factor(data_sub$DF2_Pred), factor(data_sub$Sex))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Female Male
## Female 22 1
## Male 46 67
##
## Accuracy : 0.6544
## 95% CI : (0.5681, 0.7338)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 0.0001993
##
## Kappa : 0.3088
##
## Mcnemar's Test P-Value : 1.38e-10
##
## Sensitivity : 0.3235
## Specificity : 0.9853
## Pos Pred Value : 0.9565
## Neg Pred Value : 0.5929
## Prevalence : 0.5000
## Detection Rate : 0.1618
## Detection Prevalence : 0.1691
## Balanced Accuracy : 0.6544
##
## 'Positive' Class : Female
##
confusionMatrix(factor(data_sub$DF3_Pred), factor(data_sub$Sex))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Female Male
## Female 15 0
## Male 53 68
##
## Accuracy : 0.6103
## 95% CI : (0.523, 0.6927)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 0.006302
##
## Kappa : 0.2206
##
## Mcnemar's Test P-Value : 9.148e-13
##
## Sensitivity : 0.2206
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.5620
## Prevalence : 0.5000
## Detection Rate : 0.1103
## Detection Prevalence : 0.1103
## Balanced Accuracy : 0.6103
##
## 'Positive' Class : Female
##
confusionMatrix(factor(data_sub$DF4_Pred), factor(data_sub$Sex))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Female Male
## Female 18 0
## Male 50 68
##
## Accuracy : 0.6324
## 95% CI : (0.5455, 0.7133)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 0.001279
##
## Kappa : 0.2647
##
## Mcnemar's Test P-Value : 4.219e-12
##
## Sensitivity : 0.2647
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.5763
## Prevalence : 0.5000
## Detection Rate : 0.1324
## Detection Prevalence : 0.1324
## Balanced Accuracy : 0.6324
##
## 'Positive' Class : Female
##
confusionMatrix(factor(data_sub$DF5_Pred), factor(data_sub$Sex))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Female Male
## Female 34 2
## Male 34 66
##
## Accuracy : 0.7353
## 95% CI : (0.6528, 0.8072)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 1.860e-08
##
## Kappa : 0.4706
##
## Mcnemar's Test P-Value : 2.383e-07
##
## Sensitivity : 0.5000
## Specificity : 0.9706
## Pos Pred Value : 0.9444
## Neg Pred Value : 0.6600
## Prevalence : 0.5000
## Detection Rate : 0.2500
## Detection Prevalence : 0.2647
## Balanced Accuracy : 0.7353
##
## 'Positive' Class : Female
##
confusionMatrix(factor(data_sub$DF6_Pred), factor(data_sub$Sex))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Female Male
## Female 32 2
## Male 36 66
##
## Accuracy : 0.7206
## 95% CI : (0.6372, 0.7941)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 1.365e-07
##
## Kappa : 0.4412
##
## Mcnemar's Test P-Value : 8.636e-08
##
## Sensitivity : 0.4706
## Specificity : 0.9706
## Pos Pred Value : 0.9412
## Neg Pred Value : 0.6471
## Prevalence : 0.5000
## Detection Rate : 0.2353
## Detection Prevalence : 0.2500
## Balanced Accuracy : 0.7206
##
## 'Positive' Class : Female
##
idx <- createDataPartition(data_sub$Sex, p=0.8, list=F)
train <- data_sub[idx, -(16:27)]
test <- data_sub[-idx, -(16:27)]
log2 <- glm(Sex ~ ., train, family="binomial")
df2 <- MASS::stepAIC(log2, direction="both")
## Start: AIC=55
## Sex ~ GH + GW + SPL + SH + SW + ISH + SSH + AW + CL + MSC + MSMAX +
## MSMIN + STERN + ACROM
##
## Df Deviance AIC
## - ACROM 1 24.997 52.997
## - MSMIN 1 25.008 53.008
## - AW 1 25.356 53.356
## - MSMAX 1 25.689 53.689
## - SSH 1 25.764 53.764
## - GH 1 26.787 54.787
## - MSC 1 26.789 54.789
## - CL 1 26.897 54.897
## <none> 24.997 54.997
## - SW 1 27.119 55.119
## - GW 1 27.404 55.404
## - SPL 1 29.200 57.200
## - ISH 1 29.822 57.822
## - STERN 1 31.678 59.678
## - SH 1 33.885 61.885
##
## Step: AIC=53
## Sex ~ GH + GW + SPL + SH + SW + ISH + SSH + AW + CL + MSC + MSMAX +
## MSMIN + STERN
##
## Df Deviance AIC
## - MSMIN 1 25.009 51.009
## - AW 1 25.377 51.377
## - MSMAX 1 25.733 51.733
## - SSH 1 25.765 51.765
## - GH 1 26.811 52.811
## - CL 1 26.944 52.944
## <none> 24.997 52.997
## - MSC 1 27.078 53.078
## - SW 1 27.196 53.196
## - GW 1 27.430 53.430
## + ACROM 1 24.997 54.997
## - SPL 1 29.607 55.607
## - ISH 1 30.099 56.099
## - STERN 1 32.495 58.495
## - SH 1 33.951 59.951
##
## Step: AIC=51.01
## Sex ~ GH + GW + SPL + SH + SW + ISH + SSH + AW + CL + MSC + MSMAX +
## STERN
##
## Df Deviance AIC
## - AW 1 25.459 49.459
## - MSMAX 1 25.736 49.736
## - SSH 1 25.828 49.828
## - GH 1 26.847 50.847
## - CL 1 26.969 50.969
## <none> 25.009 51.009
## - SW 1 27.394 51.394
## - GW 1 27.461 51.461
## - MSC 1 28.291 52.291
## + MSMIN 1 24.997 52.997
## + ACROM 1 25.008 53.008
## - SPL 1 29.896 53.896
## - ISH 1 31.121 55.121
## - STERN 1 32.518 56.518
## - SH 1 34.229 58.229
##
## Step: AIC=49.46
## Sex ~ GH + GW + SPL + SH + SW + ISH + SSH + CL + MSC + MSMAX +
## STERN
##
## Df Deviance AIC
## - SSH 1 25.910 47.910
## - MSMAX 1 25.950 47.950
## - GH 1 27.077 49.077
## - CL 1 27.337 49.337
## <none> 25.459 49.459
## - SW 1 27.470 49.470
## - GW 1 27.592 49.592
## + AW 1 25.009 51.009
## - MSC 1 29.259 51.259
## + MSMIN 1 25.377 51.377
## + ACROM 1 25.455 51.455
## - SPL 1 29.927 51.927
## - ISH 1 31.868 53.868
## - STERN 1 33.228 55.228
## - SH 1 34.428 56.428
##
## Step: AIC=47.91
## Sex ~ GH + GW + SPL + SH + SW + ISH + CL + MSC + MSMAX + STERN
##
## Df Deviance AIC
## - MSMAX 1 26.127 46.127
## - CL 1 27.347 47.347
## - GH 1 27.405 47.405
## - SW 1 27.606 47.606
## - GW 1 27.669 47.669
## <none> 25.910 47.910
## + SSH 1 25.459 49.459
## + MSMIN 1 25.820 49.820
## + AW 1 25.828 49.828
## + ACROM 1 25.910 49.910
## - MSC 1 30.018 50.018
## - SPL 1 30.065 50.065
## - STERN 1 33.234 53.234
## - ISH 1 33.552 53.552
## - SH 1 38.263 58.263
##
## Step: AIC=46.13
## Sex ~ GH + GW + SPL + SH + SW + ISH + CL + MSC + STERN
##
## Df Deviance AIC
## - CL 1 27.370 45.370
## - GH 1 27.422 45.422
## - GW 1 27.670 45.670
## <none> 26.127 46.127
## - SW 1 28.138 46.138
## + MSMAX 1 25.910 47.910
## + SSH 1 25.950 47.950
## + AW 1 26.047 48.047
## + MSMIN 1 26.081 48.081
## + ACROM 1 26.121 48.121
## - SPL 1 30.239 48.239
## - ISH 1 33.554 51.554
## - STERN 1 34.745 52.745
## - SH 1 38.569 56.569
## - MSC 1 49.326 67.326
##
## Step: AIC=45.37
## Sex ~ GH + GW + SPL + SH + SW + ISH + MSC + STERN
##
## Df Deviance AIC
## - GH 1 28.550 44.550
## <none> 27.370 45.370
## - GW 1 29.436 45.436
## - SW 1 29.801 45.801
## + CL 1 26.127 46.127
## - SPL 1 30.561 46.561
## + AW 1 27.157 47.157
## + ACROM 1 27.301 47.301
## + MSMAX 1 27.347 47.347
## + SSH 1 27.368 47.368
## + MSMIN 1 27.370 47.370
## - ISH 1 34.064 50.064
## - STERN 1 34.762 50.762
## - SH 1 38.688 54.688
## - MSC 1 49.475 65.475
##
## Step: AIC=44.55
## Sex ~ GW + SPL + SH + SW + ISH + MSC + STERN
##
## Df Deviance AIC
## - GW 1 29.763 43.763
## <none> 28.550 44.550
## - SPL 1 30.808 44.808
## - SW 1 31.047 45.047
## + GH 1 27.370 45.370
## + CL 1 27.422 45.422
## + ACROM 1 28.266 46.266
## + AW 1 28.391 46.391
## + MSMAX 1 28.451 46.451
## + SSH 1 28.507 46.507
## + MSMIN 1 28.548 46.548
## - STERN 1 35.102 49.102
## - ISH 1 35.395 49.395
## - SH 1 39.239 53.239
## - MSC 1 49.475 63.475
##
## Step: AIC=43.76
## Sex ~ SPL + SH + SW + ISH + MSC + STERN
##
## Df Deviance AIC
## <none> 29.763 43.763
## - SW 1 31.911 43.911
## + CL 1 28.187 44.187
## - SPL 1 32.536 44.536
## + GW 1 28.550 44.550
## + GH 1 29.436 45.436
## + MSMAX 1 29.562 45.562
## + ACROM 1 29.687 45.687
## + AW 1 29.724 45.724
## + SSH 1 29.755 45.755
## + MSMIN 1 29.763 45.763
## - STERN 1 35.777 47.777
## - ISH 1 38.042 50.042
## - SH 1 42.553 54.553
## - MSC 1 63.665 75.665
summary(df2)
##
## Call:
## glm(formula = Sex ~ SPL + SH + SW + ISH + MSC + STERN, family = "binomial",
## data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -70.1147 17.0416 -4.114 3.88e-05 ***
## SPL 0.3382 0.2152 1.572 0.11601
## SH 0.4791 0.1749 2.739 0.00616 **
## SW -0.3500 0.2462 -1.422 0.15515
## ISH -0.3272 0.1375 -2.379 0.01737 *
## MSC 1.0934 0.3330 3.284 0.00103 **
## STERN -0.6217 0.2901 -2.143 0.03211 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 152.492 on 109 degrees of freedom
## Residual deviance: 29.763 on 103 degrees of freedom
## AIC: 43.763
##
## Number of Fisher Scoring iterations: 8
log4 <- glm(Sex ~ GH+GW+SPL+SH+SW+ISH+SSH+AW, train, family="binomial")
df4 <- MASS::stepAIC(log4, direction="both")
## Start: AIC=68.24
## Sex ~ GH + GW + SPL + SH + SW + ISH + SSH + AW
##
## Df Deviance AIC
## - AW 1 50.246 66.246
## - SW 1 50.250 66.250
## - ISH 1 50.302 66.302
## - GH 1 50.319 66.319
## - SPL 1 50.347 66.347
## - SH 1 51.110 67.110
## - SSH 1 52.012 68.012
## <none> 50.243 68.243
## - GW 1 59.907 75.907
##
## Step: AIC=66.25
## Sex ~ GH + GW + SPL + SH + SW + ISH + SSH
##
## Df Deviance AIC
## - SW 1 50.254 64.254
## - ISH 1 50.304 64.304
## - GH 1 50.319 64.319
## - SPL 1 50.361 64.361
## - SH 1 51.133 65.133
## - SSH 1 52.024 66.024
## <none> 50.246 66.246
## + AW 1 50.243 68.243
## - GW 1 60.936 74.936
##
## Step: AIC=64.25
## Sex ~ GH + GW + SPL + SH + ISH + SSH
##
## Df Deviance AIC
## - ISH 1 50.316 62.316
## - GH 1 50.347 62.347
## - SPL 1 50.468 62.468
## - SH 1 51.157 63.157
## - SSH 1 52.025 64.025
## <none> 50.254 64.254
## + SW 1 50.246 66.246
## + AW 1 50.250 66.250
## - GW 1 61.145 73.145
##
## Step: AIC=62.32
## Sex ~ GH + GW + SPL + SH + SSH
##
## Df Deviance AIC
## - GH 1 50.431 60.431
## - SPL 1 50.667 60.667
## <none> 50.316 62.316
## - SSH 1 52.799 62.799
## - SH 1 52.987 62.987
## + ISH 1 50.254 64.254
## + SW 1 50.304 64.304
## + AW 1 50.312 64.312
## - GW 1 61.504 71.504
##
## Step: AIC=60.43
## Sex ~ GW + SPL + SH + SSH
##
## Df Deviance AIC
## - SPL 1 50.671 58.671
## <none> 50.431 60.431
## - SSH 1 52.849 60.849
## - SH 1 52.991 60.991
## + GH 1 50.316 62.316
## + ISH 1 50.347 62.347
## + SW 1 50.395 62.395
## + AW 1 50.430 62.430
## - GW 1 64.094 72.094
##
## Step: AIC=58.67
## Sex ~ GW + SH + SSH
##
## Df Deviance AIC
## <none> 50.671 58.671
## - SH 1 53.489 59.489
## - SSH 1 53.723 59.723
## + SPL 1 50.431 60.431
## + ISH 1 50.476 60.476
## + SW 1 50.556 60.556
## + AW 1 50.653 60.653
## + GH 1 50.667 60.667
## - GW 1 73.282 79.282
summary(df4)
##
## Call:
## glm(formula = Sex ~ GW + SH + SSH, family = "binomial", data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -46.83546 8.95010 -5.233 1.67e-07 ***
## GW 0.84765 0.23434 3.617 0.000298 ***
## SH 0.08804 0.05634 1.563 0.118110
## SSH 0.17537 0.10620 1.651 0.098678 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 152.492 on 109 degrees of freedom
## Residual deviance: 50.671 on 106 degrees of freedom
## AIC: 58.671
##
## Number of Fisher Scoring iterations: 6
log6 <- glm(Sex ~ CL+MSC+MSMAX+MSMIN+STERN+ACROM, train, family="binomial")
df6 <- MASS::stepAIC(log6, direction="both")
## Start: AIC=69.91
## Sex ~ CL + MSC + MSMAX + MSMIN + STERN + ACROM
##
## Df Deviance AIC
## - MSMIN 1 55.952 67.952
## - ACROM 1 56.109 68.109
## - MSMAX 1 56.378 68.378
## - STERN 1 56.866 68.866
## <none> 55.909 69.909
## - MSC 1 62.949 74.949
## - CL 1 67.760 79.760
##
## Step: AIC=67.95
## Sex ~ CL + MSC + MSMAX + STERN + ACROM
##
## Df Deviance AIC
## - ACROM 1 56.227 66.227
## - MSMAX 1 56.544 66.544
## - STERN 1 56.892 66.892
## <none> 55.952 67.952
## + MSMIN 1 55.909 69.909
## - CL 1 67.775 77.775
## - MSC 1 68.015 78.015
##
## Step: AIC=66.23
## Sex ~ CL + MSC + MSMAX + STERN
##
## Df Deviance AIC
## - MSMAX 1 56.648 64.648
## - STERN 1 56.964 64.964
## <none> 56.227 66.227
## + ACROM 1 55.952 67.952
## + MSMIN 1 56.109 68.109
## - MSC 1 68.290 76.290
## - CL 1 68.508 76.508
##
## Step: AIC=64.65
## Sex ~ CL + MSC + STERN
##
## Df Deviance AIC
## - STERN 1 57.130 63.130
## <none> 56.648 64.648
## + MSMAX 1 56.227 66.227
## + MSMIN 1 56.434 66.434
## + ACROM 1 56.544 66.544
## - CL 1 68.510 74.510
## - MSC 1 104.375 110.375
##
## Step: AIC=63.13
## Sex ~ CL + MSC
##
## Df Deviance AIC
## <none> 57.130 63.130
## + STERN 1 56.648 64.648
## + MSMAX 1 56.964 64.964
## + MSMIN 1 57.019 65.019
## + ACROM 1 57.095 65.095
## - CL 1 68.544 72.544
## - MSC 1 109.090 113.090
summary(df6)
##
## Call:
## glm(formula = Sex ~ CL + MSC, family = "binomial", data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -39.93459 7.35790 -5.427 5.72e-08 ***
## CL 0.10461 0.03422 3.057 0.00223 **
## MSC 0.65374 0.13645 4.791 1.66e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 152.49 on 109 degrees of freedom
## Residual deviance: 57.13 on 107 degrees of freedom
## AIC: 63.13
##
## Number of Fisher Scoring iterations: 6
Table is using the Testing Set with the Models Created Using the Training Set
| Method | Overall Accuracy Rate | Female Accuracy Rate | Male Accuracy Rate |
|---|---|---|---|
| All Variables | 96.15% | 100% | 92.31% |
| Stepwise All Variables | 92.31% | 100% | 84.62% |
| All Scapular Variables | 92.31% | 92.31% | 92.31% |
| Stepwise Scapular Variables | 96.15% | 100% | 92.31% |
| All Clavicular Variables | 84.62% | 100% | 69.23% |
| Stepwise Clavicular Variables | 76.92% | 84.62% | 69.23% |
Overall, much higher classification rates compared to the method created using the Modern Thai population (no surprise there that the best reference sample is the one that the data shares)! The highest classification rate is from the Stepwise Scapular Variables :D and the lowest is from the Stepwise Clavicular Variables. I do think this stems from there being a lot of variation between males and females for some of the clavicular measurements, as well as a lot of outliers. When I was testing whether I wanted to remove outliers, I did have to split up the variables because it was too many for MVOutlier. With just the scapular measurements, there were only 7-10 for males and females that were deemed outliers but with the clavicular there were over 20 for each sex. I can vouch for the measurements as I did them and I doublechecked them if they seemed too high or low, so I can say that the clavicle had a very wide range of measurements recorded for midshaft diameter, regardless of sex. This is why I think that the accuracy rates are that much lower that the other models.
ld1 <- MASS::lda(Sex ~ ., train)
print(ld1)
## Call:
## lda(Sex ~ ., data = train)
##
## Prior probabilities of groups:
## Female Male
## 0.5 0.5
##
## Group means:
## GH GW SPL SH SW ISH SSH AW
## Female 34.58182 26.05455 129.3818 143.9273 98.21818 106.6000 50.74545 41.85455
## Male 38.92727 30.16364 143.5273 161.5455 108.80000 118.2909 58.54545 48.78182
## CL MSC MSMAX MSMIN STERN ACROM
## Female 143.7455 33.85455 12.03636 9.163636 26.40000 23.58182
## Male 157.1273 40.45455 14.54545 10.890909 29.32727 27.74545
##
## Coefficients of linear discriminants:
## LD1
## GH 0.035039770
## GW 0.188200172
## SPL 0.054955225
## SH 0.120137611
## SW -0.068753282
## ISH -0.064017674
## SSH -0.001368898
## AW -0.005120696
## CL 0.005003882
## MSC 0.174984371
## MSMAX 0.085270194
## MSMIN 0.035277469
## STERN -0.158540771
## ACROM 0.022008443
predld1 <- predict(ld1, test)
confusionMatrix(predld1$class, test$Sex)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Female Male
## Female 13 1
## Male 0 12
##
## Accuracy : 0.9615
## 95% CI : (0.8036, 0.999)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 4.023e-07
##
## Kappa : 0.9231
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 1.0000
## Specificity : 0.9231
## Pos Pred Value : 0.9286
## Neg Pred Value : 1.0000
## Prevalence : 0.5000
## Detection Rate : 0.5000
## Detection Prevalence : 0.5385
## Balanced Accuracy : 0.9615
##
## 'Positive' Class : Female
##
beta1 <- ld1$scaling
mu1 <- ld1$means
intercept1 <- -0.5 * sum((mu1[1,] + mu1[2,]) * beta1)
print(intercept1)
## [1] -22.69291
# -22.69291
test_pred <- test %>% mutate(train_pred_DF1 = 0.035039770*GH + 0.188200172*GW + 0.054955225*SPL + 0.120137611*SH - 0.068753282*SW - 0.064017674*ISH - 0.001368898*SSH - 0.005120696*AW + 0.005003882*CL + 0.174984371*MSC + 0.085270194*MSMAX + 0.035277469*MSMIN - 0.158540771*STERN + 0.022008443*ACROM - 22.69291)
test_pred$mine <- predld1$x
# This is the whole discriminant function!
# Wanted to double check that this actually worked, which it does. - slightly off but by like 3 thousandths of a decimal
ld2<- MASS::lda(Sex ~ SPL+SH+SW+ISH+MSC+STERN, train)
print(ld2)
## Call:
## lda(Sex ~ SPL + SH + SW + ISH + MSC + STERN, data = train)
##
## Prior probabilities of groups:
## Female Male
## 0.5 0.5
##
## Group means:
## SPL SH SW ISH MSC STERN
## Female 129.3818 143.9273 98.21818 106.6000 33.85455 26.40000
## Male 143.5273 161.5455 108.80000 118.2909 40.45455 29.32727
##
## Coefficients of linear discriminants:
## LD1
## SPL 0.05838126
## SH 0.13203403
## SW -0.04055852
## ISH -0.07038991
## MSC 0.26580926
## STERN -0.15128057
predld2 <- predict(ld2, test)
confusionMatrix(predld2$class, test$Sex)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Female Male
## Female 13 2
## Male 0 11
##
## Accuracy : 0.9231
## 95% CI : (0.7487, 0.9905)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 5.245e-06
##
## Kappa : 0.8462
##
## Mcnemar's Test P-Value : 0.4795
##
## Sensitivity : 1.0000
## Specificity : 0.8462
## Pos Pred Value : 0.8667
## Neg Pred Value : 1.0000
## Prevalence : 0.5000
## Detection Rate : 0.5000
## Detection Prevalence : 0.5769
## Balanced Accuracy : 0.9231
##
## 'Positive' Class : Female
##
beta2 <- ld2$scaling
mu2 <- ld2$means
intercept2 <- -0.5 * sum((mu2[1,] + mu2[2,]) * beta2)
print(intercept2)
## [1] -21.68038
# -21.69038
test_pred <- test %>% mutate(train_pred_DF2 = 0.05838126*SPL + 0.13203403*SH - 0.04055852*SW - 0.07038991*ISH + 0.26580926*MSC - 0.15128057*STERN - 21.69038)
test_pred$mine2 <- predld2$x
ld3 <- MASS::lda(Sex ~ GH+GW+SPL+SH+SW+ISH+SSH+AW, train)
print(ld3)
## Call:
## lda(Sex ~ GH + GW + SPL + SH + SW + ISH + SSH + AW, data = train)
##
## Prior probabilities of groups:
## Female Male
## 0.5 0.5
##
## Group means:
## GH GW SPL SH SW ISH SSH AW
## Female 34.58182 26.05455 129.3818 143.9273 98.21818 106.6000 50.74545 41.85455
## Male 38.92727 30.16364 143.5273 161.5455 108.80000 118.2909 58.54545 48.78182
##
## Coefficients of linear discriminants:
## LD1
## GH 0.07236345
## GW 0.28181855
## SPL 0.02927809
## SH 0.07017650
## SW -0.03937201
## ISH -0.02409284
## SSH 0.03657640
## AW 0.01323900
predld3 <- predict(ld3, test)
confusionMatrix(predld3$class, test$Sex)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Female Male
## Female 12 1
## Male 1 12
##
## Accuracy : 0.9231
## 95% CI : (0.7487, 0.9905)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 5.245e-06
##
## Kappa : 0.8462
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9231
## Specificity : 0.9231
## Pos Pred Value : 0.9231
## Neg Pred Value : 0.9231
## Prevalence : 0.5000
## Detection Rate : 0.4615
## Detection Prevalence : 0.5000
## Balanced Accuracy : 0.9231
##
## 'Positive' Class : Female
##
beta3 <- ld3$scaling
mu3 <- ld3$means
intercept3 <- -0.5 * sum((mu3[1,] + mu3[2,]) * beta3)
print(intercept3)
## [1] -21.10919
# -21.10919
test_pred <- test %>% mutate(train_pred_DF3 = 0.07236345*GH + 0.28181855*GW + 0.02927809*SPL + 0.07017650*SH - 0.03937201*SW - 0.02409284*ISH + 0.03657640*SSH + 0.01323900*AW - 21.10919)
test_pred$mine3 <- predld3$x
ld4 <- MASS::lda(Sex ~ GW+SH+SSH, train)
print(ld4)
## Call:
## lda(Sex ~ GW + SH + SSH, data = train)
##
## Prior probabilities of groups:
## Female Male
## 0.5 0.5
##
## Group means:
## GW SH SSH
## Female 26.05455 143.9273 50.74545
## Male 30.16364 161.5455 58.54545
##
## Coefficients of linear discriminants:
## LD1
## GW 0.34689180
## SH 0.05285804
## SSH 0.04938214
predld4 <- predict(ld4, test)
confusionMatrix(predld4$class, test$Sex)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Female Male
## Female 13 1
## Male 0 12
##
## Accuracy : 0.9615
## 95% CI : (0.8036, 0.999)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 4.023e-07
##
## Kappa : 0.9231
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 1.0000
## Specificity : 0.9231
## Pos Pred Value : 0.9286
## Neg Pred Value : 1.0000
## Prevalence : 0.5000
## Detection Rate : 0.5000
## Detection Prevalence : 0.5385
## Balanced Accuracy : 0.9615
##
## 'Positive' Class : Female
##
beta4 <- ld4$scaling
mu4 <- ld4$means
intercept4 <- -0.5 * sum((mu4[1,] + mu4[2,]) * beta4)
print(intercept4)
## [1] -20.52267
# -20.52267
test_pred <- test %>% mutate(train_pred_DF4 = 0.34689180*GW + 0.05285804*SH + 0.04938214*SSH - 20.52267)
test_pred$mine4 <- predld4$x
ld5 <- MASS::lda(Sex ~ CL+MSC+MSMAX+MSMIN+STERN+ACROM, train)
print(ld5)
## Call:
## lda(Sex ~ CL + MSC + MSMAX + MSMIN + STERN + ACROM, data = train)
##
## Prior probabilities of groups:
## Female Male
## 0.5 0.5
##
## Group means:
## CL MSC MSMAX MSMIN STERN ACROM
## Female 143.7455 33.85455 12.03636 9.163636 26.40000 23.58182
## Male 157.1273 40.45455 14.54545 10.890909 29.32727 27.74545
##
## Coefficients of linear discriminants:
## LD1
## CL 0.05298943
## MSC 0.26307584
## MSMAX -0.01241628
## MSMIN 0.08983644
## STERN -0.07804072
## ACROM 0.04018838
predld5 <- predict(ld5, test)
confusionMatrix(predld5$class, test$Sex)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Female Male
## Female 13 4
## Male 0 9
##
## Accuracy : 0.8462
## 95% CI : (0.6513, 0.9564)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 0.0002668
##
## Kappa : 0.6923
##
## Mcnemar's Test P-Value : 0.1336144
##
## Sensitivity : 1.0000
## Specificity : 0.6923
## Pos Pred Value : 0.7647
## Neg Pred Value : 1.0000
## Prevalence : 0.5000
## Detection Rate : 0.5000
## Detection Prevalence : 0.6538
## Balanced Accuracy : 0.8462
##
## 'Positive' Class : Female
##
beta5 <- ld5$scaling
mu5 <- ld5$means
intercept5 <- -0.5 * sum((mu5[1,] + mu5[2,]) * beta5)
print(intercept5)
## [1] -17.33867
# -17.33867
test_pred <- test %>% mutate(train_pred_DF5 = 0.05298943*CL + 0.26307584*MSC - 0.01241628*MSMAX + 0.08983644*MSMIN - 0.07804072*STERN + 0.04018838*ACROM - 17.33867)
test_pred$mine <- predld5$x
ld6 <- MASS::lda(Sex ~ CL+MSC, train)
print(ld6)
## Call:
## lda(Sex ~ CL + MSC, data = train)
##
## Prior probabilities of groups:
## Female Male
## 0.5 0.5
##
## Group means:
## CL MSC
## Female 143.7455 33.85455
## Male 157.1273 40.45455
##
## Coefficients of linear discriminants:
## LD1
## CL 0.04989081
## MSC 0.27010965
predld6 <- predict(ld6, test)
confusionMatrix(predld6$class, test$Sex)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Female Male
## Female 11 4
## Male 2 9
##
## Accuracy : 0.7692
## 95% CI : (0.5635, 0.9103)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 0.004678
##
## Kappa : 0.5385
##
## Mcnemar's Test P-Value : 0.683091
##
## Sensitivity : 0.8462
## Specificity : 0.6923
## Pos Pred Value : 0.7333
## Neg Pred Value : 0.8182
## Prevalence : 0.5000
## Detection Rate : 0.4231
## Detection Prevalence : 0.5769
## Balanced Accuracy : 0.7692
##
## 'Positive' Class : Female
##
beta6 <- ld6$scaling
mu6 <- ld6$means
intercept6 <- -0.5 * sum((mu6[1,] + mu6[2,]) * beta6)
print(intercept6)
## [1] -17.54119
# -17.54119
test_pred <- test %>% mutate(train_pred_DF6 = 0.04989081*CL + 0.27010965*MSC - 17.54119)
test_pred$mine <- predld6$x
plot_data1 <- data.frame(LD1 = predld1$x[,1], Group = test$Sex)
ggplot(plot_data1, aes(x = LD1, fill = Group)) +
geom_density(alpha = 0.5) +
labs(y = "Density", x = "Discriminant Function 1", fill="Sex") +
theme_minimal() + geom_vline(xintercept=0, linetype="dashed") + scale_fill_manual(values = c("#bf987c","#de6612"))
plot_data2 <- data.frame(LD2 = predld2$x[,1], Group = test$Sex)
ggplot(plot_data2, aes(x = LD2, fill = Group)) +
geom_density(alpha = 0.5) +
labs(y = "Density", x = "Discriminant Function 2", fill="Sex") +
theme_minimal() + geom_vline(xintercept=0, linetype="dashed") + scale_fill_manual(values = c("#c9be97","#e6ba1c"))
plot_data3 <- data.frame(LD3 = predld3$x[,1], Group = test$Sex)
ggplot(plot_data3, aes(x = LD3, fill = Group)) +
geom_density(alpha = 0.5) +
labs(y = "Density", x = "Discriminant Function 3", fill="Sex") +
theme_minimal() + geom_vline(xintercept=0, linetype="dashed") + scale_fill_manual(values = c("#958ebd","#3e23d9"))
plot_data4 <- data.frame(LD4 = predld4$x[,1], Group = test$Sex)
ggplot(plot_data4, aes(x = LD4, fill = Group)) +
geom_density(alpha = 0.5) +
labs(y = "Density", x = "Discriminant Function 4", fill="Sex") +
theme_minimal() + geom_vline(xintercept=0, linetype="dashed") + scale_fill_manual(values = c("#9780ad","#7030b0"))
plot_data5 <- data.frame(LD5 = predld5$x[,1], Group = test$Sex)
ggplot(plot_data5, aes(x = LD5, fill = Group)) +
geom_density(alpha = 0.5) +
labs(y = "Density", x = "Discriminant Function 5", fill="Sex") +
theme_minimal() + geom_vline(xintercept=0, linetype="dashed") + scale_fill_manual(values = c("#ba93ba","#b82ab8"))
plot_data6 <- data.frame(LD6 = predld6$x[,1], Group = test$Sex)
ggplot(plot_data6, aes(x = LD6, fill = Group)) +
geom_density(alpha = 0.5) +
labs(y = "Density", x = "Discriminant Function 6", fill="Sex") +
theme_minimal() + geom_vline(xintercept=0, linetype="dashed") + scale_fill_manual(values = c("#bf889c","#d9306e"))
[1] Kharuhadetch, P., Wattanawaragorn, S., Tiamtongon, C., Wathanyutakon, T., Navic, P., & Mahakkanukrauh, P. (2022). Sex determination using scapular and clavicular parameters in modern Thai population. International Journal of Morphology, 40(3), 768–773. https://doi.org/10.4067/S0717-95022022000300768
[2] Gocha, T. P., Mavroudas, S. R., & Wescott, D. J. (2022). The Texas State Donated Skeletal Collection at the Forensic Anthropology Center at Texas State. Forensic Sciences, 2(1), 7-19. https://doi.org/10.3390/forensicsci2010002
[3] Posit team (2025). RStudio: Integrated Development Environment for R. Posit Software, PBC, Boston, MA. URL http://www.posit.co/.
[4] R Core Team (2017). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. https://www.R-project.prg/.