Global Chunk

knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE,
                      fig.path='figures/', fig.ext='jpeg', fig.width=6,
                      fig.height=4, dpi=300)

Setup

# set working directory
setwd("~/Desktop")

# libraries
library(readxl)
library(tidyverse)
library(magrittr)
library(ggplot2)
library(dplyr)
library(caret)

Import Data + Munge

data <- read_excel("ScapClavFull.xlsx")
data %<>% select(-Notes) %>% na.omit()
data$AgeAtDeath <- as.numeric(data$AgeAtDeath)
data$Sex <- as.factor(data$Sex)
data$Race <- as.factor(data$Race)
data$HispanicOrigin <- as.factor(data$HispanicOrigin)
data$`Side Used` <- as.factor(data$`Side Used`)
data %<>% mutate(across(7:20, as.numeric))

Assumptions

Normality

data_m <- data %>% filter(Sex == 'Male') %>% select(-AgeAtDeath)

vars_to_test_m <- names(data_m)[sapply(data_m, is.numeric)]
shapiro_results_m <- list()
summary_table_m <- data.frame(Variable = character(), 
                            W_Statistic = numeric(), 
                            P_Value = numeric(), 
                            stringsAsFactors = FALSE)

for (var in vars_to_test_m) {
  test <- shapiro.test(data_m[[var]])
  summary_table_m <- rbind(summary_table_m, data.frame(
    Variable = var,
    W_Statistic = test$statistic,
    P_Value = test$p.value
  ))
}
print(summary_table_m)

##     Variable W_Statistic      P_Value
## W         GH   0.9683449 0.0811894066
## W1        GW   0.9712547 0.1174624870
## W2       SPL   0.9438377 0.0041225377
## W3        SH   0.9809394 0.3836900545
## W4        SW   0.9559658 0.0172704186
## W5       ISH   0.9642835 0.0485391539
## W6       SSH   0.9851546 0.5974740848
## W7        AW   0.9848588 0.5808133544
## W8        CL   0.9750312 0.1888916382
## W9       MSC   0.9472704 0.0061235832
## W10    MSMAX   0.9301946 0.0009228846
## W11    MSMIN   0.9319212 0.0011081157
## W12    STERN   0.9767232 0.2328035740
## W13    ACROM   0.9707890 0.1107281645

data_f <- data %>% filter(Sex == 'Female') %>% select(-AgeAtDeath)

vars_to_test_f <- names(data_f)[sapply(data_f, is.numeric)]
shapiro_results_f <- list()
summary_table_f <- data.frame(Variable = character(), 
                            W_Statistic = numeric(), 
                            P_Value = numeric(), 
                            stringsAsFactors = FALSE)

for (var in vars_to_test_f) {
  test <- shapiro.test(data_f[[var]])
  summary_table_f <- rbind(summary_table_f, data.frame(
    Variable = var,
    W_Statistic = test$statistic,
    P_Value = test$p.value
  ))
}
print(summary_table_f)

##     Variable W_Statistic      P_Value
## W         GH   0.9468462 3.109518e-03
## W1        GW   0.9547980 8.610363e-03
## W2       SPL   0.9905390 8.496573e-01
## W3        SH   0.9757310 1.531178e-01
## W4        SW   0.9854887 5.399955e-01
## W5       ISH   0.9707688 7.641485e-02
## W6       SSH   0.9768788 1.794921e-01
## W7        AW   0.9913695 8.917136e-01
## W8        CL   0.9720303 9.122867e-02
## W9       MSC   0.9283038 3.497245e-04
## W10    MSMAX   0.8644230 8.689984e-07
## W11    MSMIN   0.8856263 5.164714e-06
## W12    STERN   0.9774936 1.953301e-01
## W13    ACROM   0.9811448 3.185746e-01

# Normal for some variables - outliers to remove

Outliers

Males

up1m <- mvoutlier::uni.plot(as.data.frame(data_m[6:13]), symb=TRUE)

data1 <- data_m[!up1m$outliers, ]

up2m <- mvoutlier::uni.plot(as.data.frame(data1[14:19]), symb=TRUE)

data2 <- data1[!up2m$outliers, ]

up1f <- mvoutlier::uni.plot(as.data.frame(data_f[6:13]), symb=TRUE)

data3 <- data_f[!up1f$outliers, ]

up2f <- mvoutlier::uni.plot(as.data.frame(data3[14:19]), symb=TRUE)

data4 <- data3[!up2f$outliers, ]

data_down <- rbind(data2, data4)

ggplot(data_down, aes(sample = MSMAX)) + stat_qq() + stat_qq_line(color = "red") + facet_wrap(~Sex) + theme_minimal()

ggplot(data, aes(sample = MSMAX)) + stat_qq() + stat_qq_line(color = "red") + facet_wrap(~Sex) + theme_minimal()

The problem is is that for some of the outliers it is claiming, there are some variables that have a lot of repeat values and that is seeming to cause issues like with MSMAX. It is identifying a lot of outliers for this variable when it is just natural variation. Therefore, I won’t be removing outliers for this sample. Additionally, after plotting some graphs that I did not include (trust me bro), a lot of the breaking from normality is on the upper/lower ends, which I would say is fairly normal for variation. Based on the two plots above that show the spread for MSMAX without outliers and with outliers, there are A LOT of females that are scoring higher maximum midshaft clavicular diameters that are being excluded from the sample.

Downsizing to Equal Groups (With Outliers)

set.seed(666)
data_sub <- downSample(data[-c(1:6)], factor(data$Sex)) %>% rename(Sex = Class)
data_sub %>% group_by(Sex) %>% summary(n = n())

##        GH              GW             SPL              SH       
##  Min.   :30.00   Min.   :22.00   Min.   :111.0   Min.   :125.0  
##  1st Qu.:34.00   1st Qu.:26.00   1st Qu.:128.0   1st Qu.:143.0  
##  Median :36.00   Median :28.00   Median :135.5   Median :153.0  
##  Mean   :36.65   Mean   :27.96   Mean   :136.2   Mean   :152.5  
##  3rd Qu.:39.00   3rd Qu.:30.00   3rd Qu.:143.2   3rd Qu.:161.0  
##  Max.   :45.00   Max.   :35.00   Max.   :158.0   Max.   :180.0  
##        SW             ISH             SSH              AW       
##  Min.   : 83.0   Min.   : 87.0   Min.   :36.00   Min.   :32.00  
##  1st Qu.: 97.0   1st Qu.:107.5   1st Qu.:50.00   1st Qu.:41.75  
##  Median :102.0   Median :113.0   Median :54.50   Median :45.00  
##  Mean   :103.1   Mean   :112.4   Mean   :54.43   Mean   :45.46  
##  3rd Qu.:108.2   3rd Qu.:117.0   3rd Qu.:59.00   3rd Qu.:49.00  
##  Max.   :123.0   Max.   :138.0   Max.   :73.00   Max.   :60.00  
##        CL             MSC            MSMAX           MSMIN       
##  Min.   :128.0   Min.   :29.00   Min.   :10.00   Min.   : 7.000  
##  1st Qu.:143.0   1st Qu.:34.00   1st Qu.:12.00   1st Qu.: 9.000  
##  Median :149.0   Median :36.00   Median :13.00   Median :10.000  
##  Mean   :150.6   Mean   :36.93   Mean   :13.17   Mean   : 9.993  
##  3rd Qu.:157.2   3rd Qu.:40.00   3rd Qu.:14.00   3rd Qu.:11.000  
##  Max.   :179.0   Max.   :49.00   Max.   :19.00   Max.   :13.000  
##      STERN           ACROM           Sex    
##  Min.   :19.00   Min.   :14.00   Female:68  
##  1st Qu.:25.00   1st Qu.:22.75   Male  :68  
##  Median :28.00   Median :25.50              
##  Mean   :27.63   Mean   :25.53              
##  3rd Qu.:30.00   3rd Qu.:29.00              
##  Max.   :39.00   Max.   :34.00

Data selection - 25-60 (fusion-extreme lipping), not selecting for Pop Affinity, L and R measurements taken unless unable, did not measure if there was pathology or trauma

Implementing Kharuhadetch et al.’s Equations

Used a 90/10 split of the original dataset of 139 males and 139 females (124/15).

Sexual Dimorphism

cols_to_test <- names(data_sub)[1:14]
p_values <- numeric(length(cols_to_test))
for (i in seq_along(cols_to_test)) {
  var_name <- cols_to_test[i]
  test_result <- t.test(data_sub[[var_name]] ~ data_sub$Sex)
  p_values[i] <- test_result$p.value
}
print(p_values) # It would be much more helpful to see which p-value belongs to which variable but they are less than 0.05, which means significant difference between sexes!

##  [1] 4.384582e-20 5.485770e-25 7.980352e-20 3.146321e-23 7.106555e-17
##  [6] 5.542960e-14 5.069001e-16 2.670584e-15 1.828695e-12 7.773521e-24
## [11] 1.587719e-19 8.987695e-16 1.076125e-07 1.321741e-09

Dicriminant Equations

data_sub %<>% mutate(DF1 = 0.046*GH + 0.029*GW + 0.111*SPL + 0.296*SH - 0.064*SW - 0.025*ISH + 0.135*SSH + 0.058*AW + 0.014*CL + 0.024*MSC - 0.006*MSMAX - 0.005*MSMIN + 0.018*STERN + 0.042*ACROM - 22.437)
# Something is wrong with this one... I think it involves the coefficient for SH but can't confirm because I am not the authors

data_sub %<>% mutate(DF2 = 0.166*GH + 0.029*SH + 0.045*AW + 0.052*CL + 0.121*MSC - 22.836)

data_sub %<>% mutate(DF3 = 0.104*GH + 0.121*GW + 0.042*SPL + 0.029*SH + 0.000*SW + 0.025*ISH + 0.022*SSH + 0.036*AW - 20.900)

data_sub %<>% mutate(DF4 = 0.112*GH + 0.134*GW + 0.049*SPL + 0.056*SH - 21.096)

data_sub %<>% mutate(DF5 = 0.074*CL + 0.003*MSC + 0.279*MSMAX + 0.416*MSMIN - 0.011*STERN + 0.027*ACROM - 18.153)

data_sub %<>% mutate(DF6 = 0.076*CL + 0.290*MSMAX + 0.434*MSMIN - 18.223)

Predictions

data_sub$DF1_Pred <- ifelse(data_sub$DF1 >= 0, "Male", "Female")
data_sub$DF2_Pred <- ifelse(data_sub$DF2 >= 0, "Male", "Female")
data_sub$DF3_Pred <- ifelse(data_sub$DF3 >= 0, "Male", "Female")
data_sub$DF4_Pred <- ifelse(data_sub$DF4 >= 0, "Male", "Female")
data_sub$DF5_Pred <- ifelse(data_sub$DF5 >= 0, "Male", "Female")
data_sub$DF6_Pred <- ifelse(data_sub$DF6 >= 0, "Male", "Female")

Graphs for Visualizing Classification Rates - from Paper on my Data

All Variables

ggplot(data_sub, aes(x = DF1, fill = Sex)) +
  geom_density(alpha = 0.5) +
  labs(y = "Density", x = "Discriminant Function 1") +
  theme_minimal() + geom_vline(xintercept=0, linetype="dashed") + scale_fill_manual(values = c("#bf987c","#de6612"))

Stepwise All Variables

ggplot(data_sub, aes(x = DF2, fill = Sex)) +
  geom_density(alpha = 0.5) +
  labs(y = "Density", x = "Discriminant Function 2") +
  theme_minimal() + geom_vline(xintercept=0, linetype="dashed") + scale_fill_manual(values = c("#c9be97","#e6ba1c"))

All Scapular

ggplot(data_sub, aes(x = DF3, fill = Sex)) +
  geom_density(alpha = 0.5) +
  labs(y = "Density", x = "Discriminant Function 3") +
  theme_minimal() + geom_vline(xintercept=0, linetype="dashed") + scale_fill_manual(values = c("#958ebd","#3e23d9"))

Stepwise Scapular

ggplot(data_sub, aes(x = DF4, fill = Sex)) +
  geom_density(alpha = 0.5) +
  labs(y = "Density", x = "Discriminant Function 4") +
  theme_minimal() + geom_vline(xintercept=0, linetype="dashed") + scale_fill_manual(values = c("#9780ad","#7030b0"))

All Clavicular

ggplot(data_sub, aes(x = DF5, fill = Sex)) +
  geom_density(alpha = 0.5) +
  labs(y = "Density", x = "Discriminant Function 5") +
  theme_minimal() + geom_vline(xintercept=0, linetype="dashed") + scale_fill_manual(values = c("#ba93ba","#b82ab8"))

Stepwise Clavicular

ggplot(data_sub, aes(x = DF6, fill = Sex)) +
  geom_density(alpha = 0.5) +
  labs(y = "Density", x = "Discriminant Function 6") +
  theme_minimal() + geom_vline(xintercept=0, linetype="dashed") + scale_fill_manual(values = c("#bf889c","#d9306e"))

Classification Matrices

Method	Overall Accuracy Rate	Female Accuracy Rate	Male Accuracy Rate
All Variables	50%	0%	100%
Stepwise All Variables	65.44%	32.35%	98.53%
All Scapular Variables	61.03%	22.06%	100%
Stepwise Scapular Variables	63.24%	26.47%	100%
All Clavicular Variables	73.53%	50.00%	97.06%
Stepwise Clavicular Variables	72.06%	47.06%	97.06%

All Variables

confusionMatrix(factor(data_sub$DF1_Pred), factor(data_sub$Sex))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Female Male
##     Female      0    0
##     Male       68   68
##                                           
##                Accuracy : 0.5             
##                  95% CI : (0.4131, 0.5869)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 0.5341          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : 4.476e-16       
##                                           
##             Sensitivity : 0.0             
##             Specificity : 1.0             
##          Pos Pred Value : NaN             
##          Neg Pred Value : 0.5             
##              Prevalence : 0.5             
##          Detection Rate : 0.0             
##    Detection Prevalence : 0.0             
##       Balanced Accuracy : 0.5             
##                                           
##        'Positive' Class : Female          
##

Stepwise All Variables

confusionMatrix(factor(data_sub$DF2_Pred), factor(data_sub$Sex))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Female Male
##     Female     22    1
##     Male       46   67
##                                           
##                Accuracy : 0.6544          
##                  95% CI : (0.5681, 0.7338)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 0.0001993       
##                                           
##                   Kappa : 0.3088          
##                                           
##  Mcnemar's Test P-Value : 1.38e-10        
##                                           
##             Sensitivity : 0.3235          
##             Specificity : 0.9853          
##          Pos Pred Value : 0.9565          
##          Neg Pred Value : 0.5929          
##              Prevalence : 0.5000          
##          Detection Rate : 0.1618          
##    Detection Prevalence : 0.1691          
##       Balanced Accuracy : 0.6544          
##                                           
##        'Positive' Class : Female          
##

All Scapular Variables

confusionMatrix(factor(data_sub$DF3_Pred), factor(data_sub$Sex))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Female Male
##     Female     15    0
##     Male       53   68
##                                          
##                Accuracy : 0.6103         
##                  95% CI : (0.523, 0.6927)
##     No Information Rate : 0.5            
##     P-Value [Acc > NIR] : 0.006302       
##                                          
##                   Kappa : 0.2206         
##                                          
##  Mcnemar's Test P-Value : 9.148e-13      
##                                          
##             Sensitivity : 0.2206         
##             Specificity : 1.0000         
##          Pos Pred Value : 1.0000         
##          Neg Pred Value : 0.5620         
##              Prevalence : 0.5000         
##          Detection Rate : 0.1103         
##    Detection Prevalence : 0.1103         
##       Balanced Accuracy : 0.6103         
##                                          
##        'Positive' Class : Female         
##

Stepwise Scapular Variables

confusionMatrix(factor(data_sub$DF4_Pred), factor(data_sub$Sex))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Female Male
##     Female     18    0
##     Male       50   68
##                                           
##                Accuracy : 0.6324          
##                  95% CI : (0.5455, 0.7133)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 0.001279        
##                                           
##                   Kappa : 0.2647          
##                                           
##  Mcnemar's Test P-Value : 4.219e-12       
##                                           
##             Sensitivity : 0.2647          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.5763          
##              Prevalence : 0.5000          
##          Detection Rate : 0.1324          
##    Detection Prevalence : 0.1324          
##       Balanced Accuracy : 0.6324          
##                                           
##        'Positive' Class : Female          
##

All Clavicular Variables

confusionMatrix(factor(data_sub$DF5_Pred), factor(data_sub$Sex))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Female Male
##     Female     34    2
##     Male       34   66
##                                           
##                Accuracy : 0.7353          
##                  95% CI : (0.6528, 0.8072)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 1.860e-08       
##                                           
##                   Kappa : 0.4706          
##                                           
##  Mcnemar's Test P-Value : 2.383e-07       
##                                           
##             Sensitivity : 0.5000          
##             Specificity : 0.9706          
##          Pos Pred Value : 0.9444          
##          Neg Pred Value : 0.6600          
##              Prevalence : 0.5000          
##          Detection Rate : 0.2500          
##    Detection Prevalence : 0.2647          
##       Balanced Accuracy : 0.7353          
##                                           
##        'Positive' Class : Female          
##

Stepwise Clavicular Variables

confusionMatrix(factor(data_sub$DF6_Pred), factor(data_sub$Sex))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Female Male
##     Female     32    2
##     Male       36   66
##                                           
##                Accuracy : 0.7206          
##                  95% CI : (0.6372, 0.7941)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 1.365e-07       
##                                           
##                   Kappa : 0.4412          
##                                           
##  Mcnemar's Test P-Value : 8.636e-08       
##                                           
##             Sensitivity : 0.4706          
##             Specificity : 0.9706          
##          Pos Pred Value : 0.9412          
##          Neg Pred Value : 0.6471          
##              Prevalence : 0.5000          
##          Detection Rate : 0.2353          
##    Detection Prevalence : 0.2500          
##       Balanced Accuracy : 0.7206          
##                                           
##        'Positive' Class : Female          
##

Creation of My Own Equations

Training and Testing

idx <- createDataPartition(data_sub$Sex, p=0.8, list=F)
train <- data_sub[idx, -(16:27)]
test <- data_sub[-idx, -(16:27)]

Stepwise Selection

All Variables

log2 <- glm(Sex ~ ., train, family="binomial")
df2 <- MASS::stepAIC(log2, direction="both")

## Start:  AIC=55
## Sex ~ GH + GW + SPL + SH + SW + ISH + SSH + AW + CL + MSC + MSMAX + 
##     MSMIN + STERN + ACROM
## 
##         Df Deviance    AIC
## - ACROM  1   24.997 52.997
## - MSMIN  1   25.008 53.008
## - AW     1   25.356 53.356
## - MSMAX  1   25.689 53.689
## - SSH    1   25.764 53.764
## - GH     1   26.787 54.787
## - MSC    1   26.789 54.789
## - CL     1   26.897 54.897
## <none>       24.997 54.997
## - SW     1   27.119 55.119
## - GW     1   27.404 55.404
## - SPL    1   29.200 57.200
## - ISH    1   29.822 57.822
## - STERN  1   31.678 59.678
## - SH     1   33.885 61.885
## 
## Step:  AIC=53
## Sex ~ GH + GW + SPL + SH + SW + ISH + SSH + AW + CL + MSC + MSMAX + 
##     MSMIN + STERN
## 
##         Df Deviance    AIC
## - MSMIN  1   25.009 51.009
## - AW     1   25.377 51.377
## - MSMAX  1   25.733 51.733
## - SSH    1   25.765 51.765
## - GH     1   26.811 52.811
## - CL     1   26.944 52.944
## <none>       24.997 52.997
## - MSC    1   27.078 53.078
## - SW     1   27.196 53.196
## - GW     1   27.430 53.430
## + ACROM  1   24.997 54.997
## - SPL    1   29.607 55.607
## - ISH    1   30.099 56.099
## - STERN  1   32.495 58.495
## - SH     1   33.951 59.951
## 
## Step:  AIC=51.01
## Sex ~ GH + GW + SPL + SH + SW + ISH + SSH + AW + CL + MSC + MSMAX + 
##     STERN
## 
##         Df Deviance    AIC
## - AW     1   25.459 49.459
## - MSMAX  1   25.736 49.736
## - SSH    1   25.828 49.828
## - GH     1   26.847 50.847
## - CL     1   26.969 50.969
## <none>       25.009 51.009
## - SW     1   27.394 51.394
## - GW     1   27.461 51.461
## - MSC    1   28.291 52.291
## + MSMIN  1   24.997 52.997
## + ACROM  1   25.008 53.008
## - SPL    1   29.896 53.896
## - ISH    1   31.121 55.121
## - STERN  1   32.518 56.518
## - SH     1   34.229 58.229
## 
## Step:  AIC=49.46
## Sex ~ GH + GW + SPL + SH + SW + ISH + SSH + CL + MSC + MSMAX + 
##     STERN
## 
##         Df Deviance    AIC
## - SSH    1   25.910 47.910
## - MSMAX  1   25.950 47.950
## - GH     1   27.077 49.077
## - CL     1   27.337 49.337
## <none>       25.459 49.459
## - SW     1   27.470 49.470
## - GW     1   27.592 49.592
## + AW     1   25.009 51.009
## - MSC    1   29.259 51.259
## + MSMIN  1   25.377 51.377
## + ACROM  1   25.455 51.455
## - SPL    1   29.927 51.927
## - ISH    1   31.868 53.868
## - STERN  1   33.228 55.228
## - SH     1   34.428 56.428
## 
## Step:  AIC=47.91
## Sex ~ GH + GW + SPL + SH + SW + ISH + CL + MSC + MSMAX + STERN
## 
##         Df Deviance    AIC
## - MSMAX  1   26.127 46.127
## - CL     1   27.347 47.347
## - GH     1   27.405 47.405
## - SW     1   27.606 47.606
## - GW     1   27.669 47.669
## <none>       25.910 47.910
## + SSH    1   25.459 49.459
## + MSMIN  1   25.820 49.820
## + AW     1   25.828 49.828
## + ACROM  1   25.910 49.910
## - MSC    1   30.018 50.018
## - SPL    1   30.065 50.065
## - STERN  1   33.234 53.234
## - ISH    1   33.552 53.552
## - SH     1   38.263 58.263
## 
## Step:  AIC=46.13
## Sex ~ GH + GW + SPL + SH + SW + ISH + CL + MSC + STERN
## 
##         Df Deviance    AIC
## - CL     1   27.370 45.370
## - GH     1   27.422 45.422
## - GW     1   27.670 45.670
## <none>       26.127 46.127
## - SW     1   28.138 46.138
## + MSMAX  1   25.910 47.910
## + SSH    1   25.950 47.950
## + AW     1   26.047 48.047
## + MSMIN  1   26.081 48.081
## + ACROM  1   26.121 48.121
## - SPL    1   30.239 48.239
## - ISH    1   33.554 51.554
## - STERN  1   34.745 52.745
## - SH     1   38.569 56.569
## - MSC    1   49.326 67.326
## 
## Step:  AIC=45.37
## Sex ~ GH + GW + SPL + SH + SW + ISH + MSC + STERN
## 
##         Df Deviance    AIC
## - GH     1   28.550 44.550
## <none>       27.370 45.370
## - GW     1   29.436 45.436
## - SW     1   29.801 45.801
## + CL     1   26.127 46.127
## - SPL    1   30.561 46.561
## + AW     1   27.157 47.157
## + ACROM  1   27.301 47.301
## + MSMAX  1   27.347 47.347
## + SSH    1   27.368 47.368
## + MSMIN  1   27.370 47.370
## - ISH    1   34.064 50.064
## - STERN  1   34.762 50.762
## - SH     1   38.688 54.688
## - MSC    1   49.475 65.475
## 
## Step:  AIC=44.55
## Sex ~ GW + SPL + SH + SW + ISH + MSC + STERN
## 
##         Df Deviance    AIC
## - GW     1   29.763 43.763
## <none>       28.550 44.550
## - SPL    1   30.808 44.808
## - SW     1   31.047 45.047
## + GH     1   27.370 45.370
## + CL     1   27.422 45.422
## + ACROM  1   28.266 46.266
## + AW     1   28.391 46.391
## + MSMAX  1   28.451 46.451
## + SSH    1   28.507 46.507
## + MSMIN  1   28.548 46.548
## - STERN  1   35.102 49.102
## - ISH    1   35.395 49.395
## - SH     1   39.239 53.239
## - MSC    1   49.475 63.475
## 
## Step:  AIC=43.76
## Sex ~ SPL + SH + SW + ISH + MSC + STERN
## 
##         Df Deviance    AIC
## <none>       29.763 43.763
## - SW     1   31.911 43.911
## + CL     1   28.187 44.187
## - SPL    1   32.536 44.536
## + GW     1   28.550 44.550
## + GH     1   29.436 45.436
## + MSMAX  1   29.562 45.562
## + ACROM  1   29.687 45.687
## + AW     1   29.724 45.724
## + SSH    1   29.755 45.755
## + MSMIN  1   29.763 45.763
## - STERN  1   35.777 47.777
## - ISH    1   38.042 50.042
## - SH     1   42.553 54.553
## - MSC    1   63.665 75.665

summary(df2)

## 
## Call:
## glm(formula = Sex ~ SPL + SH + SW + ISH + MSC + STERN, family = "binomial", 
##     data = train)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -70.1147    17.0416  -4.114 3.88e-05 ***
## SPL           0.3382     0.2152   1.572  0.11601    
## SH            0.4791     0.1749   2.739  0.00616 ** 
## SW           -0.3500     0.2462  -1.422  0.15515    
## ISH          -0.3272     0.1375  -2.379  0.01737 *  
## MSC           1.0934     0.3330   3.284  0.00103 ** 
## STERN        -0.6217     0.2901  -2.143  0.03211 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 152.492  on 109  degrees of freedom
## Residual deviance:  29.763  on 103  degrees of freedom
## AIC: 43.763
## 
## Number of Fisher Scoring iterations: 8

Scapular Variables

log4 <- glm(Sex ~ GH+GW+SPL+SH+SW+ISH+SSH+AW, train, family="binomial")
df4 <- MASS::stepAIC(log4, direction="both")

## Start:  AIC=68.24
## Sex ~ GH + GW + SPL + SH + SW + ISH + SSH + AW
## 
##        Df Deviance    AIC
## - AW    1   50.246 66.246
## - SW    1   50.250 66.250
## - ISH   1   50.302 66.302
## - GH    1   50.319 66.319
## - SPL   1   50.347 66.347
## - SH    1   51.110 67.110
## - SSH   1   52.012 68.012
## <none>      50.243 68.243
## - GW    1   59.907 75.907
## 
## Step:  AIC=66.25
## Sex ~ GH + GW + SPL + SH + SW + ISH + SSH
## 
##        Df Deviance    AIC
## - SW    1   50.254 64.254
## - ISH   1   50.304 64.304
## - GH    1   50.319 64.319
## - SPL   1   50.361 64.361
## - SH    1   51.133 65.133
## - SSH   1   52.024 66.024
## <none>      50.246 66.246
## + AW    1   50.243 68.243
## - GW    1   60.936 74.936
## 
## Step:  AIC=64.25
## Sex ~ GH + GW + SPL + SH + ISH + SSH
## 
##        Df Deviance    AIC
## - ISH   1   50.316 62.316
## - GH    1   50.347 62.347
## - SPL   1   50.468 62.468
## - SH    1   51.157 63.157
## - SSH   1   52.025 64.025
## <none>      50.254 64.254
## + SW    1   50.246 66.246
## + AW    1   50.250 66.250
## - GW    1   61.145 73.145
## 
## Step:  AIC=62.32
## Sex ~ GH + GW + SPL + SH + SSH
## 
##        Df Deviance    AIC
## - GH    1   50.431 60.431
## - SPL   1   50.667 60.667
## <none>      50.316 62.316
## - SSH   1   52.799 62.799
## - SH    1   52.987 62.987
## + ISH   1   50.254 64.254
## + SW    1   50.304 64.304
## + AW    1   50.312 64.312
## - GW    1   61.504 71.504
## 
## Step:  AIC=60.43
## Sex ~ GW + SPL + SH + SSH
## 
##        Df Deviance    AIC
## - SPL   1   50.671 58.671
## <none>      50.431 60.431
## - SSH   1   52.849 60.849
## - SH    1   52.991 60.991
## + GH    1   50.316 62.316
## + ISH   1   50.347 62.347
## + SW    1   50.395 62.395
## + AW    1   50.430 62.430
## - GW    1   64.094 72.094
## 
## Step:  AIC=58.67
## Sex ~ GW + SH + SSH
## 
##        Df Deviance    AIC
## <none>      50.671 58.671
## - SH    1   53.489 59.489
## - SSH   1   53.723 59.723
## + SPL   1   50.431 60.431
## + ISH   1   50.476 60.476
## + SW    1   50.556 60.556
## + AW    1   50.653 60.653
## + GH    1   50.667 60.667
## - GW    1   73.282 79.282

summary(df4)

## 
## Call:
## glm(formula = Sex ~ GW + SH + SSH, family = "binomial", data = train)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -46.83546    8.95010  -5.233 1.67e-07 ***
## GW            0.84765    0.23434   3.617 0.000298 ***
## SH            0.08804    0.05634   1.563 0.118110    
## SSH           0.17537    0.10620   1.651 0.098678 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 152.492  on 109  degrees of freedom
## Residual deviance:  50.671  on 106  degrees of freedom
## AIC: 58.671
## 
## Number of Fisher Scoring iterations: 6

Clavicular Variables

log6 <- glm(Sex ~ CL+MSC+MSMAX+MSMIN+STERN+ACROM, train, family="binomial")
df6 <- MASS::stepAIC(log6, direction="both")

## Start:  AIC=69.91
## Sex ~ CL + MSC + MSMAX + MSMIN + STERN + ACROM
## 
##         Df Deviance    AIC
## - MSMIN  1   55.952 67.952
## - ACROM  1   56.109 68.109
## - MSMAX  1   56.378 68.378
## - STERN  1   56.866 68.866
## <none>       55.909 69.909
## - MSC    1   62.949 74.949
## - CL     1   67.760 79.760
## 
## Step:  AIC=67.95
## Sex ~ CL + MSC + MSMAX + STERN + ACROM
## 
##         Df Deviance    AIC
## - ACROM  1   56.227 66.227
## - MSMAX  1   56.544 66.544
## - STERN  1   56.892 66.892
## <none>       55.952 67.952
## + MSMIN  1   55.909 69.909
## - CL     1   67.775 77.775
## - MSC    1   68.015 78.015
## 
## Step:  AIC=66.23
## Sex ~ CL + MSC + MSMAX + STERN
## 
##         Df Deviance    AIC
## - MSMAX  1   56.648 64.648
## - STERN  1   56.964 64.964
## <none>       56.227 66.227
## + ACROM  1   55.952 67.952
## + MSMIN  1   56.109 68.109
## - MSC    1   68.290 76.290
## - CL     1   68.508 76.508
## 
## Step:  AIC=64.65
## Sex ~ CL + MSC + STERN
## 
##         Df Deviance     AIC
## - STERN  1   57.130  63.130
## <none>       56.648  64.648
## + MSMAX  1   56.227  66.227
## + MSMIN  1   56.434  66.434
## + ACROM  1   56.544  66.544
## - CL     1   68.510  74.510
## - MSC    1  104.375 110.375
## 
## Step:  AIC=63.13
## Sex ~ CL + MSC
## 
##         Df Deviance     AIC
## <none>       57.130  63.130
## + STERN  1   56.648  64.648
## + MSMAX  1   56.964  64.964
## + MSMIN  1   57.019  65.019
## + ACROM  1   57.095  65.095
## - CL     1   68.544  72.544
## - MSC    1  109.090 113.090

summary(df6)

## 
## Call:
## glm(formula = Sex ~ CL + MSC, family = "binomial", data = train)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -39.93459    7.35790  -5.427 5.72e-08 ***
## CL            0.10461    0.03422   3.057  0.00223 ** 
## MSC           0.65374    0.13645   4.791 1.66e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 152.49  on 109  degrees of freedom
## Residual deviance:  57.13  on 107  degrees of freedom
## AIC: 63.13
## 
## Number of Fisher Scoring iterations: 6

Discriminant Functions

Table is using the Testing Set with the Models Created Using the Training Set

Method	Overall Accuracy Rate	Female Accuracy Rate	Male Accuracy Rate
All Variables	96.15%	100%	92.31%
Stepwise All Variables	92.31%	100%	84.62%
All Scapular Variables	92.31%	92.31%	92.31%
Stepwise Scapular Variables	96.15%	100%	92.31%
All Clavicular Variables	84.62%	100%	69.23%
Stepwise Clavicular Variables	76.92%	84.62%	69.23%

Overall, much higher classification rates compared to the method created using the Modern Thai population (no surprise there that the best reference sample is the one that the data shares)! The highest classification rate is from the Stepwise Scapular Variables :D and the lowest is from the Stepwise Clavicular Variables. I do think this stems from there being a lot of variation between males and females for some of the clavicular measurements, as well as a lot of outliers. When I was testing whether I wanted to remove outliers, I did have to split up the variables because it was too many for MVOutlier. With just the scapular measurements, there were only 7-10 for males and females that were deemed outliers but with the clavicular there were over 20 for each sex. I can vouch for the measurements as I did them and I doublechecked them if they seemed too high or low, so I can say that the clavicle had a very wide range of measurements recorded for midshaft diameter, regardless of sex. This is why I think that the accuracy rates are that much lower that the other models.

All Variables

ld1 <- MASS::lda(Sex ~ ., train)
print(ld1)

## Call:
## lda(Sex ~ ., data = train)
## 
## Prior probabilities of groups:
## Female   Male 
##    0.5    0.5 
## 
## Group means:
##              GH       GW      SPL       SH        SW      ISH      SSH       AW
## Female 34.58182 26.05455 129.3818 143.9273  98.21818 106.6000 50.74545 41.85455
## Male   38.92727 30.16364 143.5273 161.5455 108.80000 118.2909 58.54545 48.78182
##              CL      MSC    MSMAX     MSMIN    STERN    ACROM
## Female 143.7455 33.85455 12.03636  9.163636 26.40000 23.58182
## Male   157.1273 40.45455 14.54545 10.890909 29.32727 27.74545
## 
## Coefficients of linear discriminants:
##                LD1
## GH     0.035039770
## GW     0.188200172
## SPL    0.054955225
## SH     0.120137611
## SW    -0.068753282
## ISH   -0.064017674
## SSH   -0.001368898
## AW    -0.005120696
## CL     0.005003882
## MSC    0.174984371
## MSMAX  0.085270194
## MSMIN  0.035277469
## STERN -0.158540771
## ACROM  0.022008443

predld1 <- predict(ld1, test)

confusionMatrix(predld1$class, test$Sex)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Female Male
##     Female     13    1
##     Male        0   12
##                                          
##                Accuracy : 0.9615         
##                  95% CI : (0.8036, 0.999)
##     No Information Rate : 0.5            
##     P-Value [Acc > NIR] : 4.023e-07      
##                                          
##                   Kappa : 0.9231         
##                                          
##  Mcnemar's Test P-Value : 1              
##                                          
##             Sensitivity : 1.0000         
##             Specificity : 0.9231         
##          Pos Pred Value : 0.9286         
##          Neg Pred Value : 1.0000         
##              Prevalence : 0.5000         
##          Detection Rate : 0.5000         
##    Detection Prevalence : 0.5385         
##       Balanced Accuracy : 0.9615         
##                                          
##        'Positive' Class : Female         
##

beta1 <- ld1$scaling
mu1 <- ld1$means
intercept1 <- -0.5 * sum((mu1[1,] + mu1[2,]) * beta1)
print(intercept1)

## [1] -22.69291

# -22.69291

test_pred <- test %>% mutate(train_pred_DF1 = 0.035039770*GH + 0.188200172*GW + 0.054955225*SPL + 0.120137611*SH - 0.068753282*SW - 0.064017674*ISH - 0.001368898*SSH - 0.005120696*AW + 0.005003882*CL + 0.174984371*MSC + 0.085270194*MSMAX + 0.035277469*MSMIN - 0.158540771*STERN + 0.022008443*ACROM - 22.69291)

test_pred$mine <- predld1$x
# This is the whole discriminant function!
# Wanted to double check that this actually worked, which it does. - slightly off but by like 3 thousandths of a decimal

Stepwise All Variables

ld2<- MASS::lda(Sex ~ SPL+SH+SW+ISH+MSC+STERN, train)
print(ld2)

## Call:
## lda(Sex ~ SPL + SH + SW + ISH + MSC + STERN, data = train)
## 
## Prior probabilities of groups:
## Female   Male 
##    0.5    0.5 
## 
## Group means:
##             SPL       SH        SW      ISH      MSC    STERN
## Female 129.3818 143.9273  98.21818 106.6000 33.85455 26.40000
## Male   143.5273 161.5455 108.80000 118.2909 40.45455 29.32727
## 
## Coefficients of linear discriminants:
##               LD1
## SPL    0.05838126
## SH     0.13203403
## SW    -0.04055852
## ISH   -0.07038991
## MSC    0.26580926
## STERN -0.15128057

predld2 <- predict(ld2, test)

confusionMatrix(predld2$class, test$Sex)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Female Male
##     Female     13    2
##     Male        0   11
##                                           
##                Accuracy : 0.9231          
##                  95% CI : (0.7487, 0.9905)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 5.245e-06       
##                                           
##                   Kappa : 0.8462          
##                                           
##  Mcnemar's Test P-Value : 0.4795          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.8462          
##          Pos Pred Value : 0.8667          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.5000          
##          Detection Rate : 0.5000          
##    Detection Prevalence : 0.5769          
##       Balanced Accuracy : 0.9231          
##                                           
##        'Positive' Class : Female          
##

beta2 <- ld2$scaling
mu2 <- ld2$means
intercept2 <- -0.5 * sum((mu2[1,] + mu2[2,]) * beta2)
print(intercept2)

## [1] -21.68038

# -21.69038

test_pred <- test %>% mutate(train_pred_DF2 = 0.05838126*SPL + 0.13203403*SH - 0.04055852*SW - 0.07038991*ISH + 0.26580926*MSC - 0.15128057*STERN - 21.69038)

test_pred$mine2 <- predld2$x

Scapular Variables

ld3 <- MASS::lda(Sex ~ GH+GW+SPL+SH+SW+ISH+SSH+AW, train)
print(ld3)

## Call:
## lda(Sex ~ GH + GW + SPL + SH + SW + ISH + SSH + AW, data = train)
## 
## Prior probabilities of groups:
## Female   Male 
##    0.5    0.5 
## 
## Group means:
##              GH       GW      SPL       SH        SW      ISH      SSH       AW
## Female 34.58182 26.05455 129.3818 143.9273  98.21818 106.6000 50.74545 41.85455
## Male   38.92727 30.16364 143.5273 161.5455 108.80000 118.2909 58.54545 48.78182
## 
## Coefficients of linear discriminants:
##             LD1
## GH   0.07236345
## GW   0.28181855
## SPL  0.02927809
## SH   0.07017650
## SW  -0.03937201
## ISH -0.02409284
## SSH  0.03657640
## AW   0.01323900

predld3 <- predict(ld3, test)

confusionMatrix(predld3$class, test$Sex)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Female Male
##     Female     12    1
##     Male        1   12
##                                           
##                Accuracy : 0.9231          
##                  95% CI : (0.7487, 0.9905)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 5.245e-06       
##                                           
##                   Kappa : 0.8462          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9231          
##             Specificity : 0.9231          
##          Pos Pred Value : 0.9231          
##          Neg Pred Value : 0.9231          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4615          
##    Detection Prevalence : 0.5000          
##       Balanced Accuracy : 0.9231          
##                                           
##        'Positive' Class : Female          
##

beta3 <- ld3$scaling
mu3 <- ld3$means
intercept3 <- -0.5 * sum((mu3[1,] + mu3[2,]) * beta3)
print(intercept3)

## [1] -21.10919

# -21.10919

test_pred <- test %>% mutate(train_pred_DF3 = 0.07236345*GH + 0.28181855*GW + 0.02927809*SPL + 0.07017650*SH - 0.03937201*SW - 0.02409284*ISH + 0.03657640*SSH + 0.01323900*AW - 21.10919)

test_pred$mine3 <- predld3$x

Stepwise Scapular Variables

ld4 <- MASS::lda(Sex ~ GW+SH+SSH, train)
print(ld4)

## Call:
## lda(Sex ~ GW + SH + SSH, data = train)
## 
## Prior probabilities of groups:
## Female   Male 
##    0.5    0.5 
## 
## Group means:
##              GW       SH      SSH
## Female 26.05455 143.9273 50.74545
## Male   30.16364 161.5455 58.54545
## 
## Coefficients of linear discriminants:
##            LD1
## GW  0.34689180
## SH  0.05285804
## SSH 0.04938214

predld4 <- predict(ld4, test)

confusionMatrix(predld4$class, test$Sex)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Female Male
##     Female     13    1
##     Male        0   12
##                                          
##                Accuracy : 0.9615         
##                  95% CI : (0.8036, 0.999)
##     No Information Rate : 0.5            
##     P-Value [Acc > NIR] : 4.023e-07      
##                                          
##                   Kappa : 0.9231         
##                                          
##  Mcnemar's Test P-Value : 1              
##                                          
##             Sensitivity : 1.0000         
##             Specificity : 0.9231         
##          Pos Pred Value : 0.9286         
##          Neg Pred Value : 1.0000         
##              Prevalence : 0.5000         
##          Detection Rate : 0.5000         
##    Detection Prevalence : 0.5385         
##       Balanced Accuracy : 0.9615         
##                                          
##        'Positive' Class : Female         
##

beta4 <- ld4$scaling
mu4 <- ld4$means
intercept4 <- -0.5 * sum((mu4[1,] + mu4[2,]) * beta4)
print(intercept4)

## [1] -20.52267

# -20.52267

test_pred <- test %>% mutate(train_pred_DF4 = 0.34689180*GW + 0.05285804*SH + 0.04938214*SSH - 20.52267)

test_pred$mine4 <- predld4$x

Clavicular Variables

ld5 <- MASS::lda(Sex ~ CL+MSC+MSMAX+MSMIN+STERN+ACROM, train)
print(ld5)

## Call:
## lda(Sex ~ CL + MSC + MSMAX + MSMIN + STERN + ACROM, data = train)
## 
## Prior probabilities of groups:
## Female   Male 
##    0.5    0.5 
## 
## Group means:
##              CL      MSC    MSMAX     MSMIN    STERN    ACROM
## Female 143.7455 33.85455 12.03636  9.163636 26.40000 23.58182
## Male   157.1273 40.45455 14.54545 10.890909 29.32727 27.74545
## 
## Coefficients of linear discriminants:
##               LD1
## CL     0.05298943
## MSC    0.26307584
## MSMAX -0.01241628
## MSMIN  0.08983644
## STERN -0.07804072
## ACROM  0.04018838

predld5 <- predict(ld5, test)

confusionMatrix(predld5$class, test$Sex)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Female Male
##     Female     13    4
##     Male        0    9
##                                           
##                Accuracy : 0.8462          
##                  95% CI : (0.6513, 0.9564)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 0.0002668       
##                                           
##                   Kappa : 0.6923          
##                                           
##  Mcnemar's Test P-Value : 0.1336144       
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.6923          
##          Pos Pred Value : 0.7647          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.5000          
##          Detection Rate : 0.5000          
##    Detection Prevalence : 0.6538          
##       Balanced Accuracy : 0.8462          
##                                           
##        'Positive' Class : Female          
##

beta5 <- ld5$scaling
mu5 <- ld5$means
intercept5 <- -0.5 * sum((mu5[1,] + mu5[2,]) * beta5)
print(intercept5)

## [1] -17.33867

# -17.33867

test_pred <- test %>% mutate(train_pred_DF5 = 0.05298943*CL + 0.26307584*MSC - 0.01241628*MSMAX + 0.08983644*MSMIN - 0.07804072*STERN + 0.04018838*ACROM - 17.33867)

test_pred$mine <- predld5$x

Stepwise Clavicular Variables

ld6 <- MASS::lda(Sex ~ CL+MSC, train)
print(ld6)

## Call:
## lda(Sex ~ CL + MSC, data = train)
## 
## Prior probabilities of groups:
## Female   Male 
##    0.5    0.5 
## 
## Group means:
##              CL      MSC
## Female 143.7455 33.85455
## Male   157.1273 40.45455
## 
## Coefficients of linear discriminants:
##            LD1
## CL  0.04989081
## MSC 0.27010965

predld6 <- predict(ld6, test)

confusionMatrix(predld6$class, test$Sex)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Female Male
##     Female     11    4
##     Male        2    9
##                                           
##                Accuracy : 0.7692          
##                  95% CI : (0.5635, 0.9103)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 0.004678        
##                                           
##                   Kappa : 0.5385          
##                                           
##  Mcnemar's Test P-Value : 0.683091        
##                                           
##             Sensitivity : 0.8462          
##             Specificity : 0.6923          
##          Pos Pred Value : 0.7333          
##          Neg Pred Value : 0.8182          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4231          
##    Detection Prevalence : 0.5769          
##       Balanced Accuracy : 0.7692          
##                                           
##        'Positive' Class : Female          
##

beta6 <- ld6$scaling
mu6 <- ld6$means
intercept6 <- -0.5 * sum((mu6[1,] + mu6[2,]) * beta6)
print(intercept6)

## [1] -17.54119

# -17.54119

test_pred <- test %>% mutate(train_pred_DF6 = 0.04989081*CL + 0.27010965*MSC - 17.54119)

test_pred$mine <- predld6$x

Graphs for Visualizing Classification Rates - from Test

All Variables

plot_data1 <- data.frame(LD1 = predld1$x[,1], Group = test$Sex)

ggplot(plot_data1, aes(x = LD1, fill = Group)) +
  geom_density(alpha = 0.5) +
  labs(y = "Density", x = "Discriminant Function 1", fill="Sex") +
  theme_minimal() + geom_vline(xintercept=0, linetype="dashed") + scale_fill_manual(values = c("#bf987c","#de6612"))

Stepwise All Variables

plot_data2 <- data.frame(LD2 = predld2$x[,1], Group = test$Sex)

ggplot(plot_data2, aes(x = LD2, fill = Group)) +
  geom_density(alpha = 0.5) +
  labs(y = "Density", x = "Discriminant Function 2", fill="Sex") +
  theme_minimal() + geom_vline(xintercept=0, linetype="dashed") + scale_fill_manual(values = c("#c9be97","#e6ba1c"))

All Scapular

plot_data3 <- data.frame(LD3 = predld3$x[,1], Group = test$Sex)

ggplot(plot_data3, aes(x = LD3, fill = Group)) +
  geom_density(alpha = 0.5) +
  labs(y = "Density", x = "Discriminant Function 3", fill="Sex") +
  theme_minimal() + geom_vline(xintercept=0, linetype="dashed") + scale_fill_manual(values = c("#958ebd","#3e23d9"))

Stepwise Scapular

plot_data4 <- data.frame(LD4 = predld4$x[,1], Group = test$Sex)

ggplot(plot_data4, aes(x = LD4, fill = Group)) +
  geom_density(alpha = 0.5) +
  labs(y = "Density", x = "Discriminant Function 4", fill="Sex") +
  theme_minimal() + geom_vline(xintercept=0, linetype="dashed") + scale_fill_manual(values = c("#9780ad","#7030b0"))

All Clavicular

plot_data5 <- data.frame(LD5 = predld5$x[,1], Group = test$Sex)

ggplot(plot_data5, aes(x = LD5, fill = Group)) +
  geom_density(alpha = 0.5) +
  labs(y = "Density", x = "Discriminant Function 5", fill="Sex") +
  theme_minimal() + geom_vline(xintercept=0, linetype="dashed") + scale_fill_manual(values = c("#ba93ba","#b82ab8"))

Stepwise Clavicular

plot_data6 <- data.frame(LD6 = predld6$x[,1], Group = test$Sex)

ggplot(plot_data6, aes(x = LD6, fill = Group)) +
  geom_density(alpha = 0.5) +
  labs(y = "Density", x = "Discriminant Function 6", fill="Sex") +
  theme_minimal() + geom_vline(xintercept=0, linetype="dashed") + scale_fill_manual(values = c("#bf889c","#d9306e"))

References

[1] Kharuhadetch, P., Wattanawaragorn, S., Tiamtongon, C., Wathanyutakon, T., Navic, P., & Mahakkanukrauh, P. (2022). Sex determination using scapular and clavicular parameters in modern Thai population. International Journal of Morphology, 40(3), 768–773. https://doi.org/10.4067/S0717-95022022000300768

[2] Gocha, T. P., Mavroudas, S. R., & Wescott, D. J. (2022). The Texas State Donated Skeletal Collection at the Forensic Anthropology Center at Texas State. Forensic Sciences, 2(1), 7-19. https://doi.org/10.3390/forensicsci2010002

[3] Posit team (2025). RStudio: Integrated Development Environment for R. Posit Software, PBC, Boston, MA. URL http://www.posit.co/.

[4] R Core Team (2017). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. https://www.R-project.prg/.

Skel Methods Final

Mackenzie Murray

2026-05-12

Global Chunk

Setup

Import Data + Munge

Assumptions

Normality

Outliers

Males

Downsizing to Equal Groups (With Outliers)

Implementing Kharuhadetch et al.’s Equations

Sexual Dimorphism

Dicriminant Equations

Predictions

Graphs for Visualizing Classification Rates - from Paper on my Data

All Variables

Stepwise All Variables

All Scapular

Stepwise Scapular

All Clavicular

Stepwise Clavicular

Classification Matrices

All Variables

Stepwise All Variables

All Scapular Variables

Stepwise Scapular Variables

All Clavicular Variables

Stepwise Clavicular Variables

Creation of My Own Equations

Training and Testing

Stepwise Selection

All Variables

Scapular Variables

Clavicular Variables

Discriminant Functions

All Variables

Stepwise All Variables

Scapular Variables

Stepwise Scapular Variables

Clavicular Variables

Stepwise Clavicular Variables

Graphs for Visualizing Classification Rates - from Test

All Variables

Stepwise All Variables

All Scapular

Stepwise Scapular

All Clavicular

Stepwise Clavicular

References