For each of the following questions, provide R commands used to solve the problem.
Suppose you are a biologist interested in developing metrics to determine the gender of fossilized kangaroo specimen based on skull measurements. Load the kanga data set from the faraway package. A number of the elements have missing data, and so select out the subset that have all observed measures:
library(faraway)
## Warning: package 'faraway' was built under R version 4.0.3
data(kanga)
kanga2 <- kanga[!is.na(rowSums(kanga[,3:20])),] #filter the dataset adding up cranial size and not including NAs
kanga2$sexnumeric <- as.numeric(kanga2$sex)
#install.packages("scales")
library(scales)
## Warning: package 'scales' was built under R version 4.0.3
kanga2$sexnumeric = rescale(kanga2$sexnumeric) #rescale it to 0 for F and 1 for M
kanga2$cranial.size = rowSums(kanga2[,3:20])
# linear model taking into consideration total cranial size
model = lm(sexnumeric~cranial.size,data=kanga2)
summary(model)
##
## Call:
## lm(formula = sexnumeric ~ cranial.size, data = kanga2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.6842 -0.4075 -0.2155 0.4260 0.8555
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.597e+00 5.566e-01 -2.869 0.00503 **
## cranial.size 1.944e-04 5.303e-05 3.665 0.00040 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.47 on 99 degrees of freedom
## Multiple R-squared: 0.1195, Adjusted R-squared: 0.1106
## F-statistic: 13.43 on 1 and 99 DF, p-value: 4e-04
model$coefficients
## (Intercept) cranial.size
## -1.5970735231 0.0001943523
plot(kanga2$sexnumeric,kanga2$cranial.size)
#abline(10191.2, 614.7) we can't do an abline in this way
kangaprediction = model$fitted.values>0.5
predicted_males = sum(kangaprediction)
real_males = sum(kanga2$sexnumeric)
# linear model taking into consideration different cranial zones
model2 = lm(sexnumeric~basilar.length+occipitonasal.length+palate.length+palate.width+nasal.length+nasal.width+squamosal.depth+lacrymal.width+zygomatic.width+orbital.width+.rostral.width+occipital.depth+crest.width+foramina.length+mandible.length+mandible.width+mandible.depth+ramus.height, data=kanga2)
summary(model2)
##
## Call:
## lm(formula = sexnumeric ~ basilar.length + occipitonasal.length +
## palate.length + palate.width + nasal.length + nasal.width +
## squamosal.depth + lacrymal.width + zygomatic.width + orbital.width +
## .rostral.width + occipital.depth + crest.width + foramina.length +
## mandible.length + mandible.width + mandible.depth + ramus.height,
## data = kanga2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.71675 -0.31868 -0.09668 0.30148 0.91041
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.9801219 0.9848991 -3.026 0.003312 **
## basilar.length 0.0010878 0.0026435 0.412 0.681779
## occipitonasal.length 0.0051656 0.0021280 2.427 0.017394 *
## palate.length 0.0021677 0.0026806 0.809 0.421046
## palate.width -0.0008720 0.0028873 -0.302 0.763400
## nasal.length -0.0081358 0.0020655 -3.939 0.000171 ***
## nasal.width 0.0080635 0.0044446 1.814 0.073301 .
## squamosal.depth 0.0001512 0.0041188 0.037 0.970808
## lacrymal.width -0.0061008 0.0040909 -1.491 0.139718
## zygomatic.width 0.0027272 0.0027402 0.995 0.322539
## orbital.width 0.0054834 0.0034168 1.605 0.112368
## .rostral.width -0.0017068 0.0030700 -0.556 0.579753
## occipital.depth 0.0016855 0.0026351 0.640 0.524196
## crest.width -0.0049138 0.0020774 -2.365 0.020371 *
## foramina.length -0.0020106 0.0033836 -0.594 0.553991
## mandible.length -0.0033901 0.0025522 -1.328 0.187762
## mandible.width 0.0100053 0.0078866 1.269 0.208155
## mandible.depth -0.0026807 0.0049569 -0.541 0.590114
## ramus.height -0.0031911 0.0026440 -1.207 0.230935
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4166 on 82 degrees of freedom
## Multiple R-squared: 0.4269, Adjusted R-squared: 0.3011
## F-statistic: 3.394 on 18 and 82 DF, p-value: 8.044e-05
#colnames(kanga2)
kangaprediction_2 = model2$fitted.values > 0.5
predicted_males_2 = sum(kangaprediction_2)
model3 = lm(sexnumeric~occipitonasal.length+palate.length+nasal.length+nasal.width+crest.width, data=kanga2) #linear model of only significant slopes of model 2
summary(model3)
##
## Call:
## lm(formula = sexnumeric ~ occipitonasal.length + palate.length +
## nasal.length + nasal.width + crest.width, data = kanga2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8605 -0.3563 -0.1024 0.3797 1.0180
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.952758 0.831711 -2.348 0.02095 *
## occipitonasal.length 0.005261 0.001560 3.372 0.00108 **
## palate.length -0.001297 0.001359 -0.954 0.34246
## nasal.length -0.007810 0.001633 -4.783 6.31e-06 ***
## nasal.width 0.004863 0.003123 1.557 0.12278
## crest.width -0.003661 0.001628 -2.249 0.02685 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4267 on 95 degrees of freedom
## Multiple R-squared: 0.3034, Adjusted R-squared: 0.2667
## F-statistic: 8.274 on 5 and 95 DF, p-value: 1.6e-06
kangaprediction_3 = model3$fitted.values > 0.5
predicted_males_3 = sum(kangaprediction_3)
kanga2$predicted = kangaprediction_2 + 0
df = data.frame(kangaprediction1 = kangaprediction + 0, kangaprediction2 = kangaprediction_2+0,kangaprediction3 = kangaprediction_3 + 0, fitted_values1 = model$fitted.values, fitted_values2 = model2$fitted.values,fitted_values3 = model3$fitted.values)
library(vioplot)
## Warning: package 'vioplot' was built under R version 4.0.2
## Loading required package: sm
## Warning: package 'sm' was built under R version 4.0.2
## Package 'sm', version 2.2-5.6: type help(sm) for summary information
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.0.2
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
par(mfrow=c(2,1))
vioplot(df[,1],df[,2],df[,3], kanga2$sexnumeric,
col="gold",names=c('prediction','2nd prediction ','3rd prediction', 'real_distribution'))
title(main="Predicted distributions")
vioplot(df[,4],df[,5],df[,6],col="lightblue",names=c('fitted 1st prediction','fitted 2nd prediction','fitted 3rd prediction'))
title(main="Likelihood of being female (<0.5) or male (>0.5) ")
plot(kanga2$sexnumeric, col = c('red','blue')[df[,3]+1], pch = 24,cex=1.1)
x = 1:length(kanga2$sexnumeric)
par(mfrow=c(2,2))
cdplot(as.factor(kanga2$sexnumeric) ~ as.factor(x),xlab = 'Number of cases',ylab = 'Male Female', ylim = c(0,1),col = c('lightblue','red'),main='True cases distribution')
cdplot(as.factor(df[,1]) ~ as.factor(x),xlab = 'Number of cases',ylab = 'Male Female', ylim = c(0,1),col = c('lightblue','red'),main='Predicted cases skull size only')
cdplot(as.factor(df[,2]) ~ as.factor(x),xlab = 'Number of cases',ylab = 'Male Female', ylim = c(0,1),col = c('lightblue','red'),main='Predicted cases all variables')
cdplot(as.factor(df[,3]) ~ as.factor(x),xlab = 'Number of cases',ylab = 'Male Female', ylim = c(0,1),col = c('lightblue','red'),main='Predicted cases optimal variables')
# here we can appreciate how the first predicted distribution that only takes into account the sum of all cranial measurements (which could roughly be the size of the skull) tends to predict more females (males = 34) compared to the real data (males = 44) with a 22.73% error (100 - 34/44*100). On the other hand, when we take into account all cranial measurements we should hypothetically get a better model that estimates distributions closer to the true values, indeed we can see that the error is only 15.90 (100 - 37/44 * 100).Lastly, model 3 takes into account only those variables that were significant in model 2 and predicts 38 out of 44 cases, however if we check the last graph we see that the distribution is "flatter" than model 2 and may be more erroneous. Thus, it is not only important to accout for rough estimation in numbers but also to precision of those estimates!
library(car)
## Warning: package 'car' was built under R version 4.0.2
## Loading required package: carData
## Registered S3 methods overwritten by 'car':
## method from
## influence.merMod lme4
## cooks.distance.influence.merMod lme4
## dfbeta.influence.merMod lme4
## dfbetas.influence.merMod lme4
##
## Attaching package: 'car'
## The following objects are masked from 'package:faraway':
##
## logit, vif
res = model2$residuals
qqPlot(res,main='Normally distributed residuals',ylab = 'best model residuals')
## 19 56
## 14 41
#infact if we do some descriptive statistics we see that for the real cases distribution and the predicted values with all variables 52 Females were predicted optimally out of 57 and 32 out of 44 Males!
table(kanga2$sex,kanga2$predicted) #where 0 = female 1 = male
##
## 0 1
## Female 52 5
## Male 12 32
accuracy_males_percent = 32/44 * 100
accuracy_females_percent = 52/57 * 100
cat(paste('Percentage accuracy for males:',round(accuracy_males_percent,2),'\nPercentage accuracy for females:',round(accuracy_females_percent,2)))
## Percentage accuracy for males: 72.73
## Percentage accuracy for females: 91.23
# Predicting a categorical variable with a continuous one seems to be challenging because it's very prone to error being either 0 or 1. This means that even though the predictor might be a good one a slight error can result in a totally wrong prediction due to it's binary nature.
For this problem, you will build a new model, predicting palate.width all the other variables, like this:
k.lm2 <- lm(palate.width~basilar.length+occipitonasal.length+palate.length+
nasal.length+nasal.width + squamosal.depth+
lacrymal.width + zygomatic.width + orbital.width +
.rostral.width + occipital.depth + crest.width+
foramina.length+ mandible.length +mandible.width+
mandible.depth + ramus.height ,data=kanga2)
pal.lmodel = lm(palate.width~basilar.length + nasal.width + zygomatic.width + crest.width ,data=kanga2)
summary(k.lm2)
##
## Call:
## lm(formula = palate.width ~ basilar.length + occipitonasal.length +
## palate.length + nasal.length + nasal.width + squamosal.depth +
## lacrymal.width + zygomatic.width + orbital.width + .rostral.width +
## occipital.depth + crest.width + foramina.length + mandible.length +
## mandible.width + mandible.depth + ramus.height, data = kanga2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31.106 -9.908 -0.329 10.341 41.875
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 21.23209 37.36968 0.568 0.57146
## basilar.length -0.20908 0.09784 -2.137 0.03555 *
## occipitonasal.length 0.01861 0.08087 0.230 0.81855
## palate.length 0.08261 0.10150 0.814 0.41804
## nasal.length -0.06187 0.07823 -0.791 0.43126
## nasal.width 0.43610 0.16204 2.691 0.00861 **
## squamosal.depth 0.03677 0.15653 0.235 0.81488
## lacrymal.width -0.17336 0.15435 -1.123 0.26461
## zygomatic.width 0.27819 0.09960 2.793 0.00648 **
## orbital.width -0.02816 0.12986 -0.217 0.82885
## .rostral.width 0.02716 0.11667 0.233 0.81647
## occipital.depth 0.08008 0.09979 0.802 0.42460
## crest.width -0.15513 0.07712 -2.012 0.04750 *
## foramina.length 0.03151 0.12858 0.245 0.80704
## mandible.length 0.18689 0.09483 1.971 0.05209 .
## mandible.width -0.10714 0.29959 -0.358 0.72153
## mandible.depth -0.19189 0.18726 -1.025 0.30848
## ramus.height -0.03194 0.10046 -0.318 0.75133
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.84 on 83 degrees of freedom
## Multiple R-squared: 0.7763, Adjusted R-squared: 0.7305
## F-statistic: 16.94 on 17 and 83 DF, p-value: < 2.2e-16
r_squared = 0.7763
adj_r_squared = 0.7305
F_ = 16.94
dfs = 17
summary(pal.lmodel)
##
## Call:
## lm(formula = palate.width ~ basilar.length + nasal.width + zygomatic.width +
## crest.width, data = kanga2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.340 -9.419 0.027 10.749 40.712
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.24319 25.91294 0.280 0.78045
## basilar.length -0.09585 0.03515 -2.727 0.00759 **
## nasal.width 0.40538 0.09045 4.482 2.05e-05 ***
## zygomatic.width 0.37491 0.05781 6.486 3.83e-09 ***
## crest.width -0.26861 0.05998 -4.479 2.07e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.81 on 96 degrees of freedom
## Multiple R-squared: 0.742, Adjusted R-squared: 0.7312
## F-statistic: 69.01 on 4 and 96 DF, p-value: < 2.2e-16
r_squared_two = 0.742
adj_r_squared_two = 0.7312
F_two = 69.01
dfs_two = 4
cat('Multiple R^2 first model:', r_squared,' Adjusted R^2 first model:', adj_r_squared,'df:',dfs,'F:',F_,'\nMultiple R^2 second model:',r_squared_two,' Adjusted R^2 second model:', adj_r_squared_two,'df:',dfs_two,'F:',F_two)
## Multiple R^2 first model: 0.7763 Adjusted R^2 first model: 0.7305 df: 17 F: 16.94
## Multiple R^2 second model: 0.742 Adjusted R^2 second model: 0.7312 df: 4 F: 69.01
# the R squared in the first model tells us that 77.63% of variability in palate.width can be explained by our model that takes into consideration those 17 variables and t's a measure of fit of the chosen linear regression model. However, the adjusted R squared is adjusted to the number of independent variables taken into account and acts as a more comparative tool that one could use to choose a linear model that may work well enough or even better with less (more suited) variables. R squared tends to increase with more variables even though those variables may not be really relevant to our prediction model, thus we need to adjust it to face that "overestimation". In our case here picking up less variables seems to be better because our adjusted R squared is slightly better and after all we need to collect less variables and as a result our model is optimized.
# The F statistic value reports whether you can reject or not the null hypothesis that all of the regression coefficients are equal to zero. Basically, it tells you if the coefficients you are using have improved the model's fit. In our example for the first model we can report a pretty low F value which is lower compared to it's degrees of freedom, which means that there are many unrelevant variables that you should get rid of. The F value of our second model is pretty significant being more than 17 times larger than the degrees of freedom, which means that the coefficients of the variables we have selected largely improved model's fit.
cat('t-basilar.length:',-2.727,'\nt-nasal.width:', 4.482,'\nt-zygomatic.width:',6.486,'\nt-ncrest.width:',-4.479)
## t-basilar.length: -2.727
## t-nasal.width: 4.482
## t-zygomatic.width: 6.486
## t-ncrest.width: -4.479
# T-values here are the coefficients divided by their standard errors. This means that if the magnitude of the coefficients are even but if one has higher standard error then the T value would be lower. The T-value can be interpred as a measure of precision with which beta (the regression coefficient) is measured. Generally, a t-value bigger than 2 or smaller than 2 means good estimates. The basilar length and crest width are negative, indicating a change in the directionality of the effect (smaller basilar length and crest width --> bigger palate width) and the rest are positive.
library(BayesFactor)
## Warning: package 'BayesFactor' was built under R version 4.0.2
## Loading required package: coda
## Warning: package 'coda' was built under R version 4.0.2
## Loading required package: Matrix
## ************
## Welcome to BayesFactor 0.9.12-4.2. If you have questions, please contact Richard Morey (richarddmorey@gmail.com).
##
## Type BFManual() to open the manual.
## ************
regressionBF = regressionBF(palate.width~basilar.length + nasal.width + zygomatic.width + crest.width ,data=kanga2)
regressionBF
## Bayes factor analysis
## --------------
## [1] basilar.length : 5.35961e+16 ±0.01%
## [2] nasal.width : 7.061832e+13 ±0.01%
## [3] zygomatic.width : 2.307045e+17 ±0.01%
## [4] crest.width : 2484187 ±0.01%
## [5] basilar.length + nasal.width : 2.289361e+18 ±0%
## [6] basilar.length + zygomatic.width : 8.474499e+17 ±0%
## [7] basilar.length + crest.width : 3.743802e+16 ±0%
## [8] nasal.width + zygomatic.width : 1.997991e+22 ±0%
## [9] nasal.width + crest.width : 1.943763e+14 ±0%
## [10] zygomatic.width + crest.width : 1.314541e+22 ±0%
## [11] basilar.length + nasal.width + zygomatic.width : 2.232303e+21 ±0%
## [12] basilar.length + nasal.width + crest.width : 6.234527e+17 ±0%
## [13] basilar.length + zygomatic.width + crest.width : 2.205269e+21 ±0%
## [14] nasal.width + zygomatic.width + crest.width : 5.162141e+23 ±0%
## [15] basilar.length + nasal.width + zygomatic.width + crest.width : 1.980533e+24 ±0%
##
## Against denominator:
## Intercept only
## ---
## Bayes factor type: BFlinearModel, JZS
plot(regressionBF)
# as you can see here in the plot our Bayesian regression indicates that the best model is the one that takes into consideration all our passed variables, with the Bayes factor vs a null model = 1.980533e+24. Meanin that these 4 variables as predictors can account for the data 1.980533e+24 times better than a model with no predictor variables.
single_predictor = lm(palate.width ~ zygomatic.width,data=kanga2)
summary(single_predictor)
##
## Call:
## lm(formula = palate.width ~ zygomatic.width, data = kanga2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.403 -13.889 0.321 11.648 52.921
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -46.56515 25.78112 -1.806 0.0739 .
## zygomatic.width 0.34387 0.02913 11.806 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.76 on 99 degrees of freedom
## Multiple R-squared: 0.5847, Adjusted R-squared: 0.5805
## F-statistic: 139.4 on 1 and 99 DF, p-value: < 2.2e-16
sex_interaction = lm(palate.width ~ zygomatic.width:sex,data=kanga2)
summary(sex_interaction)
##
## Call:
## lm(formula = palate.width ~ zygomatic.width:sex, data = kanga2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40.344 -14.470 -0.003 11.255 52.387
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -43.61111 27.71149 -1.574 0.119
## zygomatic.width:sexFemale 0.33988 0.03215 10.573 <2e-16 ***
## zygomatic.width:sexMale 0.34131 0.03048 11.199 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.85 on 98 degrees of freedom
## Multiple R-squared: 0.5851, Adjusted R-squared: 0.5766
## F-statistic: 69.09 on 2 and 98 DF, p-value: < 2.2e-16
distinctive_variables = lm(palate.width ~ zygomatic.width + sexnumeric + zygomatic.width:sexnumeric ,data=kanga2)
summary(distinctive_variables) #here I used sex numeric because when I tried to use sex it would erroneously only display sexMale (even though it was accounting for both)
##
## Call:
## lm(formula = palate.width ~ zygomatic.width + sexnumeric + zygomatic.width:sexnumeric,
## data = kanga2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -44.566 -12.005 0.166 10.649 53.469
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -107.68232 48.40199 -2.225 0.0284 *
## zygomatic.width 0.41387 0.05598 7.393 5.1e-11 ***
## sexnumeric 94.57924 58.80709 1.608 0.1110
## zygomatic.width:sexnumeric -0.10592 0.06692 -1.583 0.1167
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.69 on 97 degrees of freedom
## Multiple R-squared: 0.5958, Adjusted R-squared: 0.5833
## F-statistic: 47.67 on 3 and 97 DF, p-value: < 2.2e-16
# If we factor in zygomatic width, sex, and their interaction we notice that sex does not play a significant role in estimation of palate width, not even when in interaction with the zygomatic width. However, one must be careful in choosing the appropriate model.In fact, if we were to only directly factor in the interaction between sex and zygomatic width for palate width estimation we would have found it to be a good predictor because the zygomatic width is such a strong variable predictor that it would "drag" and shade the sex variable in it (which is not a good predictor).