# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 245 Part 1 Day 3 (steven.grambow@duke.edu)
# ~ Hypoxanthine Data
# ~ from: Analysis of
# ~ Biological Data Ch 17
# ~ Problem 14
# ~ Reference:
# ~ James, R. A., Hoadley, P. A., & Sampson, B. G. (1997).
# ~ Determination of postmortem interval by sampling vitreous
# ~ humour. American Journal of Forensic Medicine and
# ~ Pathology, 18(2), 158-162.
# ~ https://doi.org/10.1097/00000433-199706000-00010
#
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Estimation of postmortem interval (PMI) by analyses
# of vitreous humor has certain advantages over analyses
# of blood and cerebrospinal fluid (CSF). Certain
# substances, including potassium and hypoxanthine (Hx),
# have been shown to exhibit postmortem increase in
# concentration in vitreous humor in a linear fashion.
# In the present study, potassium and Hx concentrations
# were measured in 100 subjects with known PMIs.
# Simple linear regression analyses were performed on
# the data collected, and new equations for estimation
# of PMI were constructed.
#
# Data Dictionary -- TimeOfDeath.RData
#
# hours -- Hours post mortem (since death)
# hypoxanthine -- Hypoxanthine concentration (micromol/l)
#
## Download and load the data file = TimeOfDeath.RData
load(url("http://www.duke.edu/~sgrambow/crp241data/TimeOfDeath.RData"))
# Structure of Data
str(TimeOfDeath)
## 'data.frame': 48 obs. of 2 variables:
## $ hours : num 1.5 2.1 2.8 3.9 3.9 3.4 2.9 5.4 7.9 10 ...
## $ hypoxanthine: int 29 28 26 18 11 5 0 3 31 5 ...
# Descriptive Summary of Data
summary(TimeOfDeath)
## hours hypoxanthine
## Min. : 1.50 Min. : 0.00
## 1st Qu.:10.47 1st Qu.: 21.50
## Median :17.70 Median : 52.50
## Mean :20.78 Mean : 67.77
## 3rd Qu.:27.07 3rd Qu.: 97.50
## Max. :81.20 Max. :357.00
hist(TimeOfDeath$hypoxanthine)

boxplot(TimeOfDeath$hypoxanthine)

stripchart(TimeOfDeath$hypoxanthine,
main="Postmortem Hypoxanthine in the Vitreous Humour",
xlab="concentration (micromol/l)",
ylab="Hypoxanthine",
method="jitter",
col="blue",
pch=1)

# Scatter Plot of Hours since death vs Hypoxanthine
plot(TimeOfDeath$hypoxanthine,TimeOfDeath$hours,
main="Time of Death (hours) by Postmortem
Hypoxanthine in the Vitreous Humour",
xlab="Hypoxanthine concentration (micromol/l)",
ylab="Time of Death in hours",
pch=19,col="blue")
grid()
# Here is a simple linear Regression
# modeling hours post mortem as a function of
# hypoxanthine concentration.
hypox.fit <- lm(hours ~ hypoxanthine,data=TimeOfDeath)
summary(hypox.fit)
##
## Call:
## lm(formula = hours ~ hypoxanthine, data = TimeOfDeath)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.2581 -5.6408 0.0767 7.0838 20.6384
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.82552 1.98866 3.432 0.00128 **
## hypoxanthine 0.20596 0.02165 9.512 1.95e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.299 on 46 degrees of freedom
## Multiple R-squared: 0.663, Adjusted R-squared: 0.6556
## F-statistic: 90.48 on 1 and 46 DF, p-value: 1.949e-12
# model yields slope coefficient for hypoxanthine
# = 0.21 with p <0.0001
# Interpretation
# For each 1 unit increase in hypoxanthine concentration,
# the mean time since death increases by .21 hours.
# we can use confint function to generate confidence
# interval for slope estimate
confint(hypox.fit)
## 2.5 % 97.5 %
## (Intercept) 2.8225485 10.8284913
## hypoxanthine 0.1623725 0.2495396
# This yields
# 2.5 % 97.5 %
# (Intercept) 2.8225485 10.8284913
# hypoxanthine 0.1623725 0.2495396
#
# so we have a point estimate for the slope
# with 95% confidence interval
# 0.21 (0.16, .25)
# Here is a fancier scatter plot with regression line
# and (1) confidence bands and (2) prediction bands
# source code can be found here: https://rpubs.com/Bio-Geek/71339
# install.packages("ggplot2")
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.3

# Plotting 95% Confidence Interval
ggplot(TimeOfDeath, aes(x=hypoxanthine, y=hours))+
geom_point()+
geom_smooth(method=lm, se=TRUE)
## `geom_smooth()` using formula 'y ~ x'

# Plotting 95% Confidence Interval and Prediction Interval
# Notice that the confidence bands get wider for larger values
# of x reflecting the density of observed data points across the x axis
temp_var <- predict(hypox.fit, interval="prediction")
## Warning in predict.lm(hypox.fit, interval = "prediction"): predictions on current data refer to _future_ responses
newdata <- cbind(TimeOfDeath,temp_var)
ggplot(newdata, aes(hypoxanthine, hours))+
geom_point() +
geom_line(aes(y=lwr), color = "red", linetype = "dashed")+
geom_line(aes(y=upr), color = "red", linetype = "dashed")+
geom_smooth(method=lm, se=TRUE)
## `geom_smooth()` using formula 'y ~ x'

# Generating predictions
# As we have done with previous data, we can
# generate predictions for the model using the
# predict function
# for predicted means with confidence intervals
# say for hypoxanthine=50
predict(hypox.fit,data.frame(hypoxanthine=50),interval = "confidence")
## fit lwr upr
## 1 17.12332 14.31274 19.9339
# Generates a point estimate and associated
# confidence interval
# fit lwr upr
# 1 17.12332 14.31274 19.9339
# for predicted values with prediction intervals
# say for hypoxanthine=50
predict(hypox.fit,data.frame(hypoxanthine=50),interval = "predict")
## fit lwr upr
## 1 17.12332 -1.804827 36.05147
# fit lwr upr
# 1 17.12332 -1.804827 36.05147
# Notice how much wider the prediction interval is!
# If to be used for a particular murder case, then the
# prediction interval would be the most appropriate
# measure of precision/uncertainty.
# What happens if we remove the outlying point?
no_out.TimeOfDeath <- subset(TimeOfDeath,hypoxanthine<300)
no_out.hypox.fit <- lm(hours ~ hypoxanthine,data=no_out.TimeOfDeath)
summary(no_out.hypox.fit)
##
## Call:
## lm(formula = hours ~ hypoxanthine, data = no_out.TimeOfDeath)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.0848 -6.0189 -0.0512 7.1602 20.7283
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.96337 2.29527 3.034 0.004 **
## hypoxanthine 0.20343 0.02987 6.810 1.95e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.4 on 45 degrees of freedom
## Multiple R-squared: 0.5075, Adjusted R-squared: 0.4966
## F-statistic: 46.37 on 1 and 45 DF, p-value: 1.953e-08
# How do the regression coefficients compare?
# reduced model (w/out outlier) slope = 0.203, p-value = 1.95e-08
# full model slope = 0.206, p-value = 1.95e-12
# very little difference!
# lets plot both regression lines
ggplot(no_out.TimeOfDeath, aes(x=hypoxanthine, y=hours))+
geom_point()+ # data points
geom_smooth(method=lm, se=FALSE) + #regression line for reduced data
geom_smooth(data = TimeOfDeath,method=lm,color="red",linetype=3,se=FALSE) # full data
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

# they are essentially coincident!
# End of Program