# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 245 Part 1 Day 3 (steven.grambow@duke.edu)
# ~ Hypoxanthine Data    
# ~ from: Analysis of 
# ~ Biological Data Ch 17
# ~ Problem 14
# ~ Reference:
# ~ James, R. A., Hoadley, P. A., & Sampson, B. G. (1997). 
# ~ Determination of postmortem interval by sampling vitreous 
# ~ humour. American Journal of Forensic Medicine and 
# ~ Pathology, 18(2), 158-162. 
# ~ https://doi.org/10.1097/00000433-199706000-00010
# 
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# Estimation of postmortem interval (PMI) by analyses 
# of vitreous humor has certain advantages over analyses 
# of blood and cerebrospinal fluid (CSF). Certain 
# substances, including potassium and hypoxanthine (Hx), 
# have been shown to exhibit postmortem increase in 
# concentration in vitreous humor in a linear fashion. 
# In the present study, potassium and Hx concentrations 
# were measured in 100 subjects with known PMIs. 
# Simple linear regression analyses were performed on 
# the data collected, and new equations for estimation 
# of PMI were constructed. 
# 
# Data Dictionary -- TimeOfDeath.RData
# 
# hours --          Hours post mortem (since death)
# hypoxanthine --   Hypoxanthine concentration (micromol/l) 
#
## Download and load the data file = TimeOfDeath.RData
load(url("http://www.duke.edu/~sgrambow/crp241data/TimeOfDeath.RData"))

# Structure of Data
str(TimeOfDeath)
## 'data.frame':    48 obs. of  2 variables:
##  $ hours       : num  1.5 2.1 2.8 3.9 3.9 3.4 2.9 5.4 7.9 10 ...
##  $ hypoxanthine: int  29 28 26 18 11 5 0 3 31 5 ...
# Descriptive Summary of Data
summary(TimeOfDeath)
##      hours        hypoxanthine   
##  Min.   : 1.50   Min.   :  0.00  
##  1st Qu.:10.47   1st Qu.: 21.50  
##  Median :17.70   Median : 52.50  
##  Mean   :20.78   Mean   : 67.77  
##  3rd Qu.:27.07   3rd Qu.: 97.50  
##  Max.   :81.20   Max.   :357.00
hist(TimeOfDeath$hypoxanthine)

boxplot(TimeOfDeath$hypoxanthine)

stripchart(TimeOfDeath$hypoxanthine,
           main="Postmortem Hypoxanthine in the Vitreous Humour",
           xlab="concentration (micromol/l)",
           ylab="Hypoxanthine",
           method="jitter",
           col="blue",
           pch=1)

# Scatter Plot of Hours since death vs Hypoxanthine

plot(TimeOfDeath$hypoxanthine,TimeOfDeath$hours,
     main="Time of Death (hours) by Postmortem 
     Hypoxanthine in the Vitreous Humour",
     xlab="Hypoxanthine concentration (micromol/l)",
     ylab="Time of Death in hours",
     pch=19,col="blue")
grid()

# Here is a simple linear Regression
# modeling hours post mortem as a function of 
# hypoxanthine concentration.

hypox.fit <- lm(hours ~ hypoxanthine,data=TimeOfDeath)
summary(hypox.fit)
## 
## Call:
## lm(formula = hours ~ hypoxanthine, data = TimeOfDeath)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -21.2581  -5.6408   0.0767   7.0838  20.6384 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   6.82552    1.98866   3.432  0.00128 ** 
## hypoxanthine  0.20596    0.02165   9.512 1.95e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.299 on 46 degrees of freedom
## Multiple R-squared:  0.663,  Adjusted R-squared:  0.6556 
## F-statistic: 90.48 on 1 and 46 DF,  p-value: 1.949e-12
# model yields slope coefficient for hypoxanthine 
# = 0.21 with p <0.0001
# Interpretation
# For each 1 unit increase in hypoxanthine concentration, 
# the mean time since death increases by .21 hours.
# we can use confint function to generate confidence 
# interval for slope estimate

confint(hypox.fit)
##                  2.5 %     97.5 %
## (Intercept)  2.8225485 10.8284913
## hypoxanthine 0.1623725  0.2495396
# This yields 
#                2.5 %     97.5 %
# (Intercept)  2.8225485 10.8284913
# hypoxanthine 0.1623725  0.2495396
# 
# so we have a point estimate for the slope
# with 95% confidence interval
# 0.21 (0.16, .25)

# Here is a fancier scatter plot with regression line 
# and (1) confidence bands and (2) prediction bands
# source code can be found here: https://rpubs.com/Bio-Geek/71339

# install.packages("ggplot2")
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.3

# Plotting 95% Confidence Interval
ggplot(TimeOfDeath, aes(x=hypoxanthine, y=hours))+
  geom_point()+
  geom_smooth(method=lm, se=TRUE)
## `geom_smooth()` using formula 'y ~ x'

# Plotting 95% Confidence Interval and Prediction Interval
# Notice that the confidence bands get wider for larger values
# of x reflecting the density of observed data points across the x axis

temp_var <- predict(hypox.fit, interval="prediction")
## Warning in predict.lm(hypox.fit, interval = "prediction"): predictions on current data refer to _future_ responses
newdata <- cbind(TimeOfDeath,temp_var)

ggplot(newdata, aes(hypoxanthine, hours))+
  geom_point() +
  geom_line(aes(y=lwr), color = "red", linetype = "dashed")+
  geom_line(aes(y=upr), color = "red", linetype = "dashed")+
  geom_smooth(method=lm, se=TRUE)
## `geom_smooth()` using formula 'y ~ x'

# Generating predictions
# As we have done with previous data, we can
# generate predictions for the model using the 
# predict function

# for predicted means with confidence intervals
# say for hypoxanthine=50
predict(hypox.fit,data.frame(hypoxanthine=50),interval = "confidence")
##        fit      lwr     upr
## 1 17.12332 14.31274 19.9339
# Generates a point estimate and associated 
# confidence interval
#      fit      lwr     upr
# 1 17.12332 14.31274 19.9339

# for predicted values with prediction intervals
# say for hypoxanthine=50

predict(hypox.fit,data.frame(hypoxanthine=50),interval = "predict")
##        fit       lwr      upr
## 1 17.12332 -1.804827 36.05147
#       fit       lwr      upr
# 1 17.12332 -1.804827 36.05147

# Notice how much wider the prediction interval is!
# If to be used for a particular murder case, then the 
# prediction interval would be the most appropriate 
# measure of precision/uncertainty.

# What happens if we remove the outlying point?
no_out.TimeOfDeath <- subset(TimeOfDeath,hypoxanthine<300)
no_out.hypox.fit <- lm(hours ~ hypoxanthine,data=no_out.TimeOfDeath)
summary(no_out.hypox.fit)
## 
## Call:
## lm(formula = hours ~ hypoxanthine, data = no_out.TimeOfDeath)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -21.0848  -6.0189  -0.0512   7.1602  20.7283 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   6.96337    2.29527   3.034    0.004 ** 
## hypoxanthine  0.20343    0.02987   6.810 1.95e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.4 on 45 degrees of freedom
## Multiple R-squared:  0.5075, Adjusted R-squared:  0.4966 
## F-statistic: 46.37 on 1 and 45 DF,  p-value: 1.953e-08
# How do the regression coefficients compare?
# reduced model (w/out outlier) slope = 0.203, p-value = 1.95e-08
# full model                    slope = 0.206, p-value = 1.95e-12
# very little difference!

# lets plot both regression lines
 
ggplot(no_out.TimeOfDeath, aes(x=hypoxanthine, y=hours))+
  geom_point()+ # data points
  geom_smooth(method=lm, se=FALSE) + #regression line for reduced data
  geom_smooth(data = TimeOfDeath,method=lm,color="red",linetype=3,se=FALSE) # full data
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

# they are essentially coincident!
# End of Program