# ref: Intro to data analysis with R for Forensic Scientists Michael Currant
library(dafs)
## Loading required package: s20x
library(help=dafs)
data("bottle.df")
head(bottle.df)
##   Number Part   Mn    Ba    Sr   Zr    Ti
## 1      1 Neck 56.1 170.7 145.1 77.4 267.4
## 2      1 Neck 53.8 166.2 143.3 71.6 270.0
## 3      1 Neck 58.7 184.2 156.5 78.2 286.4
## 4      1 Neck 54.6 170.5 158.1 75.3 273.6
## 5      1 Neck 58.6 185.2 161.3 83.9 289.9
## 6      1 Body 56.8 180.5 146.7 79.2 274.0
tail(bottle.df)
##     Number     Part   Mn    Ba    Sr    Zr    Ti
## 115      6     Base 67.2 208.9 180.2 125.7 326.1
## 116      6 Shoulder 64.9 197.8 172.1  81.5 304.2
## 117      6 Shoulder 52.4 158.9 137.1  67.8 254.6
## 118      6 Shoulder 61.9 189.8 165.4  80.8 302.1
## 119      6 Shoulder 57.8 174.9 160.4  77.9 278.2
## 120      6 Shoulder 63.9 198.2 176.4  90.9 309.5
str(bottle.df)
## 'data.frame':    120 obs. of  7 variables:
##  $ Number: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Part  : Factor w/ 4 levels "Base","Body",..: 3 3 3 3 3 2 2 2 2 2 ...
##  $ Mn    : num  56.1 53.8 58.7 54.6 58.6 56.8 54.6 59.2 54.9 61.1 ...
##  $ Ba    : num  171 166 184 170 185 ...
##  $ Sr    : num  145 143 156 158 161 ...
##  $ Zr    : num  77.4 71.6 78.2 75.3 83.9 79.2 74.6 78.9 80.2 90.2 ...
##  $ Ti    : num  267 270 286 274 290 ...
bottle.s <- split(bottle.df$Ba,bottle.df$Number)
sapply(bottle.s, mean)
##       1       2       3       4       5       6 
## 181.920 183.310 172.555 187.150 138.730 189.880
data(gc.df)
str(gc.df)
## 'data.frame':    612 obs. of  5 variables:
##  $ id      : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ gc1     : Factor w/ 3 levels "A","B","C": 2 2 2 2 3 2 1 1 3 2 ...
##  $ gc2     : Factor w/ 3 levels "A","B","C": 2 2 3 3 3 3 2 1 3 2 ...
##  $ racecode: Factor w/ 3 levels "A","C","H": 1 1 1 1 1 1 1 1 1 1 ...
##  $ genotype: Factor w/ 6 levels "AA","AB","AC",..: 4 4 5 5 6 5 2 1 6 4 ...
head(gc.df)
##   id gc1 gc2 racecode genotype
## 1  1   B   B        A       BB
## 2  2   B   B        A       BB
## 3  3   B   C        A       BC
## 4  4   B   C        A       BC
## 5  5   C   C        A       CC
## 6  6   B   C        A       BC
# ?gc.df
tbl <- table(gc.df$genotype)
tbl
## 
##  AA  AB  AC  BB  BC  CC 
##  29  72 114 136 137 124
midpts <- barplot(tbl,ylim = c(0,1),xlab = "Genotype",
        ylab = "Proportion")
cumppn <- cumsum(tbl)
lines(midpts,cumppn)
box()

tbl1 <- xtabs(~racecode + genotype,data = gc.df);tbl1
##         genotype
## racecode  AA  AB  AC  BB  BC  CC
##        A   1  28   6 117  46   7
##        C  19  24  60   4  29  63
##        H   9  20  48  15  62  54
rowSums(tbl1)
##   A   C   H 
## 205 199 208
tbl2 <- tbl1/rowSums(tbl1);round(tbl2,4)
##         genotype
## racecode     AA     AB     AC     BB     BC     CC
##        A 0.0049 0.1366 0.0293 0.5707 0.2244 0.0341
##        C 0.0955 0.1206 0.3015 0.0201 0.1457 0.3166
##        H 0.0433 0.0962 0.2308 0.0721 0.2981 0.2596
barplot(tbl, beside = T, ylim = c(0,1), xlab = "Genotype", ylab = "Proportion", col = c("blue", "yellow", "red")) 
box()

# chapter 5 linear regression
data("dpd.df")
head(dpd.df)
##   sample age sex molar dpd.ratio est.age assoc.error real.error
## 1      1  15   F    46      0.62    24.0        15.1       -9.0
## 2      2  17   F    46      0.64    24.4        15.1       -7.4
## 3      3  17   F    48      1.14    35.0        14.7       -1.8
## 4      4  17   M    48      1.04    32.9        14.7      -15.9
## 5      5  19   F    28      1.07    33.5        14.7      -14.5
## 6      6  20   F    48      1.83    49.7        14.8      -29.7
model <- lm(age~dpd.ratio,data = dpd.df)
summary(model)
## 
## Call:
## lm(formula = age ~ dpd.ratio, data = dpd.df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -29.8056  -8.6384  -0.0186   8.4226  25.8918 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   10.826      7.778   1.392 0.179272    
## dpd.ratio     21.301      5.011   4.251 0.000391 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.34 on 20 degrees of freedom
## Multiple R-squared:  0.4746, Adjusted R-squared:  0.4484 
## F-statistic: 18.07 on 1 and 20 DF,  p-value: 0.0003914
plot(x=dpd.df$dpd.ratio,y=dpd.df$age);abline(model)

plot(model)

# 5.3.4 Tutorial
# Load the bottle data and ???t a simple linear regression model for manganese as a function of barium
data(bottle.df)
names(bottle.df)
## [1] "Number" "Part"   "Mn"     "Ba"     "Sr"     "Zr"     "Ti"
mn.fit <- lm(Mn~Ba,data = bottle.df)
summary(mn.fit)
## 
## Call:
## lm(formula = Mn ~ Ba, data = bottle.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.5117 -1.5623 -0.2248  1.2480  5.3408 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  12.0683     1.7865   6.755  5.7e-10 ***
## Ba            0.2627     0.0101  26.002  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.308 on 118 degrees of freedom
## Multiple R-squared:  0.8514, Adjusted R-squared:  0.8501 
## F-statistic: 676.1 on 1 and 118 DF,  p-value: < 2.2e-16
# 2. Plot the ???tted line on the data. 
plot(Mn~Ba,data = bottle.df)
abline(mn.fit)

plot(mn.fit)

# 3. Add a smoothing line to the data as well, and color it blue.
with(bottle.df,lines(lowess(Ba,Mn),col="blue"))

# 4. Plot the residuals versus the ???tted values and a normal Q-Q plot of the residuals. 
plot(mn.fit,which = 1:2)

# 5. Repeat the previous step, but this time extract the residuals and ???tted values
# from the ???tted object mn.fit using the residuals, fitted and qqnorm commands.
res <- residuals(mn.fit)
pred <- fitted(mn.fit)
plot(pred,res,xlab = "Fitted Values",
     ylab = "Residuals")
abline(h=0,lty=2)

qqnorm(res)
# 6. We use the qqline function in conjuction with qqnorm to construct
# a normal Q-Q plot with a straight line on it. 
qqnorm(res)
qqline(res)

# 7. It is instructive to do the previous task by hand. To do this,
# we need to sort the residuals into order, and get the corresponding
# quantiles from the normal distribution. We can do this with the ppoints and qnorm commands. 
res <- sort(residuals(mn.fit))
n <- length(res)
z <- qnorm(ppoints(n))
plot(z,res,xlab = "Theoretical Quantiles",ylab = "Sample Quantiles",
     main = "Normal QQ plot")
abline(lm(res~z))

# 8. Plot a histogram of the residuals and superimpose the normal distribution on it. 
res <- residuals(mn.fit)
mx <- mean(res)
coefficients(mn.fit)
## (Intercept)          Ba 
##  12.0682759   0.2626991
sx <- summary(mn.fit)$sigma
sx
## [1] 2.307636
ls(mn.fit)#Extract the estimated standard deviation of the errors, the "residual standard deviation"
##  [1] "assign"        "call"          "coefficients"  "df.residual"  
##  [5] "effects"       "fitted.values" "model"         "qr"           
##  [9] "rank"          "residuals"     "terms"         "xlevels"
sigma(mn.fit)
## [1] 2.307636
hist(res,probability = TRUE,xlab = "Residuals",
     main = "")
x <- seq(min(res) - 0.5*sx,max(res) + 0.5*sx,length=200)
y <- dnorm(x,mx,sx)
lines(x,y,lty=2)
box()

# 9. The s20x packagehasafunctionthatsimpli???esthelasttwostepscalled normcheck.
# normcheck can take either a single data set, a formula, or an lm object.
# So, for example, we can type
normcheck(mn.fit)

# 10. Plot a 95% con???dence interval for the regression line for the mean and
# then for a new predicted value. 
plot(Mn~Ba,data = bottle.df)
new.Ba <- seq(min(bottle.df$Ba),max(bottle.df$Ba),length=200)
mn.pred.ci <- predict(mn.fit,data.frame(Ba=new.Ba),
                      interval = "confidence")
mn.pred.pred <- predict(mn.fit,data.frame(Ba=new.Ba),
                        interval = "prediction")
lines(new.Ba,mn.pred.ci[,1])
lines(new.Ba,mn.pred.ci[,2],lty=2,col="blue")
lines(new.Ba,mn.pred.ci[,3],lty=2,col="blue")
lines(new.Ba,mn.pred.pred[,2],lty=2,col="red")
lines(new.Ba,mn.pred.pred[,3],lty=2,col="red")

# "What is the average concentration of manganese in samples
# where the barium concentration is 180?" then we should give
# a prediction with a con???dence interval for the mean, i.e., 
new.Ba <- 180
mn.pred <- predict(mn.fit,data.frame(Ba=new.Ba),
                   interval = "confidence")
mn.pred
##        fit      lwr     upr
## 1 59.35412 58.92773 59.7805
# If the question is, "I have just collected a new sample and 
# the barium value is 180. What is a reasonable predicted value
# and range for the manganese concentration?" then we should 
# give a prediction interval for a new value, i.e., 
new.Ba <- 180
mn.pred <- predict(mn.fit,data.frame(Ba=new.Ba),
                   interval = "prediction")
mn.pred
##        fit      lwr      upr
## 1 59.35412 54.76452 63.94371
# We would say "given the barium concentration is 180, 
# we expect the manganese concentration in the new sample
# to be about 59.4 and it will lie between 54.8 and 63.9 with
# 95% con???dence." Notice that both situations have the same predicted values
# This says the expected value(mean) of a single value froma distribution 
# and expected value (mean) of the sample mean of a sample
# from the same distribution are both the same. 
# 11. Plot age versus the deoxypyridinoline (DPD) ratio in the DPD data set.
# Add a smoothing line to the plot. We will change the amount of smoothing 
# because we have a relatively small amount of data.
data("dpd.df")
plot(age~dpd.ratio, data = dpd.df)
lines(with(dpd.df,lowess(dpd.ratio,age,f=0.95)))

# 12. Fit a regression model of age on dpd.ratio and use it to make a prediction
# for someone who as a dpd.ratio of 1.83. 
dpd.fit <- lm(age~dpd.ratio,data = dpd.df)
dpd.pred <- predict(dpd.fit,data.frame(dpd.ratio=1.83),
                    interval = "prediction");dpd.pred
##        fit      lwr      upr
## 1 49.80559 18.92944 80.68174
# Is this prediction useful? I think the answer is no.
# The range of the prediction interval(19-81)is almost wider than 
# the range of the observed age data (11-73).