# ref: Intro to data analysis with R for Forensic Scientists Michael Currant
library(dafs)
## Loading required package: s20x
library(help=dafs)
data("bottle.df")
head(bottle.df)
## Number Part Mn Ba Sr Zr Ti
## 1 1 Neck 56.1 170.7 145.1 77.4 267.4
## 2 1 Neck 53.8 166.2 143.3 71.6 270.0
## 3 1 Neck 58.7 184.2 156.5 78.2 286.4
## 4 1 Neck 54.6 170.5 158.1 75.3 273.6
## 5 1 Neck 58.6 185.2 161.3 83.9 289.9
## 6 1 Body 56.8 180.5 146.7 79.2 274.0
tail(bottle.df)
## Number Part Mn Ba Sr Zr Ti
## 115 6 Base 67.2 208.9 180.2 125.7 326.1
## 116 6 Shoulder 64.9 197.8 172.1 81.5 304.2
## 117 6 Shoulder 52.4 158.9 137.1 67.8 254.6
## 118 6 Shoulder 61.9 189.8 165.4 80.8 302.1
## 119 6 Shoulder 57.8 174.9 160.4 77.9 278.2
## 120 6 Shoulder 63.9 198.2 176.4 90.9 309.5
str(bottle.df)
## 'data.frame': 120 obs. of 7 variables:
## $ Number: int 1 1 1 1 1 1 1 1 1 1 ...
## $ Part : Factor w/ 4 levels "Base","Body",..: 3 3 3 3 3 2 2 2 2 2 ...
## $ Mn : num 56.1 53.8 58.7 54.6 58.6 56.8 54.6 59.2 54.9 61.1 ...
## $ Ba : num 171 166 184 170 185 ...
## $ Sr : num 145 143 156 158 161 ...
## $ Zr : num 77.4 71.6 78.2 75.3 83.9 79.2 74.6 78.9 80.2 90.2 ...
## $ Ti : num 267 270 286 274 290 ...
bottle.s <- split(bottle.df$Ba,bottle.df$Number)
sapply(bottle.s, mean)
## 1 2 3 4 5 6
## 181.920 183.310 172.555 187.150 138.730 189.880
data(gc.df)
str(gc.df)
## 'data.frame': 612 obs. of 5 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ gc1 : Factor w/ 3 levels "A","B","C": 2 2 2 2 3 2 1 1 3 2 ...
## $ gc2 : Factor w/ 3 levels "A","B","C": 2 2 3 3 3 3 2 1 3 2 ...
## $ racecode: Factor w/ 3 levels "A","C","H": 1 1 1 1 1 1 1 1 1 1 ...
## $ genotype: Factor w/ 6 levels "AA","AB","AC",..: 4 4 5 5 6 5 2 1 6 4 ...
head(gc.df)
## id gc1 gc2 racecode genotype
## 1 1 B B A BB
## 2 2 B B A BB
## 3 3 B C A BC
## 4 4 B C A BC
## 5 5 C C A CC
## 6 6 B C A BC
# ?gc.df
tbl <- table(gc.df$genotype)
tbl
##
## AA AB AC BB BC CC
## 29 72 114 136 137 124
midpts <- barplot(tbl,ylim = c(0,1),xlab = "Genotype",
ylab = "Proportion")
cumppn <- cumsum(tbl)
lines(midpts,cumppn)
box()

tbl1 <- xtabs(~racecode + genotype,data = gc.df);tbl1
## genotype
## racecode AA AB AC BB BC CC
## A 1 28 6 117 46 7
## C 19 24 60 4 29 63
## H 9 20 48 15 62 54
rowSums(tbl1)
## A C H
## 205 199 208
tbl2 <- tbl1/rowSums(tbl1);round(tbl2,4)
## genotype
## racecode AA AB AC BB BC CC
## A 0.0049 0.1366 0.0293 0.5707 0.2244 0.0341
## C 0.0955 0.1206 0.3015 0.0201 0.1457 0.3166
## H 0.0433 0.0962 0.2308 0.0721 0.2981 0.2596
barplot(tbl, beside = T, ylim = c(0,1), xlab = "Genotype", ylab = "Proportion", col = c("blue", "yellow", "red"))
box()

# chapter 5 linear regression
data("dpd.df")
head(dpd.df)
## sample age sex molar dpd.ratio est.age assoc.error real.error
## 1 1 15 F 46 0.62 24.0 15.1 -9.0
## 2 2 17 F 46 0.64 24.4 15.1 -7.4
## 3 3 17 F 48 1.14 35.0 14.7 -1.8
## 4 4 17 M 48 1.04 32.9 14.7 -15.9
## 5 5 19 F 28 1.07 33.5 14.7 -14.5
## 6 6 20 F 48 1.83 49.7 14.8 -29.7
model <- lm(age~dpd.ratio,data = dpd.df)
summary(model)
##
## Call:
## lm(formula = age ~ dpd.ratio, data = dpd.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29.8056 -8.6384 -0.0186 8.4226 25.8918
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.826 7.778 1.392 0.179272
## dpd.ratio 21.301 5.011 4.251 0.000391 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.34 on 20 degrees of freedom
## Multiple R-squared: 0.4746, Adjusted R-squared: 0.4484
## F-statistic: 18.07 on 1 and 20 DF, p-value: 0.0003914
plot(x=dpd.df$dpd.ratio,y=dpd.df$age);abline(model)

plot(model)




# 5.3.4 Tutorial
# Load the bottle data and ???t a simple linear regression model for manganese as a function of barium
data(bottle.df)
names(bottle.df)
## [1] "Number" "Part" "Mn" "Ba" "Sr" "Zr" "Ti"
mn.fit <- lm(Mn~Ba,data = bottle.df)
summary(mn.fit)
##
## Call:
## lm(formula = Mn ~ Ba, data = bottle.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.5117 -1.5623 -0.2248 1.2480 5.3408
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.0683 1.7865 6.755 5.7e-10 ***
## Ba 0.2627 0.0101 26.002 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.308 on 118 degrees of freedom
## Multiple R-squared: 0.8514, Adjusted R-squared: 0.8501
## F-statistic: 676.1 on 1 and 118 DF, p-value: < 2.2e-16
# 2. Plot the ???tted line on the data.
plot(Mn~Ba,data = bottle.df)
abline(mn.fit)

plot(mn.fit)



# 3. Add a smoothing line to the data as well, and color it blue.
with(bottle.df,lines(lowess(Ba,Mn),col="blue"))

# 4. Plot the residuals versus the ???tted values and a normal Q-Q plot of the residuals.
plot(mn.fit,which = 1:2)


# 5. Repeat the previous step, but this time extract the residuals and ???tted values
# from the ???tted object mn.fit using the residuals, fitted and qqnorm commands.
res <- residuals(mn.fit)
pred <- fitted(mn.fit)
plot(pred,res,xlab = "Fitted Values",
ylab = "Residuals")
abline(h=0,lty=2)

qqnorm(res)
# 6. We use the qqline function in conjuction with qqnorm to construct
# a normal Q-Q plot with a straight line on it.
qqnorm(res)
qqline(res)

# 7. It is instructive to do the previous task by hand. To do this,
# we need to sort the residuals into order, and get the corresponding
# quantiles from the normal distribution. We can do this with the ppoints and qnorm commands.
res <- sort(residuals(mn.fit))
n <- length(res)
z <- qnorm(ppoints(n))
plot(z,res,xlab = "Theoretical Quantiles",ylab = "Sample Quantiles",
main = "Normal QQ plot")
abline(lm(res~z))

# 8. Plot a histogram of the residuals and superimpose the normal distribution on it.
res <- residuals(mn.fit)
mx <- mean(res)
coefficients(mn.fit)
## (Intercept) Ba
## 12.0682759 0.2626991
sx <- summary(mn.fit)$sigma
sx
## [1] 2.307636
ls(mn.fit)#Extract the estimated standard deviation of the errors, the "residual standard deviation"
## [1] "assign" "call" "coefficients" "df.residual"
## [5] "effects" "fitted.values" "model" "qr"
## [9] "rank" "residuals" "terms" "xlevels"
sigma(mn.fit)
## [1] 2.307636
hist(res,probability = TRUE,xlab = "Residuals",
main = "")
x <- seq(min(res) - 0.5*sx,max(res) + 0.5*sx,length=200)
y <- dnorm(x,mx,sx)
lines(x,y,lty=2)
box()

# 9. The s20x packagehasafunctionthatsimpli???esthelasttwostepscalled normcheck.
# normcheck can take either a single data set, a formula, or an lm object.
# So, for example, we can type
normcheck(mn.fit)

# 10. Plot a 95% con???dence interval for the regression line for the mean and
# then for a new predicted value.
plot(Mn~Ba,data = bottle.df)
new.Ba <- seq(min(bottle.df$Ba),max(bottle.df$Ba),length=200)
mn.pred.ci <- predict(mn.fit,data.frame(Ba=new.Ba),
interval = "confidence")
mn.pred.pred <- predict(mn.fit,data.frame(Ba=new.Ba),
interval = "prediction")
lines(new.Ba,mn.pred.ci[,1])
lines(new.Ba,mn.pred.ci[,2],lty=2,col="blue")
lines(new.Ba,mn.pred.ci[,3],lty=2,col="blue")
lines(new.Ba,mn.pred.pred[,2],lty=2,col="red")
lines(new.Ba,mn.pred.pred[,3],lty=2,col="red")

# "What is the average concentration of manganese in samples
# where the barium concentration is 180?" then we should give
# a prediction with a con???dence interval for the mean, i.e.,
new.Ba <- 180
mn.pred <- predict(mn.fit,data.frame(Ba=new.Ba),
interval = "confidence")
mn.pred
## fit lwr upr
## 1 59.35412 58.92773 59.7805
# If the question is, "I have just collected a new sample and
# the barium value is 180. What is a reasonable predicted value
# and range for the manganese concentration?" then we should
# give a prediction interval for a new value, i.e.,
new.Ba <- 180
mn.pred <- predict(mn.fit,data.frame(Ba=new.Ba),
interval = "prediction")
mn.pred
## fit lwr upr
## 1 59.35412 54.76452 63.94371
# We would say "given the barium concentration is 180,
# we expect the manganese concentration in the new sample
# to be about 59.4 and it will lie between 54.8 and 63.9 with
# 95% con???dence." Notice that both situations have the same predicted values
# This says the expected value(mean) of a single value froma distribution
# and expected value (mean) of the sample mean of a sample
# from the same distribution are both the same.
# 11. Plot age versus the deoxypyridinoline (DPD) ratio in the DPD data set.
# Add a smoothing line to the plot. We will change the amount of smoothing
# because we have a relatively small amount of data.
data("dpd.df")
plot(age~dpd.ratio, data = dpd.df)
lines(with(dpd.df,lowess(dpd.ratio,age,f=0.95)))

# 12. Fit a regression model of age on dpd.ratio and use it to make a prediction
# for someone who as a dpd.ratio of 1.83.
dpd.fit <- lm(age~dpd.ratio,data = dpd.df)
dpd.pred <- predict(dpd.fit,data.frame(dpd.ratio=1.83),
interval = "prediction");dpd.pred
## fit lwr upr
## 1 49.80559 18.92944 80.68174
# Is this prediction useful? I think the answer is no.
# The range of the prediction interval(19-81)is almost wider than
# the range of the observed age data (11-73).