##data analysis
fig<-read.csv("C:/Users/ywang/Dropbox/R/Deptford/Soil_Reformat.csv")
#obtain summary statistics
#install.packages("psych")
library("psych")
## Warning: package 'psych' was built under R version 3.2.4
describe(fig) # all variables
## vars n mean sd median trimmed mad min
## Type* 1 136 1.88 0.32 2.00 1.97 0.00 1.00
## FieldID* 2 136 68.50 39.40 68.50 68.50 50.41 1.00
## Depth* 3 136 7.68 6.30 5.00 7.06 4.45 1.00
## Date* 4 136 42.96 17.76 47.00 43.68 8.15 1.00
## Silver 5 128 0.63 0.27 0.56 0.57 0.07 0.45
## Aluminum 6 128 8005.23 4654.55 6595.00 7572.40 5144.62 1900.00
## Arsenic 7 128 14.19 16.05 9.25 11.59 7.41 2.50
## Barium 8 128 25.39 7.44 24.00 24.06 2.97 18.00
## Beryllium 9 130 0.54 0.29 0.50 0.52 0.38 0.20
## Calcium 10 128 4048.77 7912.23 640.00 2031.66 192.74 450.00
## Cadmium 11 129 0.65 0.28 0.57 0.58 0.07 0.45
## Cobalt 12 128 6.00 1.40 5.60 5.75 0.74 4.50
## Chromium 13 128 44.98 32.44 38.10 39.82 25.35 10.80
## Copper 14 128 11.66 14.70 4.90 8.35 3.11 2.20
## Iron 15 128 20464.22 12940.27 17900.00 19102.50 13187.73 4140.00
## Mercury 16 131 0.05 0.02 0.04 0.04 0.01 0.03
## Potassium 17 128 3155.94 2375.91 1905.00 2900.38 1341.75 890.00
## Magnesium 18 128 3197.16 4601.93 2020.00 2090.36 1716.11 450.00
## Manganese 19 128 70.15 99.11 32.85 47.53 25.06 5.00
## Sodium 20 128 1272.81 658.60 1100.00 1154.42 148.26 890.00
## Nickel 21 128 8.24 10.15 5.10 6.44 1.33 3.60
## Lead 22 131 86.31 182.78 14.00 41.08 15.12 2.00
## Antimony 23 128 2.36 0.59 2.20 2.27 0.30 1.80
## Selenium 24 128 2.42 0.72 2.20 2.28 0.30 1.80
## Thallium 25 128 1.20 0.36 1.10 1.13 0.15 0.89
## Vanadium 26 128 40.79 22.48 38.35 38.33 24.91 13.50
## Zinc 27 128 66.69 92.37 29.10 46.57 20.39 5.90
## max range skew kurtosis se
## Type* 2.00 1.00 -2.35 3.54 0.03
## FieldID* 136.00 135.00 0.00 -1.23 3.38
## Depth* 22.00 21.00 0.68 -1.01 0.54
## Date* 77.00 76.00 -0.49 -0.04 1.52
## Silver 2.70 2.25 4.89 29.61 0.02
## Aluminum 22800.00 20900.00 0.72 -0.30 411.41
## Arsenic 148.00 145.50 4.97 36.00 1.42
## Barium 77.40 59.40 4.28 23.41 0.66
## Beryllium 1.20 1.00 0.44 -1.12 0.03
## Calcium 48000.00 47550.00 3.10 10.43 699.35
## Cadmium 2.50 2.05 3.82 16.81 0.02
## Cobalt 13.30 8.80 3.16 11.47 0.12
## Chromium 236.00 225.20 2.27 8.44 2.87
## Copper 70.60 68.40 2.06 3.67 1.30
## Iron 72000.00 67860.00 1.04 1.36 1143.77
## Mercury 0.16 0.13 2.23 5.63 0.00
## Potassium 9210.00 8320.00 0.68 -1.02 210.00
## Magnesium 29500.00 29050.00 3.38 12.47 406.76
## Manganese 555.00 550.00 3.13 10.12 8.76
## Sodium 5910.00 5020.00 5.55 32.59 58.21
## Nickel 107.00 103.40 7.38 67.27 0.90
## Lead 1240.00 1238.00 3.77 16.53 15.97
## Antimony 6.70 4.90 4.94 29.12 0.05
## Selenium 6.70 4.90 3.84 15.73 0.06
## Thallium 3.30 2.41 3.80 15.29 0.03
## Vanadium 118.00 104.50 0.88 0.31 1.99
## Zinc 647.00 641.10 3.22 13.25 8.16
describeBy(fig$Arsenic, fig$Type) #Arsenic by sample type
## group: Background
## vars n mean sd median trimmed mad min max range skew kurtosis se
## 1 1 16 5.99 1.92 5.7 5.83 2 3.5 10.8 7.3 0.92 0.12 0.48
## --------------------------------------------------------
## group: Site
## vars n mean sd median trimmed mad min max range skew kurtosis
## 1 1 112 15.36 16.83 11.4 12.68 9.79 2.5 148 145.5 4.74 32.52
## se
## 1 1.59
describeBy(fig$Vanadium, fig$Type) #Arsenic by sample type
## group: Background
## vars n mean sd median trimmed mad min max range skew kurtosis
## 1 1 16 16.82 2.58 16.7 16.64 1.93 13.5 22.6 9.1 0.62 -0.46
## se
## 1 0.65
## --------------------------------------------------------
## group: Site
## vars n mean sd median trimmed mad min max range skew kurtosis
## 1 1 112 44.22 21.97 41.8 42.03 26.61 14.7 118 103.3 0.8 0.28
## se
## 1 2.08
#independent t-test for COC in site sample vs. background
t.test(fig$Arsenic ~ fig$Type)
##
## Welch Two Sample t-test
##
## data: fig$Arsenic by fig$Type
## t = -5.6384, df = 124.48, p-value = 1.093e-07
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -12.653775 -6.078368
## sample estimates:
## mean in group Background mean in group Site
## 5.99375 15.35982
t.test(fig$Vanadium ~ fig$Type)
##
## Welch Two Sample t-test
##
## data: fig$Vanadium by fig$Type
## t = -12.6, df = 124.86, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -31.70078 -23.09387
## sample estimates:
## mean in group Background mean in group Site
## 16.81875 44.21607
#plot to see the inter-relationship among elements
pairs.panels(fig[c(7,26,22,6, 15)], gap = 0)

#there are good correlations between aluminum and Arsenic and Vanadium
cor.test(fig$Arsenic,fig$Aluminum)
##
## Pearson's product-moment correlation
##
## data: fig$Arsenic and fig$Aluminum
## t = 5.4588, df = 126, p-value = 2.447e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2854677 0.5677754
## sample estimates:
## cor
## 0.4373339
cor.test(fig$Vanadium,fig$Aluminum)
##
## Pearson's product-moment correlation
##
## data: fig$Vanadium and fig$Aluminum
## t = 18.197, df = 126, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7949755 0.8927715
## sample estimates:
## cor
## 0.8510953
cor.test(fig$Arsenic,fig$Iron)
##
## Pearson's product-moment correlation
##
## data: fig$Arsenic and fig$Iron
## t = 5.8069, df = 126, p-value = 4.88e-08
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3107238 0.5862646
## sample estimates:
## cor
## 0.4594791
cor.test(fig$Vanadium,fig$Iron)
##
## Pearson's product-moment correlation
##
## data: fig$Vanadium and fig$Iron
## t = 15.584, df = 126, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7424319 0.8633835
## sample estimates:
## cor
## 0.8114231
#perform a linear fit and plot
#------------Arsenic vs. Aluminum---------------------
fit.as<-lm(Arsenic~Aluminum,fig)
anova(fit.as)
## Analysis of Variance Table
##
## Response: Arsenic
## Df Sum Sq Mean Sq F value Pr(>F)
## Aluminum 1 6260.1 6260.1 29.798 2.447e-07 ***
## Residuals 126 26470.5 210.1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modsum.as = summary(fit.as)
modsum.as
##
## Call:
## lm(formula = Arsenic ~ Aluminum, data = fig)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.572 -4.303 -2.146 0.047 140.365
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.1141412 2.5562339 0.827 0.41
## Aluminum 0.0015084 0.0002763 5.459 2.45e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.49 on 126 degrees of freedom
## (8 observations deleted due to missingness)
## Multiple R-squared: 0.1913, Adjusted R-squared: 0.1848
## F-statistic: 29.8 on 1 and 126 DF, p-value: 2.447e-07
opar <- par(mfrow = c(2,2), oma = c(0, 0, 1.1, 0))
plot(fit.as, las = 1)
# Boxplot stats: hinges, n, CI (of the median, noch of the boxplot), outliers
boxplot.stats(fig$Arsenic)
## $stats
## [1] 2.50 5.20 9.25 18.60 34.50
##
## $n
## [1] 128
##
## $conf
## [1] 7.378642 11.121358
##
## $out
## [1] 52.1 47.2 48.6 148.0 61.6
#using the boxplot function in car library to make the plot showing the outliers as row 94, 95, 14,91,28; sample MW-5A, MW-5B, AOC-1-14B,AOC-1-22, MW-3C
library(car)
## Warning: package 'car' was built under R version 3.2.4
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit

par(mfrow=c(1,1),mar=c(5,5,2,1))
Boxplot(fig$Arsenic, id.method="y",ylab="Arsenic")

## [1] 14 28 91 94 95
#boxplot(fig$Arsenic)
fig.as<-fig[-c(94,95,14,91,28),]
fit.as<-lm(Arsenic~Aluminum,fig.as)
anova(fit.as)
## Analysis of Variance Table
##
## Response: Arsenic
## Df Sum Sq Mean Sq F value Pr(>F)
## Aluminum 1 6379.3 6379.3 424.26 < 2.2e-16 ***
## Residuals 121 1819.4 15.0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modsum.as = summary(fit.as)
modsum.as
##
## Call:
## lm(formula = Arsenic ~ Aluminum, data = fig.as)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.5848 -2.2195 -0.0393 2.0117 11.8092
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.3959795 0.6901152 -0.574 0.567
## Aluminum 0.0015489 0.0000752 20.598 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.878 on 121 degrees of freedom
## (8 observations deleted due to missingness)
## Multiple R-squared: 0.7781, Adjusted R-squared: 0.7763
## F-statistic: 424.3 on 1 and 121 DF, p-value: < 2.2e-16
opar <- par(mfrow = c(2,2), oma = c(0, 0, 1.1, 0))
plot(fit.as, las = 1)

par(mfrow=c(1,1),mar=c(5,5,2,1))
plot(Arsenic~Aluminum,fig.as,main="Soil Sample Results",col=as.numeric(Type),pch=as.numeric(Type),xlab="Aluminum (mg/Kg)",ylab="Arsenic (mg/Kg)",xlim=c(1500,25000),ylim=c(0,50))
abline(fit.as,col="red",lwd=1.5)
legend("topright",as.vector(unique(fig.as$Type)),pch=unique(as.numeric(fig.as$Type)),ncol=1,col=unique(as.numeric(fig.as$Type)),text.col=unique(as.numeric(fig.as$Type)),bty="n")
#Putting 95% confidence interval
newx<-seq(1500,25000,by=1000)
prd<-predict(fit.as,newdata=data.frame(Aluminum=newx),interval = c("confidence"),
level = 0.95,type="response")
lines(newx,prd[,2],col="blue",lty=2)
lines(newx,prd[,3],col="blue",lty=2)
#putting linear equation, adjusted r2 and p-value on the plot
lm_coef <- round(coef(fit.as), 4) # extract coefficients
text(7000,45,bquote(As == .(lm_coef[2]) * Al + .(lm_coef[1])))
r2 = modsum.as$adj.r.squared
my.p = modsum.as$coefficients[2,4]
mylabel = bquote(italic(R)^2 == .(format(r2, digits = 3)))
text(7000,40, labels = mylabel)
plabel = bquote(italic(p) == .(format(my.p, digits = 3)))
text(7000,35, plabel)

#------------Vanadium vs. Aluminum----------------------
fit.v<-lm(Vanadium~Aluminum,fig)
anova(fit.v)
## Analysis of Variance Table
##
## Response: Vanadium
## Df Sum Sq Mean Sq F value Pr(>F)
## Aluminum 1 46509 46509 331.12 < 2.2e-16 ***
## Residuals 126 17698 140
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modsum.v = summary(fit.v)
modsum.v
##
## Call:
## lm(formula = Vanadium ~ Aluminum, data = fig)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.473 -7.367 -1.368 5.637 38.096
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.8786450 2.0901616 3.769 0.00025 ***
## Aluminum 0.0041114 0.0002259 18.197 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.85 on 126 degrees of freedom
## (8 observations deleted due to missingness)
## Multiple R-squared: 0.7244, Adjusted R-squared: 0.7222
## F-statistic: 331.1 on 1 and 126 DF, p-value: < 2.2e-16
opar <- par(mfrow = c(2,2), oma = c(0, 0, 1.1, 0))
plot(fit.v, las = 1)

# Boxplot stats: hinges, n, CI (of the median, noch of the boxplot), outliers
boxplot.stats(fig$Vanadium)
## $stats
## [1] 13.50 21.30 38.35 55.10 103.00
##
## $n
## [1] 128
##
## $conf
## [1] 33.62971 43.07029
##
## $out
## [1] 118
#using the boxplot function in car library to make the plot showing the outliers as row 62; sample AOC-1-SB-3B.
library(car)
par(mfrow=c(1,1),mar=c(5,5,2,1))
Boxplot(fig$Vanadium, id.method="y",ylab="Vanadium")

## [1] 62
fig.v<-fig[-c(62),]
fit.v<-lm(Vanadium~Aluminum,fig.v)
anova(fit.v)
## Analysis of Variance Table
##
## Response: Vanadium
## Df Sum Sq Mean Sq F value Pr(>F)
## Aluminum 1 40795 40795 293 < 2.2e-16 ***
## Residuals 125 17404 139
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modsum.v = summary(fit.v)
modsum.v
##
## Call:
## lm(formula = Vanadium ~ Aluminum, data = fig.v)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.464 -7.315 -1.496 5.426 38.525
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.5110512 2.1260145 4.003 0.000106 ***
## Aluminum 0.0040149 0.0002346 17.117 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.8 on 125 degrees of freedom
## (8 observations deleted due to missingness)
## Multiple R-squared: 0.701, Adjusted R-squared: 0.6986
## F-statistic: 293 on 1 and 125 DF, p-value: < 2.2e-16
opar <- par(mfrow = c(2,2), oma = c(0, 0, 1.1, 0))
plot(fit.v, las = 1)

par(mfrow=c(1,1),mar=c(5,5,2,1))
plot(Vanadium~Aluminum,fig.v,main="Soil Sample Results",col=as.numeric(Type),pch=as.numeric(Type),xlab="Aluminum (mg/Kg)",ylab="Vanadium (mg/Kg)",xlim=c(1500,25000),ylim=c(0,150))
abline(fit.v,col="red",lwd=1.5)
legend("topright",as.vector(unique(fig$Type)),pch=unique(as.numeric(fig$Type)),ncol=1,col=unique(as.numeric(fig$Type)),text.col=unique(as.numeric(fig$Type)),bty="n")
#Putting 95% confidence interval
newx<-seq(1500,25000,by=1000)
prd<-predict(fit.v,newdata=data.frame(Aluminum=newx),interval = c("confidence"),
level = 0.95,type="response")
lines(newx,prd[,2],col="blue",lty=2)
lines(newx,prd[,3],col="blue",lty=2)
#putting linear equation, adjusted r2 and p-value on the plot
lm_coef <- round(coef(fit.v), 4) # extract coefficients
text(7000,120,bquote(V == .(lm_coef[2]) * Al + .(lm_coef[1])))
r2 = modsum.v$adj.r.squared
my.p = modsum.v$coefficients[2,4]
mylabel = bquote(italic(R)^2 == .(format(r2, digits = 3)))
text(7000,105, labels = mylabel)
plabel = bquote(italic(p) == .(format(my.p, digits = 3)))
text(7000,90, plabel)

#perform a linear fit and plot
#------------Arsenic vs. Iron----------------------
fit.as<-lm(Arsenic~Iron,fig.as)
anova(fit.as)
## Analysis of Variance Table
##
## Response: Arsenic
## Df Sum Sq Mean Sq F value Pr(>F)
## Iron 1 4911.3 4911.3 180.77 < 2.2e-16 ***
## Residuals 121 3287.4 27.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modsum.as = summary(fit.as)
modsum.as
##
## Call:
## lm(formula = Arsenic ~ Iron, data = fig.as)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.5292 -2.8029 -0.6017 2.6230 14.6284
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.550e+00 8.993e-01 1.723 0.0874 .
## Iron 5.205e-04 3.871e-05 13.445 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.212 on 121 degrees of freedom
## (8 observations deleted due to missingness)
## Multiple R-squared: 0.599, Adjusted R-squared: 0.5957
## F-statistic: 180.8 on 1 and 121 DF, p-value: < 2.2e-16
opar <- par(mfrow = c(2,2), oma = c(0, 0, 1.1, 0))
plot(fit.as, las = 1)

par(mfrow=c(1,1),mar=c(5,5,2,1))
plot(Arsenic~Iron,fig.as,main="Soil Sample Results",col=as.numeric(Type),pch=as.numeric(Type),xlab="Iron (mg/Kg)",ylab="Arsenic (mg/Kg)",xlim=c(4000,72000),ylim=c(0,50))
abline(fit.as,col="red",lwd=1.5)
legend("topright",as.vector(unique(fig.as$Type)),pch=unique(as.numeric(fig.as$Type)),ncol=1,col=unique(as.numeric(fig.as$Type)),text.col=unique(as.numeric(fig.as$Type)),bty="n")
#Putting 95% confidence interval
newx<-seq(4000,72000,by=5000)
prd<-predict(fit.as,newdata=data.frame(Iron=newx),interval = c("confidence"),
level = 0.95,type="response")
lines(newx,prd[,2],col="blue",lty=2)
lines(newx,prd[,3],col="blue",lty=2)
#putting linear equation, adjusted r2 and p-value on the plot
lm_coef <- round(coef(fit.as), 4) # extract coefficients
text(20000,45,bquote(As == .(lm_coef[2]) * Fe + .(lm_coef[1])))
r2 = modsum.as$adj.r.squared
my.p = modsum.as$coefficients[2,4]
mylabel = bquote(italic(R)^2 == .(format(r2, digits = 3)))
text(20000,40, labels = mylabel)
plabel = bquote(italic(p) == .(format(my.p, digits = 3)))
text(20000,35, plabel)

#------------Vanadium vs. Iron----------------------
fit.v<-lm(Vanadium~Iron,fig.v)
anova(fit.v)
## Analysis of Variance Table
##
## Response: Vanadium
## Df Sum Sq Mean Sq F value Pr(>F)
## Iron 1 37658 37658 229.16 < 2.2e-16 ***
## Residuals 125 20541 164
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modsum.v = summary(fit.v)
modsum.v
##
## Call:
## lm(formula = Vanadium ~ Iron, data = fig.v)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40.666 -6.299 -1.510 5.515 40.271
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.270e+01 2.142e+00 5.93 2.76e-08 ***
## Iron 1.358e-03 8.970e-05 15.14 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.82 on 125 degrees of freedom
## (8 observations deleted due to missingness)
## Multiple R-squared: 0.6471, Adjusted R-squared: 0.6442
## F-statistic: 229.2 on 1 and 125 DF, p-value: < 2.2e-16
opar <- par(mfrow = c(2,2), oma = c(0, 0, 1.1, 0))
plot(fit.v, las = 1)

par(mfrow=c(1,1),mar=c(5,5,2,1))
plot(Vanadium~Iron,fig,main="Soil Sample Results",col=as.numeric(Type),pch=as.numeric(Type),xlab="Iron (mg/Kg)",ylab="Vanadium (mg/Kg)",xlim=c(4000,72000),ylim=c(0,150))
abline(fit.v,col="red",lwd=1.5)
legend("topright",as.vector(unique(fig$Type)),pch=unique(as.numeric(fig$Type)),ncol=1,col=unique(as.numeric(fig$Type)),text.col=unique(as.numeric(fig$Type)),bty="n")
#Putting 95% confidence interval
newx<-seq(4000,72000,by=5000)
prd<-predict(fit.v,newdata=data.frame(Iron=newx),interval = c("confidence"),
level = 0.95,type="response")
lines(newx,prd[,2],col="blue",lty=2)
lines(newx,prd[,3],col="blue",lty=2)
#putting linear equation, adjusted r2 and p-value on the plot
lm_coef <- round(coef(fit.v), 4) # extract coefficients
text(20000,130,bquote(V == .(lm_coef[2]) * Fe + .(lm_coef[1])))
r2 = modsum.v$adj.r.squared
my.p = modsum.v$coefficients[2,4]
mylabel = bquote(italic(R)^2 == .(format(r2, digits = 3)))
text(20000,115, labels = mylabel)
plabel = bquote(italic(p) == .(format(my.p, digits = 3)))
text(20000,100, plabel)
