library(gvlma,quietly=T)
library(car,quietly=T)
library(gamlss,quietly=T)
##
## Attaching package: 'gamlss.data'
## The following object is masked from 'package:datasets':
##
## sleep
## ********** GAMLSS Version 5.1-6 **********
## For more on GAMLSS look at http://www.gamlss.org/
## Type gamlssNews() to see new features/changes/bug fixes.
library(gamlss.add,quietly=T)
## This is mgcv 1.8-31. For overview type 'help("mgcv-package")'.
##
## Attaching package: 'nnet'
## The following object is masked from 'package:mgcv':
##
## multinom
library(quantreg,quietly=T)
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
library(BlandAltmanLeh,quietly=T)
library(irr,quietly=T)
##
## Attaching package: 'lpSolve'
## The following object is masked from 'package:gamlss':
##
## lp
library(mcr,quietly=T)
##
## Attaching package: 'mcr'
## The following object is masked from 'package:nlme':
##
## getData
data<-read.csv("data.csv",header=T)
plot(x=data$qvalue,y=data$svalue,xlab="Quanterix",ylab="Simens")
text(x=data$qvalue,y=data$svalue,labels = rownames(data))
Seemingly there are outliers.
m1=lm(svalue~qvalue,data)
summary(m1)
##
## Call:
## lm(formula = svalue ~ qvalue, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -61.695 -1.925 -0.292 1.493 117.136
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.758926 0.163803 22.95 <2e-16 ***
## qvalue 0.781013 0.006939 112.56 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.589 on 1729 degrees of freedom
## Multiple R-squared: 0.8799, Adjusted R-squared: 0.8798
## F-statistic: 1.267e+04 on 1 and 1729 DF, p-value: < 2.2e-16
outlierTest(m1)
## rstudent unadjusted p-value Bonferroni p
## 723 24.295513 2.0663e-112 3.5768e-109
## 408 16.808845 7.6526e-59 1.3247e-55
## 1144 -11.500787 1.5082e-29 2.6107e-26
## 671 -10.553496 2.8225e-25 4.8858e-22
## 390 7.711890 2.0791e-14 3.5989e-11
## 1192 5.978218 2.7345e-09 4.7335e-06
## 367 -5.599679 2.4936e-08 4.3164e-05
## 1507 5.026766 5.5053e-07 9.5296e-04
## 1653 5.013146 5.9038e-07 1.0219e-03
## 1509 4.289462 1.8901e-05 3.2718e-02
cutoff <- 4/((nrow(data)-length(m1$coefficients)-2))
plot(m1, which=4, cook.levels=cutoff)
outlier=c(723, 408, 1144, 671, 390, 1192, 367, 1507, 1653, 1509)
influ=c(367,408,671)
outlierdata=data[outlier,c("svalue","qvalue")]
infludata=data[influ,c("svalue","qvalue")]
plot(x=data$qvalue,y=data$svalue,xlab="Quanterix",ylab="Simens")
text(outlierdata$qvalue,outlierdata$svalue+5,labels = as.character(outlier),col=2)
text(infludata$qvalue,infludata$svalue-5,labels = as.character(influ),col=3)
Observations 723, 408, 1144, 671, 390, 1192, 367, 1507, 1653, 1509 are tested as outliers and observations 367, 408,671 are influential. Drop observations with >400 in quanterix, and drop other outliers while retain influential points. That is deleting observations 723,1144, 390, 1192, 367, 1507, 1653, 1509 and 1724.
data1<-data[-c(723,1144, 390, 1192, 367, 1507, 1653, 1509,1724),]
m2=lm(svalue~qvalue,data1)
summary(m2)
##
## Call:
## lm(formula = svalue ~ qvalue, data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.837 -1.800 -0.145 1.617 84.011
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.487830 0.150405 23.19 <2e-16 ***
## qvalue 0.793086 0.008617 92.04 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.214 on 1720 degrees of freedom
## Multiple R-squared: 0.8312, Adjusted R-squared: 0.8311
## F-statistic: 8471 on 1 and 1720 DF, p-value: < 2.2e-16
plot(x=data$qvalue,y=data$svalue,xlab="Quanterix",ylab="Simens")
points(x=data1$qvalue,y=data1$svalue,pch=15)
abline(m1,col=2)
abline(m2,col=3)
legend(0,390,bty="n",legend = c("OLS with outlier","OLS without outlier"),lty=1,col=c(2,3),cex=0.8)
The difference between OLS with and without outliers are subtle.
gvmodel=gvlma(m2)
summary(gvmodel)
##
## Call:
## lm(formula = svalue ~ qvalue, data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.837 -1.800 -0.145 1.617 84.011
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.487830 0.150405 23.19 <2e-16 ***
## qvalue 0.793086 0.008617 92.04 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.214 on 1720 degrees of freedom
## Multiple R-squared: 0.8312, Adjusted R-squared: 0.8311
## F-statistic: 8471 on 1 and 1720 DF, p-value: < 2.2e-16
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = m2)
##
## Value p-value Decision
## Global Stat 9.375e+05 0.000e+00 Assumptions NOT satisfied!
## Skewness 3.472e+03 0.000e+00 Assumptions NOT satisfied!
## Kurtosis 9.339e+05 0.000e+00 Assumptions NOT satisfied!
## Link Function 4.222e-01 5.158e-01 Assumptions acceptable.
## Heteroscedasticity 6.879e+01 1.110e-16 Assumptions NOT satisfied!
plot(gvmodel)
Linearity, normality and equal variance assumption are not satisied. Model should be further tuned. Try transformation to fix problems.
par(mfrow=c(1,3))
data$logsvalue=log(base=10,data$svalue)
data$logqvalue=log(base=10,data$qvalue)
plot(x=data$logqvalue,y=data$logsvalue,xlab="log Quanterix",ylab="log Simens")
densityPlot(data$logqvalue,xlab="log Quanterix")
densityPlot(data$logsvalue,xlab="log Simens")
Log of Simens value is slightly right skewed.
m3=lm(logsvalue~logqvalue,data)
summary(m3)
##
## Call:
## lm(formula = logsvalue ~ logqvalue, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.74030 -0.06718 0.00257 0.06411 0.66217
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.35700 0.01065 33.52 <2e-16 ***
## logqvalue 0.70713 0.01004 70.46 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1061 on 1729 degrees of freedom
## Multiple R-squared: 0.7417, Adjusted R-squared: 0.7415
## F-statistic: 4965 on 1 and 1729 DF, p-value: < 2.2e-16
outlierTest(m3)
## rstudent unadjusted p-value Bonferroni p
## 1144 -7.104708 1.7559e-12 3.0395e-09
## 723 6.324125 3.2350e-10 5.5998e-07
## 390 5.851296 5.8225e-09 1.0079e-05
## 1192 5.762465 9.7952e-09 1.6955e-05
## 931 4.988139 6.7090e-07 1.1613e-03
## 472 4.576775 5.0588e-06 8.7567e-03
## 1507 4.374019 1.2926e-05 2.2375e-02
## 1509 4.258524 2.1685e-05 3.7536e-02
plot(m3,which = 4,cook.levels = cutoff)
outlier=c(1144,723,390,1192,931,472,1507,1509)
influ=c(408,1144,1724)
outlierdata=data[outlier,c("logsvalue","logqvalue")]
infludata=data[influ,c("logsvalue","logqvalue")]
plot(x=data$logqvalue,y=data$logsvalue,xlab="log Quanterix",ylab="log Simens")
text(outlierdata$logqvalue,outlierdata$logsvalue+0.05,labels = as.character(outlier),col=2)
text(infludata$logqvalue,infludata$logsvalue-0.05,labels = as.character(influ),col=3)
After transformation, observation 408 and 367 are not outliers any more. Now observations will be deleted are 1144,723,390,1192,931, 472, 1507 and 1509.
data2=data[-c(1144,723,390,1192,931, 472, 1507,1509),]
m4=lm(logsvalue~logqvalue,data2)
summary(m4)
##
## Call:
## lm(formula = logsvalue ~ logqvalue, data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.31631 -0.06559 0.00402 0.06504 0.41298
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.351064 0.009975 35.20 <2e-16 ***
## logqvalue 0.711166 0.009406 75.61 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09881 on 1721 degrees of freedom
## Multiple R-squared: 0.7686, Adjusted R-squared: 0.7685
## F-statistic: 5716 on 1 and 1721 DF, p-value: < 2.2e-16
plot(x=data$logqvalue,y=data$logsvalue,xlab="log Quanterix",ylab="log Simens")
points(x=data2$logqvalue,y=data2$logsvalue,pch=15)
abline(m3,col=2)
abline(m4,col=3)
legend(0,2.5,bty="n",legend = c("OLS with outlier","OLS without outlier"),lty=1,col=c(2,3),cex=0.8)
Difference between two regression lines is very insignificant too.
gvmodel4=gvlma(m4)
summary(gvmodel4)
##
## Call:
## lm(formula = logsvalue ~ logqvalue, data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.31631 -0.06559 0.00402 0.06504 0.41298
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.351064 0.009975 35.20 <2e-16 ***
## logqvalue 0.711166 0.009406 75.61 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09881 on 1721 degrees of freedom
## Multiple R-squared: 0.7686, Adjusted R-squared: 0.7685
## F-statistic: 5716 on 1 and 1721 DF, p-value: < 2.2e-16
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = m4)
##
## Value p-value Decision
## Global Stat 143.7351 0.000000 Assumptions NOT satisfied!
## Skewness 0.5759 0.447928 Assumptions acceptable.
## Kurtosis 11.5403 0.000681 Assumptions NOT satisfied!
## Link Function 130.1231 0.000000 Assumptions NOT satisfied!
## Heteroscedasticity 1.4959 0.221310 Assumptions acceptable.
plot(gvmodel4)
Heteroscedasticity is satisfied after log tansformation and drop outliers. Since the gloabl test imply x-varible may not continous, I try to add knot to see if x-variable can be cut.
m5=gamlss(logsvalue~fk(logqvalue,degree=1,start=median(logqvalue)),data=data2,trace=F)
summary(m5)
## ******************************************************************
## Family: c("NO", "Normal")
##
## Call: gamlss(formula = logsvalue ~ fk(logqvalue, degree = 1,
## start = median(logqvalue)), data = data2, trace = F)
##
## Fitting method: RS()
##
## ------------------------------------------------------------------
## Mu link function: identity
## Mu Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.083455 0.002294 472.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## ------------------------------------------------------------------
## Sigma link function: log
## Sigma Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.35174 0.01703 -138.1 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## ------------------------------------------------------------------
## NOTE: Additive smoothing terms exist in the formulas:
## i) Std. Error for smoothers are for the linear effect only.
## ii) Std. Error for the linear terms maybe are not accurate.
## ------------------------------------------------------------------
## No. of observations in the fit: 1723
## Degrees of Freedom for the fit: 5
## Residual Deg. of Freedom: 1718
## at cycle: 2
##
## Global Deviance: -3214.43
## AIC: -3204.43
## SBC: -3177.171
## ******************************************************************
getSmo(m5)
##
## Call: fitFreeKnots(y = y, x = xvar, weights = w, degree = degree,
## knots = lambda, fixed = control$fixed, base = control$base)
##
## Coefficients:
## (Intercept) x XatBP1
## -0.6059 0.5710 0.3218
## Estimated Knots:
## BP1
## 1.138
plot(logsvalue~logqvalue,data=data2)
lines(data2$logqvalue[order(data2$logqvalue)],fitted(m5)[order(data2$logqvalue)],col=2)
abline(v=knots(getSmo(m5)),col=3)
There will be a cut off on 1.138 When logQ<1.138,logS=1.0835-0.6059+0.5710logQ=0.4776+0.5710logQ When logQ>=1.138,logS=0.4776+0.571logQ+0.3218(logQ-1.1380)=0.1114+0.8928logQ
plot(m5)
## ******************************************************************
## Summary of the Quantile Residuals
## mean = 1.401581e-16
## variance = 1.000581
## coef. of skewness = -0.08084673
## coef. of kurtosis = 3.126842
## Filliben correlation coefficient = 0.9988639
## ******************************************************************
Residuals perform pretty well as i.i.d normal distribution. This model that cut log quanterix value and fit with different slope and intercept work well. ## Qunatile regression by Quantreg on raw data ### Model
m6<-rq(svalue~qvalue,data=data1,tau=0.5)
plot(data1$qvalue,data1$svalue,xlab="Quanterix",ylab="Simens")
abline(lm(svalue~qvalue,data=data1),col=2)
abline(m6,col=3)
legend(0,220,col=c(2,3),lty=1,legend = c("mean fit","median fit"),bty="n")
wp(m6)
## Warning in wp(m6): Some points are missed out
## increase the y limits using ylim.all
Very poor behavior of quantile regression model using rq. Have a try on LMS, which will use BOX-Cox to force y to be nomalized.
m7<-lms(svalue,qvalue,data=data1,k=2,trans.x = T,cent=c(5,25,50,75,95))
## *** Checking for transformation for x ***
## *** power parameters 0.9044451 ***
## *** Initial fit***
## GAMLSS-RS iteration 1: Global Deviance = 8980.889
## GAMLSS-RS iteration 2: Global Deviance = 8980.857
## GAMLSS-RS iteration 3: Global Deviance = 8980.857
## *** Fitting BCCGo ***
## GAMLSS-RS iteration 1: Global Deviance = 8297.084
## GAMLSS-RS iteration 2: Global Deviance = 8260.855
## GAMLSS-RS iteration 3: Global Deviance = 8260.143
## GAMLSS-RS iteration 4: Global Deviance = 8260.135
## GAMLSS-RS iteration 5: Global Deviance = 8260.218
## GAMLSS-RS iteration 6: Global Deviance = 8260.326
## GAMLSS-RS iteration 7: Global Deviance = 8260.419
## GAMLSS-RS iteration 8: Global Deviance = 8260.475
## GAMLSS-RS iteration 9: Global Deviance = 8260.499
## GAMLSS-RS iteration 10: Global Deviance = 8260.514
## GAMLSS-RS iteration 11: Global Deviance = 8260.531
## GAMLSS-RS iteration 12: Global Deviance = 8260.544
## GAMLSS-RS iteration 13: Global Deviance = 8260.557
## GAMLSS-RS iteration 14: Global Deviance = 8260.567
## GAMLSS-RS iteration 15: Global Deviance = 8260.576
## GAMLSS-RS iteration 16: Global Deviance = 8260.583
## GAMLSS-RS iteration 17: Global Deviance = 8260.59
## GAMLSS-RS iteration 18: Global Deviance = 8260.595
## GAMLSS-RS iteration 19: Global Deviance = 8260.6
## GAMLSS-RS iteration 20: Global Deviance = 8260.603
## *** Fitting BCPEo ***
## GAMLSS-RS iteration 1: Global Deviance = 8395.698
## GAMLSS-RS iteration 2: Global Deviance = 8253.875
## GAMLSS-RS iteration 3: Global Deviance = 8253.033
## GAMLSS-RS iteration 4: Global Deviance = 8253.176
## GAMLSS-RS iteration 5: Global Deviance = 8253.241
## GAMLSS-RS iteration 6: Global Deviance = 8253.291
## GAMLSS-RS iteration 7: Global Deviance = 8253.328
## GAMLSS-RS iteration 8: Global Deviance = 8253.357
## GAMLSS-RS iteration 9: Global Deviance = 8253.382
## GAMLSS-RS iteration 10: Global Deviance = 8253.405
## GAMLSS-RS iteration 11: Global Deviance = 8253.424
## GAMLSS-RS iteration 12: Global Deviance = 8253.442
## GAMLSS-RS iteration 13: Global Deviance = 8253.459
## GAMLSS-RS iteration 14: Global Deviance = 8253.475
## GAMLSS-RS iteration 15: Global Deviance = 8253.489
## GAMLSS-RS iteration 16: Global Deviance = 8253.504
## GAMLSS-RS iteration 17: Global Deviance = 8253.516
## GAMLSS-RS iteration 18: Global Deviance = 8253.529
## GAMLSS-RS iteration 19: Global Deviance = 8253.54
## GAMLSS-RS iteration 20: Global Deviance = 8253.551
## *** Fitting BCTo ***
## GAMLSS-RS iteration 1: Global Deviance = 8258.694
## GAMLSS-RS iteration 2: Global Deviance = 8244.409
## GAMLSS-RS iteration 3: Global Deviance = 8242.89
## GAMLSS-RS iteration 4: Global Deviance = 8242.407
## GAMLSS-RS iteration 5: Global Deviance = 8242.255
## GAMLSS-RS iteration 6: Global Deviance = 8242.208
## GAMLSS-RS iteration 7: Global Deviance = 8242.201
## GAMLSS-RS iteration 8: Global Deviance = 8242.204
## GAMLSS-RS iteration 9: Global Deviance = 8242.21
## GAMLSS-RS iteration 10: Global Deviance = 8242.215
## GAMLSS-RS iteration 11: Global Deviance = 8242.218
## GAMLSS-RS iteration 12: Global Deviance = 8242.219
## GAMLSS-RS iteration 13: Global Deviance = 8242.22
## % of cases below 4.838121 centile is 5.052265
## % of cases below 23.81761 centile is 25.02904
## % of cases below 50.24998 centile is 50
## % of cases below 76.13121 centile is 74.97096
## % of cases below 93.89878 centile is 94.94774
m7$power
## [1] 0.9044451
wp(m7,xvar=data1$qvalue,n.inter = 5)
## number of missing points from plot= 0 out of 346
## number of missing points from plot= 1 out of 343
## number of missing points from plot= 0 out of 349
## number of missing points from plot= 0 out of 342
## number of missing points from plot= 1 out of 342
Worm plot shows residuals look good since most of points are between 95% elliptical confidence band. So quantile regression on y^0.9 is reasonable. But attendence should be attached that the apllied range of Quanterix value is 0-180 as value>=180 are exluded in model fit.
bland.altman.plot(data$qvalue,data$svalue, main="Bland Altman Plot on raw data", xlab="Means", ylab="Differences")
## NULL
bland.altman.plot(data$logqvalue,data$logsvalue, main="Bland Altman Plot on transformed data", xlab="Means", ylab="Differences")
## NULL
data_icc1=data[,c("qvalue","svalue")]
icc(data_icc1,model="oneway",unit="single")
## Single Score Intraclass Correlation
##
## Model: oneway
## Type : consistency
##
## Subjects = 1731
## Raters = 2
## ICC(1) = 0.922
##
## F-Test, H0: r0 = 0 ; H1: r0 > 0
## F(1730,1731) = 24.5 , p = 0
##
## 95%-Confidence Interval for ICC Population Values:
## 0.914 < ICC < 0.928
data_icc2=data[,c("logqvalue","logsvalue")]
icc(data_icc2,model="oneway",unit="single")
## Single Score Intraclass Correlation
##
## Model: oneway
## Type : consistency
##
## Subjects = 1731
## Raters = 2
## ICC(1) = 0.819
##
## F-Test, H0: r0 = 0 ; H1: r0 > 0
## F(1730,1731) = 10.1 , p = 0
##
## 95%-Confidence Interval for ICC Population Values:
## 0.803 < ICC < 0.834
Very high ICC of both raw and log data, so values from two assay are similar. ## Passing Bablok model on raw data
m8 <- mcreg(data$qvalue,data$svalue, method.reg = "PaBa")
m8@para
## EST SE LCI UCI
## Intercept 2.5588235 NA 2.2774192 2.8109380
## Slope 0.8823529 NA 0.8531929 0.9136358
MCResult.plot(m8, equal.axis = TRUE, x.lab = "Quanterix", y.lab = "Simens", add.grid = FALSE, points.cex = 1)
To summary, three models can be considered to fit on raw Simens value, log Simens value and 0.9 power to Simens value individually.
Passing Bablok Regression on raw value. \[S=0.8824*Q+2.5588\] It is demonstrated that there are a few outliers in OLS model of raw data. Since Passing Bablok regression is robust to outliers, it is reasonable to fit a Passing Bablok regression. Quanterix value >400 can be applied to this model.
Free knot model on log values. \[logS=0.5710logQ+0.4776,\ logQ<1.1380\] \[logS=0.8928logQ+0.1114,\ logQ >= 1.1380\] This model has excellent diagnositic result and can incoporate Quanterix value greater than 400.
Qunatile regression model on 0.9 power to original Simens vale. \[S^{0.9}=2.0204+0.0477pb(Q)\], where pb means p-spline. The residuals perfom well, but the model is rather complicated, and can only be applied when Quanterix value <=200.