library(gvlma,quietly=T)
library(car,quietly=T)
library(gamlss,quietly=T)
##
## Attaching package: 'gamlss.data'
## The following object is masked from 'package:datasets':
##
## sleep
## ********** GAMLSS Version 5.1-6 **********
## For more on GAMLSS look at http://www.gamlss.org/
## Type gamlssNews() to see new features/changes/bug fixes.
library(gamlss.add,quietly=T)
## This is mgcv 1.8-31. For overview type 'help("mgcv-package")'.
##
## Attaching package: 'nnet'
## The following object is masked from 'package:mgcv':
##
## multinom
library(quantreg,quietly=T)
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
library(BlandAltmanLeh,quietly=T)
library(irr,quietly=T)
##
## Attaching package: 'lpSolve'
## The following object is masked from 'package:gamlss':
##
## lp
library(mcr,quietly=T)
##
## Attaching package: 'mcr'
## The following object is masked from 'package:nlme':
##
## getData
data<-read.csv("data_v1.csv",header=T)
plot(x=data$qvalue,y=data$svalue,xlab="Quanterix",ylab="Siemens")
text(x=data$qvalue,y=data$svalue,labels = rownames(data))
Seemingly there are outliers.
m1=lm(svalue~qvalue,data)
summary(m1)
##
## Call:
## lm(formula = svalue ~ qvalue, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -61.695 -1.925 -0.292 1.493 117.136
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.758926 0.163803 22.95 <2e-16 ***
## qvalue 0.781013 0.006939 112.56 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.589 on 1729 degrees of freedom
## Multiple R-squared: 0.8799, Adjusted R-squared: 0.8798
## F-statistic: 1.267e+04 on 1 and 1729 DF, p-value: < 2.2e-16
outlierTest(m1)
## rstudent unadjusted p-value Bonferroni p
## 723 24.295513 2.0663e-112 3.5768e-109
## 408 16.808845 7.6526e-59 1.3247e-55
## 1144 -11.500787 1.5082e-29 2.6107e-26
## 671 -10.553496 2.8225e-25 4.8858e-22
## 390 7.711890 2.0791e-14 3.5989e-11
## 1192 5.978218 2.7345e-09 4.7335e-06
## 367 -5.599679 2.4936e-08 4.3164e-05
## 1507 5.026766 5.5053e-07 9.5296e-04
## 1653 5.013146 5.9038e-07 1.0219e-03
## 1509 4.289462 1.8901e-05 3.2718e-02
cutoff <- 4/((nrow(data)-length(m1$coefficients)-2))
plot(m1, which=4, cook.levels=cutoff)
outlier=c(723, 408, 1144, 671, 390, 1192, 367, 1507, 1653, 1509)
influ=c(367,408,671)
outlierdata=data[outlier,c("svalue","qvalue")]
infludata=data[influ,c("svalue","qvalue")]
plot(x=data$qvalue,y=data$svalue,xlab="Quanterix",ylab="Siemens")
text(outlierdata$qvalue,outlierdata$svalue+5,labels = as.character(outlier),col=2)
text(infludata$qvalue,infludata$svalue-5,labels = as.character(influ),col=3)
data1<-data[-c(723,1144, 390, 1192, 367, 1507, 1653, 1509,1724),]
m2=lm(svalue~qvalue,data1)
summary(m2)
##
## Call:
## lm(formula = svalue ~ qvalue, data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.837 -1.800 -0.145 1.617 84.011
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.487830 0.150405 23.19 <2e-16 ***
## qvalue 0.793086 0.008617 92.04 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.214 on 1720 degrees of freedom
## Multiple R-squared: 0.8312, Adjusted R-squared: 0.8311
## F-statistic: 8471 on 1 and 1720 DF, p-value: < 2.2e-16
plot(x=data$qvalue,y=data$svalue,xlab="Quanterix",ylab="Siemens")
points(x=data1$qvalue,y=data1$svalue,pch=15)
abline(m1,col=2)
abline(m2,col=3)
legend(0,390,bty="n",legend = c("OLS with outlier","OLS without outlier"),lty=1,col=c(2,3),cex=0.8)
The difference between OLS with and without outliers are subtle.
gvmodel=gvlma(m2)
summary(gvmodel)
##
## Call:
## lm(formula = svalue ~ qvalue, data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.837 -1.800 -0.145 1.617 84.011
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.487830 0.150405 23.19 <2e-16 ***
## qvalue 0.793086 0.008617 92.04 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.214 on 1720 degrees of freedom
## Multiple R-squared: 0.8312, Adjusted R-squared: 0.8311
## F-statistic: 8471 on 1 and 1720 DF, p-value: < 2.2e-16
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = m2)
##
## Value p-value Decision
## Global Stat 9.375e+05 0.000e+00 Assumptions NOT satisfied!
## Skewness 3.472e+03 0.000e+00 Assumptions NOT satisfied!
## Kurtosis 9.339e+05 0.000e+00 Assumptions NOT satisfied!
## Link Function 4.222e-01 5.158e-01 Assumptions acceptable.
## Heteroscedasticity 6.879e+01 1.110e-16 Assumptions NOT satisfied!
plot(gvmodel)
Linearity, normality, equal variance are not satisied. Model should be further tuned. Try transformation to fix problems.
par(mfrow=c(1,3))
data$logsvalue=log(base=10,data$svalue)
data$logqvalue=log(base=10,data$qvalue)
plot(x=data$logqvalue,y=data$logsvalue,xlab="log Quanterix",ylab="log Siemens")
densityPlot(data$logqvalue,xlab="log Quanterix")
densityPlot(data$logsvalue,xlab="log Siemens")
Log of Siemens value is slightly right skewed.
m3=lm(logsvalue~logqvalue,data)
summary(m3)
##
## Call:
## lm(formula = logsvalue ~ logqvalue, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.74030 -0.06718 0.00257 0.06411 0.66217
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.35700 0.01065 33.52 <2e-16 ***
## logqvalue 0.70713 0.01004 70.46 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1061 on 1729 degrees of freedom
## Multiple R-squared: 0.7417, Adjusted R-squared: 0.7415
## F-statistic: 4965 on 1 and 1729 DF, p-value: < 2.2e-16
outlierTest(m3)
## rstudent unadjusted p-value Bonferroni p
## 1144 -7.104708 1.7559e-12 3.0395e-09
## 723 6.324125 3.2350e-10 5.5998e-07
## 390 5.851296 5.8225e-09 1.0079e-05
## 1192 5.762465 9.7952e-09 1.6955e-05
## 931 4.988139 6.7090e-07 1.1613e-03
## 472 4.576775 5.0588e-06 8.7567e-03
## 1507 4.374019 1.2926e-05 2.2375e-02
## 1509 4.258524 2.1685e-05 3.7536e-02
plot(m3,which = 4,cook.levels = cutoff)
outlier=c(1144,723,390,1192,931,472,1507,1509)
influ=c(408,1144,1724)
outlierdata=data[outlier,c("logsvalue","logqvalue")]
infludata=data[influ,c("logsvalue","logqvalue")]
plot(x=data$logqvalue,y=data$logsvalue,xlab="log Quanterix",ylab="log Siemens")
text(outlierdata$logqvalue,outlierdata$logsvalue+0.05,labels = as.character(outlier),col=2)
text(infludata$logqvalue,infludata$logsvalue-0.05,labels = as.character(influ),col=3)
After transformation, observation 408 and 367 are not outliers any more. Now observations will be deleted are 1144,723,390,1192,931, 472, 1507 and 1509.
data2=data[-c(1144,723,390,1192,931, 472, 1507,1509),]
m4=lm(logsvalue~logqvalue,data2)
summary(m4)
##
## Call:
## lm(formula = logsvalue ~ logqvalue, data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.31631 -0.06559 0.00402 0.06504 0.41298
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.351064 0.009975 35.20 <2e-16 ***
## logqvalue 0.711166 0.009406 75.61 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09881 on 1721 degrees of freedom
## Multiple R-squared: 0.7686, Adjusted R-squared: 0.7685
## F-statistic: 5716 on 1 and 1721 DF, p-value: < 2.2e-16
plot(x=data$logqvalue,y=data$logsvalue,xlab="log Quanterix",ylab="log Siemens")
points(x=data2$logqvalue,y=data2$logsvalue,pch=15)
abline(m3,col=2)
abline(m4,col=3)
legend(0,2.5,bty="n",legend = c("OLS with outlier","OLS without outlier"),lty=1,col=c(2,3),cex=0.8)
Difference between two regression lines is very insignificant.
gvmodel4=gvlma(m4)
summary(gvmodel4)
##
## Call:
## lm(formula = logsvalue ~ logqvalue, data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.31631 -0.06559 0.00402 0.06504 0.41298
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.351064 0.009975 35.20 <2e-16 ***
## logqvalue 0.711166 0.009406 75.61 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09881 on 1721 degrees of freedom
## Multiple R-squared: 0.7686, Adjusted R-squared: 0.7685
## F-statistic: 5716 on 1 and 1721 DF, p-value: < 2.2e-16
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = m4)
##
## Value p-value Decision
## Global Stat 143.7351 0.000000 Assumptions NOT satisfied!
## Skewness 0.5759 0.447928 Assumptions acceptable.
## Kurtosis 11.5403 0.000681 Assumptions NOT satisfied!
## Link Function 130.1231 0.000000 Assumptions NOT satisfied!
## Heteroscedasticity 1.4959 0.221310 Assumptions acceptable.
plot(gvmodel4)
Heteroscedasticity is satisfied after log tansformation and drop outliers. Since the gloabl test imply x-varible may not continous, I try to add knot to see if x-variable can be cut.
m5=gamlss(logsvalue~fk(logqvalue,degree=1,start=median(logqvalue)),data=data2,trace=F)
summary(m5)
## ******************************************************************
## Family: c("NO", "Normal")
##
## Call: gamlss(formula = logsvalue ~ fk(logqvalue, degree = 1,
## start = median(logqvalue)), data = data2, trace = F)
##
## Fitting method: RS()
##
## ------------------------------------------------------------------
## Mu link function: identity
## Mu Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.083455 0.002294 472.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## ------------------------------------------------------------------
## Sigma link function: log
## Sigma Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.35174 0.01703 -138.1 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## ------------------------------------------------------------------
## NOTE: Additive smoothing terms exist in the formulas:
## i) Std. Error for smoothers are for the linear effect only.
## ii) Std. Error for the linear terms maybe are not accurate.
## ------------------------------------------------------------------
## No. of observations in the fit: 1723
## Degrees of Freedom for the fit: 5
## Residual Deg. of Freedom: 1718
## at cycle: 2
##
## Global Deviance: -3214.43
## AIC: -3204.43
## SBC: -3177.171
## ******************************************************************
getSmo(m5)
##
## Call: fitFreeKnots(y = y, x = xvar, weights = w, degree = degree,
## knots = lambda, fixed = control$fixed, base = control$base)
##
## Coefficients:
## (Intercept) x XatBP1
## -0.6059 0.5710 0.3218
## Estimated Knots:
## BP1
## 1.138
plot(logsvalue~logqvalue,data=data2)
lines(data2$logqvalue[order(data2$logqvalue)],fitted(m5)[order(data2$logqvalue)],col=2)
abline(v=knots(getSmo(m5)),col=3)
There will be a cut off on 1.138 When logQ<1.138,logS=1.0835-0.6059+0.5710logQ=0.4776+0.5710logQ When logQ>=1.138,logS=0.4776+0.571logQ+0.3218(logQ-1.1380)=0.1114+0.8928logQ
plot(m5)
## ******************************************************************
## Summary of the Quantile Residuals
## mean = 1.401581e-16
## variance = 1.000581
## coef. of skewness = -0.08084673
## coef. of kurtosis = 3.126842
## Filliben correlation coefficient = 0.9988639
## ******************************************************************
Residuals perform pretty well as i.i.d normal distribution. This model that cut log quanterix value and fit with different slope and intercept work well.
m6<-rq(svalue~qvalue,data=data1,tau=0.5)
plot(data1$qvalue,data1$svalue,xlab="Quanterix",ylab="Siemens")
abline(lm(svalue~qvalue,data=data1),col=2)
abline(m6,col=3)
legend(0,220,col=c(2,3),lty=1,legend = c("mean fit","median fit"),bty="n")
wp(m6)
## Warning in wp(m6): Some points are missed out
## increase the y limits using ylim.all
Very poor behavior of quantile regression model using rq. Have a try on LMS, which will use BOX-Cox to force y to be nomalized.
data1_1<-data1[-472,]
m7<-lms(svalue,qvalue,data=data1_1,k=2,trans.x = T,cent=c(5,25,50,75,95))
## *** Checking for transformation for x ***
## *** power parameters 0.9044375 ***
## *** Initial fit***
## GAMLSS-RS iteration 1: Global Deviance = 8974.035
## GAMLSS-RS iteration 2: Global Deviance = 8974.002
## GAMLSS-RS iteration 3: Global Deviance = 8974.002
## *** Fitting BCCGo ***
## GAMLSS-RS iteration 1: Global Deviance = 8288.317
## GAMLSS-RS iteration 2: Global Deviance = 8252.368
## GAMLSS-RS iteration 3: Global Deviance = 8251.675
## GAMLSS-RS iteration 4: Global Deviance = 8251.664
## GAMLSS-RS iteration 5: Global Deviance = 8251.737
## GAMLSS-RS iteration 6: Global Deviance = 8251.835
## GAMLSS-RS iteration 7: Global Deviance = 8251.92
## GAMLSS-RS iteration 8: Global Deviance = 8251.964
## GAMLSS-RS iteration 9: Global Deviance = 8251.985
## GAMLSS-RS iteration 10: Global Deviance = 8252
## GAMLSS-RS iteration 11: Global Deviance = 8252.016
## GAMLSS-RS iteration 12: Global Deviance = 8252.029
## GAMLSS-RS iteration 13: Global Deviance = 8252.04
## GAMLSS-RS iteration 14: Global Deviance = 8252.05
## GAMLSS-RS iteration 15: Global Deviance = 8252.058
## GAMLSS-RS iteration 16: Global Deviance = 8252.065
## GAMLSS-RS iteration 17: Global Deviance = 8252.071
## GAMLSS-RS iteration 18: Global Deviance = 8252.076
## GAMLSS-RS iteration 19: Global Deviance = 8252.081
## GAMLSS-RS iteration 20: Global Deviance = 8252.084
## *** Fitting BCPEo ***
## GAMLSS-RS iteration 1: Global Deviance = 8386.306
## GAMLSS-RS iteration 2: Global Deviance = 8245.327
## GAMLSS-RS iteration 3: Global Deviance = 8244.495
## GAMLSS-RS iteration 4: Global Deviance = 8244.634
## GAMLSS-RS iteration 5: Global Deviance = 8244.697
## GAMLSS-RS iteration 6: Global Deviance = 8244.744
## GAMLSS-RS iteration 7: Global Deviance = 8244.779
## GAMLSS-RS iteration 8: Global Deviance = 8244.808
## GAMLSS-RS iteration 9: Global Deviance = 8244.832
## GAMLSS-RS iteration 10: Global Deviance = 8244.853
## GAMLSS-RS iteration 11: Global Deviance = 8244.872
## GAMLSS-RS iteration 12: Global Deviance = 8244.89
## GAMLSS-RS iteration 13: Global Deviance = 8244.905
## GAMLSS-RS iteration 14: Global Deviance = 8244.92
## GAMLSS-RS iteration 15: Global Deviance = 8244.934
## GAMLSS-RS iteration 16: Global Deviance = 8244.947
## GAMLSS-RS iteration 17: Global Deviance = 8244.959
## GAMLSS-RS iteration 18: Global Deviance = 8244.971
## GAMLSS-RS iteration 19: Global Deviance = 8244.982
## GAMLSS-RS iteration 20: Global Deviance = 8244.992
## *** Fitting BCTo ***
## GAMLSS-RS iteration 1: Global Deviance = 8249.985
## GAMLSS-RS iteration 2: Global Deviance = 8235.63
## GAMLSS-RS iteration 3: Global Deviance = 8234.065
## GAMLSS-RS iteration 4: Global Deviance = 8233.542
## GAMLSS-RS iteration 5: Global Deviance = 8233.381
## GAMLSS-RS iteration 6: Global Deviance = 8233.335
## GAMLSS-RS iteration 7: Global Deviance = 8233.327
## GAMLSS-RS iteration 8: Global Deviance = 8233.327
## % of cases below 4.837445 centile is 5.0552
## % of cases below 23.79436 centile is 24.98547
## % of cases below 50.24635 centile is 49.97095
## % of cases below 76.07596 centile is 74.95642
## % of cases below 93.89626 centile is 94.9448
m7$power
## [1] 0.9044375
summary(m7)
## ******************************************************************
## Family: c("BCTo", "Box-Cox-t-orig.")
##
## Call: gamlss(formula = svalue ~ pb(qvalue), sigma.formula = ~pb(qvalue),
## nu.formula = ~pb(qvalue), , tau.formula = ~pb(qvalue), familsvalue = "BCTo",
## data = data1_1)
##
## Fitting method: RS()
##
## ------------------------------------------------------------------
## Mu link function: log
## Mu Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.0203785 0.0081631 247.50 <2e-16 ***
## pb(x, df = mu.df) 0.0477081 0.0006267 76.13 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## ------------------------------------------------------------------
## Sigma link function: log
## Sigma Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.588143 0.029665 -53.536 <2e-16 ***
## pb(x, df = sigma.df) 0.002271 0.002289 0.992 0.321
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## ------------------------------------------------------------------
## Nu link function: identity
## Nu Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.165024 0.143358 1.151 0.250
## pb(x, df = nu.df) -0.008032 0.011179 -0.718 0.473
##
## ------------------------------------------------------------------
## Tau link function: log
## Tau Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.92474 0.30514 6.308 3.59e-10 ***
## pb(x, df = tau.df) 0.13568 0.01613 8.412 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## ------------------------------------------------------------------
## NOTE: Additive smoothing terms exist in the formulas:
## i) Std. Error for smoothers are for the linear effect only.
## ii) Std. Error for the linear terms may not be reliable.
## ------------------------------------------------------------------
## No. of observations in the fit: 1721
## Degrees of Freedom for the fit: 16.16914
## Residual Deg. of Freedom: 1704.831
## at cycle: 8
##
## Global Deviance: 8233.327
## AIC: 8265.665
## SBC: 8353.798
## ******************************************************************
wp(m7,xvar=data1_1$qvalue,n.inter = 5)
## number of missing points from plot= 0 out of 346
## number of missing points from plot= 1 out of 342
## number of missing points from plot= 0 out of 349
## number of missing points from plot= 0 out of 342
## number of missing points from plot= 1 out of 342
Worm plot shows residuals look good since most of points are between 95% elliptical confidence band. So quantile regression on y^0.9 is reasonable. But attendence should be attached that the apllied range of Quanterix value is 0-180 as value>=180 are exluded in model fit.
bland.altman.plot(data$qvalue,data$svalue, main="Bland Altman Plot on raw data", xlab="Means", ylab="Differences")
## NULL
bland.altman.plot(data$logqvalue,data$logsvalue, main="Bland Altman Plot on transformed data", xlab="Means", ylab="Differences")
## NULL
data_icc1=data[,c("qvalue","svalue")]
icc(data_icc1,model="oneway",unit="single")
## Single Score Intraclass Correlation
##
## Model: oneway
## Type : consistency
##
## Subjects = 1731
## Raters = 2
## ICC(1) = 0.922
##
## F-Test, H0: r0 = 0 ; H1: r0 > 0
## F(1730,1731) = 24.5 , p = 0
##
## 95%-Confidence Interval for ICC Population Values:
## 0.914 < ICC < 0.928
data_icc2=data[,c("logqvalue","logsvalue")]
icc(data_icc2,model="oneway",unit="single")
## Single Score Intraclass Correlation
##
## Model: oneway
## Type : consistency
##
## Subjects = 1731
## Raters = 2
## ICC(1) = 0.819
##
## F-Test, H0: r0 = 0 ; H1: r0 > 0
## F(1730,1731) = 10.1 , p = 0
##
## 95%-Confidence Interval for ICC Population Values:
## 0.803 < ICC < 0.834
Very high ICC of both raw and log data, so values from two assay are similar. ## Passing Bablok model on raw data
m8 <- mcreg(data$qvalue,data$svalue, method.reg = "PaBa")
m8@para
## EST SE LCI UCI
## Intercept 2.5588235 NA 2.2713708 2.8230687
## Slope 0.8823529 NA 0.8508124 0.9143638
MCResult.plot(m8, equal.axis = TRUE, x.lab = "Quanterix", y.lab = "Siemens", add.grid = FALSE, points.cex = 1)
To summary, three models can be considered to fit on raw Simens value, log Simens value and 0.86 power to Simens value individually.
Passing Bablok Regression on raw value. \[S=0.8824*Q+2.5588\] It is demonstrated that there are a few outliers in OLS model of raw data. Since Passing Bablok regression is robust to outliers, it is reasonable to fit a Passing Bablok regression. Quanterix value >400 can be applied to this model.
Free knot model on log values. \[logS=0.5710logQ+0.4776,\ logQ<1.1380\] \[logS=0.8928logQ+0.1114,\ logQ >= 1.1380\] This model has excellent diagnositic result and can incoporate Quanterix value greater than 400.
Qunatile regression model on 0.9 power to original Simens vale. \[S^{0.9}=2.0204+0.0477pb(Q)\], where pb means p-spline. The residuals perfom well, but the model is rather complicated, and can only be applied when Quanterix value <= 180.
This part use models from data_v1 (training dataset), evaluate them on the additional data_v2 (validation dataset) by MSE
data_v2<-read.csv("data_v2.csv",header=T)
index=which(data_v2$mpi %in% data$mpi)
data_val=data_v2[-index,]
Q=data_val$qvalue
Spred=0.8824*Q+2.5588
S=data_val$svalue
dd1=(Spred-S)^2
plot(x=data_val$qvalue,y=data_val$svalue,xlab="Quanterix",ylab="Siemens",col=3)
points(x=data_val$qvalue,y=Spred,col=2,pch=2)
legend(x=0,y=70,legend = c("predicted value","true value"),col=c(2,3),pch=c(2,1),bty="n")
mean(dd1)
## [1] 18.60058
The MSE of predicted value is 18.600
Spred=ifelse(data_val$qvalue< 13.74042,0.5710*log(base=10,data_val$qvalue)+0.4776,0.8928*log(base=10,data_val$qvalue)+0.1114)
Spred=10^Spred
dd2=(Spred-data_val$svalue)^2
plot(x=data_val$qvalue,y=data_val$svalue,xlab="Quanterix",ylab="Siemens",col=3)
points(x=data_val$qvalue,y=Spred,col=2,pch=2)
legend(x=0,y=70,legend = c("predicted value","true value"),col=c(2,3),pch=c(2,1),bty="n")
mean(dd2)
## [1] 13.87906
The MSE of predicted value is 13.879
data_val2<-data_val[which(data_val$qvalue<=180),]
Spred<-predict(m7,percentiles=50,newdata = data_val,type="response")
Spred=Spred$mu
dd3=(Spred-data_val$svalue)^2
plot(x=data_val$qvalue,y=data_val$svalue,xlab="Quanterix",ylab="Siemens",col=3)
points(x=data_val$qvalue,y=Spred,col=2,pch=2)
legend(x=0,y=70,legend = c("predicted value","true value"),col=c(2,3),pch=c(2,1),bty="n")
mean(dd3)
## [1] 13.42466
The MSE of predicted value is 13.424
The perfomance on MSE of predicted value for Free knot model and Quantile regression model are comparative good, better than Passing Bablok Regression. However, the Quanterix value used to validate range from 2-70, which is much narrower than the range that model applies, leads to biased validation performance. Also, Free knot model can be applied when Quanterix value > 400 while Quantile regression model can't be applied when Quanterix value > 180.
data_v2$logsvalue=log(base=10,data_v2$svalue)
data_v2$logqvalue=log(base=10,data_v2$qvalue)
par(mfrow=c(1,3))
plot(x=data_v2$logqvalue,y=data_v2$logsvalue,xlab="log Quanterix",ylab="log Siemens")
densityPlot(data_v2$logqvalue,xlab="log Quanterix")
densityPlot(data_v2$logsvalue,xlab="log Simens")
data9<-data_v2[-c(1340,845,452,1394,1420,1088,556,1753,1756), ]
m9=gamlss(logsvalue~fk(logqvalue,degree=1,start=median(logqvalue)),data=data9,trace=F)
summary(m9)
## ******************************************************************
## Family: c("NO", "Normal")
##
## Call: gamlss(formula = logsvalue ~ fk(logqvalue, degree = 1,
## start = median(logqvalue)), data = data9, trace = F)
##
## Fitting method: RS()
##
## ------------------------------------------------------------------
## Mu link function: identity
## Mu Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.07822 0.00212 508.7 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## ------------------------------------------------------------------
## Sigma link function: log
## Sigma Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.35260 0.01576 -149.3 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## ------------------------------------------------------------------
## NOTE: Additive smoothing terms exist in the formulas:
## i) Std. Error for smoothers are for the linear effect only.
## ii) Std. Error for the linear terms maybe are not accurate.
## ------------------------------------------------------------------
## No. of observations in the fit: 2014
## Degrees of Freedom for the fit: 5
## Residual Deg. of Freedom: 2009
## at cycle: 2
##
## Global Deviance: -3760.787
## AIC: -3750.787
## SBC: -3722.747
## ******************************************************************
getSmo(m9)
##
## Call: fitFreeKnots(y = y, x = xvar, weights = w, degree = degree,
## knots = lambda, fixed = control$fixed, base = control$base)
##
## Coefficients:
## (Intercept) x XatBP1
## -0.6065 0.5728 0.3186
## Estimated Knots:
## BP1
## 1.157
plot(logsvalue~logqvalue,data=data9)
lines(data9$logqvalue[order(data9$logqvalue)],fitted(m9)[order(data9$logqvalue)],col=2)
abline(v=knots(getSmo(m9)),col=3)
The model will cut on 1.157. When logQ<1.157,logS=1.0782-0.6065+0.5728logQ=0.4717+0.5728logQ When logQ>=1.157,logS=0.4717+0.5728logQ+0.3186(logQ-1.157)=0.1031+0.8914logQ
plot(m9)
## ******************************************************************
## Summary of the Quantile Residuals
## mean = -1.137725e-15
## variance = 1.000497
## coef. of skewness = -0.05110977
## coef. of kurtosis = 3.067327
## Filliben correlation coefficient = 0.9990816
## ******************************************************************
data10<-data_v2[-c(423,2013,845,1340,788,452,1394,1420,1924,1753,474),]
m10<-lms(svalue,qvalue,data=data10,k=2,trans.x = T,cent=c(5,25,50,75,95))
## *** Checking for transformation for x ***
## Warning in additive.fit(x = X, y = wv, w = wt * w, s = s, who = who,
## smooth.frame, : additive.fit convergence not obtained in 30 iterations
## Warning in additive.fit(x = X, y = wv, w = wt * w, s = s, who = who,
## smooth.frame, : additive.fit convergence not obtained in 30 iterations
## Warning in additive.fit(x = X, y = wv, w = wt * w, s = s, who = who,
## smooth.frame, : additive.fit convergence not obtained in 30 iterations
## *** power parameters 0.8639339 ***
## *** Initial fit***
## GAMLSS-RS iteration 1: Global Deviance = 10566.88
## GAMLSS-RS iteration 2: Global Deviance = 10566.88
## *** Fitting BCCGo ***
## GAMLSS-RS iteration 1: Global Deviance = 9645.949
## GAMLSS-RS iteration 2: Global Deviance = 9600.755
## GAMLSS-RS iteration 3: Global Deviance = 9600.105
## GAMLSS-RS iteration 4: Global Deviance = 9600.097
## GAMLSS-RS iteration 5: Global Deviance = 9600.098
## *** Fitting BCPEo ***
## GAMLSS-RS iteration 1: Global Deviance = 9764.144
## GAMLSS-RS iteration 2: Global Deviance = 9595.839
## GAMLSS-RS iteration 3: Global Deviance = 9593.68
## GAMLSS-RS iteration 4: Global Deviance = 9593.355
## GAMLSS-RS iteration 5: Global Deviance = 9593.281
## GAMLSS-RS iteration 6: Global Deviance = 9593.254
## GAMLSS-RS iteration 7: Global Deviance = 9593.247
## GAMLSS-RS iteration 8: Global Deviance = 9593.246
## *** Fitting BCTo ***
## GAMLSS-RS iteration 1: Global Deviance = 9594.343
## GAMLSS-RS iteration 2: Global Deviance = 9577.299
## GAMLSS-RS iteration 3: Global Deviance = 9576.248
## GAMLSS-RS iteration 4: Global Deviance = 9576.097
## GAMLSS-RS iteration 5: Global Deviance = 9576.089
## GAMLSS-RS iteration 6: Global Deviance = 9576.097
## GAMLSS-RS iteration 7: Global Deviance = 9576.105
## GAMLSS-RS iteration 8: Global Deviance = 9576.111
## GAMLSS-RS iteration 9: Global Deviance = 9576.113
## GAMLSS-RS iteration 10: Global Deviance = 9576.114
## GAMLSS-RS iteration 11: Global Deviance = 9576.115
## % of cases below 4.717731 centile is 5.019881
## % of cases below 24.23895 centile is 25
## % of cases below 50.55349 centile is 50
## % of cases below 76.28243 centile is 75
## % of cases below 93.97069 centile is 94.98012
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values
The model fail to converge in limited steps. Model fit is obviously insensible for Quanterix value greater than 120 .Try to limit the Quanterix range to <=120
data10<-data_v2[-c(423,2013,845,1340,788,452,1394,1420,1924,1753,474,358,599,646,1172),]
m11<-lms(svalue,qvalue,data=data10,k=2,trans.x = T,cent=c(5,25,50,75,95))
## *** Checking for transformation for x ***
## *** power parameters 0.9325167 ***
## *** Initial fit***
## GAMLSS-RS iteration 1: Global Deviance = 10545.95
## GAMLSS-RS iteration 2: Global Deviance = 10545.95
## *** Fitting BCCGo ***
## GAMLSS-RS iteration 1: Global Deviance = 9612.369
## GAMLSS-RS iteration 2: Global Deviance = 9567.087
## GAMLSS-RS iteration 3: Global Deviance = 9566.397
## GAMLSS-RS iteration 4: Global Deviance = 9566.378
## GAMLSS-RS iteration 5: Global Deviance = 9566.376
## GAMLSS-RS iteration 6: Global Deviance = 9566.376
## *** Fitting BCPEo ***
## GAMLSS-RS iteration 1: Global Deviance = 9730.048
## GAMLSS-RS iteration 2: Global Deviance = 9561.651
## GAMLSS-RS iteration 3: Global Deviance = 9559.416
## GAMLSS-RS iteration 4: Global Deviance = 9559.184
## GAMLSS-RS iteration 5: Global Deviance = 9559.137
## GAMLSS-RS iteration 6: Global Deviance = 9559.121
## GAMLSS-RS iteration 7: Global Deviance = 9559.118
## GAMLSS-RS iteration 8: Global Deviance = 9559.117
## GAMLSS-RS iteration 9: Global Deviance = 9559.116
## *** Fitting BCTo ***
## GAMLSS-RS iteration 1: Global Deviance = 9569.332
## GAMLSS-RS iteration 2: Global Deviance = 9552.411
## GAMLSS-RS iteration 3: Global Deviance = 9550.693
## GAMLSS-RS iteration 4: Global Deviance = 9550.378
## GAMLSS-RS iteration 5: Global Deviance = 9550.352
## GAMLSS-RS iteration 6: Global Deviance = 9550.377
## GAMLSS-RS iteration 7: Global Deviance = 9550.394
## GAMLSS-RS iteration 8: Global Deviance = 9550.403
## GAMLSS-RS iteration 9: Global Deviance = 9550.413
## GAMLSS-RS iteration 10: Global Deviance = 9550.418
## GAMLSS-RS iteration 11: Global Deviance = 9550.421
## GAMLSS-RS iteration 12: Global Deviance = 9550.423
## GAMLSS-RS iteration 13: Global Deviance = 9550.424
## GAMLSS-RS iteration 14: Global Deviance = 9550.425
## % of cases below 4.867578 centile is 5.02988
## % of cases below 24.09728 centile is 25
## % of cases below 50.69009 centile is 50
## % of cases below 76.13163 centile is 75
## % of cases below 94.05664 centile is 94.97012
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values
m11$power
## [1] 0.9325167
wp(m11,xvar=data10$qvalue,n.inter = 5)
## number of missing points from plot= 0 out of 403
## number of missing points from plot= 1 out of 400
## Warning in panel(x[id], y[id], col = col[id], pch = pch[id], ...): Some points are missed out
## increase the y limits using ylim.worm
## number of missing points from plot= 0 out of 409
## number of missing points from plot= 1 out of 396
## Warning in panel(x[id], y[id], col = col[id], pch = pch[id], ...): Some points are missed out
## increase the y limits using ylim.worm
## number of missing points from plot= 1 out of 400
## Warning in panel(x[id], y[id], col = col[id], pch = pch[id], ...): Some points are missed out
## increase the y limits using ylim.worm
Diagnostic plot is OK but not perfect. So this quantile regression model on y^0.9 is reasonable but very sensitive to the X-variable range.
First, the validation result using additional data_v2 on models modeled from data_v1 show that Free knot model and Quantile regression model are good. Then both model are built based on complete data_v2. Free knot model works still well while Quantile regeression model are very sensitive to the model parameter and the range of X-variable. Free knot model \[logS=0.4717+0.5725\times logQ,\ if logQ<1.157\], \[logS=0.1031+0.8914\times logQ, \ logQ>=1.157\] are applicable even for Quanterix value >400. Quantile regression model \[S^{0.93}=2.0411+0.1050pb(Q)\] are applicable when Quanterix <=120.