library(foreign)
data=read.dta("C:/Users/BINH THANG-TRAN/Dropbox/PhD/Research/PhD thesis/Methods/morkov/Utility_study/Graphs/data_5DEC2019.dta")
require(ggplot2)
## Loading required package: ggplot2
require(GGally)
## Loading required package: GGally
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
require(VGAM)
## Loading required package: VGAM
## Loading required package: stats4
## Loading required package: splines
library(ggplot2)
library(grid)
library(gridExtra)
# add a normal distribution line in histogram
hist(data$score2sh, freq=FALSE, col="gray", xlab="Petal Length", main="title here ")
curve(dnorm(x, mean=mean(data$score2sh), sd=sd(data$score2sh)), add=TRUE, col="red") #line
# add a normal distribution line in histogram
hist(data$c2, freq=FALSE, col="gray", xlab="Petal Length", main="title here ")
curve(dnorm(x, mean=mean(data$c2), sd=sd(data$c2)), add=TRUE, col="red") #line
summary(m <- vglm(score2sh ~ factor(a3_gioi)+ factor(a7honnhan1) + factor(age_code)+ factor(a41_edu1)+ factor(job) + factor(a6_kinhte)+ factor(a8benh_code) + factor(timetoK) + factor(a11_stage)+ factor(tr_status), tobit(Lower = -0.566, Upper = 1), data = data))
##
## Call:
## vglm(formula = score2sh ~ factor(a3_gioi) + factor(a7honnhan1) +
## factor(age_code) + factor(a41_edu1) + factor(job) + factor(a6_kinhte) +
## factor(a8benh_code) + factor(timetoK) + factor(a11_stage) +
## factor(tr_status), family = tobit(Lower = -0.566, Upper = 1),
## data = data)
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## mu -3.9495 -0.5044 0.06878 0.572947 3.574
## loglink(sd) -0.9642 -0.7010 -0.49292 0.007119 10.383
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept):1 0.692933 0.181448 3.819 0.000134 ***
## (Intercept):2 -1.502176 0.052802 -28.449 < 2e-16 ***
## factor(a3_gioi)2 0.025408 0.034496 0.737 0.461389
## factor(a7honnhan1)Married -0.053661 0.056594 -0.948 0.343046
## factor(age_code)2 0.253854 0.098222 2.584 0.009752 **
## factor(age_code)3 0.327966 0.093170 3.520 0.000431 ***
## factor(age_code)4 0.355364 0.094874 3.746 0.000180 ***
## factor(age_code)5 0.204475 0.093471 2.188 0.028700 *
## factor(a41_edu1)2 0.077719 0.042065 1.848 0.064663 .
## factor(a41_edu1)3 0.273611 0.074105 3.692 0.000222 ***
## factor(a41_edu1)4 0.134380 0.050682 2.651 0.008015 **
## factor(a41_edu1)5 0.213319 0.076055 2.805 0.005035 **
## factor(job)2 -0.019298 0.057824 -0.334 0.738586
## factor(job)3 -0.061382 0.040944 -1.499 0.133830
## factor(job)4 -0.067976 0.056469 -1.204 0.228674
## factor(a6_kinhte)2 -0.074705 0.144871 -0.516 0.606090
## factor(a6_kinhte)3 -0.038010 0.144893 -0.262 0.793064
## factor(a8benh_code)C20 -0.047294 0.035686 -1.325 0.185077
## factor(timetoK)1 -0.003268 0.048562 -0.067 0.946352
## factor(timetoK)2 0.065508 0.067747 0.967 0.333566
## factor(a11_stage)2 -0.303298 0.076537 -3.963 7.41e-05 ***
## factor(a11_stage)3 -0.305307 0.067528 -4.521 6.15e-06 ***
## factor(a11_stage)4 -0.456580 0.075180 -6.073 1.25e-09 ***
## factor(a11_stage)NA -0.369006 0.174929 -2.109 0.034904 *
## factor(tr_status)2 -0.054429 0.050821 -1.071 0.284170
## factor(tr_status)3 -0.113894 0.048215 -2.362 0.018166 *
## factor(tr_status)4 0.039229 0.065526 0.599 0.549385
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Names of linear predictors: mu, loglink(sd)
##
## Log-likelihood: 12.8071 on 367 degrees of freedom
##
## Number of Fisher scoring iterations: 8
##
## No Hauck-Donner effect found in any of the estimates
ctable <- coef(summary(m))
pvals <- 2 * pt(abs(ctable[, "z value"]), df.residual(m), lower.tail = FALSE)
cbind(ctable, pvals)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept):1 0.692932565 0.18144794 3.81890563 1.340450e-04
## (Intercept):2 -1.502176412 0.05280209 -28.44918447 4.987603e-178
## factor(a3_gioi)2 0.025408286 0.03449580 0.73656179 4.613889e-01
## factor(a7honnhan1)Married -0.053660906 0.05659450 -0.94816470 3.430456e-01
## factor(age_code)2 0.253854430 0.09822216 2.58449248 9.752243e-03
## factor(age_code)3 0.327966306 0.09317027 3.52007463 4.314254e-04
## factor(age_code)4 0.355364000 0.09487391 3.74564526 1.799307e-04
## factor(age_code)5 0.204475311 0.09347097 2.18758084 2.870015e-02
## factor(a41_edu1)2 0.077719213 0.04206532 1.84758416 6.466252e-02
## factor(a41_edu1)3 0.273610682 0.07410506 3.69219948 2.223230e-04
## factor(a41_edu1)4 0.134379855 0.05068150 2.65145753 8.014519e-03
## factor(a41_edu1)5 0.213318702 0.07605495 2.80479721 5.034825e-03
## factor(job)2 -0.019297602 0.05782450 -0.33372710 7.385855e-01
## factor(job)3 -0.061381853 0.04094395 -1.49916798 1.338301e-01
## factor(job)4 -0.067976248 0.05646894 -1.20378125 2.286741e-01
## factor(a6_kinhte)2 -0.074704751 0.14487137 -0.51566264 6.060900e-01
## factor(a6_kinhte)3 -0.038010419 0.14489331 -0.26233384 7.930641e-01
## factor(a8benh_code)C20 -0.047294397 0.03568627 -1.32528265 1.850774e-01
## factor(timetoK)1 -0.003267649 0.04856151 -0.06728886 9.463517e-01
## factor(timetoK)2 0.065507985 0.06774656 0.96695659 3.335657e-01
## factor(a11_stage)2 -0.303297682 0.07653725 -3.96274619 7.409254e-05
## factor(a11_stage)3 -0.305306829 0.06752785 -4.52119846 6.149050e-06
## factor(a11_stage)4 -0.456580246 0.07518032 -6.07313513 1.254368e-09
## factor(a11_stage)NA -0.369006408 0.17492870 -2.10946753 3.490425e-02
## factor(tr_status)2 -0.054428904 0.05082074 -1.07099791 2.841704e-01
## factor(tr_status)3 -0.113893750 0.04821463 -2.36222398 1.816566e-02
## factor(tr_status)4 0.039229439 0.06552643 0.59868120 5.493855e-01
## pvals
## (Intercept):1 1.573363e-04
## (Intercept):2 7.452985e-95
## factor(a3_gioi)2 4.618594e-01
## factor(a7honnhan1)Married 3.436695e-01
## factor(age_code)2 1.013767e-02
## factor(age_code)3 4.856961e-04
## factor(age_code)4 2.088602e-04
## factor(age_code)5 2.933046e-02
## factor(a41_edu1)2 6.546708e-02
## factor(a41_edu1)3 2.560797e-04
## factor(a41_edu1)4 8.361924e-03
## factor(a41_edu1)5 5.302765e-03
## factor(job)2 7.387761e-01
## factor(job)3 1.346897e-01
## factor(job)4 2.294500e-01
## factor(a6_kinhte)2 6.064005e-01
## factor(a6_kinhte)3 7.932113e-01
## factor(a8benh_code)C20 1.859019e-01
## factor(timetoK)1 9.463884e-01
## factor(timetoK)2 3.342025e-01
## factor(a11_stage)2 8.904617e-05
## factor(a11_stage)3 8.309508e-06
## factor(a11_stage)4 3.135761e-09
## factor(a11_stage)NA 3.558095e-02
## factor(tr_status)2 2.848742e-01
## factor(tr_status)3 1.868696e-02
## factor(tr_status)4 5.497548e-01
m2 <- vglm(score2sh ~ factor(a3_gioi)+ factor(a7honnhan1) + factor(age_code)+ factor(a41_edu1)+ factor(job) + factor(a6_kinhte)+ factor(a8benh_code) + factor(timetoK) + factor(tr_status), tobit(Lower = -0.566, Upper = 1), data = data)
(p <- pchisq(2 * (logLik(m) - logLik(m2)), df = 2, lower.tail = FALSE))
## [1] 1.600392e-08
The LRT with two degrees of freedom is associated with a p-value of 1.600392e-08, indicating that the effect of CRC stage is statistically significant.
Below we calculate the upper and lower 95% confidence intervals for the coefficients.
b <- coef(m)
se <- sqrt(diag(vcov(m)))
cbind(LL = b - qnorm(0.975) * se, UL = b + qnorm(0.975) * se)
## LL UL
## (Intercept):1 0.337301135 1.04856400
## (Intercept):2 -1.605666607 -1.39868622
## factor(a3_gioi)2 -0.042202234 0.09301881
## factor(a7honnhan1)Married -0.164584086 0.05726227
## factor(age_code)2 0.061342537 0.44636632
## factor(age_code)3 0.145355931 0.51057668
## factor(age_code)4 0.169414559 0.54131344
## factor(age_code)5 0.021275569 0.38767505
## factor(a41_edu1)2 -0.004727293 0.16016572
## factor(a41_edu1)3 0.128367424 0.41885394
## factor(a41_edu1)4 0.035045933 0.23371378
## factor(a41_edu1)5 0.064253745 0.36238366
## factor(job)2 -0.132631537 0.09403633
## factor(job)3 -0.141630513 0.01886681
## factor(job)4 -0.178653332 0.04270084
## factor(a6_kinhte)2 -0.358647410 0.20923791
## factor(a6_kinhte)3 -0.321996096 0.24597526
## factor(a8benh_code)C20 -0.117238205 0.02264941
## factor(timetoK)1 -0.098446456 0.09191116
## factor(timetoK)2 -0.067272837 0.19828881
## factor(a11_stage)2 -0.453307929 -0.15328744
## factor(a11_stage)3 -0.437658990 -0.17295467
## factor(a11_stage)4 -0.603930967 -0.30922953
## factor(a11_stage)NA -0.711860356 -0.02615246
## factor(tr_status)2 -0.154035720 0.04517791
## factor(tr_status)3 -0.208392684 -0.01939482
## factor(tr_status)4 -0.089199995 0.16765887
data$yhat <- fitted(m)[,1]
data$rr <- resid(m, type = "response")
data$rp <- resid(m, type = "pearson")[,1]
par(mfcol = c(2, 3))
with(data, {
plot(yhat, rr, main = "Fitted vs Residuals")
qqnorm(rr)
plot(yhat, rp, main = "Fitted vs Pearson Residuals")
qqnorm(rp)
plot(score2sh, rp, main = "Actual vs Pearson Residuals")
plot(score2sh, yhat, main = "Actual vs Fitted")
})
The graph in the bottom right was the predicted, or fitted, values plotted against the actual. This can be particularly useful when comparing competing models. We can calculate the correlation between these two as well as the squared correlation, to get a sense of how accurate our model predicts the data and how much of the variance in the outcome is accounted for by the model.
# correlation
(r <- with(data, cor(score2sh, yhat)))
## [1] 0.5823048
# variance accounted for
r^2
## [1] 0.3390789
End: The correlation between the predicted and observed values of apt is 0.5823048. If we square this value, we get the multiple squared correlation, this indicates predicted values share 34% of their variance with “score2sh”
(r <- with(data, cor(score2sh, c2)))
## [1] 0.4322855
grob1 = grobTree(textGrob(paste("Pearson Correlation : ", round(cor(data$score2sh, data$c2), 4) ), x = 0.63, y = 0.97, hjust = 0, gp = gpar(col = "red", fontsize = 11, fontface = "bold")))
g1 = ggplot(data, aes(x=c2, y=score2sh)) + geom_point() + ggtitle("EQ-5D-5L vs VAS") + geom_smooth(method=lm, se=FALSE) + scale_x_continuous(name = "VAS", limits = c(0, 100), breaks = seq(0, 100, 10)) + scale_y_continuous(name = "EQ-5D-5L", limits = c(-1, 1), breaks = seq(-1, 1, 0.5)) + annotation_custom(grob1) + theme(plot.title = element_text(hjust = 0.5))
g1
#Binh Thang (R)
# add a normal distribution line in histogram
hist(data$c2, freq=FALSE, col="gray", xlab="Petal Length", main="title here ")
curve(dnorm(x, mean=mean(data$c2), sd=sd(data$c2)), add=TRUE, col="red") #line
# add a normal distribution line in histogram
hist(data$c2, freq=FALSE, col="gray", xlab="Petal Length", main="title here ")
curve(dnorm(x, mean=mean(data$c2), sd=sd(data$c2)), add=TRUE, col="red") #line
summary(m <- vglm(c2 ~ factor(a3_gioi)+ factor(a7honnhan1) + factor(age_code)+ factor(a41_edu1)+ factor(job) + factor(a6_kinhte)+ factor(a8benh_code) + factor(timetoK) + factor(a11_stage)+ factor(tr_status), tobit(Lower = 0, Upper = 100), data = data))
##
## Call:
## vglm(formula = c2 ~ factor(a3_gioi) + factor(a7honnhan1) + factor(age_code) +
## factor(a41_edu1) + factor(job) + factor(a6_kinhte) + factor(a8benh_code) +
## factor(timetoK) + factor(a11_stage) + factor(tr_status),
## family = tobit(Lower = 0, Upper = 100), data = data)
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## mu -3.7737 -0.5794 -0.03367 0.6447 3.187
## loglink(sd) -0.7169 -0.6403 -0.44150 0.0582 9.376
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept):1 57.50393 9.41986 6.105 1.03e-09 ***
## (Intercept):2 2.46230 0.05064 48.624 < 2e-16 ***
## factor(a3_gioi)2 -1.15646 1.80775 -0.640 0.522350
## factor(a7honnhan1)Married 1.90209 2.96707 0.641 0.521479
## factor(age_code)2 0.06091 5.14903 0.012 0.990562
## factor(age_code)3 4.95729 4.88637 1.015 0.310338
## factor(age_code)4 5.11058 4.96943 1.028 0.303760
## factor(age_code)5 -3.45804 4.90356 -0.705 0.480679
## factor(a41_edu1)2 1.90411 2.21163 0.861 0.389264
## factor(a41_edu1)3 8.51597 3.85093 2.211 0.027008 *
## factor(a41_edu1)4 -1.55091 2.65608 -0.584 0.559282
## factor(a41_edu1)5 6.73020 3.98095 1.691 0.090913 .
## factor(job)2 5.31721 3.03068 1.754 0.079351 .
## factor(job)3 3.93718 2.14712 1.834 0.066698 .
## factor(job)4 2.54339 2.96002 0.859 0.390205
## factor(a6_kinhte)2 17.09298 7.52509 2.271 0.023119 *
## factor(a6_kinhte)3 17.43589 7.53071 2.315 0.020596 *
## factor(a8benh_code)C20 -2.37479 1.87069 -1.269 0.204272
## factor(timetoK)1 -2.44672 2.54226 -0.962 0.335841
## factor(timetoK)2 -7.69443 3.54829 -2.168 0.030121 *
## factor(a11_stage)2 -11.12852 3.97064 -2.803 0.005068 **
## factor(a11_stage)3 -11.53994 3.49377 -3.303 0.000957 ***
## factor(a11_stage)4 -17.52035 3.89832 -4.494 6.98e-06 ***
## factor(a11_stage)NA -7.60723 9.18515 -0.828 0.407552
## factor(tr_status)2 2.94708 2.67204 1.103 0.270056
## factor(tr_status)3 -8.67330 2.53003 -3.428 0.000608 ***
## factor(tr_status)4 -4.27483 3.43543 -1.244 0.213375
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Names of linear predictors: mu, loglink(sd)
##
## Log-likelihood: -762.1313 on 367 degrees of freedom
##
## Number of Fisher scoring iterations: 5
##
## No Hauck-Donner effect found in any of the estimates
ctable <- coef(summary(m))
pvals <- 2 * pt(abs(ctable[, "z value"]), df.residual(m), lower.tail = FALSE)
cbind(ctable, pvals)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept):1 57.50392708 9.41986006 6.1045415 1.030962e-09
## (Intercept):2 2.46230477 0.05064015 48.6235639 0.000000e+00
## factor(a3_gioi)2 -1.15646418 1.80774677 -0.6397269 5.223502e-01
## factor(a7honnhan1)Married 1.90209307 2.96706961 0.6410679 5.214786e-01
## factor(age_code)2 0.06091049 5.14903234 0.0118295 9.905616e-01
## factor(age_code)3 4.95728559 4.88637410 1.0145121 3.103385e-01
## factor(age_code)4 5.11057517 4.96942616 1.0284035 3.037601e-01
## factor(age_code)5 -3.45804309 4.90355520 -0.7052114 4.806787e-01
## factor(a41_edu1)2 1.90411487 2.21163491 0.8609535 3.892636e-01
## factor(a41_edu1)3 8.51597062 3.85093327 2.2114044 2.700785e-02
## factor(a41_edu1)4 -1.55090572 2.65607672 -0.5839085 5.592819e-01
## factor(a41_edu1)5 6.73020051 3.98094672 1.6906030 9.091265e-02
## factor(job)2 5.31720710 3.03067677 1.7544620 7.935138e-02
## factor(job)3 3.93717937 2.14711818 1.8337041 6.669793e-02
## factor(job)4 2.54338756 2.96002162 0.8592463 3.902047e-01
## factor(a6_kinhte)2 17.09298391 7.52509481 2.2714643 2.311889e-02
## factor(a6_kinhte)3 17.43589104 7.53070805 2.3153057 2.059621e-02
## factor(a8benh_code)C20 -2.37479126 1.87068994 -1.2694735 2.042722e-01
## factor(timetoK)1 -2.44671564 2.54226500 -0.9624157 3.358409e-01
## factor(timetoK)2 -7.69442536 3.54828702 -2.1684901 3.012142e-02
## factor(a11_stage)2 -11.12852086 3.97063611 -2.8027048 5.067603e-03
## factor(a11_stage)3 -11.53994496 3.49376511 -3.3030111 9.565260e-04
## factor(a11_stage)4 -17.52034992 3.89831839 -4.4943353 6.978759e-06
## factor(a11_stage)NA -7.60722979 9.18514752 -0.8282099 4.075517e-01
## factor(tr_status)2 2.94708004 2.67203999 1.1029326 2.700564e-01
## factor(tr_status)3 -8.67329725 2.53003280 -3.4281363 6.077403e-04
## factor(tr_status)4 -4.27483238 3.43542603 -1.2443384 2.133751e-01
## pvals
## (Intercept):1 2.624130e-09
## (Intercept):2 4.946868e-162
## factor(a3_gioi)2 5.227493e-01
## factor(a7honnhan1)Married 5.218787e-01
## factor(age_code)2 9.905681e-01
## factor(age_code)3 3.110068e-01
## factor(age_code)4 3.044374e-01
## factor(age_code)5 4.811260e-01
## factor(a41_edu1)2 3.898258e-01
## factor(a41_edu1)3 2.762359e-02
## factor(a41_edu1)4 5.596406e-01
## factor(a41_edu1)5 9.176171e-02
## factor(job)2 8.018591e-02
## factor(job)3 6.750757e-02
## factor(job)4 3.907655e-01
## factor(a6_kinhte)2 2.369732e-02
## factor(a6_kinhte)3 2.114708e-02
## factor(a8benh_code)C20 2.050767e-01
## factor(timetoK)1 3.364745e-01
## factor(timetoK)2 3.076328e-02
## factor(a11_stage)2 5.336550e-03
## factor(a11_stage)3 1.050562e-03
## factor(a11_stage)4 9.368097e-06
## factor(a11_stage)NA 4.080899e-01
## factor(tr_status)2 2.707791e-01
## factor(tr_status)3 6.767025e-04
## factor(tr_status)4 2.141692e-01
m2 <- vglm(c2 ~ factor(a3_gioi)+ factor(a7honnhan1) + factor(age_code)+ factor(a41_edu1)+ factor(job) + factor(a6_kinhte)+ factor(a8benh_code) + factor(timetoK) + factor(tr_status), tobit(Lower = 0, Upper = 100), data = data)
(p <- pchisq(2 * (logLik(m) - logLik(m2)), df = 2, lower.tail = FALSE))
## [1] 5.002571e-05
The LRT with two degrees of freedom is associated with a p-value of 5.002571e-05, indicating that the effect of CRC stage is statistically significant.
Below we calculate the upper and lower 95% confidence intervals for the coefficients.
b <- coef(m)
se <- sqrt(diag(vcov(m)))
cbind(LL = b - qnorm(0.975) * se, UL = b + qnorm(0.975) * se)
## LL UL
## (Intercept):1 39.0413406 75.9665135
## (Intercept):2 2.3630519 2.5615576
## factor(a3_gioi)2 -4.6995827 2.3866544
## factor(a7honnhan1)Married -3.9132565 7.7174426
## factor(age_code)2 -10.0310074 10.1528284
## factor(age_code)3 -4.6198317 14.5344028
## factor(age_code)4 -4.6293211 14.8504715
## factor(age_code)5 -13.0688347 6.1527485
## factor(a41_edu1)2 -2.4306099 6.2388396
## factor(a41_edu1)3 0.9682801 16.0636611
## factor(a41_edu1)4 -6.7567204 3.6549090
## factor(a41_edu1)5 -1.0723117 14.5327127
## factor(job)2 -0.6228102 11.2572244
## factor(job)3 -0.2710949 8.1454537
## factor(job)4 -3.2581482 8.3449233
## factor(a6_kinhte)2 2.3440691 31.8418987
## factor(a6_kinhte)3 2.6759745 32.1958076
## factor(a8benh_code)C20 -6.0412762 1.2916937
## factor(timetoK)1 -7.4294635 2.5360322
## factor(timetoK)2 -14.6489401 -0.7399106
## factor(a11_stage)2 -18.9108246 -3.3462171
## factor(a11_stage)3 -18.3875987 -4.6922912
## factor(a11_stage)4 -25.1609136 -9.8797863
## factor(a11_stage)NA -25.6097881 10.3953285
## factor(tr_status)2 -2.2900221 8.1841822
## factor(tr_status)3 -13.6320704 -3.7145241
## factor(tr_status)4 -11.0081437 2.4584789
data$yhat1 <- fitted(m)[,1]
data$rr <- resid(m, type = "response")
data$rp <- resid(m, type = "pearson")[,1]
par(mfcol = c(2, 3))
with(data, {
plot(yhat1, rr, main = "Fitted vs Residuals")
qqnorm(rr)
plot(yhat1, rp, main = "Fitted vs Pearson Residuals")
qqnorm(rp)
plot(c2, rp, main = "Actual vs Pearson Residuals")
plot(c2, yhat1, main = "Actual vs Fitted")
})
The graph in the bottom right was the predicted, or fitted, values plotted against the actual. This can be particularly useful when comparing competing models. We can calculate the correlation between these two as well as the squared correlation, to get a sense of how accurate our model predicts the data and how much of the variance in the outcome is accounted for by the model.
# correlation
(r <- with(data, cor(c2, yhat1)))
## [1] 0.5669197
# variance accounted for
r^2
## [1] 0.321398
End: The correlation between the predicted and observed values of apt is 0.5669197 If we square this value, we get the multiple squared correlation, this indicates predicted values share 32% of their variance with “c2- vas”