packageVersion("gRbase")
## [1] '1.7.2'
packageVersion("gRain")
## [1] '1.2.4'
packageVersion("gRim")
## [1] '0.1.17'
source("http://bioconductor.org/biocLite.R");
## Bioconductor version 3.1 (BiocInstaller 1.18.5), ?biocLite for help
## A newer version of Bioconductor is available for this version of R,
## ?BiocUpgrade for help
#biocLite(c("graph","RBGL","Rgraphviz"))
#3 Conditional probability tables (CPTs)
#CPTs are just multiway arrays WITH dimnames attribute. For
#example p(tja):
library(gRain)
## Loading required package: gRbase
yn <- c("yes","no");
x <- c(5,95,1,99)
# Vanilla R
t.a <- array(x, dim=c(2,2),
dimnames=list(tub=yn,asia=yn))
t.a
## asia
## tub yes no
## yes 5 1
## no 95 99
# Alternative specification: parray() from gRbase
t.a <- parray(c("tub","asia"), levels=list(yn,yn),
values=x)
t.a
## asia
## tub yes no
## yes 5 1
## no 95 99
# with a formula interface
t.a <- parray(~tub:asia, levels=list(yn,yn), values=x)
t.a
## asia
## tub yes no
## yes 5 1
## no 95 99
# Alternative (partial) specification
t.a <- cptable(~tub | asia, values=c(5,95,1,99),
levels=yn)
t.a
## {v,pa(v)} : chr [1:2] "tub" "asia"
## <NA> <NA>
## yes 5 1
## no 95 99
#An introduction to the gRain package
#Specify chest clinic network. Can be done in many ways; one is
#from a list of CPTs:
library(gRain)
yn <- c("yes","no")
a <- cptable(~asia, values=c(1,99), levels=yn)
t.a <- cptable(~tub | asia, values=c(5,95,1,99), levels=yn)
s <- cptable(~smoke, values=c(5,5), levels=yn)
l.s <- cptable(~lung | smoke, values=c(1,9,1,99), levels=yn)
b.s <- cptable(~bronc | smoke, values=c(6,4,3,7), levels=yn)
e.lt <- cptable(~either | lung:tub,values=c(1,0,1,0,1,0,0,1),
levels=yn)
x.e <- cptable(~xray | either, values=c(98,2,5,95), levels=yn)
d.be <- cptable(~dysp | bronc:either, values=c(9,1,7,3,8,2,1,9),
levels=yn)
cpt.list <- compileCPT(list(a, t.a, s, l.s, b.s, e.lt, x.e, d.be))
cpt.list
## CPTspec with probabilities:
## P( asia )
## P( tub | asia )
## P( smoke )
## P( lung | smoke )
## P( bronc | smoke )
## P( either | lung tub )
## P( xray | either )
## P( dysp | bronc either )
cpt.list$asia
## asia
## yes no
## 0.01 0.99
cpt.list$tub
## asia
## tub yes no
## yes 0.05 0.01
## no 0.95 0.99
ftable(cpt.list$either, row.vars=1) # Notice: logical variable
## lung yes no
## tub yes no yes no
## either
## yes 1 1 1 0
## no 0 0 0 1
# Create network from CPT list:
bnet <- grain(cpt.list)
# Compile network (details follow)
bnet <- compile(bnet)
bnet
## Independence network: Compiled: TRUE Propagated: FALSE
## Nodes: chr [1:8] "asia" "tub" "smoke" "lung" "bronc" "either" ...
#Querying the network
# Query network to find marginal probabilities of diseases
querygrain(bnet, nodes=c("tub","lung","bronc"))
## $tub
## tub
## yes no
## 0.0104 0.9896
##
## $lung
## lung
## yes no
## 0.055 0.945
##
## $bronc
## bronc
## yes no
## 0.45 0.55
#6 Setting evidence
# Set evidence and query network again
bnet.ev<-setEvidence(bnet, nodes = c("asia","dysp"),
states = c("yes","yes"))
querygrain(bnet.ev, nodes=c("tub","lung","bronc"))
## $tub
## tub
## yes no
## 0.08775096 0.91224904
##
## $lung
## lung
## yes no
## 0.09952515 0.90047485
##
## $bronc
## bronc
## yes no
## 0.8114021 0.1885979
# Set additional evidence and query again
bnet.ev<-setEvidence(bnet.ev, nodes = "xray", states = "yes")
querygrain(bnet.ev, nodes=c("tub","lung","bronc"))
## $tub
## tub
## yes no
## 0.3917117 0.6082883
##
## $lung
## lung
## yes no
## 0.4442705 0.5557295
##
## $bronc
## bronc
## yes no
## 0.6288218 0.3711782
# Probability of observing the evidence (the normalizing constant)
pEvidence(bnet.ev)
## [1] 0.0009882268
# Get joint dist of tub, lung, bronc given evidence
x<-querygrain(bnet.ev, nodes=c("tub","lung","bronc"),
type="joint")
ftable(x, row.vars=1)
## lung yes no
## bronc yes no yes no
## tub
## yes 0.014056997 0.008156529 0.186757240 0.182740955
## no 0.267082934 0.154974048 0.160924606 0.025306692
# Get distribution of tub given lung, bronc and evidence
x<-querygrain(bnet.ev, nodes=c("tub","lung","bronc"),
type="conditional")
ftable(x, row.vars=1)
## bronc yes no
## tub yes no yes no
## lung
## yes 0.0500000 0.9500000 0.0500000 0.9500000
## no 0.5371498 0.4628502 0.8783611 0.1216389
# Remove evidence
bnet.ev<-retractEvidence(bnet.ev, nodes="asia")
bnet.ev
## Independence network: Compiled: TRUE Propagated: TRUE
## Nodes: chr [1:8] "asia" "tub" "smoke" "lung" "bronc" "either" ...
## Evidence:
## nodes is.hard.evidence hard.state
## 1 dysp TRUE yes
## 2 xray TRUE yes
## pEvidence: 0.070670
## collection of CPTs: p(v|pa(v))
cpt.list
## CPTspec with probabilities:
## P( asia )
## P( tub | asia )
## P( smoke )
## P( lung | smoke )
## P( bronc | smoke )
## P( either | lung tub )
## P( xray | either )
## P( dysp | bronc either )
## form joint p(V)= prod p(v|pa(v))
joint <- cpt.list$asia
for (i in 2:length(cpt.list)){
joint <- tableMult( joint, cpt.list[[i]] )
}
dim(joint)
## dysp bronc either xray lung tub smoke asia
## 2 2 2 2 2 2 2 2
head( as.data.frame.table( joint ) )
## dysp bronc either xray lung tub smoke asia Freq
## 1 yes yes yes yes yes yes yes yes 1.323e-05
## 2 no yes yes yes yes yes yes yes 1.470e-06
## 3 yes no yes yes yes yes yes yes 6.860e-06
## 4 no no yes yes yes yes yes yes 2.940e-06
## 5 yes yes no yes yes yes yes yes 0.000e+00
## 6 no yes no yes yes yes yes yes 0.000e+00
## form marginal p(a,b,d) by marginalization
marg <- tableMargin(joint, ~asia+bronc+dysp)
dim( marg )
## asia bronc dysp
## 2 2 2
ftable( marg )
## dysp yes no
## asia bronc
## yes yes 0.003652425 0.000847575
## no 0.000848950 0.004651050
## no yes 0.359932815 0.085567185
## no 0.071536410 0.472963590
## Set entries not consistent with asia=yes and dysp=yes
## equal to zero
marg <- tableSetSliceValue(marg, c("asia","dysp"), c("yes","yes"),
complement=T)
ftable(marg)
## dysp yes no
## asia bronc
## yes yes 0.003652425 0.000000000
## no 0.000848950 0.000000000
## no yes 0.000000000 0.000000000
## no 0.000000000 0.000000000
result <- tableMargin(marg, ~bronc);
result <- result / sum( result ); result
## bronc
## yes no
## 0.8114021 0.1885979
#8 Message passing a small example
require(gRbase); require(Rgraphviz)
## Loading required package: Rgraphviz
## Loading required package: graph
## Loading required package: grid
d<-dag( ~smoke + bronc|smoke + dysp|bronc ); plot(d)

library(gRain)
yn <- c("yes","no")
s <- parray("smoke", list(yn), c(.5, .5))
b.s <- parray(c("bronc","smoke"), list(yn,yn), c(6,4, 3,7))
d.b <- parray(c("dysp","bronc"), list(yn, yn), c(9,1, 2,8))
s; b.s; d.b
## smoke
## yes no
## 0.5 0.5
## smoke
## bronc yes no
## yes 6 3
## no 4 7
## bronc
## dysp yes no
## yes 9 2
## no 1 8
#p(s; b; d) = p(s)p(bjs)p(djb)
joint <- tableMult( tableMult(s, b.s), d.b) ; ftable(joint)
## smoke yes no
## dysp bronc
## yes yes 27.0 13.5
## no 4.0 7.0
## no yes 3.0 1.5
## no 16.0 28.0
dm <-moralize(d);
jtree<-ug(~smoke.bronc:bronc.dysp);
par(mfrow=c(1,3)); plot(d); plot(dm); plot(jtree)
par(mfrow=c(1,3)); plot(d); plot(dm); plot(jtree)

q1.sb <- tableMult(s, b.s); q1.sb
## smoke
## bronc yes no
## yes 3 1.5
## no 2 3.5
q2.bd <- d.b; q2.bd
## bronc
## dysp yes no
## yes 9 2
## no 1 8
#Collect Evidence
plot( jtree )

q1.b <- tableMargin(q1.sb, "bronc"); q1.b
## bronc
## yes no
## 4.5 5.5
q2.bd <- tableMult(q2.bd, q1.b); q2.bd
## dysp
## bronc yes no
## yes 40.5 4.5
## no 11.0 44.0
q1.sb <- tableDiv(q1.sb, q1.b); q1.sb
## smoke
## bronc yes no
## yes 0.6666667 0.3333333
## no 0.3636364 0.6363636
#Recall that the joint distribution is
joint
## , , smoke = yes
##
## bronc
## dysp yes no
## yes 27 4
## no 3 16
##
## , , smoke = no
##
## bronc
## dysp yes no
## yes 13.5 7
## no 1.5 28
q1.sb
## smoke
## bronc yes no
## yes 0.6666667 0.3333333
## no 0.3636364 0.6363636
tableMargin(joint, c("smoke","bronc"))
## bronc
## smoke yes no
## yes 30 20
## no 15 35
q2.bd
## dysp
## bronc yes no
## yes 40.5 4.5
## no 11.0 44.0
tableMargin(joint, c("bronc","dysp"))
## dysp
## bronc yes no
## yes 40.5 4.5
## no 11.0 44.0
tableMargin(q1.sb, "bronc") # or
## bronc
## yes no
## 1 1
tableMargin(q2.bd, "bronc")
## bronc
## yes no
## 45 55
#8.3 Setting evidence
#Next consider the case where we have the evidence that dysp=yes.
q1.sb <- tableMult(s, b.s)
q2.bd <- d.b
q2.bd <- tableSetSliceValue(q2.bd, "dysp", "yes", complement=T)#; q2.bronc??
# Repeat all the same steps as before
q1.b <- tableMargin(q1.sb, "bronc"); q1.b
## bronc
## yes no
## 4.5 5.5
q2.bd <- tableMult(q2.bd, q1.b); q2.bd
## dysp
## bronc yes no
## yes 40.5 0
## no 11.0 0
q1.sb <- tableDiv(q1.sb, q1.b); q1.sb
## smoke
## bronc yes no
## yes 0.6666667 0.3333333
## no 0.3636364 0.6363636
q2.b <- tableMargin(q2.bd, "bronc"); q2.b
## bronc
## yes no
## 40.5 11.0
q1.sb <- tableMult(q1.sb, q2.b); q1.sb
## smoke
## bronc yes no
## yes 27 13.5
## no 4 7.0
ftable( joint )
## smoke yes no
## dysp bronc
## yes yes 27.0 13.5
## no 4.0 7.0
## no yes 3.0 1.5
## no 16.0 28.0
q1.sb
## smoke
## bronc yes no
## yes 27 13.5
## no 4 7.0
tableMargin(joint, c("smoke","bronc"))
## bronc
## smoke yes no
## yes 30 20
## no 15 35
q2.bd
## dysp
## bronc yes no
## yes 40.5 0
## no 11.0 0
tableMargin(joint, c("bronc","dysp"))
## dysp
## bronc yes no
## yes 40.5 4.5
## no 11.0 44.0
##9 Message passing the bigger picture
#Moral graph: marry parents and drop directions:
par(mfrow=c(1,2)); plot(bnet$dag); plot(moralize(bnet$dag))

par(mfrow=c(1,2)); plot(moralize(bnet$dag));
plot(triangulate(moralize(bnet$dag)))

par(mfrow=c(1,2)); plot(bnet$ug); plot(jTree( bnet$ug ))

str( jTree( bnet$ug )$cliques )
## List of 6
## $ : chr [1:2] "asia" "tub"
## $ : chr [1:3] "either" "lung" "tub"
## $ : chr [1:3] "either" "lung" "bronc"
## $ : chr [1:3] "smoke" "lung" "bronc"
## $ : chr [1:3] "either" "dysp" "bronc"
## $ : chr [1:2] "either" "xray"
#10 Conditional independence
#Consider again the toy example:
plot(dag(~smoke+bronc|smoke+dysp|bronc))
#p(s; b; d) = p(s)p(b|s)p(d|b)
#Clear that s??djb under all these models:
par(mfrow = c(1,4))

plot(dag(~smoke+bronc|smoke+dysp|bronc))
plot(dag(~bronc+smoke|bronc+dysp|bronc))
plot(dag(~dysp+smoke|bronc+bronc|dysp))
plot(ug(~smoke:bronc+bronc:dysp))

plot(dag( ~smoke + dysp + bronc|smoke:dysp ))

#11 Towards data
#Building CPTs from data:
## Example: Simulated data from chest network
data(chestSim1000, package="gRbase")
head(chestSim1000)
## asia tub smoke lung bronc either xray dysp
## 1 no no no no yes no no yes
## 2 no no yes no yes no no yes
## 3 no no yes no no no no no
## 4 no no no no no no no no
## 5 no no yes no yes no no yes
## 6 no no yes yes yes yes yes yes
#11.1 Extracting CPTs
## Extract empirical distributions
s <- xtabs(~smoke, chestSim1000); s
## smoke
## yes no
## 465 535
b.s <- xtabs(~bronc+smoke, chestSim1000); b.s
## smoke
## bronc yes no
## yes 276 160
## no 189 375
d.b <- xtabs(~dysp+bronc, chestSim1000); d.b
## bronc
## dysp yes no
## yes 360 68
## no 76 496
## Normalize to CPTs if desired (not necessary because
## we can always normalize at the end)
s <- as.parray(s, normalize="first"); s
## smoke
## yes no
## 0.465 0.535
b.s <- as.parray(b.s, normalize="first"); b.s
## smoke
## bronc yes no
## yes 0.5935484 0.2990654
## no 0.4064516 0.7009346
d.b <- as.parray(d.b, normalize="first"); d.b
## bronc
## dysp yes no
## yes 0.8256881 0.1205674
## no 0.1743119 0.8794326
cpt.list <- compileCPT(list(s, b.s, d.b)); cpt.list
## CPTspec with probabilities:
## P( smoke )
## P( bronc | smoke )
## P( dysp | bronc )
net <- grain( cpt.list ); net
## Independence network: Compiled: FALSE Propagated: FALSE
## Nodes: chr [1:3] "smoke" "bronc" "dysp"
#But we could just as well extract CPTs for this model,
plot(dag(~bronc + smoke|bronc + dysp|bronc))

#in the sense that the joint distribution will become the same:
## Extract empirical distributions
b <- xtabs(~bronc, chestSim1000);
s.b <- xtabs(~smoke+bronc, chestSim1000);
d.b <- xtabs(~dysp+bronc, chestSim1000);
#Notice, that in this case
plot(dag( ~smoke + dysp + bronc|smoke:dysp ))

s <- xtabs(~smoke, chestSim1000);
d <- xtabs(~dysp, chestSim1000);
b.sd <- xtabs(~bronc+smoke+dysp, chestSim1000);
#11.2 Extracting clique marginals
#Alternatively, we consider the undirected graph
plot(ug( ~smoke:bronc+bronc:dysp ))

#We might as well extract clique marginals directly:
q1.sb <- xtabs(~smoke+bronc, data=chestSim1000); q1.sb
## bronc
## smoke yes no
## yes 276 189
## no 160 375
q2.db <- xtabs(~bronc+dysp, data=chestSim1000); q2.db
## dysp
## bronc yes no
## yes 360 76
## no 68 496
q1.sb <- tableDiv(q1.sb, tableMargin(q1.sb, ~smoke)); q1.sb
## bronc
## smoke yes no
## yes 0.5935484 0.4064516
## no 0.2990654 0.7009346
#12 Learning the model structure
data(lizardRAW, package="gRbase")
dim(lizardRAW)
## [1] 409 3
head(lizardRAW, 4)
## diam height species
## 1 >4 >4.75 dist
## 2 >4 >4.75 dist
## 3 <=4 <=4.75 anoli
## 4 >4 <=4.75 anoli
lizard<-xtabs(~., data=lizardRAW)
dim( lizard )
## [1] 2 2 2
ftable( lizard )
## species anoli dist
## diam height
## <=4 <=4.75 86 73
## >4.75 32 61
## >4 <=4.75 35 70
## >4.75 11 41
#One estimate of the probabilities is by the relative frquencies:
lizardProb <- lizard/sum(lizard); ftable(lizardProb)
## species anoli dist
## diam height
## <=4 <=4.75 0.21026895 0.17848411
## >4.75 0.07823961 0.14914425
## >4 <=4.75 0.08557457 0.17114914
## >4.75 0.02689487 0.10024450
tableMargin(lizard, ~height+species)
## species
## height anoli dist
## <=4.75 121 143
## >4.75 43 102
tableMargin(lizardProb, ~height+species)
## species
## height anoli dist
## <=4.75 0.2958435 0.3496333
## >4.75 0.1051345 0.2493888
lizard
## , , species = anoli
##
## height
## diam <=4.75 >4.75
## <=4 86 32
## >4 35 11
##
## , , species = dist
##
## height
## diam <=4.75 >4.75
## <=4 73 61
## >4 70 41
#12.3 Hierarchical loglinear models
#12.4 Dependence graphs
par(mfrow=c(1,4))
plot( ug(~D:S + H:S ))
plot( ug(~D:S + H:S + D:H ))
plot( ug(~D:H:S ))
plot( ug(~D + H:S ))

par(mfrow=c(1,1))
#12.5 The Global Markov property
plot( ug(~A:B:C+B:C:D+D:E+E:F ))

#12.7 Fitting loglinear models
#Iterative proportional scaling is implemented in loglin():
ll1 <- loglin(lizard, list(c("species","diam"),
c("species","height")))
## 2 iterations: deviation 0
#2 iterations: deviation 0
str( ll1 )
## List of 4
## $ lrt : num 2.03
## $ pearson: num 2.02
## $ df : num 2
## $ margin :List of 2
## ..$ : chr [1:2] "species" "diam"
## ..$ : chr [1:2] "species" "height"
#A formula based interface to loglin() is provided by loglm():
library(MASS)
ll2 <- loglm(~species:diam + species:height, data=lizard); ll2
## Call:
## loglm(formula = ~species:diam + species:height, data = lizard)
##
## Statistics:
## X^2 df P(> X^2)
## Likelihood Ratio 2.025647 2 0.3631921
## Pearson 2.017364 2 0.3646994
coef( ll2 )
## $`(Intercept)`
## [1] 3.787314
##
## $diam
## <=4 >4
## 0.2825882 -0.2825882
##
## $height
## <=4.75 >4.75
## 0.3431156 -0.3431156
##
## $species
## anoli dist
## -0.3090218 0.3090218
##
## $diam.species
## species
## diam anoli dist
## <=4 0.1884334 -0.1884334
## >4 -0.1884334 0.1884334
##
## $height.species
## species
## height anoli dist
## <=4.75 0.1741797 -0.1741797
## >4.75 -0.1741797 0.1741797
#The dmod() function also provides an interface to loglin(), and
#dmod() oers much more; see later.
library(gRim)
ll3 <- dmod(~species:diam + species:height, data=lizard); ll3
## Model: A dModel with 3 variables
## graphical : TRUE decomposable : TRUE
## -2logL : 1604.43 mdim : 5 aic : 1614.43
## ideviance : 23.01 idf : 2 bic : 1634.49
## deviance : 2.03 df : 2
par(mfrow=c(1,4))
plot( ug(~D:S + H:S )) ## graphical
plot( ug(~D:S + H:S + D:H )) ## not graphical
plot( ug(~D:H:S )) ## graphical
plot( ug(~D + H:S )) ## graphical

par(mfrow=c(1,1))
par(mfrow=c(1,3))
plot(ug(~A:B:C + B:C:D)) ## graphical, decomposable
plot(ug(~A:B + A:C + B:C:D)) ## not graphical, not decomposable
plot(ug(~A:B + A:C + B:D + C:D)) ## graphical, not decomposable

par(mfrow=c(1,1))
#12.9 ML estimation in decomposable models
n.ds <- tableMargin(lizard, ~diam+species); n.ds
## species
## diam anoli dist
## <=4 118 134
## >4 46 111
n.ds <- tableMargin(lizard, c("diam", "species"))
n.hs <- tableMargin(lizard, c("height", "species"))
n.s <- tableMargin(lizard, c("species"))
ec <- tableDiv( tableMult(n.ds, n.hs), n.s) ## expected counts
ftable( ec )
## diam <=4 >4
## species height
## anoli <=4.75 87.06098 33.93902
## >4.75 30.93902 12.06098
## dist <=4.75 78.21224 64.78776
## >4.75 55.78776 46.21224
ftable( fitted(ll2) )
## Re-fitting to get fitted values
## species anoli dist
## diam height
## <=4 <=4.75 87.06098 78.21224
## >4.75 30.93902 55.78776
## >4 <=4.75 33.93902 64.78776
## >4.75 12.06098 46.21224
#13 Decomposable models and Bayesian networks
library(gRim)
args(ciTest_table)
## function (x, set = NULL, statistic = "dev", method = "chisq",
## adjust.df = TRUE, slice.info = TRUE, L = 20, B = 200, ...)
## NULL
ciTest(lizard, set=c("diam","height","species"))
## Testing diam _|_ height | species
## Statistic (DEV): 2.026 df: 2 p-value: 0.3632 method: CHISQ
cit <- ciTest(lizard, set=~diam+height+species, slice.info=T)
cit
## Testing diam _|_ height | species
## Statistic (DEV): 2.026 df: 2 p-value: 0.3632 method: CHISQ
names(cit)
## [1] "statistic" "p.value" "df" "statname" "method" "adjust.df"
## [7] "varNames" "slice"
cit$slice
## statistic p.value df species
## 1 0.1779795 0.6731154 1 anoli
## 2 1.8476671 0.1740550 1 dist
#14.2 Example: University admissions
#Example: Admission to graduate school at UC at Berkley in 1973
#for the six largest departments classied by sex and gender.
ftable(UCBAdmissions)
## Dept A B C D E F
## Admit Gender
## Admitted Male 512 353 120 138 53 22
## Female 89 17 202 131 94 24
## Rejected Male 313 207 205 279 138 351
## Female 19 8 391 244 299 317
#Is there evidence of sexual discrimination?
ag <- tableMargin(UCBAdmissions, ~Admit+Gender); ag
## Gender
## Admit Male Female
## Admitted 1198 557
## Rejected 1493 1278
as.parray( ag, normalize="first" )
## Gender
## Admit Male Female
## Admitted 0.4451877 0.3035422
## Rejected 0.5548123 0.6964578
s<-ciTest(UCBAdmissions, ~Admit+Gender+Dept, slice.info=T); s
## Testing Admit _|_ Gender | Dept
## Statistic (DEV): 21.736 df: 6 p-value: 0.0014 method: CHISQ
#Hence, admit and gender are not independent within each Dept.
#However, most contribution to the deviance comes from
#department A:
s$slice
## statistic p.value df Dept
## 1 19.0540099 1.270705e-05 1 A
## 2 0.2586429 6.110540e-01 1 B
## 3 0.7509844 3.861648e-01 1 C
## 4 0.2978665 5.852230e-01 1 D
## 5 0.9903864 3.196480e-01 1 E
## 6 0.3836167 5.356740e-01 1 F
#So what happens in department A?
x <- tableSlice(UCBAdmissions, margin="Dept", level="A"); x
## Gender
## Admit Male Female
## Admitted 512 89
## Rejected 313 19
as.parray(x, normalize="first")
## Gender
## Admit Male Female
## Admitted 0.6206061 0.8240741
## Rejected 0.3793939 0.1759259
#Why were we mislead at the beginning?
x <- tableMargin(UCBAdmissions, ~Admit+Dept);
x
## Dept
## Admit A B C D E F
## Admitted 601 370 322 269 147 46
## Rejected 332 215 596 523 437 668
as.parray(x, norm="first")
## Dept
## Admit A B C D E F
## Admitted 0.6441586 0.6324786 0.3507625 0.3396465 0.2517123 0.06442577
## Rejected 0.3558414 0.3675214 0.6492375 0.6603535 0.7482877 0.93557423
#15 Loglinear models the gRim package
#Coronary artery disease data:
data(cad1, package="gRbase")
head(cad1)
## Sex AngPec AMI QWave QWavecode STcode STchange SuffHeartF
## 1 Male None NotCertain No Usable Usable No No
## 2 Male Atypical NotCertain No Usable Usable No No
## 3 Female None Definite No Usable Usable No No
## 4 Male None NotCertain No Usable Nonusable No No
## 5 Male None NotCertain No Usable Nonusable No No
## 6 Male None NotCertain No Usable Nonusable No No
## Hypertrophi Hyperchol Smoker Inherit Heartfail CAD
## 1 No No No No No No
## 2 No No No No No No
## 3 No No No No No No
## 4 No No No No No No
## 5 No No No No No No
## 6 No No No No No No
use <- c(1,2,3,9:14)
cad1 <- cad1[,use]
head( cad1, 4 )
## Sex AngPec AMI Hypertrophi Hyperchol Smoker Inherit
## 1 Male None NotCertain No No No No
## 2 Male Atypical NotCertain No No No No
## 3 Female None Definite No No No No
## 4 Male None NotCertain No No No No
## Heartfail CAD
## 1 No No
## 2 No No
## 3 No No
## 4 No No
#Some (random) model:
m1 <- dmod(~Sex:Smoker:CAD + CAD:Hyperchol:AMI, data=cad1); m1
## Model: A dModel with 5 variables
## graphical : TRUE decomposable : TRUE
## -2logL : 1293.88 mdim : 13 aic : 1319.88
## ideviance : 112.54 idf : 8 bic : 1364.91
## deviance : 16.38 df : 18
par(mfrow=c(1,1))
plot( m1 )

str( terms( m1 ) )
## List of 2
## $ : chr [1:3] "Sex" "Smoker" "CAD"
## $ : chr [1:3] "CAD" "Hyperchol" "AMI"
formula( m1 )
## ~Sex * Smoker * CAD + CAD * Hyperchol * AMI
# Default: a graphNEL object
DG <- ugList( terms( m1 ) ); DG
## A graphNEL graph with undirected edges
## Number of Nodes = 5
## Number of Edges = 6
# Alternative: an adjacency matrix
a <- ugList( terms( m1 ), result="matrix" ); a
## Sex Smoker CAD Hyperchol AMI
## Sex 0 1 1 0 0
## Smoker 1 0 1 0 0
## CAD 1 1 0 1 1
## Hyperchol 0 0 1 0 1
## AMI 0 0 1 1 0
A <- ugList( terms( m1 ), result="dgCMatrix" )
#15.1 Model specication shortcuts
#Shortcuts for specifying some models
mar <- c("Sex","AngPec","AMI","CAD")
str(terms(dmod(~.^., data=cad1, margin=mar))) ## Saturated model
## List of 1
## $ : chr [1:4] "Sex" "AngPec" "AMI" "CAD"
str(terms(dmod(~.^1, data=cad1, margin=mar))) ## Independence model
## List of 4
## $ : chr "Sex"
## $ : chr "AngPec"
## $ : chr "AMI"
## $ : chr "CAD"
str(terms(dmod(~.^3, data=cad1, margin=mar))) ## All 3-factor model
## List of 4
## $ : chr [1:3] "Sex" "AngPec" "AMI"
## $ : chr [1:3] "Sex" "AngPec" "CAD"
## $ : chr [1:3] "Sex" "AMI" "CAD"
## $ : chr [1:3] "AngPec" "AMI" "CAD"
#15.2 Altering graphical models
#Natural operations on graphical models: add and delete edges
m1 <- dmod(~Sex:Smoker:CAD + CAD:Hyperchol:AMI, data=cad1); m1
## Model: A dModel with 5 variables
## graphical : TRUE decomposable : TRUE
## -2logL : 1293.88 mdim : 13 aic : 1319.88
## ideviance : 112.54 idf : 8 bic : 1364.91
## deviance : 16.38 df : 18
m2 <- update(m1,items =
list(dedge=~Hyperchol:CAD, # drop edge
aedge=~Smoker:AMI)) # add edge
par(mfrow=c(1,2)); plot( m1 ); plot( m2 )

#15.3 Model comparison
#Models are compared with compareModels().
m1 <- dmod(~Sex:Smoker:CAD + CAD:Hyperchol:AMI, data=cad1); m1
## Model: A dModel with 5 variables
## graphical : TRUE decomposable : TRUE
## -2logL : 1293.88 mdim : 13 aic : 1319.88
## ideviance : 112.54 idf : 8 bic : 1364.91
## deviance : 16.38 df : 18
m3 <- update(m1, items=list(dedge=~Sex:Smoker+Hyperchol:AMI))
compareModels( m1, m3 )
## Large:
## :"Sex" "Smoker" "CAD"
## :"CAD" "Hyperchol" "AMI"
## Small:
## :"Sex" "CAD"
## :"Smoker" "CAD"
## :"CAD" "Hyperchol"
## :"CAD" "AMI"
## -2logL: 8.93 df: 4 AIC(k= 2.0): 0.93 p.value: 0.346446
par(mfrow=c(1,2)); plot( m1 ); plot( m3 )

#15.4 Decomposable models deleting edges
#Result: If A1 is a decompsable model and we remove an edge
#e = fu; vg which is contained in one clique C only, then the new
#model A2 will also be decomposable.
par(mfrow=c(1,3))
plot(ug(~A:B:C+B:C:D))
plot(ug(~A:C+B:C+B:C:D))
plot(ug(~A:B+A:C+B:D+C:D))

par(mfrow=c(1,1))
#More tricky when adding edge to a decomposable model
plot(ug(~A:B+B:C+C:D), "circo")

UG <- ug(~A:B+B:C+C:D)
mcs(UG)
## [1] "A" "B" "C" "D"
UG1 <- addEdge("A","D",UG)
mcs(UG1)
## character(0)
UG2 <- addEdge("A","C",UG)
mcs(UG2)
## [1] "A" "B" "C" "D"
#15.6 Test for adding and deleting edges
#Done with testdelete() and testadd()
m1 <- dmod(~Sex:Smoker:CAD + CAD:Hyperchol:AMI, data=cad1)
plot( m1 )
testdelete( m1, edge=c("Hyperchol", "AMI") )
## dev: 4.981 df: 2 p.value: 0.08288 AIC(k=2.0): 1.0 edge: Hyperchol:AMI
## host: CAD Hyperchol AMI
## Notice: Test performed in saturated marginal model
m1 <- dmod(~Sex:Smoker:CAD + CAD:Hyperchol:AMI, data=cad1)
plot( m1 )

testadd( m1, edge=c("Smoker", "Hyperchol"))
## dev: 1.658 df: 2 p.value: 0.43654 AIC(k=2.0): 2.3 edge: Smoker:Hyperchol
## host: CAD Smoker Hyperchol
## Notice: Test performed in saturated marginal model
args(stepwise.iModel)
## function (object, criterion = "aic", alpha = NULL, type = "decomposable",
## search = "all", steps = 1000, k = 2, direction = "backward",
## fixinMAT = NULL, fixoutMAT = NULL, details = 0, trace = 2,
## ...)
## NULL
msat <- dmod( ~.^., data=cad1 )
mnew1 <- stepwise( msat, details=1, k=2 ) # use aic
## STEPWISE:
## criterion: aic ( k = 2 )
## direction: backward
## type : decomposable
## search : all
## steps : 1000
## . BACKWARD: type=decomposable search=all, criterion=aic(2.00), alpha=0.00
## . Initial model: is graphical=TRUE is decomposable=TRUE
## change.AIC -10.1543 Edge deleted: Sex CAD
## change.AIC -10.8104 Edge deleted: Sex AngPec
## change.AIC -18.3658 Edge deleted: AngPec Smoker
## change.AIC -13.6019 Edge deleted: Hyperchol AngPec
## change.AIC -10.1275 Edge deleted: Sex Heartfail
## change.AIC -10.3829 Edge deleted: Hyperchol Heartfail
## change.AIC -7.1000 Edge deleted: AMI Sex
## change.AIC -9.2019 Edge deleted: Hyperchol Sex
## change.AIC -9.0764 Edge deleted: Inherit Hyperchol
## change.AIC -5.1589 Edge deleted: Heartfail Smoker
## change.AIC -4.6758 Edge deleted: Inherit Heartfail
## change.AIC -1.7378 Edge deleted: Sex Smoker
## change.AIC -6.3261 Edge deleted: Smoker Inherit
## change.AIC -6.2579 Edge deleted: CAD Inherit
plot( mnew1 )

msat <- dmod( ~.^., data=cad1 )
mnew2 <- stepwise( msat, details=1, k=log(nrow(cad1)) ) # use bic
## STEPWISE:
## criterion: aic ( k = 5.46 )
## direction: backward
## type : decomposable
## search : all
## steps : 1000
## . BACKWARD: type=decomposable search=all, criterion=aic(5.46), alpha=0.00
## . Initial model: is graphical=TRUE is decomposable=TRUE
## change.AIC -100.0382 Edge deleted: Sex AngPec
## change.AIC -103.1520 Edge deleted: Hyperchol AngPec
## change.AIC -74.2967 Edge deleted: Smoker AngPec
## change.AIC -67.8590 Edge deleted: Sex Hyperchol
## change.AIC -60.3907 Edge deleted: AngPec Hypertrophi
## change.AIC -51.9489 Edge deleted: Heartfail Hyperchol
## change.AIC -50.8580 Edge deleted: Sex CAD
## change.AIC -43.8873 Edge deleted: AngPec Heartfail
## change.AIC -41.3702 Edge deleted: AMI Sex
## change.AIC -43.6158 Edge deleted: AMI Heartfail
## change.AIC -40.2509 Edge deleted: Hyperchol Inherit
## change.AIC -26.3511 Edge deleted: AngPec AMI
## change.AIC -31.4947 Edge deleted: Inherit AMI
## change.AIC -25.5315 Edge deleted: Heartfail CAD
## change.AIC -31.2732 Edge deleted: Inherit Heartfail
## change.AIC -22.9457 Edge deleted: AMI Hypertrophi
## change.AIC -17.9850 Edge deleted: Smoker AMI
## change.AIC -15.7814 Edge deleted: Sex Heartfail
## change.AIC -15.5931 Edge deleted: Smoker Sex
## change.AIC -18.5186 Edge deleted: Inherit Smoker
## change.AIC -13.8092 Edge deleted: Hyperchol Smoker
## change.AIC -12.4648 Edge deleted: AngPec Inherit
## change.AIC -6.5068 Edge deleted: Smoker Heartfail
## change.AIC -9.2031 Edge deleted: Hypertrophi Smoker
## change.AIC -5.9470 Edge deleted: AMI Hyperchol
## change.AIC -5.0227 Edge deleted: Hypertrophi Hyperchol
## change.AIC -4.0234 Edge deleted: Sex Inherit
## change.AIC -6.8882 Edge deleted: Hypertrophi Inherit
## change.AIC -3.1347 Edge deleted: Hypertrophi Sex
plot( mnew2 )

#16 From graph and data to network
#Create graphs from models:
ug1 <- ugList( terms( mnew1 ) )
ug2 <- ugList( terms( mnew2 ) )
par(mfrow=c(1,2)); plot( ug1 ); plot( ug2 )

#Create Bayesian networks from (graph, data):
bn1 <- compile( grain( ug1, data=cad1, smooth=0.1 )); bn1
## Independence network: Compiled: TRUE Propagated: FALSE
## Nodes: chr [1:9] "Hypertrophi" "AMI" "CAD" "Smoker" "Hyperchol" ...
bn2 <- compile( grain( ug2, data=cad1, smooth=0.1 )); bn2
## Independence network: Compiled: TRUE Propagated: FALSE
## Nodes: chr [1:9] "CAD" "AngPec" "Hypertrophi" "Heartfail" "Smoker" ...
querygrain( bn1, "CAD")
## $CAD
## CAD
## No Yes
## 0.5459866 0.4540134
z<-setEvidence( bn1, nodes=c("AngPec", "Hypertrophi"),
c("Typical","Yes"))
# alternative form
z<-setEvidence( bn1,
nslist=list(AngPec="Typical", Hypertrophi="Yes"))
querygrain( z, "CAD")
#17 Prediction
#Dataset with missing values
data(cad2, package="gRbase")
dim( cad2 )
## [1] 67 14
head( cad2, 4 )
## Sex AngPec AMI QWave QWavecode STcode STchange SuffHeartF
## 1 Male None NotCertain No Usable Usable Yes Yes
## 2 Female None NotCertain No Usable Usable Yes Yes
## 3 Female None NotCertain No Nonusable Nonusable No No
## 4 Male Atypical Definite No Usable Usable No Yes
## Hypertrophi Hyperchol Smoker Inherit Heartfail CAD
## 1 No No <NA> No No No
## 2 No No <NA> No No No
## 3 No Yes <NA> No No No
## 4 No Yes <NA> No No No
args(predict.grain)
## function (object, response, predictors = setdiff(names(newdata),
## response), newdata, type = "class", ...)
## NULL
#function (object, response, predictors = setdiff(names(newdata),
# response), newdata, type = "class", ...)
# NULL
p1 <- predict(bn1, newdata=cad2, response="CAD")
head( p1$pred$CAD )
## [1] "No" "No" "No" "No" "No" "Yes"
z <- data.frame(CAD.obs=cad2$CAD, CAD.pred=p1$pred$CAD)
head( z ) # class assigned by highest probability
## CAD.obs CAD.pred
## 1 No No
## 2 No No
## 3 No No
## 4 No No
## 5 No No
## 6 No Yes
xtabs(~., data=z)
## CAD.pred
## CAD.obs No Yes
## No 32 9
## Yes 9 17
#Can be more informative to look at conditional probabilities:
q1 <- predict(bn1, newdata=cad2, response="CAD",
type="distribution")
head( q1$pred$CAD )
## No Yes
## [1,] 0.9741595 0.02584049
## [2,] 0.9741595 0.02584049
## [3,] 0.8982885 0.10171154
## [4,] 0.5348625 0.46513750
## [5,] 0.7865913 0.21340871
## [6,] 0.4509980 0.54900198
head( p1$pred$CAD )
## [1] "No" "No" "No" "No" "No" "Yes"
head( cad2$CAD)
## [1] No No No No No No
## Levels: No Yes
par(mfrow=c(1,1))
#18 Other packages
require( bnlearn )
## Loading required package: bnlearn
##
## Attaching package: 'bnlearn'
##
## 以下のオブジェクトは 'package:gRbase' からマスクされています:
##
## children, parents
a = bn.fit(hc( cad1 ), cad1)
bn = as.grain(a)
plot(bn)

require( bnlearn )
a = bn.fit(hc(learning.test ),learning.test)
bn = as.grain(a)
plot(bn)

library(bnlearn)
norm <- rnorm(4000)
Data <- matrix(norm, nrow=1000, ncol=4, byrow=T)
colnames(Data) <- c("Height", "BMI", "SBP", "FBS")
Data <- data.frame(Data)
head(Data)
## Height BMI SBP FBS
## 1 0.3152757 -1.6255617 0.2820571 -0.3756693
## 2 0.1862342 1.0407168 -1.6418698 -0.1539259
## 3 -1.1549326 0.6071451 0.3628388 -0.7618045
## 4 -0.2965907 0.1610033 -0.2913580 -1.0498726
## 5 1.4597698 0.3122938 0.1030056 0.8036521
## 6 -0.8637087 -0.6718771 0.1441947 0.1156066
Data2 <- Data
Data2$Height <- 170 + Data$Height*10
Data2$SBP <- 120 + Data$SBP*10
Data2$BMI <- 22 + Data$Height*2 + Data$BMI*2 + Data$SBP*2
Data2$FBS <- 90 + (Data2$BMI-22)*5 + Data$FBS*10
Data2 <- data.frame(Data2)
head(Data2)
## Height BMI SBP FBS
## 1 173.1528 19.94354 122.8206 75.96102
## 2 171.8623 21.17016 103.5813 84.31155
## 3 158.4507 21.63010 123.6284 80.53247
## 4 167.0341 21.14611 117.0864 75.23182
## 5 184.5977 25.75014 121.0301 116.78721
## 6 161.3629 19.21722 121.4419 77.24216
res.Data2 = gs(Data2)
plot(res.Data2)

res.arc2 <- set.arc(res.Data2, "FBS", "BMI") #←HeightをFBSに修正
arc.strength(res.arc2, Data2)
## from to strength
## 1 Height BMI 7.923986e-66
## 2 SBP BMI 8.244726e-63
## 3 FBS BMI 2.701750e-135
#---回帰係数を求める
fitted = bn.fit(res.arc2, Data2)
(Coef <- coefficients(fitted))
## $Height
## (Intercept)
## 170.4514
##
## $BMI
## (Intercept) Height SBP FBS
## -16.86251210 0.10439402 0.10260526 0.09747203
##
## $SBP
## (Intercept)
## 120.4283
##
## $FBS
## (Intercept)
## 91.40537
#---絵に上書き
XMax <- max(axTicks(1))
YMax <- max(axTicks(2))
#---手動で位置を設定する
plot(res.Data2)
#切片
text(XMax*0.05, YMax*0.6, round(Coef$Height, digits=2), adj=0)
text(XMax*0.85, YMax*0.6, round(Coef$SBP, digits=2), adj=0)
text(XMax*0.35, YMax*0.9, round(Coef$FBS, digits=2), adj=0)
text(XMax*0.35, YMax*0.05, round(Coef$BMI[1], digits=2), adj=0)
#回帰係数
text(XMax*0.3, YMax*0.3, round(Coef$BMI[2], digits=3), adj=0)
text(XMax*0.65, YMax*0.3, round(Coef$BMI[3], digits=3), adj=0)
text(XMax*0.45, YMax*0.5, round(Coef$BMI[4], digits=3), adj=0)
