## I now further sharpen the results by estimating the 8 parameters
## of the multinomial (No-Signalling) model by maximum likelihood, and also estimating
## the 7 parameters of the submodel obtained by restriction to Local Realism.
##
## Statistical testing is probably improved by using the Wilks minus two log likelihood
## ratio test (compared to the chi-squared distribution with one degree of freedom), 
## instead of a Wald test based on an estimated parameter and estimated
## asymptotic variance.

## First we simply re-run the generalised least squares program in order to get initial
## estimates of the parameters under the unrestricted model, and we rerun a modification
## thereof to get initial estimates under the restriction that S = 2 (J = 0)

setwd("~/Desktop/Optimal Bell/Experiments/Weihs")
source("OptimizedWeihs.R")

# Here I express my eight parameters as functions of the 16 true probabilities
# I take as parameters the four marginal probability distributions of each party's
# outcomes given each party's own setting, and the four correlations, one for
# each of the four sub-experiments. More precisely, I parametrize the
# probability distribution of each binary variable with twice the probability of
# a reference outcome, minus one. This parameter lies in the interval [-1, 1].
# The correlations also lie in the interval [-1, 1]
# I ignore the linear constraints on the parameter space [-1, 1]^8, that each of
# the 16 resulting "observable" probabilies must be non-negative.

a1x4 <- c(c(1, 1, -1, -1), zero, zero, zero)
a2x4 <- c(zero, zero, c(1, 1, -1, -1), zero)
b1x4 <- c(c(1, -1, 1, -1), zero, zero, zero)
b2x4 <- c(zero, c(1, -1, 1, -1), zero, zero)
r11x4 <- c(c(1, -1, -1, 1), zero, zero, zero)
r12x4 <- c(zero, c(1, -1, -1, 1), zero, zero)
r21x4 <- c(zero, zero, c(1, -1, -1, 1), zero)
r22x4 <- c(zero, zero, zero, c(1, -1, -1, 1))
Thetax4 <- cbind(a1 = a1x4, a2 = a2x4, b1 = b1x4, b2 = b2x4, r11 = r11x4, r12 = r12x4, r21 = r21x4, r22 = r22x4)

# Here I compute covariance matries for the set of 4 observed deviations from
# the no-signalling equalities, and the set of 8 naive estimates 
# of the 8 parameters which we have just constructed

CovTN <- t(Thetax4) %*% Cov %*% NS
CovNT <- t(CovTN)
VarT <- t(Thetax4) %*% Cov %*% Thetax4

# Here are the generalised least squares (GLS) estimates of the eight parameters
# and the resulting observed value of the Bell-CHSH statistic S

ThetaOpt <- (t(Thetax4) - CovTN %*% InvCovNN %*% t(NS)) %*% rawProbsVec
ThetaOpt
##             [,1]
## a1  -0.001196617
## a2   0.004059086
## b1  -0.005757908
## b2  -0.102855310
## r11  0.615390173
## r12  0.701027951
## r21  0.696273486
## r22 -0.698305017
t(c(0, 0, 0, 0, 1, 1, 1, -1)) %*% ThetaOpt
##          [,1]
## [1,] 2.710997
VarTopt <- VarT - CovTN %*% InvCovNN %*% CovNT
sqrt(t(c(0, 0, 0, 0, 1, 1, 1, -1)) %*% VarTopt %*% c(0, 0, 0, 0, 1, 1, 1, -1))
##           [,1]
## [1,] 0.0243444
# Now I am going to impose local realism, by imposing the condition that
# Eberhard's J equals zero
# Now we want to estimate 7 parameters under 9 constraints on the
# true conditional probabilities in our 4 2x2 tables

LR <- cbind(NS, J)
CovTL <- t(Thetax4) %*% Cov %*% LR
CovLT <- t(CovTL)
covLL <- t(LR) %*% Cov %*% LR
InvCovLL <- solve(covLL)
ThetaOptL <- (t(Thetax4) - CovTL %*% InvCovLL %*% t(LR)) %*% rawProbsVec
ThetaOptL
##             [,1]
## a1   0.018745752
## a2  -0.009885852
## b1  -0.006459564
## b2  -0.128770328
## r11  0.430252319
## r12  0.497975508
## r21  0.554771339
## r22 -0.517000833
t(c(0, 0, 0, 0, 1, 1, 1, -1)) %*% ThetaOptL
##      [,1]
## [1,]    2
# Now I want to improve the GLS estimates under the two models
# by computing the MLE under each model
# First the log likelihood function for the unconstrained 
# (i.e. no assumption of LR) model, 8 parameters
#
# But don't forget the data! A vector of 16 raw counts.

dataVec <- as.vector(tables)

negloglik <- function (Theta) {
    A1 <- Theta[1]
    A2 <- Theta[2]
    B1 <- Theta[3]
    B2 <- Theta[4]
    P11 <- Theta[5]
    P12 <- Theta[6]
    P21 <- Theta[7]
    P22 <- Theta[8]
    P11vecx4 <- c(1 + A1 + B1 + P11, 1 + A1 - B1 - P11, 1 - A1 + B1 - P11, 1 - A1 - B1 + P11)
    P12vecx4 <- c(1 + A1 + B2 + P12, 1 + A1 - B2 - P12, 1 - A1 + B2 - P12, 1 - A1 - B2 + P12)
    P21vecx4 <- c(1 + A2 + B1 + P21, 1 + A2 - B1 - P21, 1 - A2 + B1 - P21, 1 - A2 - B1 + P21)
    P22vecx4 <- c(1 + A2 + B2 + P22, 1 + A2 - B2 - P22, 1 - A2 + B2 - P22, 1 - A2 - B2 + P22)
    Pvec <- c(P11vecx4, P12vecx4, P21vecx4, P22vecx4)/4
    - sum(dataVec * log(Pvec))
}

# Next the log likelihood function for the constrained 
# (assume true S = 2) model, 7 parameters
# I simply drop the 4th correlation. 
# It is now a function of the other three.

negloglikL <- function (Theta) {
    A1 <- Theta[1]
    A2 <- Theta[2]
    B1 <- Theta[3]
    B2 <- Theta[4]
    P11 <- Theta[5]
    P12 <- Theta[6]
    P21 <- Theta[7]
    P22 <- P11 + P12 + P21 - 2
    P11vecx4 <- c(1 + A1 + B1 + P11, 1 + A1 - B1 - P11, 1 - A1 + B1 - P11, 1 - A1 - B1 + P11)
    P12vecx4 <- c(1 + A1 + B2 + P12, 1 + A1 - B2 - P12, 1 - A1 + B2 - P12, 1 - A1 - B2 + P12)
    P21vecx4 <- c(1 + A2 + B1 + P21, 1 + A2 - B1 - P21, 1 - A2 + B1 - P21, 1 - A2 - B1 + P21)
    P22vecx4 <- c(1 + A2 + B2 + P22, 1 + A2 - B2 - P22, 1 - A2 + B2 - P22, 1 - A2 - B2 + P22)
    Pvec <- c(P11vecx4, P12vecx4, P21vecx4, P22vecx4)/4
    - sum(dataVec * log(Pvec))
}

# Suppress warnings - "optim" may be going to produce a lot!
# You might want to remove this line and see what they are...
options(warn = -1)

# Minus the log likelihood at the initial (unconstrained, GLS) estimate
negloglik(ThetaOpt)
## [1] 16439.85
# Minus the log likelihood at the (unconstrained, MLE) estimate
optim(ThetaOpt, negloglik)$value
## [1] 16439.84
# Minus the log likelihood at the initial (constrained, GLS) estimate

negloglik(ThetaOptL)
## [1] 16771.47
# Minus the log likelihood at the initial (constrained, GLS) estimate
negloglikL(ThetaOptL[1:7])
## [1] 16771.47
# Minus the log likelihood at the (constrained, MLE) estimate
optim(ThetaOptL[1:7], negloglikL)$value
## [1] 16767.07
# Wilk's test statistic based on comparison of log likelihood at GLS estimates
2 * (- negloglik(ThetaOpt) + negloglik(ThetaOptL))
## [1] 663.2217
# Wilk's test statistic based on comparison of log likelihood at MLE estimates
2 * (- optim(ThetaOpt, negloglik)$value + optim(ThetaOptL[1:7], negloglikL)$value )
## [1] 654.4548
# Wilk's p-value based on comparison of log likelihood at GLS estimates
pchisq(2 * (- negloglik(ThetaOpt) + negloglik(ThetaOptL)), 1, lower.tail = FALSE)
## [1] 2.97649e-146
# Wilk's p-value based on comparison of log likelihood at MLE estimates
pchisq(2 * (- optim(ThetaOpt, negloglik)$value + optim(ThetaOptL[1:7], negloglikL)$value), 1, lower.tail = FALSE)
## [1] 2.400423e-144
W <- 2 * (- optim(ThetaOpt, negloglik)$value + optim(ThetaOptL[1:7], negloglikL)$value); W
## [1] 654.4548
zW <- sqrt(W); zW
## [1] 25.58231
1/zW^2
## [1] 0.001527989
## The Wilk's p-values need to be halved, since the null hypothesis
## MLE is on the boundary of LR and the alternative hypothesis MLE is
## far outside of the LR  polytope.
## We end up with an effective z-value of 25.6. The value of CHSH
## was too optimistic.
##
## A sensible p-value based on Chebyshev is 1 pro mille.