## I now further sharpen the results by estimating the 8 parameters
## of the multinomial (No-Signalling) model by maximum likelihood, and also estimating
## the 7 parameters of the submodel obtained by restriction to Local Realism.
##
## Statistical testing is probably improved by using the Wilks minus two log likelihood
## ratio test (compared to the chi-squared distribution with one degree of freedom), 
## instead of a Wald test based on an estimated parameter and estimated
## asymptotic variance.

## First we simply re-run the generalised least squares program in order to get initial
## estimates of the parameters under the unrestricted model, and we rerun a modification
## thereof to get initial estimates under the restriction that S = 2 (J = 0)

setwd("~/Desktop/Clean up/Clean up/Bell/Experiments/Munich")
source("OptimizedMunich.R")

# Here I express my eight parameters as functions of the 16 true probabilities
# I take as parameters the four marginal probability distributions of each party's
# outcomes given each party's own setting, and the four correlations, one for
# each of the four sub-experiments. More precisely, I parametrize the
# probability distribution of each binary variable with twice the probability of
# a reference outcome, minus one. This parameter lies in the interval [-1, 1].
# The correlations also lie in the interval [-1, 1]
# I ignore the linear constraints on the parameter space [-1, 1]^8, that each of
# the 16 resulting "observable" probabilies must be non-negative.

a1x4 <- c(c(1, 1, -1, -1), zero, zero, zero)
a2x4 <- c(zero, zero, c(1, 1, -1, -1), zero)
b1x4 <- c(c(1, -1, 1, -1), zero, zero, zero)
b2x4 <- c(zero, c(1, -1, 1, -1), zero, zero)
r11x4 <- c(c(1, -1, -1, 1), zero, zero, zero)
r12x4 <- c(zero, c(1, -1, -1, 1), zero, zero)
r21x4 <- c(zero, zero, c(1, -1, -1, 1), zero)
r22x4 <- c(zero, zero, zero, c(1, -1, -1, 1))
Thetax4 <- cbind(a1 = a1x4, a2 = a2x4, b1 = b1x4, b2 = b2x4, r11 = r11x4, r12 = r12x4, r21 = r21x4, r22 = r22x4)

# Here I compute covariance matries for the set of 4 observed deviations from
# the no-signalling equalities, and the set of 8 naive estimates 
# of the 8 parameters which we have just constructed

CovTN <- t(Thetax4) %*% Cov %*% NS
CovNT <- t(CovTN)
VarT <- t(Thetax4) %*% Cov %*% Thetax4

# Here are the generalised least squares (GLS) estimates of the eight parameters
# and the resulting observed value of the Bell-CHSH statistic S

ThetaOpt <- (t(Thetax4) - CovTN %*% InvCovNN %*% t(NS)) %*% rawProbsVec
ThetaOpt
##            [,1]
## a1   0.02970269
## a2   0.20334674
## b1   0.07056341
## b2  -0.14718933
## r11  0.62090278
## r12  0.62220041
## r21  0.65835739
## r22 -0.68080089
t(c(0, 0, 0, 0, 1, 1, 1, -1)) %*% ThetaOpt
##          [,1]
## [1,] 2.582261
VarTopt <- VarT - CovTN %*% InvCovNN %*% CovNT
sqrt(t(c(0, 0, 0, 0, 1, 1, 1, -1)) %*% VarTopt %*% c(0, 0, 0, 0, 1, 1, 1, -1))
##           [,1]
## [1,] 0.2451967
# Now I am going to impose local realism, by imposing the condition that
# Eberhard's J equals zero
# Now we want to estimate 7 parameters under 9 constraints on the
# true conditional probabilities in our 4 2x2 tables

LR <- cbind(NS, J)
CovTL <- t(Thetax4) %*% Cov %*% LR
CovLT <- t(CovTL)
covLL <- t(LR) %*% Cov %*% LR
InvCovLL <- solve(covLL)
ThetaOptL <- (t(Thetax4) - CovTL %*% InvCovLL %*% t(LR)) %*% rawProbsVec
ThetaOptL
##            [,1]
## a1   0.06646120
## a2   0.21859921
## b1   0.06652586
## b2  -0.13497585
## r11  0.45142288
## r12  0.46465061
## r21  0.53044765
## r22 -0.55347886
t(c(0, 0, 0, 0, 1, 1, 1, -1)) %*% ThetaOptL
##      [,1]
## [1,]    2
# Now I want to improve the GLS estimates under the two models
# by computing the MLE under each model
# First the log likelihood function for the unconstrained 
# (i.e. no assumption of LR) model, 8 parameters
#
# But don't forget the data! A vector of 16 raw counts.

dataVec <- as.vector(tables)

negloglik <- function (Theta) {
    A1 <- Theta[1]
    A2 <- Theta[2]
    B1 <- Theta[3]
    B2 <- Theta[4]
    P11 <- Theta[5]
    P12 <- Theta[6]
    P21 <- Theta[7]
    P22 <- Theta[8]
    P11vecx4 <- c(1 + A1 + B1 + P11, 1 + A1 - B1 - P11, 1 - A1 + B1 - P11, 1 - A1 - B1 + P11)
    P12vecx4 <- c(1 + A1 + B2 + P12, 1 + A1 - B2 - P12, 1 - A1 + B2 - P12, 1 - A1 - B2 + P12)
    P21vecx4 <- c(1 + A2 + B1 + P21, 1 + A2 - B1 - P21, 1 - A2 + B1 - P21, 1 - A2 - B1 + P21)
    P22vecx4 <- c(1 + A2 + B2 + P22, 1 + A2 - B2 - P22, 1 - A2 + B2 - P22, 1 - A2 - B2 + P22)
    Pvec <- c(P11vecx4, P12vecx4, P21vecx4, P22vecx4)/4
    - sum(dataVec * log(Pvec))
}

# Next the log likelihood function for the constrained 
# (assume true S = 2) model, 7 parameters
# I simply drop the 4th correlation. 
# It is now a function of the other three.

negloglikL <- function (Theta) {
    A1 <- Theta[1]
    A2 <- Theta[2]
    B1 <- Theta[3]
    B2 <- Theta[4]
    P11 <- Theta[5]
    P12 <- Theta[6]
    P21 <- Theta[7]
    P22 <- P11 + P12 + P21 - 2
    P11vecx4 <- c(1 + A1 + B1 + P11, 1 + A1 - B1 - P11, 1 - A1 + B1 - P11, 1 - A1 - B1 + P11)
    P12vecx4 <- c(1 + A1 + B2 + P12, 1 + A1 - B2 - P12, 1 - A1 + B2 - P12, 1 - A1 - B2 + P12)
    P21vecx4 <- c(1 + A2 + B1 + P21, 1 + A2 - B1 - P21, 1 - A2 + B1 - P21, 1 - A2 - B1 + P21)
    P22vecx4 <- c(1 + A2 + B2 + P22, 1 + A2 - B2 - P22, 1 - A2 + B2 - P22, 1 - A2 - B2 + P22)
    Pvec <- c(P11vecx4, P12vecx4, P21vecx4, P22vecx4)/4
    - sum(dataVec * log(Pvec))
}

# Suppress warnings - "optim" may be going to produce a lot!
# You might want to remove this line and see what they are...
options(warn = -1)

# Minus the log likelihood at the initial (unconstrained, GLS) estimate
negloglik(ThetaOpt)
## [1] 170.5121
# Minus the log likelihood at the (unconstrained, MLE) estimate
optim(ThetaOpt, negloglik)$value
## [1] 170.4891
# Minus the log likelihood at the initial (constrained, GLS) estimate

negloglik(ThetaOptL)
## [1] 172.7129
# Minus the log likelihood at the initial (constrained, GLS) estimate
negloglikL(ThetaOptL[1:7])
## [1] 172.7129
# Minus the log likelihood at the (constrained, MLE) estimate
optim(ThetaOptL[1:7], negloglikL)$value
## [1] 172.5761
# Wilk's test statistic based on comparison of log likelihood at GLS estimates
2 * (- negloglik(ThetaOpt) + negloglik(ThetaOptL))
## [1] 4.401702
# Wilk's test statistic based on comparison of log likelihood at MLE estimates
2 * (- optim(ThetaOpt, negloglik)$value + optim(ThetaOptL[1:7], negloglikL)$value )
## [1] 4.174018
# Wilk's p-value based on comparison of log likelihood at GLS estimates
pchisq(2 * (- negloglik(ThetaOpt) + negloglik(ThetaOptL)), 1, lower.tail = FALSE)
## [1] 0.03590308
# Wilk's p-value based on comparison of log likelihood at MLE estimates
pchisq(2 * (- optim(ThetaOpt, negloglik)$value + optim(ThetaOptL[1:7], negloglikL)$value), 1, lower.tail = FALSE)
## [1] 0.04104834