## I now further sharpen the results by estimating the 8 parameters
## of the multinomial (No-Signalling) model by maximum likelihood, and also estimating
## the 7 parameters of the submodel obtained by restriction to Local Realism.
##
## Statistical testing is probably improved by using the Wilks minus two log likelihood
## ratio test (compared to the chi-squared distribution with one degree of freedom), 
## instead of a Wald test based on an estimated parameter and estimated
## asymptotic variance.

## First we simply re-run the generalised least squares program in order to get initial
## estimates of the parameters under the unrestricted model, and we rerun a modification
## thereof to get initial estimates under the restriction that S = 2 (J = 0)

setwd("~/Desktop/Clean up/Clean up/Bell/Experiments/Delft")
source("OptimizedDelft.R")

# Here I express my eight parameters as functions of the 16 true probabilities
# I take as parameters the four marginal probability distributions of each party's
# outcomes given each party's own setting, and the four correlations, one for
# each of the four sub-experiments. More precisely, I parametrize the
# probability distribution of each binary variable with twice the probability of
# a reference outcome, minus one. This parameter lies in the interval [-1, 1].
# The correlations also lie in the interval [-1, 1]
# I ignore the linear constraints on the parameter space [-1, 1]^8, that each of
# the 16 resulting "observable" probabilies must be non-negative.

a1x4 <- c(c(1, 1, -1, -1), zero, zero, zero)
a2x4 <- c(zero, zero, c(1, 1, -1, -1), zero)
b1x4 <- c(c(1, -1, 1, -1), zero, zero, zero)
b2x4 <- c(zero, c(1, -1, 1, -1), zero, zero)
r11x4 <- c(c(1, -1, -1, 1), zero, zero, zero)
r12x4 <- c(zero, c(1, -1, -1, 1), zero, zero)
r21x4 <- c(zero, zero, c(1, -1, -1, 1), zero)
r22x4 <- c(zero, zero, zero, c(1, -1, -1, 1))
Thetax4 <- cbind(a1 = a1x4, a2 = a2x4, b1 = b1x4, b2 = b2x4, r11 = r11x4, r12 = r12x4, r21 = r21x4, r22 = r22x4)

# Here I compute covariance matries for the set of 4 observed deviations from
# the no-signalling equalities, and the set of 8 naive estimates 
# of the 8 parameters which we have just constructed

CovTN <- t(Thetax4) %*% Cov %*% NS
CovNT <- t(CovTN)
VarT <- t(Thetax4) %*% Cov %*% Thetax4

# Here are the generalised least squares (GLS) estimates of the eight parameters
# and the resulting observed value of the Bell-CHSH statistic S

ThetaOpt <- (t(Thetax4) - CovTN %*% InvCovNN %*% t(NS)) %*% rawProbsVec
ThetaOpt
##            [,1]
## a1   0.02528835
## a2   0.02744135
## b1  -0.01112831
## b2  -0.08409985
## r11  0.74506802
## r12  0.60686507
## r21  0.50005204
## r22 -0.61067320
t(c(0, 0, 0, 0, 1, 1, 1, -1)) %*% ThetaOpt
##          [,1]
## [1,] 2.462658
VarTopt <- VarT - CovTN %*% InvCovNN %*% CovNT
sqrt(t(c(0, 0, 0, 0, 1, 1, 1, -1)) %*% VarTopt %*% c(0, 0, 0, 0, 1, 1, 1, -1))
##           [,1]
## [1,] 0.2018883
# Now I am going to impose local realism, by imposing the condition that
# Eberhard's J equals zero
# Now we want to estimate 7 parameters under 9 constraints on the
# true conditional probabilities in our 4 2x2 tables

LR <- cbind(NS, J)
CovTL <- t(Thetax4) %*% Cov %*% LR
CovLT <- t(CovTL)
covLL <- t(LR) %*% Cov %*% LR
InvCovLL <- solve(covLL)
ThetaOptL <- (t(Thetax4) - CovTL %*% InvCovLL %*% t(LR)) %*% rawProbsVec
ThetaOptL
##            [,1]
## a1   0.02358929
## a2   0.03915829
## b1  -0.01254638
## b2  -0.11419484
## r11  0.64857384
## r12  0.51577710
## r21  0.36392035
## r22 -0.47172870
t(c(0, 0, 0, 0, 1, 1, 1, -1)) %*% ThetaOptL
##      [,1]
## [1,]    2
# Now I want to improve the GLS estimates under the two models
# by computing the MLE under each model
# First the log likelihood function for the unconstrained 
# (i.e. no assumption of LR) model, 8 parameters
#
# But don't forget the data! A vector of 16 raw counts.

dataVec <- as.vector(tables)

negloglik <- function (Theta) {
    A1 <- Theta[1]
    A2 <- Theta[2]
    B1 <- Theta[3]
    B2 <- Theta[4]
    P11 <- Theta[5]
    P12 <- Theta[6]
    P21 <- Theta[7]
    P22 <- Theta[8]
    P11vecx4 <- c(1 + A1 + B1 + P11, 1 + A1 - B1 - P11, 1 - A1 + B1 - P11, 1 - A1 - B1 + P11)
    P12vecx4 <- c(1 + A1 + B2 + P12, 1 + A1 - B2 - P12, 1 - A1 + B2 - P12, 1 - A1 - B2 + P12)
    P21vecx4 <- c(1 + A2 + B1 + P21, 1 + A2 - B1 - P21, 1 - A2 + B1 - P21, 1 - A2 - B1 + P21)
    P22vecx4 <- c(1 + A2 + B2 + P22, 1 + A2 - B2 - P22, 1 - A2 + B2 - P22, 1 - A2 - B2 + P22)
    Pvec <- c(P11vecx4, P12vecx4, P21vecx4, P22vecx4)/4
    - sum(dataVec * log(Pvec))
}

# Next the log likelihood function for the constrained 
# (assume true S = 2) model, 7 parameters
# I simply drop the 4th correlation. 
# It is now a function of the other three.

negloglikL <- function (Theta) {
    A1 <- Theta[1]
    A2 <- Theta[2]
    B1 <- Theta[3]
    B2 <- Theta[4]
    P11 <- Theta[5]
    P12 <- Theta[6]
    P21 <- Theta[7]
    P22 <- P11 + P12 + P21 - 2
    P11vecx4 <- c(1 + A1 + B1 + P11, 1 + A1 - B1 - P11, 1 - A1 + B1 - P11, 1 - A1 - B1 + P11)
    P12vecx4 <- c(1 + A1 + B2 + P12, 1 + A1 - B2 - P12, 1 - A1 + B2 - P12, 1 - A1 - B2 + P12)
    P21vecx4 <- c(1 + A2 + B1 + P21, 1 + A2 - B1 - P21, 1 - A2 + B1 - P21, 1 - A2 - B1 + P21)
    P22vecx4 <- c(1 + A2 + B2 + P22, 1 + A2 - B2 - P22, 1 - A2 + B2 - P22, 1 - A2 - B2 + P22)
    Pvec <- c(P11vecx4, P12vecx4, P21vecx4, P22vecx4)/4
    - sum(dataVec * log(Pvec))
}

# Suppress warnings - "optim" may be going to produce a lot!
# You might want to remove this line and see what they are...
options(warn = -1)

# Minus the log likelihood at the initial (unconstrained, GLS) estimate
negloglik(ThetaOpt)
## [1] 290.0286
# Minus the log likelihood at the (unconstrained, MLE) estimate
optim(ThetaOpt, negloglik)$value
## [1] 290.004
# Minus the log likelihood at the initial (constrained, GLS) estimate

negloglik(ThetaOptL)
## [1] 292.1388
# Minus the log likelihood at the initial (constrained, GLS) estimate
negloglikL(ThetaOptL[1:7])
## [1] 292.1388
# Minus the log likelihood at the (constrained, MLE) estimate
optim(ThetaOptL[1:7], negloglikL)$value
## [1] 291.9759
# Wilk's test statistic based on comparison of log likelihood at GLS estimates
2 * (- negloglik(ThetaOpt) + negloglik(ThetaOptL))
## [1] 4.220286
# Wilk's test statistic based on comparison of log likelihood at MLE estimates
2 * (- optim(ThetaOpt, negloglik)$value + optim(ThetaOptL[1:7], negloglikL)$value )
## [1] 3.943899
# Wilk's p-value based on comparison of log likelihood at GLS estimates
pchisq(2 * (- negloglik(ThetaOpt) + negloglik(ThetaOptL)), 1, lower.tail = FALSE)
## [1] 0.03994343
# Wilk's p-value based on comparison of log likelihood at MLE estimates
pchisq(2 * (- optim(ThetaOpt, negloglik)$value + optim(ThetaOptL[1:7], negloglikL)$value), 1, lower.tail = FALSE)
## [1] 0.04704162