Time <- c(0, 8, 28, 41, 63, 79, 97, 117, 135, 154)
Beetles <- c(2, 47, 192, 256, 768, 896, 1120, 896, 1184, 1024)
func <- function(t) {
k = 1033.5148145
r = 0.1178587
num = 2*k
denom = 2 + (k-2)*exp(-r*t)
return(num/denom)
}
model1 <- function(k, r, t) {
num = 2*k
denom = 2 + (k-2)*exp(-r*t)
f = num/denom
deriv.denom = k + 2*exp(r*t) - 2
dk = 4*exp(r*t)*(exp(r*t) - 1)/(deriv.denom)^2
dr = 2*(k-2)*k*t*exp(r*t)/(deriv.denom)^2
grad = cbind(dk, dr)
attr(f, 'gradient') <- grad
f
}
result = nls(Beetles ~ model1(k, r, Time), start=list(k=1200, r=0.20), trace = TRUE,
nls.control(maxiter = 50, tol = 1e-5, minFactor = 1/1024,
printEval=TRUE))
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
## 1040978 : 1200.0 0.2
## 154182.1 : 953.3604719 0.1463231
## 91604.31 : 1002.3053517 0.1264523
## 83852.16 : 1025.5543132 0.1201219
## 83281.01 : 1031.8357460 0.1185218
## 83243.3 : 1033.1060566 0.1181069
## 83240.68 : 1033.4094887 0.1179977
## 83240.5 : 1033.4875273 0.1179689
## 83240.49 : 1033.5079944 0.1179613
## 83240.49 : 1033.5133901 0.1179593
## 83240.49 : 1033.5148145 0.1179587
plot(Time, Beetles)
t = seq(0,max(Time), by = 1)
lines(t,func(t), col='blue')
By our output, we can see that our estimates are the following:
\[ \hat{k} = 1033.5148145\\ \hat{r} = 0.1178587 \]
In the context of this problem the graph shows that the Beetle population will reach an equilibrium at approximately 1,000 beetles. It appears the data is oscillating above and below that threshold.
Therefore, connecting that thought to the value of our parameter estimate \(\hat{k} = 1033.5148145\) we can infer that \(k\) controls where the upper horizonal asymptote of the model’s graph will be.
The log-likelihood function is the following:
\[ \ell(\beta) = \sum_{i=1}^n [y_ilog(\pi_i(\beta)) + (1-y_i)log(1-\pi_i(\beta))] \]
The gradient \(\nabla\ell(\beta) = (\frac{\partial\ell}{\partial\beta_0},\frac{\partial\ell}{\partial\beta_1}, \frac{\partial\ell}{\partial\beta_2}, \frac{\partial\ell}{\partial\beta_3})^{T}\) can be seen below. We first define some notation to make our work more eligible.
\(f(x)\) is the standard normal density. \[ f(x) = \frac{1}{\sqrt{2\pi}}e^{\frac{-x^2}{2}} \]
\(\phi(x)\) is the standard normal cumulative distribution function (as defined in the problem statement)
Then we have the following formula for the \(j\)th partial derivative.
\[ \frac{\partial \ell}{\partial \beta_j} = \sum_{i=1}^n x_{ij}[y_i * (\frac{f(x_i^{T}\beta)}{\phi(x_i^{T}\beta)}) + (1-y_i)*(\frac{-f(x_i^{T}\beta)}{1-\phi(x_i^{T}\beta)})] \]
Hence,
\[ \nabla\ell(\beta) = (\frac{\partial\ell}{\partial\beta_0},\frac{\partial\ell}{\partial\beta_1}, \frac{\partial\ell}{\partial\beta_2}, \frac{\partial\ell}{\partial\beta_3})^{T} \]
where
\[ \frac{\partial \ell}{\partial \beta_j} = \sum_{i=1}^n x_{ij}[y_i * (\frac{f(x_i^{T}\beta)}{\phi(x_i^{T}\beta)}) + (1-y_i)*(\frac{-f(x_i^{T}\beta)}{1-\phi(x_i^{T}\beta)})] \]
data = read.csv('prob2_data.csv')
data = as.matrix(data)
n = nrow(data)
y = as.matrix(data[,1]) #admit column
X = cbind(matrix(1, n, ncol=1), data[,2], data[,3], data[,4])
#calculates function approximation, Jacobian, and Weight matrix
fJW <- function(beta, X) {
xb = X %*% beta
mu = pnorm(xb, mean=0, sd=1)
sigma = mu*(1-mu)
#Jacobian
dens.norm = dnorm(xb, mean=0, sd=1)
col1 = dens.norm * X[,1]
col2 = dens.norm * X[,2]
col3 = dens.norm * X[,3]
col4 = dens.norm * X[,4]
J = cbind(col1, col2, col3, col4)
#Weight matrix
W = diag(1/as.vector(sigma))
list(f = mu, J = J, W = W)
}
like.grad <- function(y, X, beta) {
xb = X %*% beta
cdf = pnorm(xb, mean=0, sd=1)
dens.norm = dnorm(xb, mean=0, sd=1)
db0 = sum(y*(dens.norm/cdf) + (1-y)*(-1*dens.norm/(1-cdf)))
db1 = sum(X[,2]*(y*(dens.norm/cdf)+ (1-y)*(-1*dens.norm/(1-cdf))))
db2 = sum(X[,3]*(y*(dens.norm/cdf)+ (1-y)*(-1*dens.norm/(1-cdf))))
db3 = sum(X[,3]*(y*(dens.norm/cdf)+ (1-y)*(-1*dens.norm/(1-cdf))))
gradient = matrix(c(db0, db1, db2, db3), ncol=1)
return(gradient)
}
#Gauss-Newton algorithm
GN = function(y, X, beta, maxit, tolerr, tolgrad) {
for(it in 1:maxit) {
a = fJW(beta, X)
f = a$f
J = a$J
Wt = a$W
JW = t(J) %*% Wt
#calculate grad.like (gradient of log likelihood)
grad.like = like.grad(y, X, beta)
norm.grad = norm(grad.like, type='2')
#print(sprintf("it = %3.0f b0 = %6.6f b1 = %6.6f b2 = %6.6f b3=%6.6f grad=%2.2e",
#it, beta[1, ], beta[2, ], beta[3, ], beta[4,],norm(JW %*% (y - f))))
print(sprintf("it = %3.0f b0 = %6.6f b1 = %6.6f b2 = %6.6f b3=%6.6f grad.like=%2.2e",
it, beta[1, ], beta[2, ], beta[3, ], beta[4,],norm.grad))
dir = solve(JW %*% J) %*% JW %*% (y - f)
beta.new = beta + dir
c1 = abs(max(1, beta.new))
c2 = abs(beta.new - beta)
mre = max(c2/c1)
if (mre < tolerr) {
print("We reached convergence with modified relative error < tolerr")
break
}
if (norm.grad < tolgrad) {
print("We reached convergence with gradient < tolgrad")
break
}
# update variable for next iteration
beta = beta.new
}
return(beta.new)
}
beta.init = as.matrix(c(0, 0, 0, 0))
beta.mle = GN(y, X, beta = beta.init, maxit=1000, tolerr=10^-6, tolgrad=10^-9)
## [1] "it = 1 b0 = 0.000000 b1 = 0.000000 b2 = 0.000000 b3=0.000000 grad.like=6.21e+04"
## [1] "it = 2 b0 = -1.710555 b1 = 0.001109 b2 = 0.378602 b3=-0.274481 grad.like=5.58e+03"
## [1] "it = 3 b0 = -2.060455 b1 = 0.001371 b2 = 0.458720 b3=-0.328551 grad.like=2.94e+02"
## [1] "it = 4 b0 = -2.090900 b1 = 0.001398 b2 = 0.464321 b3=-0.331741 grad.like=3.25e-01"
## [1] "it = 5 b0 = -2.091497 b1 = 0.001398 b2 = 0.464358 b3=-0.331711 grad.like=5.56e-02"
## [1] "it = 6 b0 = -2.091504 b1 = 0.001398 b2 = 0.464360 b3=-0.331712 grad.like=2.08e-04"
## [1] "We reached convergence with modified relative error < tolerr"
We can see the mle estimate of \(\beta\) below in the R console printout.
beta.mle
## [,1]
## [1,] -2.091503917
## [2,] 0.001398222
## [3,] 0.464359847
## [4,] -0.331711692
probability <- function(X, beta) {
xb = X %*% beta
return(pnorm(xb, mean=0, sd=1))
}
gre <- seq(220, 880, by=1)
X.rank1 = cbind(matrix(1, length(gre), ncol=1), gre, rep(3.8, length(gre)), rep(1, length(gre)))
X.rank2 = cbind(matrix(1, length(gre), ncol=1), gre, rep(3.8, length(gre)), rep(2, length(gre)))
X.rank3 = cbind(matrix(1, length(gre), ncol=1), gre, rep(3.8, length(gre)), rep(3, length(gre)))
X.rank4 = cbind(matrix(1, length(gre), ncol=1), gre, rep(3.8, length(gre)), rep(4, length(gre)))
plot(1, type='n', xlab='GRE scores 220-880', xlim=c(220, 880), ylim=c(0,1), ylab='Probability of admit')
lines(gre, probability(X.rank1, beta.mle), col='blue')
lines(gre, probability(X.rank2, beta.mle), col='green')
lines(gre, probability(X.rank3, beta.mle), col='red')
lines(gre, probability(X.rank4, beta.mle), col='black')
legend('topleft', legend=c("Rank1", "Rank2", "Rank3", "Rank4"), text.col=c('blue', 'green', 'red', 'black'))
I notice that as GRE scores increase, the probability of admit increases. I notice that high prestige rank has higher probability of admit than those from low prestige rank.
X.partE = matrix(c(1, 700, 3.5, 4), nrow=1)
this.student.prob = probability(X.partE, beta.mle)
this.student.prob
## [,1]
## [1,] 0.2077263
As we can see above from the R console printout, the probability that a student with GPA of 3.5, GRE score of 700, and undergraduate institution ranking of 4 will have probability 0.2077263 of being admitted.
The likelihood function is the following:
\[ L(\theta) = \prod_{i=1}^n \frac{\Gamma(\frac{v+p}{2})|\Sigma|^{-1/2}}{\pi^{p/2}v^{v/2}\Gamma(v/2)[v + (y_i-\mu)^{T}\Sigma^{-1}(y_i-\mu)]^{\frac{v+p}{2}}} \]
Hence the log-likelihood function is the following:
\[ \ell(\theta)= n\log\Gamma(\frac{v+p}{2}) - \frac{n}{2}\log|\Sigma| - \frac{np}{2}\log\pi - \frac{nv}{2}\log v - n\log\Gamma(v/2) - \frac{v+p}{2}\sum_{i=1}^n\log[v + (y_i-\mu)^{T}\Sigma^{-1}(y_i-\mu)] \]
The complete likelihood function (by product of the \(n\) multivariate densities) is the following:
\[ \begin{align} L_c(\theta) &= \prod_{i=1}^n f(x_i; \mu, \Sigma)\\ &= \prod_{i=1}^nf((y_i,u_i)^{T}; \mu, \Sigma)\\ &= \prod_{i=1}^{n}f(y_i|u_i; \mu, \Sigma) *f(u_i; \alpha, \beta)\\ &= \prod_{i=1}^n (\frac{u_i}{2\pi})^{p/2}|\Sigma|^{-1/2}\exp[-\frac{u_i}{2}\delta(y_i; \mu, \Sigma)](\frac{1}{\Gamma(v/2)(\frac{2}{v})^{v/2}}u_i^{\frac{v}{2} -1}\exp(-\frac{u_iv}{{2}}))\\ &=(2\pi)^{-np/2}\Gamma(v/2)^{-n}(2/v)^{-nv/2}|\Sigma|^{-n/2}\exp(-\frac{1}{2}[\sum_{i=1}^n u_i\delta(y_i;\mu, \Sigma) + v\sum_{i=1}^nu_i])\prod_{i=1}^nu_i^{\frac{p+v}{2}-1}\\ &=K*|\Sigma|^{-n/2}\exp(-\frac{1}{2}[\sum_{i=1}^n u_i\delta(y_i;\mu, \Sigma) + v\sum_{i=1}^nu_i])\prod_{i=1}^nu_i^{\frac{p+v}{2}-1} \end{align} \]
Where in the last line we see that \(K\) is a constant that denotes the terms that do not depend on \(y_i\), \(u_i\), or \(\mu\) or \(\Sigma\). Now we take the log of each side to obtain the log likelihood.
\[ \begin{align} \ell_c(\theta) &= \log(K) -\frac{n}{2}\log(|\Sigma|) -\frac{1}{2}[\sum_{i=1}^n u_i\delta(y_i;\mu, \Sigma) + v\sum_{i=1}^nu_i]) + (\frac{p+v}{2} - 1)\sum_{i=1}^n\log(u_i) \end{align} \]
By definition of the \(Q(\theta^{\prime}|\theta)\) is the following:
\[ \begin{align} Q(\theta^\prime|\theta) &= E[\ell_c(\theta^{\prime}|y_i, \theta)]\\ &=E[\log(K) -\frac{n}{2}\log(\Sigma^{\prime}) -\frac{1}{2}[\sum_{i=1}^n u_i\delta(y_i;\mu^{\prime}, \Sigma^{\prime}) + v\sum_{i=1}^nu_i]) + (\frac{p+v}{2} - 1)\sum_{i=1}^n\log(u_i) | y_i, \theta] \end{align} \]
We adopt the notation \(E^{*}(\cdot)\) := \(E(\cdot|y, \theta)\). We now bring the expectation inside the sum.
\[ \begin{align} Q(\theta^\prime|\theta) &= E[\ell_c(\theta^{\prime}|y_i, \theta)]\\ &=\log(K) -\frac{n}{2}\log(\Sigma^{\prime}) -\frac{1}{2}[\sum_{i=1}^n E^{*}(u_i)\delta(y_i;\mu^{\prime}, \Sigma^{\prime}) + v\sum_{i=1}^n E^{*}(u_i) ]) + (\frac{p+v}{2} - 1)\sum_{i=1}^n E^{*}(\log(u_i)) \end{align} \]
We now use some facts about the random variable \(U|y\). By fact (ii) in the problem statement we know that \[ U|y \sim gamma(\alpha=\frac{p+v}{2}, \beta=\frac{2}{v+\delta(y_i; \mu , \Sigma)}) \] Therefore, we have the following two facts: \[ E^{*}(u_i) = \frac{p+v}{v+\delta(y_i;\mu, \Sigma)}\\ E^{*}(\log(u_i)) = \log(\frac{2}{v+\delta(y_i; \mu, \Sigma)}) + \psi(\frac{p+v}{2}) \]
We are now ready to plug in these values into our \(Q(\theta^{\prime}|\theta)\) function.
\[ \begin{align} Q(\theta^\prime|\theta) &= E[\ell_c(\theta^{\prime}|y_i, \theta)]\\ &=\log(K) -\frac{n}{2}\log(\Sigma^{\prime}) -\frac{1}{2}[\sum_{i=1}^n E^{*}(u_i)\delta(y_i;\mu^{\prime}, \Sigma^{\prime}) + v\sum_{i=1}^n E^{*}(u_i) ]) + (\frac{p+v}{2} - 1)\sum_{i=1}^n E^{*}(\log(u_i))\\ &=\log(K) -\frac{n}{2}\log(\Sigma^{\prime})-\frac{1}{2}[\sum_{i=1}^n (\frac{p+v}{v+\delta(y_i;\mu, \Sigma)})[\delta(y_i;\mu^{\prime}, \Sigma^{\prime}) + v] + (\frac{p+v}{2} - 1)\sum_{i=1}^n[\log(\frac{2}{v+\delta(y_i; \mu, \Sigma)})+ \psi(\frac{p+v}{2})]\\ &=\log(K_2) -\frac{n}{2}\log(\Sigma^{\prime})-(\frac{p+v}{2})\sum_{i=1}^n\frac{v+\delta(y_i; \mu^{\prime},\Sigma^{\prime})}{v+\delta(y_i;\mu,\Sigma)} - (\frac{p+v}{2} - 1)\sum_{i=1}^n\log(v+\delta(y_i; \mu, \Sigma)) \end{align} \]
The above result is a function of \(\theta\), \(\theta^{\prime}\), and the observed values \(y_1, ..., y_n\).
The M-Step will update the values \(\tilde{\mu}\) and \(\tilde{\Sigma}\) using the values of \(\mu^{\prime}\) and \(\Sigma^{\prime}\) that maximize our \(Q(\theta^{\prime}|\theta)\) function with respect to \(\theta^{\prime}\).
We will rewrite \(Q(\theta^{\prime}|\theta)\) so that all terms that are not functions of \(\theta^{\prime}\) are treated as constants that will vanish as we take derivatives. Note that \(E^{*(u_i)}\) and \(E^{*}(\log(u_i))\) are functions of \(\theta\), not \(\theta^{\prime}\). Thus, we can treat them as constants. We will denote all the combination of constant terms by \(K_3\)
\[ \begin{align} Q(\theta^\prime|\theta) &= E[\ell_c(\theta^{\prime}|y_i, \theta)]\\ &=\log(K) -\frac{n}{2}\log(\Sigma^{\prime}) -\frac{1}{2}[\sum_{i=1}^n E^{*}(u_i)\delta(y_i;\mu^{\prime}, \Sigma^{\prime}) + v\sum_{i=1}^n E^{*}(u_i) ]) + (\frac{p+v}{2} - 1)\sum_{i=1}^n E^{*}(\log(u_i))\\ &=\log(K)-\frac{v}{2}\sum_{i=1}^nE^{*}(u_i) + (\frac{p+v}{2} - 1)\sum_{i=1}^n E^{*}(\log(u_i))-\frac{n}{2}\log(\Sigma^{\prime})-\frac{1}{2}[\sum_{i=1}^n E^{*}(u_i)\delta(y_i;\mu^{\prime}, \Sigma^{\prime})]\\ &=K_3 -\frac{n}{2}\log(\Sigma^{\prime})-\frac{1}{2}[\sum_{i=1}^n E^{*}(u_i)\delta(y_i;\mu^{\prime}, \Sigma^{\prime})] \end{align} \]
We now take the derivative with respect o \(\mu^{\prime}\) using differentials notation.
\[ \begin{align} dQ(d\mu^{\prime}) &= 0 - 0 -\frac{1}{2}\sum_{i=1}^nE^{*}(u_i)[(-d{\mu^{\prime}}^{T}){\Sigma^{\prime}}^{-1}(y_i-\mu^{\prime}) + (y_i -\mu^{\prime})^{T}{\Sigma^{\prime}}^{-1}(-d\mu^{\prime})]\\ &=-\frac{1}{2}\sum_{i=1}^nE^{*}(u_i)[-2(y_i -\mu^{\prime})^{T}{\Sigma^{\prime}}^{-1}(-d\mu^{\prime})]\\ &=\sum_{i=1}^nE^{*}(u_i)(y_i -\mu^{\prime})^{T}{\Sigma^{\prime}}^{-1}(d\mu^{\prime}) \end{align} \]
Now let \(V=\sum_{i=1}^nE^{*}(u_i)(y_i -\mu)^{T}{\Sigma^{\prime}}^{-1}\). Note that \(V\) is a 1 x \(p\) row vector.
Then we have \(dQ(d\mu^{\prime})=V(d\mu^{\prime})\). It then follows that the \(j\)th partial with respect to is the \(j\)th entry of \(V\). In other words.
\[ \frac{\partial Q}{\partial \mu^{\prime}_j} = V[,j]\Rightarrow \nabla^{T}_{\mu^{\prime}}Q = \sum_{i=1}^nE^{*}(u_i)(y_i -\mu)^{T}{\Sigma^{\prime}}^{-1} \]
All that remains to obtain the gradient with respect to \(\mu^{\prime}\) is to take transpose of each side. Therefore: \[ \begin{align} \nabla_{\mu^{\prime}}Q &=\sum_{i=1}^n{\Sigma^{\prime}}^{-1}E^{*}(u_i)(y_i-\mu^{\prime}) \\ &={\Sigma^{\prime}}^{-1}\sum_{i=1}^nE^{*}(u_i)(y_i-\mu^{\prime}) \end{align} \]
Setting this equal to zero and solving for \(\mu{\prime}\) will give us the M-step update for \(\tilde{\mu}\)
\[ \nabla_{\mu^{\prime}}Q = 0 \Rightarrow\tilde{\mu} = \frac{\sum_{i=1}^nE^{*}(u_i)y_i}{\sum_{i=1}^nE^{*}(u_i)} \]
We first remind ourselves the \(Q\) function we will be working with.
\[ \begin{align} Q(\theta^\prime|\theta) &= K_3 -\frac{n}{2}\log(\Sigma^{\prime})-\frac{1}{2}[\sum_{i=1}^n E^{*}(u_i)\delta(y_i;\mu^{\prime}, \Sigma^{\prime})\\ &=K_3 -\frac{n}{2}\log(\Sigma^{\prime})-\frac{1}{2}[\sum_{i=1}^n E^{*}(u_i))(y_i-\mu^{\prime})^{T}{\Sigma^{\prime}}^{-1}(y_i-\mu^{\prime}])\\ &=K_3 -\frac{n}{2}\log(\Sigma^{\prime})-\frac{1}{2}trace[{\Sigma^{\prime}}^{-1}[\sum_{i=1}^n E^{*}(u_i)(y_i-\mu^{\prime})(y_i-\mu^{\prime})^{T}]]\\ &=K_3 -\frac{n}{2}\log(\Sigma^{\prime})-\frac{n}{2}trace[{\Sigma^{\prime}}^{-1}\frac{1}{n}[\sum_{i=1}^n E^{*}(u_i)(y_i-\mu^{\prime})(y_i-\mu^{\prime})^{T}]] \end{align} \]
By class Lecture on April 6, a maximizing value of \(\Sigma\) was given for a similar structured likelihood function. In particular, the maximizing value was algebraic structure inside the trace function, after \(\Sigma^{-1}\) (See Lecture 04/06 when finding the maximizing values of the complete data log-likelihood of a multivariate normal).
Hence, we can guess that the maximizing value for \(\Sigma^{\prime}\) will be given by the following:
\[ \Sigma^{\prime}_{max} = \frac{1}{n}[\sum_{i=1}^n E^{*}(u_i)(y_i-\mu^{\prime})(y_i-\mu^{\prime})^{T}] \]
Therefore, replacing \(\mu^{\prime}\) with \(\tilde{\mu} =\frac{\sum_{i=1}^nE^{*}(u_i)y_i}{\sum_{i=1}^nE^{*}(u_i)}\) (our M-Step for \(\mu^{\prime}\)) in the above equation we see that our M-step for \(\Sigma^{\prime}\) is the following:
\[ \tilde{\Sigma} = \frac{1}{n}[\sum_{i=1}^n E^{*}(u_i)(y_i-\tilde{\mu})(y_i-\tilde{\mu})^{T}] \] This concludes our M-step.