GLM set choosen for this question is a binomial response variable with a single independent variable (X). We use the typical logit link function for modeling.
#Examining Deviance test
set.seed(1366)
NREP = 500
sample_n = 100 #This variable shows number of steps for sample size. each steps will increase by 5
beta_A = c(0.1,0.2,0) #null hypothesis is true
beta_B = c(0.1, 0.2,-0.07) #null hypothesis is false
Wrong_Dec_matrix = matrix(0,sample_n,5)
colnames(Wrong_Dec_matrix) <- c("Sample Size","Wrong_Rejections", "Wrong_acceptance", "Wrong_Decision", "Wrong_Decisions_Ratio")
for (k in 1:sample_n){
sample_size = k * 5
Null_Rej_DevianceT = matrix(0, NREP, 2)
for (i in 1:NREP){
x1 <- runif(sample_size,min = -2, max=2)
x2 <- rnorm(sample_size, 0, 2)
y_A <- rbinom(sample_size, size = 20, prob = 1/(1+exp(-(cbind(1,x1,x2) %*% beta_A))))
y_B <- rbinom(sample_size, size = 20, prob = 1/(1+exp(-(cbind(1,x1,x2) %*% beta_B))))
Full_model_A <- glm(cbind(y_A , 20-y_A) ~ cbind(x1,x2), family = binomial())
Red_model_A <- glm(cbind(y_A , 20-y_A) ~ x1, family = binomial())
Full_model_B <- glm(cbind(y_B , 20-y_B) ~ cbind(x1,x2), family = binomial())
Red_model_B <- glm(cbind(y_B , 20-y_B) ~ x1, family = binomial())
Null_Rej_DevianceT[i,1] <- as.numeric(deviance(Red_model_A) - deviance(Full_model_A) > qchisq(0.95, 1))
Null_Rej_DevianceT[i,2] <- as.numeric((deviance(Red_model_B) - deviance(Full_model_B)) > qchisq(0.95, 1))
}
Wrong_Dec_matrix[k,1] = sample_size
Wrong_Dec_matrix[k,2] = sum(Null_Rej_DevianceT[,1])
Wrong_Dec_matrix[k,3] = NREP - sum(Null_Rej_DevianceT[,2])
Wrong_Dec_matrix[k,4] = Wrong_Dec_matrix[k,2] + Wrong_Dec_matrix[k,3]
Wrong_Dec_matrix[k,5] = Wrong_Dec_matrix[k,4] / NREP
}
print(head(Wrong_Dec_matrix,10))
## Sample Size Wrong_Rejections Wrong_acceptance Wrong_Decision
## [1,] 5 34 445 479
## [2,] 10 26 429 455
## [3,] 15 25 414 439
## [4,] 20 26 375 401
## [5,] 25 24 368 392
## [6,] 30 23 304 327
## [7,] 35 35 295 330
## [8,] 40 19 268 287
## [9,] 45 29 233 262
## [10,] 50 16 207 223
## Wrong_Decisions_Ratio
## [1,] 0.958
## [2,] 0.910
## [3,] 0.878
## [4,] 0.802
## [5,] 0.784
## [6,] 0.654
## [7,] 0.660
## [8,] 0.574
## [9,] 0.524
## [10,] 0.446
print(tail(Wrong_Dec_matrix,10))
## Sample Size Wrong_Rejections Wrong_acceptance Wrong_Decision
## [91,] 455 18 0 18
## [92,] 460 21 0 21
## [93,] 465 20 0 20
## [94,] 470 26 0 26
## [95,] 475 20 0 20
## [96,] 480 29 0 29
## [97,] 485 19 0 19
## [98,] 490 21 0 21
## [99,] 495 27 0 27
## [100,] 500 20 0 20
## Wrong_Decisions_Ratio
## [91,] 0.036
## [92,] 0.042
## [93,] 0.040
## [94,] 0.052
## [95,] 0.040
## [96,] 0.058
## [97,] 0.038
## [98,] 0.042
## [99,] 0.054
## [100,] 0.040
In the following plot, we can see how accuracy of LR test increases with regard to sample size:
plot(Wrong_Dec_matrix[,1], Wrong_Dec_matrix[,5], type = "l", lty=1, lwd=2, xlab = "Sample Size", ylab = "Wrong decision ratio", main = "Accuracy of LR test in different sample sizes")
lines(Wrong_Dec_matrix[,1], Wrong_Dec_matrix[,2]/NREP, col="red",lty=2)
lines(Wrong_Dec_matrix[,1], Wrong_Dec_matrix[,3]/NREP, col="blue",lty=3)
legend("topright",c("Wrong decision ratio","Wrong rejection ratio","Wrong acceptance ratio"),lty=1:3,lwd=c(2,1,1),col = c("black","red","blue"))
As we can see in the plot above when sample size increases, accuracy for LR test also increases and converges to its best amount (i.e. about %5 of wrong decision). We can see that for sample sizes greater than 200 (approximately) we do not see considerable improvement in accuracy. Also we should note that the factor that improves the accuracy of decisions is decrease in the rate of wrong acceptance ratio when the reduced model is not adequate but we do not reject it in favor of full model. Wrong rejection of reduced model does not get significantly better by increaseing sample size.
As we have shown in the second question of this assignment, wald and score statistic are the same for binomial model. So in the following chunk, I am implementing these two tests at the same time.
Wrong_Dec_matrix2 = matrix(0,sample_n,5)
colnames(Wrong_Dec_matrix2) <- c("Sample Size","Wrong_Rejections", "Wrong_acceptance", "Wrong_Decision", "Wrong_Decisions_Ratio")
for (k in 1:sample_n){
sample_size = k * 5
Null_Rej_Wald = matrix(0, NREP, 2)
for (i in 1:NREP){
x1 <- runif(sample_size,min = -2, max=2)
x2 <- rnorm(sample_size, 0, 2)
y_A <- rbinom(sample_size, size = 20, prob = 1/(1+exp(-(cbind(1,x1,x2) %*% beta_A))))
y_B <- rbinom(sample_size, size = 20, prob = 1/(1+exp(-(cbind(1,x1,x2) %*% beta_B))))
Full_model_A <- glm(cbind(y_A , 20-y_A) ~ cbind(x1,x2), family = binomial())
Red_model_A <- glm(cbind(y_A , 20-y_A) ~ x1, family = binomial())
Full_model_B <- glm(cbind(y_B , 20-y_B) ~ cbind(x1,x2), family = binomial())
Red_model_B <- glm(cbind(y_B , 20-y_B) ~ x1, family = binomial())
Null_Rej_Wald[i,1] <- as.numeric(abs(Full_model_A$coefficients[3]/sqrt(summary(Full_model_A)$cov.unscaled[3,3])) > qnorm(0.975))
Null_Rej_Wald[i,2] <- as.numeric(abs(Full_model_B$coefficients[3]/sqrt(summary(Full_model_B)$cov.unscaled[3,3])) > qnorm(0.975))
}
Wrong_Dec_matrix2[k,1] = sample_size
Wrong_Dec_matrix2[k,2] = sum(Null_Rej_Wald[,1])
Wrong_Dec_matrix2[k,3] = NREP - sum(Null_Rej_Wald[,2])
Wrong_Dec_matrix2[k,4] = Wrong_Dec_matrix2[k,2] + Wrong_Dec_matrix2[k,3]
Wrong_Dec_matrix2[k,5] = Wrong_Dec_matrix2[k,4] / NREP
}
print(head(Wrong_Dec_matrix2,10))
## Sample Size Wrong_Rejections Wrong_acceptance Wrong_Decision
## [1,] 5 21 458 479
## [2,] 10 25 451 476
## [3,] 15 22 393 415
## [4,] 20 27 365 392
## [5,] 25 19 350 369
## [6,] 30 25 307 332
## [7,] 35 26 289 315
## [8,] 40 26 262 288
## [9,] 45 25 243 268
## [10,] 50 21 214 235
## Wrong_Decisions_Ratio
## [1,] 0.958
## [2,] 0.952
## [3,] 0.830
## [4,] 0.784
## [5,] 0.738
## [6,] 0.664
## [7,] 0.630
## [8,] 0.576
## [9,] 0.536
## [10,] 0.470
print(tail(Wrong_Dec_matrix2,10))
## Sample Size Wrong_Rejections Wrong_acceptance Wrong_Decision
## [91,] 455 14 0 14
## [92,] 460 25 0 25
## [93,] 465 30 0 30
## [94,] 470 20 0 20
## [95,] 475 26 0 26
## [96,] 480 18 0 18
## [97,] 485 23 0 23
## [98,] 490 19 0 19
## [99,] 495 20 0 20
## [100,] 500 31 0 31
## Wrong_Decisions_Ratio
## [91,] 0.028
## [92,] 0.050
## [93,] 0.060
## [94,] 0.040
## [95,] 0.052
## [96,] 0.036
## [97,] 0.046
## [98,] 0.038
## [99,] 0.040
## [100,] 0.062
plot(Wrong_Dec_matrix2[,1], Wrong_Dec_matrix2[,5], type = "l", lty=1, lwd=2, xlab = "Sample Size", ylab = "Wrong decision ratio", main = "Accuracy of Wald test in different sample sizes")
lines(Wrong_Dec_matrix2[,1], Wrong_Dec_matrix2[,2]/NREP, col="red",lty=2)
lines(Wrong_Dec_matrix2[,1], Wrong_Dec_matrix2[,3]/NREP, col="blue",lty=3)
legend("topright",c("Wrong decision ratio","Wrong rejection ratio","Wrong acceptance ratio"),lty=1:3,lwd=c(2,1,1),col = c("black","red","blue"))
In the plot below, I compare LR and Wald test and we can see that there is no significant differnce between them in terms of convergence to best accuracy by sample size.
plot(Wrong_Dec_matrix[,1], Wrong_Dec_matrix[,5], type = "l", lty=1, col="blue", xlab = "Sample Size", ylab = "Wrong decision ratio", main = "Comparison of LR and Wald test")
lines(Wrong_Dec_matrix2[,1], Wrong_Dec_matrix2[,5], col="red",lty=2)
legend("topright", c("LR Test", "Wald test"), lty = c(1,2), col=c("blue","red"))
##Question 2- Exercise 5.1 (a) \[ f(y_i; n, \pi)=\binom{n}{y_i}\pi^{y_i}(1-\pi)^{n-y_i} \,\,\, for \,\,\,i=1,...,N \\ l(\pi;y)=\sum_{i=1}^N y_ilog\,\pi + (n-y_i)\,log\,(1-\pi)+log\,\binom{n}{y_i}\\ U=\frac{\partial l}{\partial \pi} = \sum_{i=1}^N \frac{y_i}\pi - (n-y_i)\frac 1{1-\pi} \\ U=0 \Rightarrow \hat \pi = \frac{\overline y}{n} \]
And for information matrix we have: \[ I=-E(U')\\ U'= \sum_{i=1}^N -\frac{y_i}{\pi^2}-(n-y_i)\frac 1{(1-\pi)^2} \\E(U') = -\frac{nN\pi}{\pi^2}-\frac{nN-nN\pi}{(1-\pi)^2} = -nN\bigg(\frac1\pi+\frac1{1-\pi}\bigg) \\ \Rightarrow I=nN\bigg(\frac1{\pi(1-\pi)}\bigg) \]
Then Wald statitic equals to: \[
(\hat \pi - \pi)^2(nN\frac1{\pi(1-\pi)})\\
= (\frac{\overline y}n - \pi)^2(nN\frac1{\pi(1-\pi)})\\
= (\overline y - n\pi)^2(N\frac1{n\pi(1-\pi)})\\
=\frac{(\overline y - n\pi)^2}{\frac{n\pi(1-\pi)}{N}}
\] (b) \[
U=\frac{N\overline y}{\pi}-\frac{nN-N\overline y}{1-\pi}=N\bigg(\frac{\overline y -n\pi}{\pi(1-\pi)}\bigg)\\
\Rightarrow U^TI^{-1}U = N^2 \times\frac{(\overline y - n\pi)^2}{\pi^2(1-\pi)^2}\times\frac{\pi(1-\pi)}{nN}\\
= \frac{(\overline y - n\pi)^2}{\frac{n\pi(1-\pi)}{N}}
\] (c) \[
D=2[l(\hat \pi;y)-l(\pi;y)]=2\bigg[\sum_{i=1}^N y_ilog\frac{\hat \pi}{\pi}+(n-y_i)log\frac{1-\hat \pi}{1-\pi}\bigg]=2\bigg [nN\hat\pi log\frac{\hat \pi}{\pi}+nN(1-\hat\pi) log\frac{1-\hat \pi}{1-\pi}\bigg]=\\
2nN\bigg [\hat\pi log\frac{\hat \pi}{\pi}+(1-\hat\pi) log\frac{1-\hat \pi}{1-\pi}\bigg]
\] (d)
(i) \(\pi=0.1\)
Wald Statistic:
rej1_W <- ((3-10*0.1)^2) / (10*0.1*0.9) > qchisq(0.95,1)
print(rej1_W)
## [1] TRUE
Deviance Statistic:
rej1_D <- 2*10*(0.3*log(0.3/0.1)+0.7*log(0.7/0.9)) > qchisq(0.95,1)
print(rej1_D)
## [1] FALSE
(ii) \(\pi=0.3\)
Wald Statistic:
rej2_W <- ((3-10*0.3)^2) / (10*0.3*0.7) > qchisq(0.95,1)
print(rej2_W)
## [1] FALSE
Deviance Statistic:
rej2_D <- 2*10*(0.3*log(0.3/0.3)+0.7*log(0.7/0.7)) > qchisq(0.95,1)
print(rej2_D)
## [1] FALSE
(iii) \(\pi=0.5\)
Wald Statistic:
rej3_W <- ((3-10*0.5)^2) / (10*0.5*0.5) > qchisq(0.95,1)
print(rej3_W)
## [1] FALSE
Deviance Statistic:
rej3_D <- 2*10*(0.3*log(0.3/0.5)+0.7*log(0.7/0.5)) > qchisq(0.95,1)
print(rej3_D)
## [1] FALSE
###First
dose <- c(1.6907, 1.7242, 1.7552, 1.7842, 1.8113, 1.8369, 1.8610, 1.8839)
n <- c(59, 60, 62, 56, 63, 59, 62, 60)
y <- c(6, 13, 18, 28, 52, 53, 61, 60)
p <- y / n
fit1 <- glm(cbind(y,n-y) ~ dose, family = binomial(link = "logit"))
fit2 <- glm(cbind(y,n-y) ~ dose, family = binomial(link = "probit"))
fit3 <- glm(cbind(y,n-y) ~ dose, family = binomial(link = "cloglog"))
plot(dose,p, pch=19)
points(dose, fit1$fitted.values, pch=22)
points(dose, fit2$fitted.values, pch=23)
points(dose, fit3$fitted.values, pch=24)
legend("topleft", c("True values", "Logit model", "Probit Model", "Cloglog model"), pch = c(19, 22, 23, 24))
Examining the plot, cloglog models seems a little bit better. Although, it is not evident and in my opinion we cannot graphically infere in this regard
###Second
Logit Link For the second part, we build a linear model which relates Doses (as the dependant variable) to the logit of odds (as the independant variable). Thus, the model will be like this: \[ Dose = \beta_0+\beta_1\,log\bigg(\frac{\pi}{1-\pi}\bigg) \] At first, I am going to visually examine the linear relationship between these two:
plot(log(p/(1-p)),dose)
As expected, the plot confirms the linear relationship.
x_new <- log(p/(1-p))
dose_model <- lm(dose ~ x_new, subset = c(1:7))
summary(dose_model)
##
## Call:
## lm(formula = dose ~ x_new, subset = c(1:7))
##
## Residuals:
## 1 2 3 4 5 6 7
## -0.017971 -0.008446 0.012050 0.017062 0.002474 0.011295 -0.016465
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.767138 0.006113 289.067 9.4e-12 ***
## x_new 0.026838 0.002893 9.276 0.000245 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01572 on 5 degrees of freedom
## Multiple R-squared: 0.9451, Adjusted R-squared: 0.9341
## F-statistic: 86.05 on 1 and 5 DF, p-value: 0.0002448
Since Residuals standard error is the estimate of standard deviation of the dependant variable (dose), we will have point estimate and confidence interval for %50 and %5 as follows:
dose_50 = as.numeric(dose_model$coefficients[1] + dose_model$coefficients[2] * log(1))
dose_50_CI <- c(dose_50-1.96*summary(dose_model)$sigma, dose_50+1.96*summary(dose_model)$sigma)
print("point estimation for %50 mortality dose is:")
## [1] "point estimation for %50 mortality dose is:"
dose_50
## [1] 1.767138
print("%95 confidence interval for %50 mortality dose is:")
## [1] "%95 confidence interval for %50 mortality dose is:"
dose_50_CI
## [1] 1.736329 1.797947
dose_5 = as.numeric(dose_model$coefficients[1] + dose_model$coefficients[2] * log(0.05/0.95))
dose_5_CI <- c(dose_5-1.96*summary(dose_model)$sigma, dose_5+1.96*summary(dose_model)$sigma)
print("point estimation for %5 mortality dose is:")
## [1] "point estimation for %5 mortality dose is:"
dose_5
## [1] 1.688116
print("%95 confidence interval for %5 mortality dose is:")
## [1] "%95 confidence interval for %5 mortality dose is:"
dose_5_CI
## [1] 1.657307 1.718924
Probit Link We repeat all the above steps for probit model. \[ Dose = \beta_0+\beta_1\Phi^{-1}(\pi) \]
dose_model2 <- lm(dose ~ qnorm(p), subset = c(1:7))
summary(dose_model2)
##
## Call:
## lm(formula = dose ~ qnorm(p), subset = c(1:7))
##
## Residuals:
## 1 2 3 4 5 6 7
## -0.016064 -0.006248 0.013549 0.015762 -0.002529 0.006788 -0.011260
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.768438 0.005213 339.2 4.22e-12 ***
## qnorm(p) 0.048488 0.004448 10.9 0.000113 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01348 on 5 degrees of freedom
## Multiple R-squared: 0.9596, Adjusted R-squared: 0.9515
## F-statistic: 118.8 on 1 and 5 DF, p-value: 0.0001129
dose_50 = as.numeric(dose_model2$coefficients[1] + dose_model2$coefficients[2] * qnorm(0.5))
dose_50_CI <- c(dose_50-1.96*summary(dose_model2)$sigma, dose_50+1.96*summary(dose_model2)$sigma)
print("point estimation for %50 mortality dose is:")
## [1] "point estimation for %50 mortality dose is:"
dose_50
## [1] 1.768438
print("%95 confidence interval for %50 mortality dose is:")
## [1] "%95 confidence interval for %50 mortality dose is:"
dose_50_CI
## [1] 1.742020 1.794856
dose_5 = as.numeric(dose_model2$coefficients[1] + dose_model2$coefficients[2] * qnorm(0.05))
dose_5_CI <- c(dose_5-1.96*summary(dose_model2)$sigma, dose_5+1.96*summary(dose_model2)$sigma)
print("point estimation for %5 mortality dose is:")
## [1] "point estimation for %5 mortality dose is:"
dose_5
## [1] 1.688682
print("%95 confidence interval for %5 mortality dose is:")
## [1] "%95 confidence interval for %5 mortality dose is:"
dose_5_CI
## [1] 1.662265 1.715100
Cloglog Link We repeat all the above steps for cloglog model. \[ Dose = \beta_0+\beta_1\,log[-log(1-\pi)] \]
dose_model3 <- lm(dose ~ log(-log(1-p)), subset = c(1:7))
summary(dose_model3)
##
## Call:
## lm(formula = dose ~ log(-log(1 - p)), subset = c(1:7))
##
## Residuals:
## 1 2 3 4 5 6 7
## -0.0020954 -0.0064349 0.0089486 0.0055897 -0.0097736 0.0034188 0.0003469
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.795465 0.002844 631.3 1.89e-13 ***
## log(-log(1 - p)) 0.045986 0.002243 20.5 5.11e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.007273 on 5 degrees of freedom
## Multiple R-squared: 0.9882, Adjusted R-squared: 0.9859
## F-statistic: 420.4 on 1 and 5 DF, p-value: 5.108e-06
dose_50 = as.numeric(dose_model3$coefficients[1] + dose_model3$coefficients[2] * log(-log(0.5)))
dose_50_CI <- c(dose_50-1.96*summary(dose_model3)$sigma, dose_50+1.96*summary(dose_model3)$sigma)
print("point estimation for %50 mortality dose is:")
## [1] "point estimation for %50 mortality dose is:"
dose_50
## [1] 1.77861
print("%95 confidence interval for %50 mortality dose is:")
## [1] "%95 confidence interval for %50 mortality dose is:"
dose_50_CI
## [1] 1.764356 1.792865
dose_5 = as.numeric(dose_model3$coefficients[1] + dose_model3$coefficients[2] * log(-log(0.95)))
dose_5_CI <- c(dose_5-1.96*summary(dose_model3)$sigma, dose_5+1.96*summary(dose_model3)$sigma)
print("point estimation for %5 mortality dose is:")
## [1] "point estimation for %5 mortality dose is:"
dose_5
## [1] 1.658878
print("%95 confidence interval for %5 mortality dose is:")
## [1] "%95 confidence interval for %5 mortality dose is:"
dose_5_CI
## [1] 1.644624 1.673133
*Comparing three models Length of confidence intervals for three models are as follows:
## [1] "logit:"
## [1] 0.03143749
## [1] "probit:"
## [1] 0.02695694
## [1] "cloglog:"
## [1] 0.01454521
Regarding length of CI we can see that cloglog model is more accurate. If we consider \(R^2\), we see that cloglog model has highest amounts which shows that this model explains highest amount of variance in values of dose.
library(generalhoslem)
## Loading required package: reshape
## Loading required package: MASS
set.seed(50)
NREP = 500
n_obs = 100
statistic_values <- array(0,NREP)
for (i in 1:NREP){
x <- rnorm(n_obs, 1, 1)
beta <- c(0.1,0.2)
y <- rbinom(n_obs, size = 1, prob = (1/(1+exp(-(beta[1]+beta[2]*x)))))
fit4 <- glm(y ~ x, family = binomial())
hoslem <- logitgof(y,fit4$fitted.values)
statistic_values[i] = hoslem$statistic
}
mean(statistic_values)
## [1] 8.168053
var(statistic_values)
## [1] 13.93862
qqplot(statistic_values, rchisq(500,8))
abline(0,1,lty=2)
This qqpolt compares values of the statistic with chi-square(df=8) and we can confirm that this statistic has distribution of chi-square with (g-2) degrees of freedom.
##Question 5 To implement EM algorithm for probit regression, at first I am going to prepare simulated data for modeling.
x <- rnorm(2000,0,2)
betas <- c(0.1 , 0.2)
z <- rnorm(2000, mean = cbind(1,x) %*% betas, sd=1)
y <- z
y[y>=0] <- 1
y[y<0] <- 0
t <- glm(y~x, family = binomial(link = "probit"))
summary(t)
##
## Call:
## glm(formula = y ~ x, family = binomial(link = "probit"))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0109 -1.1485 0.7254 1.0386 1.8799
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.1564 0.0290 5.394 6.89e-08 ***
## x 0.2047 0.0155 13.208 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2749.2 on 1999 degrees of freedom
## Residual deviance: 2563.9 on 1998 degrees of freedom
## AIC: 2567.9
##
## Number of Fisher Scoring iterations: 4
As the next step, we should calculate \(\gamma_i\): \[ \gamma_i=E(z_i|y=y_i) \] \(z_i\)s conditioned on \(y_i\) have truncated normal distribution. So the expected value can be calculated by the equations below. If \(y_i=1\), \[ E(z_i|z_i \geq 0)=x^T\beta+\frac{\phi(x^T\beta)}{\Phi(x^T\beta)} \]
And If \(y_i=0\), \[ E(z_i|z_i<0) =x^T\beta-\frac{\phi(x^T\beta)}{\Phi(-x^T\beta)} \]
b <- c(10,2) #initial values for betas
ex <- cbind(1,x)
mu <- ex %*% b
it=1
converged = FALSE
maxits = 100000
tol=0.0001
gammas=array(0,20)
while ((!converged) & (it < maxits)) {
b_old = b
gammas = ifelse(y==1,mu+dnorm(mu)/pnorm(mu), mu-dnorm(mu)/pnorm(-mu)) #E-step
b = solve(t(ex)%*%ex)%*%t(ex)%*%gammas #Mstep
mu = ex %*% b
it = it + 1
converged = max(abs(b_old - b)) <= tol
}
b
## [,1]
## 0.1564594
## x 0.2047532
it
## [1] 19
We can see that the estimations are acceptable and it is not at all sensitive to initial values.
library(readr)
housedata <- read_csv("/Users/hamed/Documents/Academics/Generalized Linear Models/housedata.csv")
## Parsed with column specification:
## cols(
## Sat_L = col_double(),
## Sat_M = col_double(),
## Sat_H = col_double(),
## Is_apartment = col_double(),
## Is_house = col_double(),
## Contact = col_double()
## )
(a)
table_SC <-data.frame(
Low_Satisfaction = rep(0,2),
Medium_Satisfaction = rep(0,2),
High_Satisfaction = rep(0,2)
)
row.names(table_SC) <- c("Low Contact","High Contact")
temp <- subset(housedata, Contact==0)
table_SC$Low_Satisfaction[1] = sum(temp$Sat_L)/(sum(temp$Sat_L)+sum(temp$Sat_M)+sum(temp$Sat_H))
table_SC$Medium_Satisfaction[1] = sum(temp$Sat_M)/(sum(temp$Sat_L)+sum(temp$Sat_M)+sum(temp$Sat_H))
table_SC$High_Satisfaction[1] = sum(temp$Sat_H)/(sum(temp$Sat_L)+sum(temp$Sat_M)+sum(temp$Sat_H))
temp <- subset(housedata, Contact==1)
table_SC$Low_Satisfaction[2] = sum(temp$Sat_L)/(sum(temp$Sat_L)+sum(temp$Sat_M)+sum(temp$Sat_H))
table_SC$Medium_Satisfaction[2] = sum(temp$Sat_M)/(sum(temp$Sat_L)+sum(temp$Sat_M)+sum(temp$Sat_H))
table_SC$High_Satisfaction[2] = sum(temp$Sat_H)/(sum(temp$Sat_L)+sum(temp$Sat_M)+sum(temp$Sat_H))
table_SC
## Low_Satisfaction Medium_Satisfaction High_Satisfaction
## Low Contact 0.3674614 0.2496494 0.3828892
## High Contact 0.3150826 0.2768595 0.4080579
table_TC <-data.frame(
Tower_block = rep(0,2),
Apartment = rep(0,2),
House = rep(0,2)
)
row.names(table_TC) <- c("Low Contact","High Contact")
temp <- subset(housedata, Is_apartment==0 & Is_house==0)
table_TC$Tower_block[1] = sum(subset(temp,Contact==0,select = c("Sat_L","Sat_M", "Sat_H"))) /sum(subset(temp,select = c("Sat_L","Sat_M", "Sat_H")))
table_TC$Tower_block[2] = sum(subset(temp,Contact==1,select = c("Sat_L","Sat_M", "Sat_H"))) /sum(subset(temp,select = c("Sat_L","Sat_M", "Sat_H")))
temp <- subset(housedata, Is_apartment==1)
table_TC$Apartment[1] = sum(subset(temp,Contact==0,select = c("Sat_L","Sat_M", "Sat_H"))) /sum(subset(temp,select = c("Sat_L","Sat_M", "Sat_H")))
table_TC$Apartment[2] = sum(subset(temp,Contact==1,select = c("Sat_L","Sat_M", "Sat_H"))) /sum(subset(temp,select = c("Sat_L","Sat_M", "Sat_H")))
temp <- subset(housedata, Is_house==1)
table_TC$House[1] = sum(subset(temp,Contact==0,select = c("Sat_L","Sat_M", "Sat_H"))) /sum(subset(temp,select = c("Sat_L","Sat_M", "Sat_H")))
table_TC$House[2] = sum(subset(temp,Contact==1,select = c("Sat_L","Sat_M", "Sat_H"))) /sum(subset(temp,select = c("Sat_L","Sat_M", "Sat_H")))
table_TC
## Tower_block Apartment House
## Low Contact 0.5475 0.4143791 0.3430233
## High Contact 0.4525 0.5856209 0.6569767
table_TS <-data.frame(
Low_Satisfaction = rep(0,3),
Medium_Satisfaction = rep(0,3),
High_Satisfaction = rep(0,3)
)
row.names(table_TS) <- c("Tower_block","Apartment","House")
temp <- subset(housedata, Is_apartment==0 & Is_house==0)
table_TS$Low_Satisfaction[1]=sum(subset(temp,select = "Sat_L"))/sum(subset(temp,select = c("Sat_L","Sat_M", "Sat_H")))
table_TS$Medium_Satisfaction[1]=sum(subset(temp,select = "Sat_M"))/sum(subset(temp,select = c("Sat_L","Sat_M", "Sat_H")))
table_TS$High_Satisfaction[1]=sum(subset(temp,select = "Sat_H"))/sum(subset(temp,select = c("Sat_L","Sat_M", "Sat_H")))
temp <- subset(housedata, Is_apartment==1)
table_TS$Low_Satisfaction[2]=sum(subset(temp,select = "Sat_L"))/sum(subset(temp,select = c("Sat_L","Sat_M", "Sat_H")))
table_TS$Medium_Satisfaction[2]=sum(subset(temp,select = "Sat_M"))/sum(subset(temp,select = c("Sat_L","Sat_M", "Sat_H")))
table_TS$High_Satisfaction[2]=sum(subset(temp,select = "Sat_H"))/sum(subset(temp,select = c("Sat_L","Sat_M", "Sat_H")))
temp <- subset(housedata, Is_house==1)
table_TS$Low_Satisfaction[3]=sum(subset(temp,select = "Sat_L"))/sum(subset(temp,select = c("Sat_L","Sat_M", "Sat_H")))
table_TS$Medium_Satisfaction[3]=sum(subset(temp,select = "Sat_M"))/sum(subset(temp,select = c("Sat_L","Sat_M", "Sat_H")))
table_TS$High_Satisfaction[3]=sum(subset(temp,select = "Sat_H"))/sum(subset(temp,select = c("Sat_L","Sat_M", "Sat_H")))
table_TS
## Low_Satisfaction Medium_Satisfaction High_Satisfaction
## Tower_block 0.2475000 0.2525000 0.5000000
## Apartment 0.3542484 0.2509804 0.3947712
## House 0.3817829 0.2965116 0.3217054
(b)
library(nnet)
library(readr)
newhousedata <- read_delim("/Users/hamed/Documents/Academics/Generalized Linear Models/newhousedata.csv",
"\t", escape_double = FALSE, trim_ws = TRUE)
## Parsed with column specification:
## cols(
## n = col_double(),
## Satisfaction = col_character(),
## Is_apartment = col_double(),
## Is_house = col_double(),
## housetype = col_character(),
## Contact = col_character(),
## N = col_double()
## )
newhousedata$Satisfaction <- relevel(as.factor(newhousedata$Satisfaction), ref = "low")
newhousedata$housetype <- relevel(as.factor(newhousedata$housetype), ref = "Towerblock")
newhousedata$Contact <- relevel(as.factor(newhousedata$Contact), ref = "low")
nom_model <- multinom(Satisfaction~housetype+Contact, data = newhousedata, weights = n, Hess = TRUE)
## # weights: 15 (8 variable)
## initial value 1846.767257
## iter 10 value 1803.278543
## final value 1802.740161
## converged
summary(nom_model)
## Call:
## multinom(formula = Satisfaction ~ housetype + Contact, data = newhousedata,
## weights = n, Hess = TRUE)
##
## Coefficients:
## (Intercept) housetypeApartment housetypeHouse Contacthigh
## high 0.5607737 -0.6415967 -0.9456177 0.3282263
## medium -0.1072644 -0.4067537 -0.3370771 0.2959803
##
## Std. Errors:
## (Intercept) housetypeApartment housetypeHouse Contacthigh
## high 0.1329301 0.1500773 0.1644850 0.1181870
## medium 0.1524077 0.1713011 0.1803577 0.1301046
##
## Residual Deviance: 3605.48
## AIC: 3621.48
(c) Yes, Ordinal Model might be more appropriate, because satisfaction has an ordinal nature.
library(MASS)
newhousedata <- read_delim("newhousedata.csv",
"\t", escape_double = FALSE, trim_ws = TRUE)
## Parsed with column specification:
## cols(
## n = col_double(),
## Satisfaction = col_character(),
## Is_apartment = col_double(),
## Is_house = col_double(),
## housetype = col_character(),
## Contact = col_character(),
## N = col_double()
## )
newhousedata$Satisfaction <- as.ordered(newhousedata$Satisfaction)
ord_model <- polr(Satisfaction ~ housetype + Contact, data = newhousedata, weights = n)
summary(ord_model)
##
## Re-fitting to get Hessian
## Call:
## polr(formula = Satisfaction ~ housetype + Contact, data = newhousedata,
## weights = n)
##
## Coefficients:
## Value Std. Error t value
## housetypeHouse 0.27590 0.10434 2.6442
## housetypeTowerblock -0.29241 0.11786 -2.4810
## Contactlow 0.07453 0.09235 0.8071
##
## Intercepts:
## Value Std. Error t value
## high|low -0.3671 0.0802 -4.5796
## low|medium 1.0817 0.0843 12.8294
##
## Residual Deviance: 3628.55
## AIC: 3638.55
In the lines below, I am going to calculate p-values for coefficients and intercepts:
ctable <- coef(summary(ord_model))
##
## Re-fitting to get Hessian
p <- pnorm(abs(ctable[,"t value"]), lower.tail = FALSE) * 2
ctable <- cbind(ctable, "p-value" = p)
ctable
## Value Std. Error t value p-value
## housetypeHouse 0.27590184 0.10434090 2.644235 8.187583e-03
## housetypeTowerblock -0.29240512 0.11785638 -2.481029 1.310037e-02
## Contactlow 0.07453312 0.09235173 0.807057 4.196336e-01
## high|low -0.36711750 0.08016366 -4.579600 4.658661e-06
## low|medium 1.08168216 0.08431290 12.829379 1.122608e-37
According to the results, all the coefficients and intercepts are significant (at %95 level) except for intercept for medium to high which can be accepted to equal to zero at this level of significance. This code calculates standardized residuals for this model:
obs <- newhousedata$n / newhousedata$N
e <- predict(ord_model, data = newhousedata$Satisfaction, type = "prob")
est <- array(0,18)
for (i in 1:nrow(newhousedata)){
est[i] = e[i,as.character(newhousedata$Satisfaction[i])]
}
resids <- (obs - est) / sqrt(est)
resids
## [1] -0.046097209 0.069859635 -0.009021106 -0.228981667 0.128411728
## [6] 0.102562205 0.118107280 -0.053728395 -0.065843122 -0.039314745
## [11] 0.011409002 0.026733023 0.053266795 -0.094268267 0.039029369
## [16] 0.062564040 0.001698972 -0.064361035
plot(obs,resids)
abline(0,0,lty=2)
We can see in the plot that residuals do not show specific pattern against observed probability in lower probabilities. For probabilities higher than 0.4 we can see some kind of increasing pattern.