Податоците од задачате се:
raw_data = c(
1, 1, 65, 317, 2, 20,
1, 2, 65, 476, 5, 33,
1, 3, 52, 486, 4, 40,
1, 4, 310, 3259, 36, 316,
2, 1, 98, 486, 7, 31,
2, 2, 159, 1004, 10, 81,
2, 3, 175, 1355, 22, 122,
2, 4, 877, 7660, 102, 724,
3, 1, 41, 223, 5, 18,
3, 2, 117, 539, 7, 39,
3, 3, 137, 697, 16, 68,
3, 4, 477, 3442, 63, 344,
4, 1, 11, 40, 0, 3,
4, 2, 35, 148, 6, 16,
4, 3, 39, 214, 8, 25,
4, 4, 167, 1019, 33, 114
)
car = as.factor(rep(raw_data[seq(1, length(raw_data), 6)], 2))
age = as.factor(rep(raw_data[seq(2, length(raw_data), 6)], 2))
dist = as.factor(c(rep(0, length(raw_data) / 6), rep(1, length(raw_data) / 6)))
y = c(
raw_data[seq(3, length(raw_data), 6)],
raw_data[seq(5, length(raw_data), 6)]
)
n = c(
raw_data[seq(4, length(raw_data), 6)],
raw_data[seq(6, length(raw_data), 6)]
)
claims_df = data.frame(car, age, dist, y, n)
claims_df
Како што може да се забележи од графиците, дистриктот и колата влијаат позитивно на соодносот y/n, додека староста влијае негативно.
by_car = aggregate(cbind(y, n) ~ car, data=claims_df, sum)
with(by_car, plot(car, y / n))
by_age = aggregate(cbind(y, n) ~ age, data=claims_df, sum)
with(by_age, plot(age, y / n))
by_dist = aggregate(cbind(y, n) ~ dist, data=claims_df, sum)
with(by_dist, plot(dist, y / n))
Најзначајни си самиот offset и староста. Има одредена значајност и во интеракцијата помеѓу староста и колата, но не е голема.
summary(
glm(
y ~ offset(log(y + n)) + car * age * dist,
family=poisson,
data=claims_df
)
)
Call:
glm(formula = y ~ offset(log(y + n)) + car * age * dist, family = poisson,
data = claims_df)
Deviance Residuals:
[1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[25] 0 0 0 0 0 0 0 0
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.771e+00 1.240e-01 -14.279 < 2e-16 ***
car2 -1.390e-02 1.600e-01 -0.087 0.93075
car3 -9.134e-02 1.994e-01 -0.458 0.64695
car4 2.371e-01 3.260e-01 0.727 0.46707
age2 -3.480e-01 1.754e-01 -1.984 0.04727 *
age3 -5.656e-01 1.860e-01 -3.040 0.00237 **
age4 -6.724e-01 1.364e-01 -4.929 8.26e-07 ***
dist1 -6.269e-01 7.179e-01 -0.873 0.38256
car2:age2 1.431e-01 2.174e-01 0.658 0.51045
car3:age2 4.864e-01 2.524e-01 1.927 0.05397 .
car4:age2 2.278e-01 3.876e-01 0.588 0.55676
car2:age3 1.823e-01 2.248e-01 0.811 0.41745
car3:age3 6.217e-01 2.575e-01 2.414 0.01576 *
car4:age3 2.297e-01 3.888e-01 0.591 0.55469
car2:age4 1.817e-01 1.731e-01 1.050 0.29376
car3:age4 4.287e-01 2.124e-01 2.019 0.04350 *
car4:age4 2.460e-01 3.399e-01 0.724 0.46914
car2:dist1 7.201e-01 8.176e-01 0.881 0.37843
car3:dist1 9.632e-01 8.601e-01 1.120 0.26278
car4:dist1 -2.124e+01 4.225e+04 -0.001 0.99960
age2:dist1 7.177e-01 8.548e-01 0.840 0.40112
age3:dist1 5.656e-01 8.858e-01 0.639 0.52314
age4:dist1 7.902e-01 7.392e-01 1.069 0.28505
car2:age2:dist1 -1.029e+00 9.950e-01 -1.035 0.30088
car3:age2:dist1 -1.213e+00 1.052e+00 -1.153 0.24894
car4:age2:dist1 2.150e+01 4.225e+04 0.001 0.99959
car2:age3:dist1 -3.694e-01 9.944e-01 -0.371 0.71030
car3:age3:dist1 -7.539e-01 1.039e+00 -0.726 0.46795
car4:age3:dist1 2.175e+01 4.225e+04 0.001 0.99959
car2:age4:dist1 -6.994e-01 8.428e-01 -0.830 0.40662
car3:age4:dist1 -8.861e-01 8.881e-01 -0.998 0.31839
car4:age4:dist1 2.154e+01 4.225e+04 0.001 0.99959
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for poisson family taken to be 1)
Null deviance: 1.5791e+02 on 31 degrees of freedom
Residual deviance: 4.1223e-10 on 0 degrees of freedom
AIC: 232.36
Number of Fisher Scoring iterations: 20
Овој модел е нешто полош од тој во б (има повисоки резидуали), но повторно добро ги моделира податоците.
summary(
glm(
y ~ offset(log(y + n)) + as.double(car) + as.double(age),
family=poisson,
data=claims_df
)
)
Call:
glm(formula = y ~ offset(log(y + n)) + as.double(car) + as.double(age),
family = poisson, data = claims_df)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.5881 -0.5929 -0.1024 0.8757 2.2134
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.99485 0.07978 -25.004 <2e-16 ***
as.double(car) 0.17418 0.02078 8.383 <2e-16 ***
as.double(age) -0.15188 0.01848 -8.219 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for poisson family taken to be 1)
Null deviance: 157.912 on 31 degrees of freedom
Residual deviance: 27.709 on 29 degrees of freedom
AIC: 202.07
Number of Fisher Scoring iterations: 4
Податоците од задачата се:
raw_data = c(25, 8, 5, 6, 18, 11)
small = raw_data[seq(1, length(raw_data), 3)]
moderate = raw_data[seq(2, length(raw_data), 3)]
large = raw_data[seq(3, length(raw_data), 3)]
total = small + moderate + large
vaccine = as.factor(c("placebo", "vaccine"))
flu_df = data.frame(vaccine, small, moderate, large, total)
flu_df
p-вредноста на хи-квадрат тестот е доволно ниска за да се заклучи дека има значајна разлика во двете групи.
chisq.test(matrix(c(small, moderate, large), ncol=3))
Pearson's Chi-squared test
data: matrix(c(small, moderate, large), ncol = 3)
X-squared = 17.648, df = 2, p-value = 0.0001472
За да направиме лог-линеарен модел, ги реструктурираме податоците. Моделот одлично ги учи податоците и има значајна интеракција помеѓу вакцинацијата и response, што не води до истиот заклучок како и хи-квадрат тестот, а тоа е дека има значајна разлика помеѓу плацебо и вакцината.
cnt = raw_data
response = as.factor(rep(c("small", "moderate", "large"), 2))
vaccine = as.factor(c(rep("placebo", 3), rep("vaccine", 3)))
new_flu_df = data.frame(vaccine, response, cnt)
new_flu_df
log_linear_lm = glm(cnt ~ response * vaccine, family=poisson, data=new_flu_df)
summary(log_linear_lm)
Call:
glm(formula = cnt ~ response * vaccine, family = poisson, data = new_flu_df)
Deviance Residuals:
[1] 0 0 0 0 0 0
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.60944 0.44721 3.599 0.00032 ***
responsemoderate 0.47000 0.57009 0.824 0.40969
responsesmall 1.60944 0.48990 3.285 0.00102 **
vaccinevaccine 0.78846 0.53936 1.462 0.14379
responsemoderate:vaccinevaccine 0.02247 0.68663 0.033 0.97389
responsesmall:vaccinevaccine -2.21557 0.70539 -3.141 0.00168 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for poisson family taken to be 1)
Null deviance: 2.3807e+01 on 5 degrees of freedom
Residual deviance: -6.6613e-16 on 0 degrees of freedom
AIC: 37.128
Number of Fisher Scoring iterations: 3
Моделот на хомогеност треба да не содржи интеракции (значи + наместо *).
Откако го пресметуваме хи-квадратот преку пирсоновите резидуали, јасно е дека оние пирсонови резидуали кои се најголеми по апсолутна вредност допринесуваат најмногу. Тоа се редиците 1 и 4 (имаат вредноста над 2, па речиси половина од хи-квадратот е поради нив), кои соодветствуваа на small response.
На сличен начин заклучуваме дека истите тие две редици најмногу допринесуваат и до D.
log_linear_lm = glm(cnt ~ response + vaccine, family=poisson, data=new_flu_df)
fitted_values = fitted(log_linear_lm)
pearson_resid = (new_flu_df$cnt - fitted_values) / sqrt(fitted_values)
print(pearson_resid)
1 2 3 4 5 6
2.206329 -1.504324 -1.153435 -2.298942 1.567470 1.201852
chi_sq = sum(pearson_resid * pearson_resid)
print(chi_sq)
[1] 17.64783
dev_res = sign(new_flu_df$cnt - fitted_values) * (sqrt(
2 * (
new_flu_df$cnt * log(new_flu_df$cnt / fitted_values) -
(new_flu_df$cnt - fitted_values)
)
))
print(dev_res)
1 2 3 4 5 6
2.040115 -1.629720 -1.246900 -2.615460 1.468817 1.127679
deviance = sum(dev_res * dev_res)
print(deviance)
[1] 18.64253
Ординалниот модел ни посочува дека кај плацебо групата, пресечните точки се 0.08011702 (large), 0.28226302 (moderate) и 0.63761996 (small), додека кај вакцинираната група, пресечните точки се 0.3535527 (large), 0.4275750 (moderate) и 0.2188723 (small).
library(MASS)
vaccine = c()
response = c()
for (idx in 1:nrow(new_flu_df)) {
row = new_flu_df[idx, ]
vaccine = append(vaccine, rep(as.character(row$vaccine), row$cnt))
response = append(response, rep(as.character(row$response), row$cnt))
}
ordinal_model = polr(
factor(response, ordered=TRUE) ~ as.factor(vaccine),
Hess=TRUE
)
summary(ordinal_model)
Call:
polr(formula = factor(response, ordered = TRUE) ~ as.factor(vaccine),
Hess = TRUE)
Coefficients:
Value Std. Error t value
as.factor(vaccine)vaccine -1.837 0.4882 -3.764
Intercepts:
Value Std. Error t value
large|moderate -2.4408 0.4525 -5.3939
moderate|small -0.5650 0.3434 -1.6456
Residual Deviance: 139.6736
AIC: 145.6736
fitted_values = fitted(ordinal_model)
print(fitted_values[1,])
large moderate small
0.08011702 0.28226302 0.63761996
print(fitted_values[50,])
large moderate small
0.3535527 0.4275750 0.2188723
Податоците од задачата се:
satisfaction = c(
65, 34, 54, 47, 100, 100,
130, 141, 76, 116, 111, 191,
67, 130, 48, 105, 62, 104
)
level = as.factor(rep(c(rep("low", 2), rep("medium", 2), rep("high", 2)), 3))
housing = as.factor(c(rep("tower", 6), rep("apartment", 6), rep("house", 6)))
contact = as.factor(rep(c("low", "high"), 9))
sat_df = data.frame(housing, contact, level, satisfaction)
sat_df
За tower, постои значајна интеракција помеѓу задоволството и контактот.
tower_log_linear_lm = glm(
satisfaction ~ contact * level,
family=poisson,
data=sat_df[sat_df$housing == "tower", ]
)
summary(tower_log_linear_lm)
Call:
glm(formula = satisfaction ~ contact * level, family = poisson,
data = sat_df[sat_df$housing == "tower", ])
Deviance Residuals:
[1] 0 0 0 0 0 0
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 4.605e+00 1.000e-01 46.052 < 2e-16 ***
contactlow -1.338e-16 1.414e-01 0.000 1.0000
levellow -1.079e+00 1.985e-01 -5.434 5.51e-08 ***
levelmedium -7.550e-01 1.769e-01 -4.269 1.96e-05 ***
contactlow:levellow 6.480e-01 2.546e-01 2.546 0.0109 *
contactlow:levelmedium 1.388e-01 2.445e-01 0.568 0.5702
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for poisson family taken to be 1)
Null deviance: 5.7491e+01 on 5 degrees of freedom
Residual deviance: 2.3537e-14 on 0 degrees of freedom
AIC: 47.795
Number of Fisher Scoring iterations: 3
За apartment, постои значајна интеракција помеѓу задоволството и контактот (дури и повисока отколку за tower).
apartment_log_linear_lm = glm(
satisfaction ~ contact * level,
family=poisson,
data=sat_df[sat_df$housing == "apartment", ]
)
summary(apartment_log_linear_lm)
Call:
glm(formula = satisfaction ~ contact * level, family = poisson,
data = sat_df[sat_df$housing == "apartment", ])
Deviance Residuals:
[1] 0 0 0 0 0 0
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 5.25227 0.07236 72.588 < 2e-16 ***
contactlow -0.54274 0.11935 -4.547 5.43e-06 ***
levellow -0.30351 0.11103 -2.734 0.00626 **
levelmedium -0.49868 0.11771 -4.236 2.27e-05 ***
contactlow:levellow 0.46152 0.17038 2.709 0.00675 **
contactlow:levelmedium 0.11989 0.18980 0.632 0.52761
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for poisson family taken to be 1)
Null deviance: 5.6480e+01 on 5 degrees of freedom
Residual deviance: 2.4869e-14 on 0 degrees of freedom
AIC: 51.898
Number of Fisher Scoring iterations: 2
За house, не постои значајна интеракција помеѓу задоволството и контактот.
house_log_linear_lm = glm(
satisfaction ~ contact * level,
family=poisson,
data=sat_df[sat_df$housing == "house", ]
)
summary(house_log_linear_lm)
Call:
glm(formula = satisfaction ~ contact * level, family = poisson,
data = sat_df[sat_df$housing == "house", ])
Deviance Residuals:
[1] 0 0 0 0 0 0
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 4.644391 0.098058 47.364 < 2e-16 ***
contactlow -0.517257 0.160451 -3.224 0.00127 **
levellow 0.223144 0.131559 1.696 0.08986 .
levelmedium 0.009569 0.138344 0.069 0.94485
contactlow:levellow -0.145585 0.219914 -0.662 0.50796
contactlow:levelmedium -0.265503 0.236858 -1.121 0.26231
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for poisson family taken to be 1)
Null deviance: 5.8866e+01 on 5 degrees of freedom
Residual deviance: -1.1546e-14 on 0 degrees of freedom
AIC: 49.409
Number of Fisher Scoring iterations: 2
Значајноста на интеракцијата е значително помала кога симултано се креира моделот.
complete_log_linear_lm = glm(
satisfaction ~ housing * contact * level,
family=poisson,
data=sat_df
)
summary(complete_log_linear_lm)
Call:
glm(formula = satisfaction ~ housing * contact * level, family = poisson,
data = sat_df)
Deviance Residuals:
[1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 5.25227 0.07236 72.588 < 2e-16
housinghouse -0.60788 0.12186 -4.988 6.10e-07
housingtower -0.64710 0.12343 -5.243 1.58e-07
contactlow -0.54274 0.11935 -4.547 5.43e-06
levellow -0.30351 0.11103 -2.734 0.006265
levelmedium -0.49868 0.11771 -4.236 2.27e-05
housinghouse:contactlow 0.02549 0.19997 0.127 0.898583
housingtower:contactlow 0.54274 0.18505 2.933 0.003358
housinghouse:levellow 0.52666 0.17215 3.059 0.002219
housingtower:levellow -0.77530 0.22746 -3.408 0.000653
housinghouse:levelmedium 0.50825 0.18165 2.798 0.005142
housingtower:levelmedium -0.25634 0.21245 -1.207 0.227580
contactlow:levellow 0.46152 0.17038 2.709 0.006753
contactlow:levelmedium 0.11989 0.18980 0.632 0.527614
housinghouse:contactlow:levellow -0.60710 0.27819 -2.182 0.029087
housingtower:contactlow:levellow 0.18651 0.30631 0.609 0.542597
housinghouse:contactlow:levelmedium -0.38539 0.30352 -1.270 0.204181
housingtower:contactlow:levelmedium 0.01895 0.30955 0.061 0.951185
(Intercept) ***
housinghouse ***
housingtower ***
contactlow ***
levellow **
levelmedium ***
housinghouse:contactlow
housingtower:contactlow **
housinghouse:levellow **
housingtower:levellow ***
housinghouse:levelmedium **
housingtower:levelmedium
contactlow:levellow **
contactlow:levelmedium
housinghouse:contactlow:levellow *
housingtower:contactlow:levellow
housinghouse:contactlow:levelmedium
housingtower:contactlow:levelmedium
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for poisson family taken to be 1)
Null deviance: 2.9448e+02 on 17 degrees of freedom
Residual deviance: -2.2649e-14 on 0 degrees of freedom
AIC: 149.1
Number of Fisher Scoring iterations: 3
Резултатот е речиси ист како и кај номиналниот модел од претходната глава.