Податоците од задачата се:
satisfaction = c(
65, 34, 54, 47, 100, 100,
130, 141, 76, 116, 111, 191,
67, 130, 48, 105, 62, 104
)
level = as.factor(rep(c(rep("low", 2), rep("medium", 2), rep("high", 2)), 3))
housing = as.factor(c(rep("tower", 6), rep("apartment", 6), rep("house", 6)))
contact = as.factor(rep(c("low", "high"), 9))
sat_df = data.frame(housing, contact, level, satisfaction)
sat_df
Онаму каде што имало повисок контакт, имало и поголемо задоволство.
contact = c("low", "high")
low_sat_sum = c(
sum(sat_df[sat_df$contact == "low" & sat_df$level == "low", ]$satisfaction),
sum(sat_df[sat_df$contact == "high" & sat_df$level == "low", ]$satisfaction)
)
medium_sat_sum = c(
sum(sat_df[sat_df$contact == "low" & sat_df$level == "medium", ]$satisfaction),
sum(sat_df[sat_df$contact == "high" & sat_df$level == "medium", ]$satisfaction)
)
high_sat_sum = c(
sum(sat_df[sat_df$contact == "low" & sat_df$level == "high", ]$satisfaction),
sum(sat_df[sat_df$contact == "high" & sat_df$level == "high", ]$satisfaction)
)
low_sat_pct = low_sat_sum / (low_sat_sum + medium_sat_sum + high_sat_sum)
medium_sat_pct = medium_sat_sum / (low_sat_sum + medium_sat_sum + high_sat_sum)
high_sat_pct = high_sat_sum / (low_sat_sum + medium_sat_sum + high_sat_sum)
data.frame(contact, low_sat_pct, medium_sat_pct, high_sat_pct)
Најголемо задоволство имало во tower.
housing = c("tower", "apartment", "house")
low_sat_sum = c(
sum(sat_df[sat_df$housing == "tower" & sat_df$level == "low", ]$satisfaction),
sum(sat_df[sat_df$housing == "apartment" & sat_df$level == "low", ]$satisfaction),
sum(sat_df[sat_df$housing == "house" & sat_df$level == "low", ]$satisfaction)
)
medium_sat_sum = c(
sum(sat_df[sat_df$housing == "tower" & sat_df$level == "medium", ]$satisfaction),
sum(sat_df[sat_df$housing == "apartment" & sat_df$level == "medium", ]$satisfaction),
sum(sat_df[sat_df$housing == "house" & sat_df$level == "medium", ]$satisfaction)
)
high_sat_sum = c(
sum(sat_df[sat_df$housing == "tower" & sat_df$level == "high", ]$satisfaction),
sum(sat_df[sat_df$housing == "apartment" & sat_df$level == "high", ]$satisfaction),
sum(sat_df[sat_df$housing == "house" & sat_df$level == "high", ]$satisfaction)
)
low_sat_pct = low_sat_sum / (low_sat_sum + medium_sat_sum + high_sat_sum)
medium_sat_pct = medium_sat_sum / (low_sat_sum + medium_sat_sum + high_sat_sum)
high_sat_pct = high_sat_sum / (low_sat_sum + medium_sat_sum + high_sat_sum)
data.frame(housing, low_sat_pct, medium_sat_pct, high_sat_pct)
Во tower имало најнизок контакт, а во house имало највисок контакт.
housing = c("tower", "apartment", "house")
low_contact_sum = c(
sum(sat_df[sat_df$housing == "tower" & sat_df$contact == "low", ]$satisfaction),
sum(sat_df[sat_df$housing == "apartment" & sat_df$contact == "low", ]$satisfaction),
sum(sat_df[sat_df$housing == "house" & sat_df$contact == "low", ]$satisfaction)
)
high_contact_sum = c(
sum(sat_df[sat_df$housing == "tower" & sat_df$contact == "high", ]$satisfaction),
sum(sat_df[sat_df$housing == "apartment" & sat_df$contact == "high", ]$satisfaction),
sum(sat_df[sat_df$housing == "house" & sat_df$contact == "high", ]$satisfaction)
)
low_contact_pct = low_contact_sum / (low_contact_sum + high_contact_sum)
high_contact_pct = high_contact_sum / (low_contact_sum + high_contact_sum)
data.frame(housing, low_contact_pct, high_contact_pct)
library(nnet)
low_satisfaction = sat_df[sat_df$level == "low", ]$satisfaction
medium_satisfaction = sat_df[sat_df$level == "medium", ]$satisfaction
high_satisfaction = sat_df[sat_df$level == "high", ]$satisfaction
satisfaction = cbind(low_satisfaction, medium_satisfaction, high_satisfaction)
contact = sat_df[sat_df$level == "low", ]$contact
housing = sat_df[sat_df$level == "low", ]$housing
multinom_sat_df = data.frame(contact, housing, satisfaction)
multinom_sat_df
multinom_model = multinom(
satisfaction ~ contact * housing,
data=multinom_sat_df
)
# weights: 21 (12 variable)
initial value 1846.767257
iter 10 value 1800.128659
final value 1799.293647
converged
summary(multinom_model)
Call:
multinom(formula = satisfaction ~ contact * housing, data = multinom_sat_df)
Coefficients:
(Intercept) contactlow housinghouse housingtower
medium_satisfaction -0.1951677 -0.341634 -0.01840665 0.5189502
high_satisfaction 0.3035139 -0.461520 -0.52665690 0.7752913
contactlow:housinghouse contactlow:housingtower
medium_satisfaction 0.2217172 -0.1675522
high_satisfaction 0.6071035 -0.1865006
Std. Errors:
(Intercept) contactlow housinghouse housingtower
medium_satisfaction 0.1253510 0.1912147 0.1814635 0.2576842
high_satisfaction 0.1110307 0.1703794 0.1721496 0.2274631
contactlow:housinghouse contactlow:housingtower
medium_satisfaction 0.2992288 0.3480726
high_satisfaction 0.2781928 0.3063093
Residual Deviance: 3598.587
AIC: 3622.587
Да, ординален модел би одговарал во ситуацијава, бидејќи нивоата на задоволство се подредени. Мораме да ги обработиме податоците за да можеме да креираме ординален модел.
library(MASS)
contact = c()
housing = c()
satisfaction = c()
for (idx in 1:nrow(multinom_sat_df)) {
row = multinom_sat_df[idx, ]
for (sat in c("low", "medium", "high")) {
col_name = paste(sat, "satisfaction", sep="_")
cnt = row[[col_name]]
contact = append(contact, rep(row$contact, cnt))
housing = append(housing, rep(row$housing, cnt))
satisfaction = append(satisfaction, rep(sat, cnt))
}
}
satisfaction = factor(
satisfaction,
ordered=TRUE,
levels=c("low", "medium", "high")
)
ordered_sat_df = data.frame(housing, contact, satisfaction)
ordered_sat_df
polr_model = polr(
satisfaction ~ housing * contact,
data=ordered_sat_df
)
summary(polr_model)
Re-fitting to get Hessian
Call:
polr(formula = satisfaction ~ housing * contact, data = ordered_sat_df)
Coefficients:
Value Std. Error t value
housing 0.08112 0.1769 0.4585
contact -0.32911 0.2234 -1.4731
housing:contact 0.07621 0.1133 0.6728
Intercepts:
Value Std. Error t value
low|medium -0.8064 0.3409 -2.3655
medium|high 0.2937 0.3403 0.8629
Residual Deviance: 3633.392
AIC: 3643.392
За номиналниот модел (тој е делумно подобар според резултатите, иако нема некоја разлика), можеме да забележиме дека резидуалите се доста ниски (помали од 0.0001, што е навистина добро).
multinom_sat_df["sums"] =
multinom_sat_df["low_satisfaction"] +
multinom_sat_df["medium_satisfaction"] +
multinom_sat_df["high_satisfaction"]
fitted_values = fitted(multinom_model)
multinom_sat_df["low_residual"] =
as.vector(fitted_values[, 1]) * multinom_sat_df["sums"] -
multinom_sat_df["low_satisfaction"]
multinom_sat_df["medium_residual"] =
as.vector(fitted_values[, 2]) * multinom_sat_df["sums"] -
multinom_sat_df["medium_satisfaction"]
multinom_sat_df["high_residual"] =
as.vector(fitted_values[, 3]) * multinom_sat_df["sums"]-
multinom_sat_df["high_satisfaction"]
Податоците од задалата се:
cnt = c(28, 45, 29, 26, 4, 12, 5, 2, 41, 44, 20, 20, 12, 7, 3, 1)
response = rep(c("progressive", "no_change", "partial", "complete"), 4)
sex = rep(c(rep("male", 4), rep("female", 4)), 2)
treatment = c(rep("sequential", 8), rep("alternating", 8))
tumor_df = data.frame(sex, treatment, response, cnt)
tumor_df
Ги обработуваме податоците слично како во втората задача:
sex = c()
treatment = c()
response = c()
for (idx in 1:nrow(tumor_df)) {
row = tumor_df[idx, ]
sex = append(sex, rep(row$sex, row$cnt))
treatment = append(treatment, rep(row$treatment, row$cnt))
response = append(response, rep(row$response, row$cnt))
}
sex = as.factor(sex)
treatment = as.factor(treatment)
response = factor(
response,
levels=c("progressive", "no_change", "partial", "complete"),
ordered=TRUE
)
ordered_tumor_df = data.frame(sex, treatment, response)
ordered_tumor_df
Ова е моделот:
tumor_model = polr(
response ~ sex + treatment,
data=ordered_tumor_df,
Hess=TRUE
)
summary(tumor_model)
Call:
polr(formula = response ~ sex + treatment, data = ordered_tumor_df,
Hess = TRUE)
Coefficients:
Value Std. Error t value
sexmale 0.5414 0.2872 1.885
treatmentsequential 0.5807 0.2121 2.737
Intercepts:
Value Std. Error t value
progressive|no_change -0.1960 0.2893 -0.6774
no_change|partial 1.3713 0.3000 4.5706
partial|complete 2.4221 0.3224 7.5119
Residual Deviance: 789.0566
AIC: 799.0566
Гледајќи ги резидуалите, false postives и false negatives за секоја класа, и правејќи го хи-квадрат тестот за точните и предвидените фреквенции, заклучуваме дека моделот не ги учи добро податоците.
y_pred = c(predict(tumor_model))
y_true = c(ordered_tumor_df$response)
correct_freqs = c(0, 0, 0, 0)
predicted_freqs = c(0, 0, 0, 0)
residuals = c(0, 0, 0, 0)
fps = c(0, 0, 0, 0)
fns = c(0, 0, 0, 0)
for (i in 1:length(y_true)) {
correct_freqs[y_true[i]] = correct_freqs[y_true[i]] + 1
predicted_freqs[y_pred[i]] = predicted_freqs[y_pred[i]] + 1
if (y_pred[i] != y_true[i]) {
residuals[y_pred[i]] = residuals[y_pred[i]] + 1
residuals[y_true[i]] = residuals[y_true[i]] + 1
fps[y_pred[i]] = fps[y_pred[i]] + 1
fns[y_true[i]] = fns[y_true[i]] + 1
}
}
data.frame(correct_freqs, predicted_freqs, residuals, fps, fns)
chisq.test(correct_freqs, predicted_freqs)
Chi-squared approximation may be incorrect
Pearson's Chi-squared test
data: correct_freqs and predicted_freqs
X-squared = 8, df = 6, p-value = 0.2381
Од големата p-вредност тестот на Валд сугерира дека има разлика во различните третмани.
library(lmtest)
treatment_model = polr(response ~ treatment, data=ordered_sat_df, Hess=TRUE)
waldtest(treatment_model)
Wald test
Model 1: response ~ treatment
Model 2: response ~ 1
Res.Df Df Chisq Pr(>Chisq)
1 295
2 296 -1 7.2424 0.00712 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
И тестот на Wald и обичното summary на моделот каде што можеме да ги споредиме AIC вредностите сугерираат дека моделот кој што зема предвид treatment е подобар и ја поддржуваат тезата дека има разлика помеѓу различните третмани.
no_treatment = polr(response ~ sex, data=ordered_sat_df, Hess=TRUE)
summary(no_treatment)
Call:
polr(formula = response ~ sex, data = ordered_sat_df, Hess = TRUE)
Coefficients:
Value Std. Error t value
sexmale 0.5219 0.2871 1.818
Intercepts:
Value Std. Error t value
progressive|no_change -0.4932 0.2682 -1.8385
no_change|partial 1.0407 0.2729 3.8137
partial|complete 2.0790 0.2945 7.0600
Residual Deviance: 796.6268
AIC: 804.6268
waldtest(no_treatment)
Wald test
Model 1: response ~ sex
Model 2: response ~ 1
Res.Df Df Chisq Pr(>Chisq)
1 295
2 296 -1 3.3048 0.06908 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
yes_treatment = polr(
response ~ sex + treatment,
data=ordered_sat_df,
Hess=TRUE
)
summary(yes_treatment)
Call:
polr(formula = response ~ sex + treatment, data = ordered_sat_df,
Hess = TRUE)
Coefficients:
Value Std. Error t value
sexmale 0.5414 0.2872 1.885
treatmentsequential 0.5807 0.2121 2.737
Intercepts:
Value Std. Error t value
progressive|no_change -0.1960 0.2893 -0.6774
no_change|partial 1.3713 0.3000 4.5706
partial|complete 2.4221 0.3224 7.5119
Residual Deviance: 789.0566
AIC: 799.0566
waldtest(yes_treatment)
Wald test
Model 1: response ~ sex + treatment
Model 2: response ~ 1
Res.Df Df Chisq Pr(>Chisq)
1 294
2 296 -2 10.738 0.004659 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1