library(readr)
library(readxl)
library(AID)
library(stats)
library(magrittr)
library(caret)
library(reshape2)
library(dplyr)
library(InformationValue)
library(ROCR)
library(regclass)
data1 <- read_excel("data1.xls")
data2 <- read_excel("Telegram Desktop/задание2.xlsx")
data3 <- read_excel("Telegram Desktop/задание3.xlsx", skip = 2)
data4 <- read_excel("Telegram Desktop/задание4.xls")
По данным по Великобритании о потреблении цыплят (Y), среднедушевом доходе (X1), стоимости одного фунта цыплят (X2), стоимости одного фунта свинины (X3) и стоимости одного фунта говядины (X4) а) Необходимо построить, сравнить и проинтерпретировать уравнения регрессии:
logData1 = log(data1)
Полином приближает модель не очень хорошо
lm1 = lm(Y~X2, data=logData1)
summary(lm1)
##
## Call:
## lm(formula = Y ~ X2, data = logData1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.137580 -0.049196 0.008714 0.058689 0.120432
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.57772 0.31051 5.081 9.25e-05 ***
## X2 0.55272 0.07989 6.918 2.49e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.07812 on 17 degrees of freedom
## Multiple R-squared: 0.7379, Adjusted R-squared: 0.7225
## F-statistic: 47.86 on 1 and 17 DF, p-value: 2.485e-06
Полином хорошо приближает модель
lm2 = lm(Y~X1, data=logData1)
summary(lm2)
##
## Call:
## lm(formula = Y ~ X1, data = logData1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.072152 -0.020452 0.003223 0.025825 0.059829
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.74599 0.12292 14.20 7.33e-11 ***
## X1 0.28492 0.01768 16.12 9.84e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03781 on 17 degrees of freedom
## Multiple R-squared: 0.9386, Adjusted R-squared: 0.935
## F-statistic: 259.8 on 1 and 17 DF, p-value: 9.837e-12
Полином ожидаемо хорошо приближает модель
lm3 = lm(Y~X1 + X2, data=logData1)
summary(lm3)
##
## Call:
## lm(formula = Y ~ X1 + X2, data = logData1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.055142 -0.018277 -0.005682 0.022947 0.051872
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.01848 0.12665 15.937 3.07e-11 ***
## X1 0.41614 0.04157 10.011 2.70e-08 ***
## X2 -0.30482 0.09094 -3.352 0.00405 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.02988 on 16 degrees of freedom
## Multiple R-squared: 0.9639, Adjusted R-squared: 0.9594
## F-statistic: 213.7 on 2 and 16 DF, p-value: 2.872e-12
Полином приближает модель достаточно хорошо относительно ожиданий здравого смысла
lm4 = lm(Y~X2 + X3 + X4, data=logData1)
summary(lm4)
##
## Call:
## lm(formula = Y ~ X2 + X3 + X4, data = logData1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.06833 -0.02295 0.01050 0.02525 0.04585
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.31519 0.22155 10.450 2.79e-08 ***
## X2 -0.48745 0.21100 -2.310 0.0355 *
## X3 0.23742 0.15562 1.526 0.1479
## X4 0.46005 0.07527 6.112 1.99e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03772 on 15 degrees of freedom
## Multiple R-squared: 0.9461, Adjusted R-squared: 0.9353
## F-statistic: 87.74 on 3 and 15 DF, p-value: 9.73e-10
Дополнительно проведём проведём преобразование Бокса-Кокса, где необходимо (переменные 1, 2, 3 и 4 ввиду p-value ниже критического в тесте Шапиро-Уилка)
shapiro.test(data1$X1)
##
## Shapiro-Wilk normality test
##
## data: data1$X1
## W = 0.89644, p-value = 0.04197
boxcoxnc(data1$X1)
##
## Box-Cox power transformation
## -------------------------------------------------------------------
## data : data1$X1
##
## lambda.hat : -0.38
##
##
## Shapiro-Wilk normality test for transformed data (alpha = 0.05)
## -------------------------------------------------------------------
##
## statistic : 0.9600685
## p.value : 0.5737312
##
## Result : Transformed data are normal.
## -------------------------------------------------------------------
shapiro.test(data1$X2)
##
## Shapiro-Wilk normality test
##
## data: data1$X2
## W = 0.85485, p-value = 0.008055
boxcoxnc(data1$X2)
##
## Box-Cox power transformation
## -------------------------------------------------------------------
## data : data1$X2
##
## lambda.hat : 1.8
##
##
## Shapiro-Wilk normality test for transformed data (alpha = 0.05)
## -------------------------------------------------------------------
##
## statistic : 0.8567855
## p.value : 0.008667564
##
## Result : Transformed data are not normal.
## -------------------------------------------------------------------
shapiro.test(data1$X3)
##
## Shapiro-Wilk normality test
##
## data: data1$X3
## W = 0.8953, p-value = 0.04004
boxcoxnc(data1$X3)
##
## Box-Cox power transformation
## -------------------------------------------------------------------
## data : data1$X3
##
## lambda.hat : -0.42
##
##
## Shapiro-Wilk normality test for transformed data (alpha = 0.05)
## -------------------------------------------------------------------
##
## statistic : 0.9086766
## p.value : 0.07000484
##
## Result : Transformed data are normal.
## -------------------------------------------------------------------
shapiro.test(data1$X4)
##
## Shapiro-Wilk normality test
##
## data: data1$X4
## W = 0.87763, p-value = 0.01953
boxcoxnc(data1$X4)
##
## Box-Cox power transformation
## -------------------------------------------------------------------
## data : data1$X4
##
## lambda.hat : -0.7
##
##
## Shapiro-Wilk normality test for transformed data (alpha = 0.05)
## -------------------------------------------------------------------
##
## statistic : 0.9306845
## p.value : 0.178292
##
## Result : Transformed data are normal.
## -------------------------------------------------------------------
wilcox.test(data1$X3, data1$X4)
##
## Wilcoxon rank sum exact test
##
## data: data1$X3 and data1$X4
## W = 102, p-value = 0.02152
## alternative hypothesis: true location shift is not equal to 0
select(data1, X2, X3)
## # A tibble: 19 x 2
## X2 X3
## <dbl> <dbl>
## 1 37.3 54.7
## 2 38.1 63.7
## 3 39.3 69.8
## 4 37.8 65.9
## 5 38.4 64.5
## 6 40.1 70
## 7 38.6 73.2
## 8 39.8 67.8
## 9 39.7 79.1
## 10 52.1 95.4
## 11 48.9 94.2
## 12 58.3 124.
## 13 57.9 130.
## 14 56.5 118.
## 15 63.7 131.
## 16 61.6 130.
## 17 58.9 128
## 18 66.4 141
## 19 70.4 168.
fligner.test(data1)
##
## Fligner-Killeen test of homogeneity of variances
##
## data: data1
## Fligner-Killeen:med chi-squared = 62.099, df = 4, p-value = 1.05e-12
t.test(data1$X2, data1$X3)
##
## Welch Two Sample t-test
##
## data: data1$X2 and data1$X3
## t = -5.9376, df = 22.16, p-value = 5.475e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -65.56778 -31.63222
## sample estimates:
## mean of x mean of y
## 49.67368 98.27368
По данным из файла «задание2.xlsx» показателей деятельности 50 фирм, оказывающих фото- и видео-услуги в городе N, за 2019 г. рассмотреть возможности оценивания нелинейной, в частности полиномиальной, регрессии прибыльности вложений в оборудование (profitability, %) на показатели среднего числа сотрудников на один проект (staff), среднемесячного числа проектов (projects), доли расходов на рекламу и продвижение своего бренда (adverts, %). а) Найти степень полинома, при которой коэффициент детерминации оказывается близким к 1. b) Показать, что результат достигнут за счет переобучения модели. с) Выбрать оптимальную степень полинома, в том числе с содержательной точки зрения, предполагая, что фирмы можно считать однородными по прочим возможным показателям их деятельности.
polynomSize = 10
arrayOfErrors = rep(0, polynomSize)
for (i in 1:polynomSize) {
lm.fit = lm(profitability~poly(staff, i) + poly(adverts, i) + poly(projects, i), data=data2)
arrayOfErrors[i] = summary(lm.fit)$r.squared
}
plot(arrayOfErrors, xlab="Степень многочлена", ylab="R^2", type="l", at=c(1:polynomSize))
Степень 1 - Adjusted R-squared: 0.7406
linearModel2_1 = lm(profitability~poly(staff, 1) + poly(adverts, 1) + poly(projects, 1), data=data2)
summary(linearModel2_1)
##
## Call:
## lm(formula = profitability ~ poly(staff, 1) + poly(adverts, 1) +
## poly(projects, 1), data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.7006 -4.1140 -0.3395 4.9787 18.9472
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 101.239 1.177 86.047 < 2e-16 ***
## poly(staff, 1) -57.252 8.447 -6.777 1.97e-08 ***
## poly(adverts, 1) 45.966 8.458 5.435 2.02e-06 ***
## poly(projects, 1) 58.376 8.489 6.877 1.39e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.32 on 46 degrees of freedom
## Multiple R-squared: 0.7565, Adjusted R-squared: 0.7406
## F-statistic: 47.64 on 3 and 46 DF, p-value: 3.733e-14
Степень 2 - Adjusted R-squared: 0.896
linearModel2_2 = lm(profitability~poly(staff, 2) + poly(adverts, 2) + poly(projects, 2), data=data2)
summary(linearModel2_2)
##
## Call:
## lm(formula = profitability ~ poly(staff, 2) + poly(adverts, 2) +
## poly(projects, 2), data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.0820 -3.7249 -0.2015 3.1046 10.1281
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 101.2392 0.7451 135.878 < 2e-16 ***
## poly(staff, 2)1 -55.1866 5.4473 -10.131 5.81e-13 ***
## poly(staff, 2)2 -37.0286 5.3484 -6.923 1.66e-08 ***
## poly(adverts, 2)1 47.7111 5.3718 8.882 2.78e-11 ***
## poly(adverts, 2)2 -25.5261 5.7106 -4.470 5.62e-05 ***
## poly(projects, 2)1 59.8028 5.5384 10.798 7.98e-14 ***
## poly(projects, 2)2 23.6518 5.4756 4.320 9.06e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.268 on 43 degrees of freedom
## Multiple R-squared: 0.9087, Adjusted R-squared: 0.896
## F-statistic: 71.34 on 6 and 43 DF, p-value: < 2.2e-16
Степень 3 - Adjusted R-squared: 0.9139
linearModel2_3 = lm(profitability~poly(staff, 3) + poly(adverts, 3) + poly(projects, 3), data=data2)
summary(linearModel2_3)
##
## Call:
## lm(formula = profitability ~ poly(staff, 3) + poly(adverts, 3) +
## poly(projects, 3), data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.4649 -2.5857 0.3781 2.9326 10.3508
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 101.2392 0.6781 149.307 < 2e-16 ***
## poly(staff, 3)1 -52.1851 5.1000 -10.232 9.92e-13 ***
## poly(staff, 3)2 -34.5825 4.9925 -6.927 2.37e-08 ***
## poly(staff, 3)3 12.2187 4.9894 2.449 0.0188 *
## poly(adverts, 3)1 44.9252 4.9766 9.027 3.39e-11 ***
## poly(adverts, 3)2 -24.9179 5.3712 -4.639 3.71e-05 ***
## poly(adverts, 3)3 11.5985 5.2740 2.199 0.0337 *
## poly(projects, 3)1 60.5199 5.0816 11.910 9.93e-15 ***
## poly(projects, 3)2 24.8145 5.3797 4.613 4.03e-05 ***
## poly(projects, 3)3 7.8554 5.1174 1.535 0.1326
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.795 on 40 degrees of freedom
## Multiple R-squared: 0.9297, Adjusted R-squared: 0.9139
## F-statistic: 58.75 on 9 and 40 DF, p-value: < 2.2e-16
Степень 5 - Adjusted R-squared: 0.916
linearModel2_5 = lm(profitability~poly(staff, 5) + poly(adverts, 5) + poly(projects, 5), data=data2)
summary(linearModel2_5)
##
## Call:
## lm(formula = profitability ~ poly(staff, 5) + poly(adverts, 5) +
## poly(projects, 5), data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.5217 -1.8580 0.0309 2.8685 8.1066
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 101.2392 0.6697 151.174 < 2e-16 ***
## poly(staff, 5)1 -52.8428 5.3298 -9.915 1.45e-11 ***
## poly(staff, 5)2 -35.2626 5.1847 -6.801 8.00e-08 ***
## poly(staff, 5)3 14.9078 5.2621 2.833 0.007698 **
## poly(staff, 5)4 3.6594 4.9802 0.735 0.467499
## poly(staff, 5)5 -8.9644 5.0804 -1.764 0.086632 .
## poly(adverts, 5)1 42.2136 5.2711 8.009 2.47e-09 ***
## poly(adverts, 5)2 -24.5382 5.5084 -4.455 8.64e-05 ***
## poly(adverts, 5)3 8.6692 5.5511 1.562 0.127617
## poly(adverts, 5)4 1.8918 5.0351 0.376 0.709450
## poly(adverts, 5)5 3.9627 5.4797 0.723 0.474525
## poly(projects, 5)1 60.3096 5.2908 11.399 3.68e-13 ***
## poly(projects, 5)2 20.9593 5.7121 3.669 0.000826 ***
## poly(projects, 5)3 9.6043 5.3098 1.809 0.079336 .
## poly(projects, 5)4 -10.4826 5.4233 -1.933 0.061617 .
## poly(projects, 5)5 -2.0767 5.3737 -0.386 0.701567
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.735 on 34 degrees of freedom
## Multiple R-squared: 0.9417, Adjusted R-squared: 0.916
## F-statistic: 36.61 on 15 and 34 DF, p-value: < 2.2e-16
Степень 7 - Adjusted R-squared: 0.9146
linearModel2_7 = lm(profitability~poly(staff, 7) + poly(adverts, 7) + poly(projects, 7), data=data2)
summary(linearModel2_7)
##
## Call:
## lm(formula = profitability ~ poly(staff, 7) + poly(adverts, 7) +
## poly(projects, 7), data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.1100 -2.1741 0.2706 1.8177 7.7701
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 101.2392 0.6752 149.933 < 2e-16 ***
## poly(staff, 7)1 -52.5766 5.5888 -9.408 3.64e-10 ***
## poly(staff, 7)2 -37.8759 5.4525 -6.947 1.49e-07 ***
## poly(staff, 7)3 15.6114 5.6661 2.755 0.010195 *
## poly(staff, 7)4 3.7711 5.3210 0.709 0.484359
## poly(staff, 7)5 -10.2932 5.3069 -1.940 0.062566 .
## poly(staff, 7)6 1.0543 6.4381 0.164 0.871100
## poly(staff, 7)7 10.7904 6.3414 1.702 0.099913 .
## poly(adverts, 7)1 41.7997 5.9936 6.974 1.39e-07 ***
## poly(adverts, 7)2 -26.9347 5.9576 -4.521 0.000103 ***
## poly(adverts, 7)3 7.4740 6.2622 1.194 0.242686
## poly(adverts, 7)4 6.4035 5.7133 1.121 0.271887
## poly(adverts, 7)5 -1.8235 6.5825 -0.277 0.783796
## poly(adverts, 7)6 0.7344 6.2522 0.117 0.907327
## poly(adverts, 7)7 1.4045 5.4616 0.257 0.798940
## poly(projects, 7)1 56.9922 5.8205 9.792 1.53e-10 ***
## poly(projects, 7)2 20.8905 6.1294 3.408 0.001999 **
## poly(projects, 7)3 5.0908 6.5847 0.773 0.445927
## poly(projects, 7)4 -9.0865 6.1440 -1.479 0.150323
## poly(projects, 7)5 1.4624 6.3672 0.230 0.820017
## poly(projects, 7)6 5.5896 6.3057 0.886 0.382932
## poly(projects, 7)7 0.8812 6.3895 0.138 0.891291
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.775 on 28 degrees of freedom
## Multiple R-squared: 0.9512, Adjusted R-squared: 0.9146
## F-statistic: 25.98 on 21 and 28 DF, p-value: 3.576e-13
Степень 12 - Adjusted R-squared: 0.9395
linearModel2_12 = lm(profitability~poly(staff, 12) + poly(adverts, 12) + poly(projects, 12), data=data2)
summary(linearModel2_12)
##
## Call:
## lm(formula = profitability ~ poly(staff, 12) + poly(adverts,
## 12) + poly(projects, 12), data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.3405 -1.1910 0.0033 0.9440 5.5871
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 101.2392 0.5683 178.129 <2e-16 ***
## poly(staff, 12)1 -58.1254 23.8471 -2.437 0.0299 *
## poly(staff, 12)2 -50.7891 28.8292 -1.762 0.1016
## poly(staff, 12)3 4.5251 22.1203 0.205 0.8411
## poly(staff, 12)4 -3.4767 9.7871 -0.355 0.7281
## poly(staff, 12)5 -6.6681 9.7780 -0.682 0.5072
## poly(staff, 12)6 17.0681 27.0147 0.632 0.5385
## poly(staff, 12)7 27.3567 40.6149 0.674 0.5124
## poly(staff, 12)8 33.2720 44.1977 0.753 0.4650
## poly(staff, 12)9 24.8612 33.3421 0.746 0.4692
## poly(staff, 12)10 16.2163 23.5360 0.689 0.5029
## poly(staff, 12)11 10.4506 12.9597 0.806 0.4345
## poly(staff, 12)12 13.0371 7.0749 1.843 0.0883 .
## poly(adverts, 12)1 30.9949 11.8063 2.625 0.0210 *
## poly(adverts, 12)2 -20.3340 12.5022 -1.626 0.1278
## poly(adverts, 12)3 3.6125 11.2255 0.322 0.7527
## poly(adverts, 12)4 16.6422 14.3460 1.160 0.2669
## poly(adverts, 12)5 -11.2481 13.3632 -0.842 0.4152
## poly(adverts, 12)6 6.2490 10.6809 0.585 0.5685
## poly(adverts, 12)7 -11.1094 8.2808 -1.342 0.2027
## poly(adverts, 12)8 -0.9260 7.1050 -0.130 0.8983
## poly(adverts, 12)9 4.8295 6.2239 0.776 0.4517
## poly(adverts, 12)10 5.2088 6.6276 0.786 0.4460
## poly(adverts, 12)11 -1.0857 6.6004 -0.164 0.8719
## poly(adverts, 12)12 -10.7248 5.7301 -1.872 0.0839 .
## poly(projects, 12)1 38.1452 28.9492 1.318 0.2104
## poly(projects, 12)2 42.7443 30.3712 1.407 0.1828
## poly(projects, 12)3 -7.0393 18.8245 -0.374 0.7145
## poly(projects, 12)4 -8.6137 7.4026 -1.164 0.2655
## poly(projects, 12)5 11.1659 20.4627 0.546 0.5945
## poly(projects, 12)6 -0.6829 33.9853 -0.020 0.9843
## poly(projects, 12)7 25.8796 29.8659 0.867 0.4019
## poly(projects, 12)8 -7.4325 31.3320 -0.237 0.8162
## poly(projects, 12)9 26.3695 39.2778 0.671 0.5137
## poly(projects, 12)10 -0.7220 32.4296 -0.022 0.9826
## poly(projects, 12)11 11.9014 25.3010 0.470 0.6459
## poly(projects, 12)12 0.7141 13.6222 0.052 0.9590
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.019 on 13 degrees of freedom
## Multiple R-squared: 0.9839, Adjusted R-squared: 0.9395
## F-statistic: 22.13 on 36 and 13 DF, p-value: 3.242e-07
Для этого сравним качество моделей с применением кросс-валидации и без неё
set.seed(1)
training.samples <- data2$profitability %>% createDataPartition(p = 0.8, list = FALSE)
train.data <- data2[training.samples, ]
test.data <- data2[-training.samples, ]
# LM default
model <- lm(profitability ~., data = train.data)
predictions <- model %>% predict(test.data)
R2(predictions, test.data$profitability)
## [1] 0.7429733
set.seed(1)
train.control <- trainControl(method = "cv", number = 10)
modelCV <- train(profitability ~., data = data2, method = "lm", trControl = train.control)
predictionsCV <- modelCV %>% predict(test.data)
R2(predictionsCV, test.data$profitability)
## [1] 0.8021626
set.seed(1)
maxPolynomSize = 5
train = sample(50, 25)
train.errors = rep(0, maxPolynomSize)
test.errors = rep(0, maxPolynomSize)
for (i in 1:maxPolynomSize) {
lm.fit=lm(profitability~poly(staff, i) + poly(adverts, i) + poly(projects, i),
data=data2, subset = train)
train.errors[i] = mean((data2$profitability - predict(lm.fit, data2))[train]^2)
test.errors[i] = mean((data2$profitability - predict(lm.fit, data2))[-train]^2)
}
qplot(c(1:maxPolynomSize), test.errors)
Применить метод главных компонент для понижения размерности данных о результативности деятельности российских вузов. Данные в файле «задание3.xlsx» а) Определить минимальное количество компонент, которые необходимо использовать для сохранения 75% первоначальной информации. b) Выписать формулы зависимости главных компонент из пункта а) от первоначальных данных.
data3[, 1] <- NULL
pcal = prcomp(data3, scale. = TRUE)
summary(pcal)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.4059 1.4092 1.14417 0.96359 0.94776 0.82369 0.74918
## Proportion of Variance 0.4134 0.1419 0.09351 0.06632 0.06416 0.04846 0.04009
## Cumulative Proportion 0.4134 0.5553 0.64880 0.71513 0.77929 0.82775 0.86784
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.72619 0.69719 0.56762 0.53358 0.35872 0.2592 0.18459
## Proportion of Variance 0.03767 0.03472 0.02301 0.02034 0.00919 0.0048 0.00243
## Cumulative Proportion 0.90551 0.94023 0.96324 0.98358 0.99277 0.9976 1.00000
pcal$rotation[, c(1, 2, 3, 4, 5)]
## PC1 PC2 PC3 PC4 PC5
## x1 0.28607737 0.02481524 -0.006826135 0.394031346 -0.28151793
## x2 0.13099801 0.28601704 0.552065419 -0.089233427 -0.25887789
## x3 0.34119480 -0.22033491 0.067549833 -0.257805906 0.31404117
## x4 0.32879358 -0.20743165 0.083679503 -0.195291296 0.33994000
## x5 0.37793811 -0.10949390 0.071679815 -0.065173350 0.05997512
## x6 0.38460810 -0.08495492 0.037909166 -0.002734889 0.07193408
## x7 0.32176089 0.13651277 -0.193201958 0.383669548 0.11159878
## x8 0.20206610 0.16075792 -0.355388519 -0.073756200 -0.51233973
## x9 0.15501420 -0.16134255 -0.346102966 -0.514354997 -0.40257112
## x10 0.18264310 0.49870019 0.209974986 -0.023377115 0.09227097
## x11 0.28062705 -0.09218763 0.186320464 -0.188699215 -0.18320373
## x12 0.09893056 0.59425588 0.012686508 -0.251929909 0.06123391
## x13 0.30772848 -0.03930942 -0.170180509 0.435781133 -0.02174976
## x14 0.02711555 0.35270505 -0.534503896 -0.145560825 0.38401476
По данным из файла «задание4.xls» обучить модель логистической регрессии с целью прогнозирования увольнения сотрудника. а) Отобрать переменные, которые будут включаться в модель. Обосновать сделанный выбор. Обучить модель логистической регрессии. Дать интерпретацию полученных результатов. b) Привести матрицу ошибок, рассмотреть различные метрики качества модели. c) Построить ROC-кривую.
cormat <- round(cor(data4),2)
melted_cormat <- melt(cormat)
ggplot(data = melted_cormat, aes(Var2, Var1, fill = value))+
geom_tile(color = "white")+
scale_fill_gradient2(low = "yellow", high = "green", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 12, hjust = 1))+
coord_fixed()
AIC: 896.44
logReg = glm(target~., data=data4, family=binomial(link="logit"))
summary(logReg) # Без выбора
##
## Call:
## glm(formula = target ~ ., family = binomial(link = "logit"),
## data = data4)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3510 -0.6561 -0.3200 0.7438 2.4227
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.6275744 1.9920181 -0.817 0.413901
## age 0.0115989 0.0267336 0.434 0.664383
## educ 0.0047014 0.0742355 0.063 0.949503
## work_exp -0.0022471 0.0263544 -0.085 0.932052
## interest -0.1647039 0.0302454 -5.446 5.16e-08 ***
## coffee -0.9211480 0.1052395 -8.753 < 2e-16 ***
## boss_educ 0.1076321 0.0644938 1.669 0.095142 .
## passport 0.5882873 0.1747101 3.367 0.000759 ***
## green 0.0180053 0.0663929 0.271 0.786243
## floor -0.0403283 0.0212694 -1.896 0.057950 .
## children -0.1120272 0.0686061 -1.633 0.102489
## climate -0.5085813 0.0691901 -7.350 1.97e-13 ***
## offhour 1.1655882 0.1757012 6.634 3.27e-11 ***
## dist 0.1400555 0.0216791 6.460 1.04e-10 ***
## salary -0.3852382 0.1728999 -2.228 0.025874 *
## heigh -0.0003100 0.0103481 -0.030 0.976100
## lunch 0.0005173 0.0016067 0.322 0.747459
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1233.86 on 983 degrees of freedom
## Residual deviance: 862.44 on 967 degrees of freedom
## AIC: 896.44
##
## Number of Fisher Scoring iterations: 5
Результат после препроцессинга - AIC: 891.59 (несколько лучше, но незначительно)
data4b <- select(data4, target, interest, coffee, passport, climate, offhour, dist, salary)
logRegB = glm(target~., data=data4b, family=binomial(link="logit"))
summary(logRegB)
##
## Call:
## glm(formula = target ~ ., family = binomial(link = "logit"),
## data = data4b)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4074 -0.6631 -0.3397 0.7435 2.3894
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.87382 0.36857 -2.371 0.017747 *
## interest -0.15136 0.02939 -5.151 2.60e-07 ***
## coffee -0.93676 0.10257 -9.133 < 2e-16 ***
## passport 0.57356 0.17242 3.327 0.000879 ***
## climate -0.49392 0.06752 -7.315 2.57e-13 ***
## offhour 1.14208 0.17281 6.609 3.87e-11 ***
## dist 0.12774 0.02062 6.196 5.80e-10 ***
## salary -0.37200 0.17090 -2.177 0.029504 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1233.86 on 983 degrees of freedom
## Residual deviance: 875.59 on 976 degrees of freedom
## AIC: 891.59
##
## Number of Fisher Scoring iterations: 5
mean(VIF(logRegB))
## [1] 1.047769
Приемлемо
training.samples <- data4b$target %>% createDataPartition(p = 0.8, list = FALSE)
train.data <- data4b[training.samples, ]
test.data <- data4b[-training.samples, ]
predictions <- logRegB %>% predict(test.data)
Concordance(test.data$target, predictions)$Concordance
## [1] 0.8480867
fitted.res<-ifelse(predict(logRegB, data = data4b, type="response") > 0.5, 1,0)
confusionMatrix(fitted.res, data4b$target)
## 0 1
## 0 575 94
## 1 135 180
predicted = prediction(predict(logRegB, data = data4b, type="response"), data4b$target)
evaluatedPerformance = performance(predicted, "tpr", "fpr")
plot(evaluatedPerformance)