First run of logit model – Znga
## Read Data (Step 2)
znga <- read.csv("Website_VIP_User_data_10000.csv", stringsAsFactors = FALSE)
# Logisitic regression
# Drop ID, and product_like_rate columns
znga <- znga[-1]
znga <- znga[-8]
# Set up trainning and test data sets
indx <- sample(1:nrow(znga), as.integer(0.9*nrow(znga)))
indx
[1] 9441 8951 6210 656 5582 565 701 3666 291 613 4011 123 7166 1478 7844 1426 4819 1883
[19] 1170 3849 7679 7096 4855 7358 2768 1997 7120 1267 4540 4622 4702 9101 7770 2267 9281 1983
[37] 3815 6878 1797 1827 5306 6949 1155 7993 740 3356 5110 1372 6385 8458 5085 7036 3551 7116
[55] 9467 2775 9723 9324 6334 8197 1463 4116 6458 1671 1117 6137 3260 9951 8938 2837 3555 9145
[73] 5663 9980 2005 5428 7540 5870 3649 5591 1605 4258 23 3760 8188 7705 6413 299 1971 6269
[91] 1703 8631 4331 8711 7003 5061 1176 6616 8157 7253 8643 9695 1501 4290 5190 1388 1308 7655
[109] 3312 3545 8249 5605 6847 541 2680 4374 1529 4297 9022 7940 3953 8263 2374 3509 7905 6468
[127] 1912 3984 4543 921 161 4722 1149 4329 5680 9464 2354 8172 6573 1582 4357 239 1141 6985
[145] 8369 2574 3933 8075 8146 1715 5091 757 5434 3354 9302 9757 2934 9134 7294 3491 2612 2315
[163] 7829 4879 7669 7405 4932 5221 5036 5973 1725 3781 5966 4781 3870 9603 281 6155 579 4626
[181] 3548 8588 3281 3499 9519 102 2208 4199 5571 8930 5689 5037 3386 244 9119 6290 9499 5601
[199] 8104 8071 3476 2480 813 7193 3920 5233 3082 6965 5518 8426 4252 5533 8614 7412 6292 7281
[217] 7741 6004 1988 4471 6401 4153 4788 6979 476 8751 2230 1134 8638 1596 7975 5758 661 3274
[235] 1301 7121 1045 3393 1049 2148 826 9930 2494 8887 9790 5744 1811 8676 5865 8414 4430 5335
[253] 6029 9739 1350 7517 1928 4666 5566 7817 6666 975 111 6756 6318 6489 37 3773 2239 365
[271] 1245 1688 8459 9374 7696 8754 563 839 8647 2507 4333 960 9796 5821 4257 4044 2295 4336
[289] 9368 1128 3081 6091 3037 7744 414 7506 4962 9208 6280 8704 6271 766 8719 9373 4483 5035
[307] 2699 2708 8710 8202 3489 5271 2618 2478 1823 7434 3099 4215 6914 4528 5640 2060 9455 5698
[325] 1295 5860 3328 4139 226 2532 2103 920 5228 4704 136 8065 8650 2723 9894 7266 8789 8613
[343] 4889 5231 9933 2958 9289 6759 6710 1697 6120 8796 2372 1926 9952 8370 3074 7919 5423 1518
[361] 9794 679 4907 1856 8623 140 4893 7366 1757 6904 6802 205 869 7032 7054 2957 2814 4868
[379] 9184 4403 3590 851 1180 3433 5259 3616 2562 6405 3621 4362 5943 8504 4187 3197 8100 4596
[397] 6244 8911 9030 7077 6415 2089 8557 4934 93 5893 6597 4777 7643 8528 1066 2014 9731 8465
[415] 5165 6493 744 6368 6680 1614 8477 9296 791 2639 3502 9477 8323 3012 9722 381 8029 6375
[433] 761 94 7481 7403 8248 1644 8982 6776 7750 4608 2817 1844 6667 7187 2090 5462 9196 2455
[451] 6009 717 7191 502 9556 1225 7663 9681 6466 6614 7460 800 3959 8932 2102 9343 6342 1831
[469] 7349 2575 4894 2840 6165 930 3295 6731 4243 2283 9040 247 1317 4005 8723 3347 1091 3397
[487] 1914 4196 6173 7295 6316 191 321 5772 1783 1837 3597 1919 7125 7774 7526 203 8062 1892
[505] 9492 8639 382 3308 5750 4248 2273 2767 5621 8475 9301 6814 6863 2140 9855 283 4604 9522
[523] 5673 3382 5069 2965 3801 9699 6476 9594 6845 8058 48 9317 7458 75 8922 7071 367 9391
[541] 1623 981 6637 3628 1718 1685 6530 4818 4506 1007 8441 8381 7922 9272 4785 6254 4657 2898
[559] 8509 6217 5824 6526 471 4409 2766 3568 5540 9466 5486 5731 6868 3495 348 9430 7626 8655
[577] 5088 1258 7443 6064 5554 5018 8610 3439 7063 544 5648 3752 5027 350 1133 7820 3025 3765
[595] 8099 9588 4231 274 1289 5143 3862 5445 2966 9862 4025 3564 3966 535 2353 5780 5262 7838
[613] 7502 9 9230 2327 4771 7890 5606 9647 8576 5818 5001 4315 8858 5749 473 6623 3659 9096
[631] 6505 5097 4700 2475 4408 7269 4175 1310 1660 8506 3663 5513 3123 8333 5656 5348 9419 1961
[649] 3901 4412 4295 1348 8256 7346 3053 3226 9426 9479 843 8743 9901 5909 6153 4720 9687 3151
[667] 3838 3878 7780 5106 4799 8254 5250 7702 1102 9142 7391 5585 2235 5817 3171 9924 6689 5365
[685] 5411 6031 5690 7235 4145 860 9294 7765 4919 3592 7916 5053 3623 4691 1281 4043 2771 6959
[703] 8609 2320 6867 4813 3559 4385 3172 2855 2635 1886 351 7459 3335 4024 5833 5177 2798 6026
[721] 7647 7188 8198 4462 6054 8355 5452 3186 2894 3708 7841 1603 1775 3606 5318 8189 8698 39
[739] 3579 778 3318 8036 5222 1683 3741 3889 7371 867 7217 7803 7730 4646 8185 2167 6193 5727
[757] 4079 9341 1003 3181 1754 2012 611 5859 6839 2568 3414 9051 6097 7792 3469 6506 2644 3979
[775] 4065 9303 6148 2153 6651 7582 169 4486 5993 8624 8889 2011 3924 4814 8914 8310 8604 2293
[793] 6966 3333 5921 1938 8193 8076 4725 228 6261 1150 8221 5045 1306 8810 8240 364 3243 8068
[811] 2977 7904 1687 2889 1336 9321 8549 9298 5286 4239 5938 86 6002 9648 5552 4767 5111 1808
[829] 6181 6682 341 4568 8876 1812 4854 6911 7973 7432 3438 8013 727 3067 1963 9394 7554 2569
[847] 8004 5444 7871 1847 3906 6819 890 2338 6595 443 8329 2248 1325 598 7956 6656 1395 4156
[865] 4739 3050 7179 9836 4341 7024 9906 1065 3165 8307 1957 3162 8507 7775 7515 989 1334 131
[883] 1949 6838 3842 3485 6346 5159 6786 6574 6964 4324 8692 6581 7612 7115 6101 5292 9118 9984
[901] 7482 1711 9658 3991 3673 152 9192 8792 2237 1890 5524 758 551 7542 5667 7896 6259 5570
[919] 8943 9078 9400 4829 9968 1207 4518 3119 448 4400 6220 9840 5430 6813 1507 9217 9792 7522
[937] 415 1005 2363 7694 9259 8606 5021 8991 290 4705 7571 6829 1554 5132 4037 586 6621 4155
[955] 204 7954 7992 2389 4567 4556 6740 7140 1092 6480 209 6336 9692 1807 2976 5180 4294 7402
[973] 7342 3814 5119 6063 8086 6012 2753 4222 1123 460 2811 3418 8720 5686 823 5543 370 6507
[991] 648 2522 2061 3236 2566 6787 8063 8235 4244 2300
[ reached getOption("max.print") -- omitted 8000 entries ]
znga_train <- znga[indx,]
znga_test <- znga[-indx,]
znga_train_labels <- znga[indx,1]
znga_test_labels <- znga[-indx,1]
# Check if there are any missing values
# install.packages('Amelia')
library(Amelia)
missmap(znga, main = "Missing values vs observed")

# number of missing values in each column
sapply(znga, function(x) sum(is.na(x)))
IsVIP_500 payment_7_day dau_days
0 0 0
days_between_install_first_pay total_txns_7_day total_page_views
0 0 0
total_product_liked total_free_coupon_got total_bonus_xp_points
0 0 0
# number of unique values in each column
sapply(znga, function(x) length(unique(x)))
IsVIP_500 payment_7_day dau_days
2 1505 7
days_between_install_first_pay total_txns_7_day total_page_views
2312 30 1656
total_product_liked total_free_coupon_got total_bonus_xp_points
731 53 4164
# fit the logistic regression model, with all predictor variables
model <- glm(IsVIP_500 ~.,family=binomial(link='logit'),data=znga_train)
summary(model)
Call:
glm(formula = IsVIP_500 ~ ., family = binomial(link = "logit"),
data = znga_train)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.8529 -0.1525 -0.1057 -0.0673 3.5425
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -7.411e+00 3.934e-01 -18.840 < 2e-16 ***
payment_7_day 2.772e-02 1.697e-03 16.327 < 2e-16 ***
dau_days 3.547e-01 6.381e-02 5.560 2.71e-08 ***
days_between_install_first_pay -4.342e-04 1.217e-04 -3.566 0.000362 ***
total_txns_7_day 3.161e-02 1.541e-02 2.052 0.040208 *
total_page_views 7.270e-04 4.012e-04 1.812 0.069957 .
total_product_liked -1.196e-03 1.373e-03 -0.872 0.383427
total_free_coupon_got -2.599e-03 2.328e-02 -0.112 0.911138
total_bonus_xp_points 5.495e-08 1.895e-07 0.290 0.771835
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1476.6 on 8999 degrees of freedom
Residual deviance: 1066.8 on 8991 degrees of freedom
AIC: 1084.8
Number of Fisher Scoring iterations: 8
anova(model)
Analysis of Deviance Table
Model: binomial, link: logit
Response: IsVIP_500
Terms added sequentially (first to last)
Df Deviance Resid. Df Resid. Dev
NULL 8999 1476.6
payment_7_day 1 290.476 8998 1186.1
dau_days 1 87.153 8997 1099.0
days_between_install_first_pay 1 15.448 8996 1083.5
total_txns_7_day 1 4.868 8995 1078.7
total_page_views 1 10.948 8994 1067.7
total_product_liked 1 0.672 8993 1067.0
total_free_coupon_got 1 0.135 8992 1066.9
total_bonus_xp_points 1 0.082 8991 1066.8
Dropping insignificant terms – Znga
# drop the insignificant predictors, alpha = 0.10
model <- glm(IsVIP_500 ~ payment_7_day+dau_days+days_between_install_first_pay+total_txns_7_day+total_page_views, family = binomial(link='logit'), data = znga_train)
summary(model)
Call:
glm(formula = IsVIP_500 ~ payment_7_day + dau_days + days_between_install_first_pay +
total_txns_7_day + total_page_views, family = binomial(link = "logit"),
data = znga_train)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.8397 -0.1531 -0.1061 -0.0677 3.5400
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -7.3865930 0.3907609 -18.903 < 2e-16 ***
payment_7_day 0.0274334 0.0016646 16.481 < 2e-16 ***
dau_days 0.3601509 0.0616972 5.837 5.3e-09 ***
days_between_install_first_pay -0.0004339 0.0001216 -3.569 0.000358 ***
total_txns_7_day 0.0309698 0.0153615 2.016 0.043793 *
total_page_views 0.0003938 0.0001107 3.559 0.000372 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1476.6 on 8999 degrees of freedom
Residual deviance: 1067.7 on 8994 degrees of freedom
AIC: 1079.7
Number of Fisher Scoring iterations: 8
anova(model, test="Chisq")
Analysis of Deviance Table
Model: binomial, link: logit
Response: IsVIP_500
Terms added sequentially (first to last)
Df Deviance Resid. Df Resid. Dev Pr(>Chi)
NULL 8999 1476.6
payment_7_day 1 290.476 8998 1186.1 < 2.2e-16 ***
dau_days 1 87.153 8997 1099.0 < 2.2e-16 ***
days_between_install_first_pay 1 15.448 8996 1083.5 8.48e-05 ***
total_txns_7_day 1 4.868 8995 1078.7 0.0273570 *
total_page_views 1 10.948 8994 1067.7 0.0009369 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Check Accuracy – Znga
fitted.results <- predict(model,newdata=znga_test,type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results != znga_test$IsVIP_500)
print(paste('Accuracy',1-misClasificError))
[1] "Accuracy 0.988011988011988"
ROC – Template code
# ROC
# Because this data set is so small, it is possible that the test data set
# does not contain both 0 and 1 values. If this happens the code will not
# run. And since the test data set is so small the ROC is not useful here
# but the code is provided.
# install.packages('ROCR')
# library(ROCR)
# p <- predict(model, newdata=launch_test, type="response")
# pr <- prediction(p, launch_test$distress_ct)
# prf <- performance(pr, measure = "tpr", x.measure = "fpr")
# plot(prf)
#
# auc <- performance(pr, measure = "auc")
# auc <- auc@y.values[[1]]
# auc
ROC Accuracy – Znga
# ROC
library(ROCR)
p <- predict(model, newdata=znga_test, type="response")
pr <- prediction(p, znga_test$IsVIP_500)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)

auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
[1] 0.9195592
An Example: Credit Data – Logit
credit <- read.csv("http://www.sci.csueastbay.edu/~esuess/classes/Statistics_6620/Presentations/ml7/credit.csv")
## Fix the default variable to be 0 or 1
credit$default <- as.numeric(credit$default) # Don't need this for znga
credit$default <- credit$default - 1 # Don't need this for znga
# examine the launch data
str(credit)
'data.frame': 1000 obs. of 17 variables:
$ checking_balance : Factor w/ 4 levels "< 0 DM","> 200 DM",..: 1 3 4 1 1 4 4 3 4 3 ...
$ months_loan_duration: int 6 48 12 42 24 36 24 36 12 30 ...
$ credit_history : Factor w/ 5 levels "critical","good",..: 1 2 1 2 4 2 2 2 2 1 ...
$ purpose : Factor w/ 6 levels "business","car",..: 5 5 4 5 2 4 5 2 5 2 ...
$ amount : int 1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
$ savings_balance : Factor w/ 5 levels "< 100 DM","> 1000 DM",..: 5 1 1 1 1 5 4 1 2 1 ...
$ employment_duration : Factor w/ 5 levels "< 1 year","> 7 years",..: 2 3 4 4 3 3 2 3 4 5 ...
$ percent_of_income : int 4 2 2 2 3 2 3 2 2 4 ...
$ years_at_residence : int 4 2 3 4 4 4 4 2 4 2 ...
$ age : int 67 22 49 45 53 35 53 35 61 28 ...
$ other_credit : Factor w/ 3 levels "bank","none",..: 2 2 2 2 2 2 2 2 2 2 ...
$ housing : Factor w/ 3 levels "other","own",..: 2 2 2 1 1 1 2 3 2 2 ...
$ existing_loans_count: int 2 1 1 1 2 1 1 1 1 2 ...
$ job : Factor w/ 4 levels "management","skilled",..: 2 2 4 2 2 4 2 1 4 1 ...
$ dependents : int 1 1 2 2 2 2 1 1 1 1 ...
$ phone : Factor w/ 2 levels "no","yes": 2 1 1 1 1 2 1 2 1 1 ...
$ default : num 0 1 0 0 1 0 0 0 0 1 ...
# logisitic regression
# set up trainning and test data sets
indx <- sample(1:nrow(credit), as.integer(0.9*nrow(credit)))
indx
[1] 937 487 238 479 153 143 988 886 556 661 935 178 306 284 630 313 708 122 697
[20] 694 146 111 875 628 797 712 248 567 939 390 972 31 530 852 411 158 990 139
[39] 657 624 125 369 745 357 761 594 270 753 235 905 184 148 370 459 559 333 500
[58] 880 434 634 872 84 877 845 632 145 160 864 181 524 186 509 752 177 97 2
[77] 463 958 429 803 129 954 909 394 564 703 617 779 455 380 182 308 725 247 537
[96] 998 278 633 293 608 583 172 811 216 395 291 610 46 344 726 616 67 915 245
[115] 788 888 893 721 645 932 917 192 919 263 600 115 197 557 170 436 419 940 101
[134] 759 773 995 665 782 314 944 368 709 505 198 154 446 765 386 466 228 441 498
[153] 808 912 541 743 277 331 294 581 528 502 561 334 656 654 928 233 59 956 123
[172] 710 847 12 887 93 957 482 13 103 513 539 543 347 385 73 906 644 705 55
[191] 533 432 272 255 107 473 786 806 379 421 117 535 666 365 999 376 456 472 693
[210] 105 982 733 345 321 606 798 346 547 252 470 690 239 243 740 215 899 217 689
[229] 894 196 728 555 605 585 670 476 439 35 273 777 813 469 859 237 173 426 90
[248] 977 841 844 134 23 361 942 1000 249 232 891 538 516 378 724 443 923 409 682
[267] 289 78 161 80 49 856 26 109 822 653 819 77 261 166 416 596 565 701 353
[286] 601 211 735 58 962 483 37 414 227 730 126 688 979 422 602 214 462 202 679
[305] 130 920 850 318 54 974 637 41 929 722 521 805 597 337 44 171 542 623 40
[324] 831 444 100 641 350 757 406 322 652 391 33 75 453 484 914 400 514 499 994
[343] 88 488 651 512 236 866 667 458 840 938 793 976 152 468 714 352 658 423 784
[362] 168 163 116 892 618 203 558 371 865 792 450 61 881 598 970 562 24 218 953
[381] 833 244 165 47 591 229 303 946 454 191 642 969 45 930 901 548 338 271 183
[400] 327 62 510 863 404 821 295 489 493 576 837 936 824 827 156 433 687 11 800
[419] 839 588 174 68 405 855 748 343 790 580 449 987 794 282 326 169 360 895 769
[438] 795 635 529 222 941 234 810 723 889 382 374 159 310 815 854 501 200 829 620
[457] 570 857 447 267 907 964 70 317 526 818 307 65 25 862 355 925 496 341 648
[476] 388 729 5 428 69 816 467 812 22 212 702 640 448 383 534 325 738 842 36
[495] 589 762 778 927 952 230 663 949 397 34 515 911 739 356 660 372 302 973 413
[514] 94 315 231 254 76 377 425 934 85 399 768 636 639 57 497 826 460 106 586
[533] 764 749 698 883 814 960 102 664 677 692 104 286 896 506 135 155 417 288 963
[552] 137 396 457 649 574 853 655 290 674 579 328 74 375 696 868 32 646 846 836
[571] 997 647 716 190 96 796 50 747 486 133 713 224 680 783 364 20 775 471 193
[590] 392 7 627 195 563 304 330 902 508 746 774 913 464 552 445 522 6 132 8
[609] 546 851 324 18 830 519 767 572 336 686 64 398 89 573 437 185 834 402 242
[628] 742 540 758 475 91 961 832 799 955 898 79 3 415 240 19 717 771 366 275
[647] 751 201 407 993 113 849 719 772 298 309 188 885 358 503 144 83 532 787 731
[666] 706 975 403 599 131 431 118 147 53 367 823 867 362 205 878 678 175 430 495
[685] 560 274 95 452 82 279 119 16 312 685 744 108 550 527 319 494 860 691 569
[704] 42 301 571 621 536 718 531 910 967 699 1 86 253 480 922 604 296 668 577
[723] 870 904 619 128 971 981 684 440 582 28 157 704 523 820 638 672 755 250 389
[742] 438 384 785 614 802 983 874 858 592 257 60 766 265 966 791 933 194 162 607
[761] 631 985 140 554 879 320 71 838 780 890 285 615 659 566 204 17 329 72 873
[780] 220 551 14 801 918 549 199 504 613 9 809 989 287 943 835 770 206 110 517
[799] 127 114 763 98 142 223 595 711 732 474 991 511 316 387 817 734 882 241 947
[818] 219 465 669 179 707 568 207 56 410 986 27 92 676 209 492 737 828 138 280
[837] 43 626 897 750 650 978 673 683 984 264 575 609 323 63 807 141 804 544 87
[856] 992 490 381 342 99 112 269 268 931 213 408 427 373 151 339 671 478 412 948
[875] 776 351 136 349 662 226 553 251 52 340 149 485 789 260 359 603 262 900 518
[894] 418 924 48 120 335 332 525
credit_train <- credit[indx,]
credit_test <- credit[-indx,]
credit_train_labels <- credit[indx,17]
credit_test_labels <- credit[-indx,17]
# Check if there are any missing values
library(Amelia)
missmap(credit, main = "Missing values vs observed")

# number of missing values in each column
sapply(credit,function(x) sum(is.na(x)))
checking_balance months_loan_duration credit_history purpose
0 0 0 0
amount savings_balance employment_duration percent_of_income
0 0 0 0
years_at_residence age other_credit housing
0 0 0 0
existing_loans_count job dependents phone
0 0 0 0
default
0
# number of unique values in each column
sapply(credit, function(x) length(unique(x)))
checking_balance months_loan_duration credit_history purpose
4 33 5 6
amount savings_balance employment_duration percent_of_income
921 5 5 4
years_at_residence age other_credit housing
4 53 3 3
existing_loans_count job dependents phone
4 4 2 2
default
2
# fit the logistic regression model, with all predictor variables
creditModel <- glm(default ~., family=binomial(link='logit'), data = credit_train)
creditModel
Call: glm(formula = default ~ ., family = binomial(link = "logit"),
data = credit_train)
Coefficients:
(Intercept) checking_balance> 200 DM checking_balance1 - 200 DM
-1.454e+00 -9.034e-01 -3.243e-01
checking_balanceunknown months_loan_duration credit_historygood
-1.755e+00 3.155e-02 8.531e-01
credit_historyperfect credit_historypoor credit_historyvery good
1.448e+00 7.433e-01 1.333e+00
purposecar purposecar0 purposeeducation
2.569e-01 -4.931e-01 7.098e-01
purposefurniture/appliances purposerenovations amount
-1.334e-01 3.455e-01 8.337e-05
savings_balance> 1000 DM savings_balance100 - 500 DM savings_balance500 - 1000 DM
-1.031e+00 -3.843e-01 -5.040e-01
savings_balanceunknown employment_duration> 7 years employment_duration1 - 4 years
-9.277e-01 -4.541e-01 -3.002e-01
employment_duration4 - 7 years employment_durationunemployed percent_of_income
-9.441e-01 -2.707e-01 2.687e-01
years_at_residence age other_creditnone
6.473e-02 -1.789e-02 -6.555e-01
other_creditstore housingown housingrent
-1.690e-01 -8.252e-02 3.832e-01
existing_loans_count jobskilled jobunemployed
2.134e-01 2.393e-02 5.874e-01
jobunskilled dependents phoneyes
-2.497e-02 3.753e-02 -1.665e-01
Degrees of Freedom: 899 Total (i.e. Null); 864 Residual
Null Deviance: 1094
Residual Deviance: 843.3 AIC: 915.3
summary(creditModel)
Call:
glm(formula = default ~ ., family = binomial(link = "logit"),
data = credit_train)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.9276 -0.7444 -0.4016 0.7978 2.6275
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.454e+00 9.435e-01 -1.541 0.123347
checking_balance> 200 DM -9.034e-01 3.879e-01 -2.329 0.019850 *
checking_balance1 - 200 DM -3.243e-01 2.143e-01 -1.513 0.130271
checking_balanceunknown -1.755e+00 2.385e-01 -7.360 1.83e-13 ***
months_loan_duration 3.155e-02 9.420e-03 3.349 0.000810 ***
credit_historygood 8.531e-01 2.676e-01 3.187 0.001436 **
credit_historyperfect 1.448e+00 4.415e-01 3.279 0.001041 **
credit_historypoor 7.433e-01 3.381e-01 2.198 0.027938 *
credit_historyvery good 1.333e+00 4.470e-01 2.982 0.002862 **
purposecar 2.569e-01 3.254e-01 0.789 0.429914
purposecar0 -4.931e-01 7.962e-01 -0.619 0.535710
purposeeducation 7.098e-01 4.485e-01 1.583 0.113481
purposefurniture/appliances -1.334e-01 3.181e-01 -0.419 0.674910
purposerenovations 3.455e-01 6.391e-01 0.541 0.588808
amount 8.337e-05 4.306e-05 1.936 0.052862 .
savings_balance> 1000 DM -1.031e+00 5.123e-01 -2.013 0.044163 *
savings_balance100 - 500 DM -3.843e-01 2.924e-01 -1.314 0.188682
savings_balance500 - 1000 DM -5.040e-01 4.403e-01 -1.145 0.252411
savings_balanceunknown -9.277e-01 2.667e-01 -3.479 0.000504 ***
employment_duration> 7 years -4.541e-01 2.939e-01 -1.545 0.122362
employment_duration1 - 4 years -3.002e-01 2.401e-01 -1.250 0.211215
employment_duration4 - 7 years -9.441e-01 3.022e-01 -3.124 0.001785 **
employment_durationunemployed -2.707e-01 4.444e-01 -0.609 0.542348
percent_of_income 2.687e-01 8.813e-02 3.049 0.002297 **
years_at_residence 6.473e-02 8.821e-02 0.734 0.463039
age -1.789e-02 9.512e-03 -1.880 0.060041 .
other_creditnone -6.555e-01 2.437e-01 -2.690 0.007147 **
other_creditstore -1.690e-01 4.204e-01 -0.402 0.687721
housingown -8.252e-02 3.044e-01 -0.271 0.786323
housingrent 3.832e-01 3.485e-01 1.099 0.271574
existing_loans_count 2.134e-01 1.948e-01 1.096 0.273239
jobskilled 2.393e-02 2.880e-01 0.083 0.933779
jobunemployed 5.874e-01 6.897e-01 0.852 0.394418
jobunskilled -2.497e-02 3.486e-01 -0.072 0.942895
dependents 3.753e-02 2.453e-01 0.153 0.878412
phoneyes -1.665e-01 2.029e-01 -0.820 0.412037
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1094.42 on 899 degrees of freedom
Residual deviance: 843.33 on 864 degrees of freedom
AIC: 915.33
Number of Fisher Scoring iterations: 5
anova(creditModel, test="Chisq")
Analysis of Deviance Table
Model: binomial, link: logit
Response: default
Terms added sequentially (first to last)
Df Deviance Resid. Df Resid. Dev Pr(>Chi)
NULL 899 1094.42
checking_balance 3 123.045 896 971.38 < 2.2e-16 ***
months_loan_duration 1 38.101 895 933.28 6.718e-10 ***
credit_history 4 25.481 891 907.80 4.025e-05 ***
purpose 5 6.305 886 901.49 0.277638
amount 1 0.096 885 901.40 0.757098
savings_balance 4 17.311 881 884.08 0.001681 **
employment_duration 4 11.705 877 872.38 0.019686 *
percent_of_income 1 7.343 876 865.04 0.006734 **
years_at_residence 1 0.896 875 864.14 0.343989
age 1 5.475 874 858.67 0.019295 *
other_credit 2 7.964 872 850.70 0.018647 *
housing 2 4.324 870 846.38 0.115105
existing_loans_count 1 1.374 869 845.01 0.241169
job 3 0.972 866 844.03 0.807968
dependents 1 0.024 865 844.01 0.876113
phone 1 0.676 864 843.33 0.411137
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# drop the insignificant predictors, alpha = 0.10
creditModel <- glm(default ~ checking_balance + months_loan_duration + credit_history + percent_of_income + age, family = binomial(link='logit'), data = credit_train)
creditModel
Call: glm(formula = default ~ checking_balance + months_loan_duration +
credit_history + percent_of_income + age, family = binomial(link = "logit"),
data = credit_train)
Coefficients:
(Intercept) checking_balance> 200 DM checking_balance1 - 200 DM
-1.23454 -1.07359 -0.47676
checking_balanceunknown months_loan_duration credit_historygood
-1.97625 0.03556 0.57723
credit_historyperfect credit_historypoor credit_historyvery good
1.61432 0.67220 1.35226
percent_of_income age
0.16530 -0.01822
Degrees of Freedom: 899 Total (i.e. Null); 889 Residual
Null Deviance: 1094
Residual Deviance: 897.9 AIC: 919.9
summary(creditModel)
Call:
glm(formula = default ~ checking_balance + months_loan_duration +
credit_history + percent_of_income + age, family = binomial(link = "logit"),
data = credit_train)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.8279 -0.8071 -0.4582 0.8897 2.4312
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.234544 0.436468 -2.828 0.004677 **
checking_balance> 200 DM -1.073590 0.364743 -2.943 0.003246 **
checking_balance1 - 200 DM -0.476760 0.195911 -2.434 0.014952 *
checking_balanceunknown -1.976250 0.223510 -8.842 < 2e-16 ***
months_loan_duration 0.035558 0.006789 5.237 1.63e-07 ***
credit_historygood 0.577230 0.210606 2.741 0.006129 **
credit_historyperfect 1.614321 0.415943 3.881 0.000104 ***
credit_historypoor 0.672202 0.318760 2.109 0.034962 *
credit_historyvery good 1.352261 0.392121 3.449 0.000564 ***
percent_of_income 0.165302 0.074969 2.205 0.027458 *
age -0.018222 0.007776 -2.343 0.019107 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1094.42 on 899 degrees of freedom
Residual deviance: 897.87 on 889 degrees of freedom
AIC: 919.87
Number of Fisher Scoring iterations: 5
anova(creditModel, test="Chisq")
Analysis of Deviance Table
Model: binomial, link: logit
Response: default
Terms added sequentially (first to last)
Df Deviance Resid. Df Resid. Dev Pr(>Chi)
NULL 899 1094.42
checking_balance 3 123.045 896 971.38 < 2.2e-16 ***
months_loan_duration 1 38.101 895 933.28 6.718e-10 ***
credit_history 4 25.481 891 907.80 4.025e-05 ***
percent_of_income 1 4.253 890 903.54 0.03917 *
age 1 5.676 889 897.87 0.01719 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# check Accuracy
fitted.results <- predict(creditModel, newdata = credit_test, type = 'response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results != credit_test$default)
print(paste('Accuracy',1-misClasificError))
[1] "Accuracy 0.73"
Credit Data – ROC
# ROC
library(ROCR)
p <- predict(creditModel, newdata=credit_test, type="response")
pr <- prediction(p, credit_test$default)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)

auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
[1] 0.7195839
LS0tDQp0aXRsZTogIkxvZ2lzdGljIHJlZ3Jlc3Npb24gLS0gWm5nYSINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQoNCiMjIEZpcnN0IHJ1biBvZiBsb2dpdCBtb2RlbCAtLSBabmdhDQpgYGB7cn0NCiMjIFJlYWQgRGF0YSAoU3RlcCAyKQ0Kem5nYSA8LSByZWFkLmNzdigiV2Vic2l0ZV9WSVBfVXNlcl9kYXRhXzEwMDAwLmNzdiIsIHN0cmluZ3NBc0ZhY3RvcnMgPSBGQUxTRSkNCg0KIyBMb2dpc2l0aWMgcmVncmVzc2lvbg0KDQojIERyb3AgSUQsIGFuZCBwcm9kdWN0X2xpa2VfcmF0ZSBjb2x1bW5zDQp6bmdhIDwtIHpuZ2FbLTFdDQp6bmdhIDwtIHpuZ2FbLThdDQoNCiMgU2V0IHVwIHRyYWlubmluZyBhbmQgdGVzdCBkYXRhIHNldHMNCmluZHggPC0gc2FtcGxlKDE6bnJvdyh6bmdhKSwgYXMuaW50ZWdlcigwLjkqbnJvdyh6bmdhKSkpDQppbmR4DQoNCnpuZ2FfdHJhaW4gPC0gem5nYVtpbmR4LF0NCnpuZ2FfdGVzdCA8LSB6bmdhWy1pbmR4LF0NCg0Kem5nYV90cmFpbl9sYWJlbHMgPC0gem5nYVtpbmR4LDFdDQp6bmdhX3Rlc3RfbGFiZWxzIDwtIHpuZ2FbLWluZHgsMV0gICANCg0KIyBDaGVjayBpZiB0aGVyZSBhcmUgYW55IG1pc3NpbmcgdmFsdWVzDQojIGluc3RhbGwucGFja2FnZXMoJ0FtZWxpYScpDQpsaWJyYXJ5KEFtZWxpYSkNCm1pc3NtYXAoem5nYSwgbWFpbiA9ICJNaXNzaW5nIHZhbHVlcyB2cyBvYnNlcnZlZCIpDQoNCiMgbnVtYmVyIG9mIG1pc3NpbmcgdmFsdWVzIGluIGVhY2ggY29sdW1uDQoNCnNhcHBseSh6bmdhLCBmdW5jdGlvbih4KSBzdW0oaXMubmEoeCkpKQ0KDQojIG51bWJlciBvZiB1bmlxdWUgdmFsdWVzIGluIGVhY2ggY29sdW1uDQoNCnNhcHBseSh6bmdhLCBmdW5jdGlvbih4KSBsZW5ndGgodW5pcXVlKHgpKSkNCg0KIyBmaXQgdGhlIGxvZ2lzdGljIHJlZ3Jlc3Npb24gbW9kZWwsIHdpdGggYWxsIHByZWRpY3RvciB2YXJpYWJsZXMNCg0KbW9kZWwgPC0gZ2xtKElzVklQXzUwMCB+LixmYW1pbHk9Ymlub21pYWwobGluaz0nbG9naXQnKSxkYXRhPXpuZ2FfdHJhaW4pDQoNCnN1bW1hcnkobW9kZWwpDQphbm92YShtb2RlbCkNCmBgYA0KDQojIyBEcm9wcGluZyBpbnNpZ25pZmljYW50IHRlcm1zIC0tIFpuZ2ENCmBgYHtyfQ0KIyBkcm9wIHRoZSBpbnNpZ25pZmljYW50IHByZWRpY3RvcnMsIGFscGhhID0gMC4xMA0KDQptb2RlbCA8LSBnbG0oSXNWSVBfNTAwIH4gcGF5bWVudF83X2RheStkYXVfZGF5cytkYXlzX2JldHdlZW5faW5zdGFsbF9maXJzdF9wYXkrdG90YWxfdHhuc183X2RheSt0b3RhbF9wYWdlX3ZpZXdzLCBmYW1pbHkgPSBiaW5vbWlhbChsaW5rPSdsb2dpdCcpLCBkYXRhID0gem5nYV90cmFpbikNCg0Kc3VtbWFyeShtb2RlbCkNCg0KYW5vdmEobW9kZWwsIHRlc3Q9IkNoaXNxIikNCmBgYA0KDQojIyBDaGVjayBBY2N1cmFjeSAtLSBabmdhDQpgYGB7cn0NCmZpdHRlZC5yZXN1bHRzIDwtIHByZWRpY3QobW9kZWwsbmV3ZGF0YT16bmdhX3Rlc3QsdHlwZT0ncmVzcG9uc2UnKQ0KZml0dGVkLnJlc3VsdHMgPC0gaWZlbHNlKGZpdHRlZC5yZXN1bHRzID4gMC41LDEsMCkNCg0KbWlzQ2xhc2lmaWNFcnJvciA8LSBtZWFuKGZpdHRlZC5yZXN1bHRzICE9IHpuZ2FfdGVzdCRJc1ZJUF81MDApDQpwcmludChwYXN0ZSgnQWNjdXJhY3knLDEtbWlzQ2xhc2lmaWNFcnJvcikpDQpgYGANCg0KIyMgUk9DIC0tIFRlbXBsYXRlIGNvZGUNCmBgYHtyfQ0KIyBST0MNCiMgQmVjYXVzZSB0aGlzIGRhdGEgc2V0IGlzIHNvIHNtYWxsLCBpdCBpcyBwb3NzaWJsZSB0aGF0IHRoZSB0ZXN0IGRhdGEgc2V0DQojIGRvZXMgbm90IGNvbnRhaW4gYm90aCAwIGFuZCAxIHZhbHVlcy4gIElmIHRoaXMgaGFwcGVucyB0aGUgY29kZSB3aWxsIG5vdA0KIyBydW4uICBBbmQgc2luY2UgdGhlIHRlc3QgZGF0YSBzZXQgaXMgc28gc21hbGwgdGhlIFJPQyBpcyBub3QgdXNlZnVsIGhlcmUNCiMgYnV0IHRoZSBjb2RlIGlzIHByb3ZpZGVkLg0KIyBpbnN0YWxsLnBhY2thZ2VzKCdST0NSJykNCiMgbGlicmFyeShST0NSKQ0KIyBwIDwtIHByZWRpY3QobW9kZWwsIG5ld2RhdGE9bGF1bmNoX3Rlc3QsIHR5cGU9InJlc3BvbnNlIikNCiMgcHIgPC0gcHJlZGljdGlvbihwLCBsYXVuY2hfdGVzdCRkaXN0cmVzc19jdCkNCiMgcHJmIDwtIHBlcmZvcm1hbmNlKHByLCBtZWFzdXJlID0gInRwciIsIHgubWVhc3VyZSA9ICJmcHIiKQ0KIyBwbG90KHByZikNCiMgDQojIGF1YyA8LSBwZXJmb3JtYW5jZShwciwgbWVhc3VyZSA9ICJhdWMiKQ0KIyBhdWMgPC0gYXVjQHkudmFsdWVzW1sxXV0NCiMgYXVjDQpgYGANCg0KIyMgUk9DIEFjY3VyYWN5IC0tIFpuZ2ENCmBgYHtyfQ0KIyBST0MNCmxpYnJhcnkoUk9DUikNCnAgPC0gcHJlZGljdChtb2RlbCwgbmV3ZGF0YT16bmdhX3Rlc3QsIHR5cGU9InJlc3BvbnNlIikNCnByIDwtIHByZWRpY3Rpb24ocCwgem5nYV90ZXN0JElzVklQXzUwMCkNCnByZiA8LSBwZXJmb3JtYW5jZShwciwgbWVhc3VyZSA9ICJ0cHIiLCB4Lm1lYXN1cmUgPSAiZnByIikNCnBsb3QocHJmKQ0KDQphdWMgPC0gcGVyZm9ybWFuY2UocHIsIG1lYXN1cmUgPSAiYXVjIikNCmF1YyA8LSBhdWNAeS52YWx1ZXNbWzFdXQ0KYXVjDQpgYGANCg0KDQojIyBBbiBFeGFtcGxlOiBDcmVkaXQgRGF0YSAtLSBMb2dpdA0KYGBge3J9DQpjcmVkaXQgPC0gcmVhZC5jc3YoImh0dHA6Ly93d3cuc2NpLmNzdWVhc3RiYXkuZWR1L35lc3Vlc3MvY2xhc3Nlcy9TdGF0aXN0aWNzXzY2MjAvUHJlc2VudGF0aW9ucy9tbDcvY3JlZGl0LmNzdiIpDQoNCiMjIEZpeCB0aGUgZGVmYXVsdCB2YXJpYWJsZSB0byBiZSAwIG9yIDENCmNyZWRpdCRkZWZhdWx0IDwtIGFzLm51bWVyaWMoY3JlZGl0JGRlZmF1bHQpICMgRG9uJ3QgbmVlZCB0aGlzIGZvciB6bmdhDQpjcmVkaXQkZGVmYXVsdCA8LSBjcmVkaXQkZGVmYXVsdCAtIDEgICAgICAgICAjIERvbid0IG5lZWQgdGhpcyBmb3Igem5nYQ0KDQojIGV4YW1pbmUgdGhlIGxhdW5jaCBkYXRhDQpzdHIoY3JlZGl0KQ0KDQojIGxvZ2lzaXRpYyByZWdyZXNzaW9uDQoNCiMgc2V0IHVwIHRyYWlubmluZyBhbmQgdGVzdCBkYXRhIHNldHMNCg0KaW5keCA8LSBzYW1wbGUoMTpucm93KGNyZWRpdCksIGFzLmludGVnZXIoMC45Km5yb3coY3JlZGl0KSkpDQppbmR4DQoNCmNyZWRpdF90cmFpbiA8LSBjcmVkaXRbaW5keCxdDQpjcmVkaXRfdGVzdCA8LSBjcmVkaXRbLWluZHgsXQ0KDQpjcmVkaXRfdHJhaW5fbGFiZWxzIDwtIGNyZWRpdFtpbmR4LDE3XQ0KY3JlZGl0X3Rlc3RfbGFiZWxzIDwtIGNyZWRpdFstaW5keCwxN10gICANCg0KIyBDaGVjayBpZiB0aGVyZSBhcmUgYW55IG1pc3NpbmcgdmFsdWVzDQoNCmxpYnJhcnkoQW1lbGlhKQ0KbWlzc21hcChjcmVkaXQsIG1haW4gPSAiTWlzc2luZyB2YWx1ZXMgdnMgb2JzZXJ2ZWQiKQ0KDQojIG51bWJlciBvZiBtaXNzaW5nIHZhbHVlcyBpbiBlYWNoIGNvbHVtbg0KDQpzYXBwbHkoY3JlZGl0LGZ1bmN0aW9uKHgpIHN1bShpcy5uYSh4KSkpDQoNCiMgbnVtYmVyIG9mIHVuaXF1ZSB2YWx1ZXMgaW4gZWFjaCBjb2x1bW4NCg0Kc2FwcGx5KGNyZWRpdCwgZnVuY3Rpb24oeCkgbGVuZ3RoKHVuaXF1ZSh4KSkpDQoNCiMgZml0IHRoZSBsb2dpc3RpYyByZWdyZXNzaW9uIG1vZGVsLCB3aXRoIGFsbCBwcmVkaWN0b3IgdmFyaWFibGVzDQoNCmNyZWRpdE1vZGVsIDwtIGdsbShkZWZhdWx0IH4uLCBmYW1pbHk9Ymlub21pYWwobGluaz0nbG9naXQnKSwgZGF0YSA9IGNyZWRpdF90cmFpbikNCmNyZWRpdE1vZGVsDQoNCnN1bW1hcnkoY3JlZGl0TW9kZWwpDQoNCmFub3ZhKGNyZWRpdE1vZGVsLCB0ZXN0PSJDaGlzcSIpDQoNCiMgZHJvcCB0aGUgaW5zaWduaWZpY2FudCBwcmVkaWN0b3JzLCBhbHBoYSA9IDAuMTANCg0KY3JlZGl0TW9kZWwgPC0gZ2xtKGRlZmF1bHQgfiBjaGVja2luZ19iYWxhbmNlICsgbW9udGhzX2xvYW5fZHVyYXRpb24gKyBjcmVkaXRfaGlzdG9yeSArICBwZXJjZW50X29mX2luY29tZSArIGFnZSwgZmFtaWx5ID0gYmlub21pYWwobGluaz0nbG9naXQnKSwgZGF0YSA9IGNyZWRpdF90cmFpbikNCmNyZWRpdE1vZGVsDQoNCnN1bW1hcnkoY3JlZGl0TW9kZWwpDQoNCmFub3ZhKGNyZWRpdE1vZGVsLCB0ZXN0PSJDaGlzcSIpDQoNCiMgY2hlY2sgQWNjdXJhY3kNCg0KZml0dGVkLnJlc3VsdHMgPC0gcHJlZGljdChjcmVkaXRNb2RlbCwgbmV3ZGF0YSA9IGNyZWRpdF90ZXN0LCB0eXBlID0gJ3Jlc3BvbnNlJykNCmZpdHRlZC5yZXN1bHRzIDwtIGlmZWxzZShmaXR0ZWQucmVzdWx0cyA+IDAuNSwxLDApDQoNCm1pc0NsYXNpZmljRXJyb3IgPC0gbWVhbihmaXR0ZWQucmVzdWx0cyAhPSBjcmVkaXRfdGVzdCRkZWZhdWx0KQ0KcHJpbnQocGFzdGUoJ0FjY3VyYWN5JywxLW1pc0NsYXNpZmljRXJyb3IpKQ0KYGBgDQoNCiMjIENyZWRpdCBEYXRhIC0tIFJPQw0KYGBge3J9DQojIFJPQw0KbGlicmFyeShST0NSKQ0KcCA8LSBwcmVkaWN0KGNyZWRpdE1vZGVsLCBuZXdkYXRhPWNyZWRpdF90ZXN0LCB0eXBlPSJyZXNwb25zZSIpDQpwciA8LSBwcmVkaWN0aW9uKHAsIGNyZWRpdF90ZXN0JGRlZmF1bHQpDQpwcmYgPC0gcGVyZm9ybWFuY2UocHIsIG1lYXN1cmUgPSAidHByIiwgeC5tZWFzdXJlID0gImZwciIpDQpwbG90KHByZikNCg0KYXVjIDwtIHBlcmZvcm1hbmNlKHByLCBtZWFzdXJlID0gImF1YyIpDQphdWMgPC0gYXVjQHkudmFsdWVzW1sxXV0NCmF1Yw0KYGBgDQoNCg==