EXAMPLE: CREDIT DATA.
STEP 1: DATA COLLECTION.
STEP 2: EXPLORING AND PREPARING DATA.
credit <- read.csv("http://www.sci.csueastbay.edu/~esuess/classes/Statistics_6620/Presentations/ml7/credit.csv")
- Fixing the default variable to be 0 or 1.
credit$default = as.numeric(credit$default)
credit$default = credit$default - 1
- Examing the structiure of the data.
str(credit)
'data.frame': 1000 obs. of 17 variables:
$ checking_balance : Factor w/ 4 levels "< 0 DM","> 200 DM",..: 1 3 4 1 1 4 4 3 4 3 ...
$ months_loan_duration: int 6 48 12 42 24 36 24 36 12 30 ...
$ credit_history : Factor w/ 5 levels "critical","good",..: 1 2 1 2 4 2 2 2 2 1 ...
$ purpose : Factor w/ 6 levels "business","car",..: 5 5 4 5 2 4 5 2 5 2 ...
$ amount : int 1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
$ savings_balance : Factor w/ 5 levels "< 100 DM","> 1000 DM",..: 5 1 1 1 1 5 4 1 2 1 ...
$ employment_duration : Factor w/ 5 levels "< 1 year","> 7 years",..: 2 3 4 4 3 3 2 3 4 5 ...
$ percent_of_income : int 4 2 2 2 3 2 3 2 2 4 ...
$ years_at_residence : int 4 2 3 4 4 4 4 2 4 2 ...
$ age : int 67 22 49 45 53 35 53 35 61 28 ...
$ other_credit : Factor w/ 3 levels "bank","none",..: 2 2 2 2 2 2 2 2 2 2 ...
$ housing : Factor w/ 3 levels "other","own",..: 2 2 2 1 1 1 2 3 2 2 ...
$ existing_loans_count: int 2 1 1 1 2 1 1 1 1 2 ...
$ job : Factor w/ 4 levels "management","skilled",..: 2 2 4 2 2 4 2 1 4 1 ...
$ dependents : int 1 1 2 2 2 2 1 1 1 1 ...
$ phone : Factor w/ 2 levels "no","yes": 2 1 1 1 1 2 1 2 1 1 ...
$ default : num 0 1 0 0 1 0 0 0 0 1 ...
STEP 3: TRAINING THE MODEL ON THE DATA.
- Set up trainning and test data sets.
indx = sample(1:nrow(credit), as.integer(0.9*nrow(credit)))
indx
[1] 92 933 179 8 556 37 835 612 673 553 999 529 481 918 876 332 98 725
[19] 780 643 383 176 564 905 291 421 820 659 901 625 194 734 327 104 784 708
[37] 138 154 343 962 416 195 307 27 526 758 655 90 681 9 334 366 82 493
[55] 814 209 464 250 696 806 792 585 976 286 477 133 24 706 927 783 598 213
[73] 45 571 33 592 47 106 122 300 651 819 465 654 634 233 251 495 380 647
[91] 508 483 63 971 202 279 10 977 278 369 815 550 888 676 59 538 979 626
[109] 690 235 552 984 769 48 641 469 325 206 982 97 504 653 379 408 323 880
[127] 7 316 423 798 915 167 810 863 258 318 433 14 439 117 521 764 936 435
[145] 358 480 707 878 422 399 50 862 225 338 315 331 232 107 850 490 341 628
[163] 57 94 460 457 438 788 127 661 100 129 821 397 528 487 750 506 38 155
[181] 728 697 321 609 572 128 378 885 363 458 305 344 241 756 71 543 12 394
[199] 105 662 311 456 994 672 972 231 960 896 636 389 680 563 393 812 742 776
[217] 637 677 671 296 142 259 800 412 228 373 306 381 99 486 618 13 425 568
[235] 714 978 990 953 357 77 342 779 757 467 669 356 721 988 372 516 346 16
[253] 736 949 955 751 583 32 263 542 288 193 549 744 384 74 678 594 667 123
[271] 199 729 700 266 70 283 492 165 196 367 243 803 540 51 295 197 290 432
[289] 510 836 610 5 136 987 811 943 236 69 816 470 919 974 402 25 877 281
[307] 340 260 892 595 276 298 737 146 86 238 867 132 244 853 134 752 293 559
[325] 640 620 170 959 294 353 333 849 55 427 753 795 785 829 845 865 190 499
[343] 823 539 703 224 424 621 34 437 967 699 223 789 826 537 622 54 945 868
[361] 890 158 940 308 587 420 509 738 937 711 114 666 72 909 139 580 174 827
[379] 414 715 208 337 929 864 545 705 498 544 973 173 638 578 222 22 36 601
[397] 852 428 891 172 997 466 463 287 444 370 330 523 623 520 361 511 212 694
[415] 302 665 93 319 1000 184 716 245 116 522 186 924 975 575 3 579 409 210
[433] 513 730 52 749 687 203 931 387 874 426 790 181 277 759 770 831 124 913
[451] 514 713 860 934 824 35 951 942 858 908 264 642 257 989 461 312 718 89
[469] 442 180 479 773 519 452 182 482 755 872 96 284 324 698 660 763 474 695
[487] 966 574 140 255 156 395 664 992 49 822 921 961 531 56 775 137 692 993
[505] 551 411 548 15 280 79 624 68 948 965 603 969 685 39 169 419 60 410
[523] 149 627 958 534 833 606 78 491 793 230 599 371 407 588 740 567 84 436
[541] 787 322 326 920 19 162 911 6 837 873 406 848 147 765 701 658 886 120
[559] 261 554 351 405 216 429 897 741 558 847 786 856 518 265 884 507 576 525
[577] 778 434 221 688 869 43 644 396 175 985 772 771 489 902 413 722 870 404
[595] 102 807 166 144 613 187 485 314 471 282 668 838 229 160 871 360 586 497
[613] 454 478 900 150 577 657 17 903 617 40 178 218 488 774 475 377 382 390
[631] 58 631 679 103 731 152 11 766 830 760 825 566 143 935 581 605 923 839
[649] 267 201 501 415 46 183 185 401 846 240 809 157 782 724 310 349 717 562
[667] 932 392 565 494 899 536 675 462 561 273 615 28 791 44 689 145 670 247
[685] 18 970 188 192 691 879 237 108 219 912 904 385 268 151 473 446 968 41
[703] 723 768 354 472 234 686 593 443 53 248 289 164 719 254 828 468 682 191
[721] 941 73 799 88 710 914 205 702 818 374 270 445 557 777 246 926 541 589
[739] 113 991 794 449 947 441 512 857 650 861 704 882 484 616 906 76 450 2
[757] 67 26 31 141 747 808 61 859 418 4 220 928 121 131 569 440 796 226
[775] 375 85 350 841 328 303 910 227 272 645 983 866 600 893 732 944 855 126
[793] 101 611 762 843 834 502 320 352 996 177 726 115 629 533 256 632 735 591
[811] 805 345 447 745 608 110 649 986 417 646 292 854 297 386 801 546 339 573
[829] 275 840 83 527 946 148 62 347 907 743 496 64 80 663 964 388 917 530
[847] 832 207 887 125 804 430 532 189 950 733 329 400 802 547 720 451 956 359
[865] 239 200 555 505 317 42 693 597 582 683 781 797 925 602 761 21 119 684
[883] 570 65 30 362 171 754 630 161 249 91 524 748 584 376 204 299 163 20
credit_train = credit[indx,]
credit_test = credit[-indx,]
credit_train_labels = credit[indx,17]
credit_test_labels = credit[-indx,17]
- Check if there are any missing values.
library(Amelia)
missmap(credit, main = "Missing vs observed Values")

- Number of missing values in each column. There are no missing values in this dataset.
sapply(credit,function(x) sum(is.na(x)))
checking_balance months_loan_duration credit_history purpose
0 0 0 0
amount savings_balance employment_duration percent_of_income
0 0 0 0
years_at_residence age other_credit housing
0 0 0 0
existing_loans_count job dependents phone
0 0 0 0
default
0
- Checking the number of unique values in each column.
sapply(credit, function(x) length(unique(x)))
checking_balance months_loan_duration credit_history purpose
4 33 5 6
amount savings_balance employment_duration percent_of_income
921 5 5 4
years_at_residence age other_credit housing
4 53 3 3
existing_loans_count job dependents phone
4 4 2 2
default
2
STEP 5: IMPROVING THE MODEL PERFORMANCE.
- Drop the insignificant predictors, alpha = 0.10.
model <- glm(default ~ checking_balance + months_loan_duration + credit_history + percent_of_income + age,family=binomial(link='logit'),data=credit_train)
- *This seems to be a better performing model, as seen by the improved AIC value, compared to the previous model’s AIC value.
model
Call: glm(formula = default ~ checking_balance + months_loan_duration +
credit_history + percent_of_income + age, family = binomial(link = "logit"),
data = credit_train)
Coefficients:
(Intercept) checking_balance> 200 DM checking_balance1 - 200 DM
-1.46259 -0.94685 -0.53320
checking_balanceunknown months_loan_duration credit_historygood
-1.90335 0.03253 0.60990
credit_historyperfect credit_historypoor credit_historyvery good
1.66589 0.61757 1.38926
percent_of_income age
0.20898 -0.01398
Degrees of Freedom: 899 Total (i.e. Null); 889 Residual
Null Deviance: 1094
Residual Deviance: 906.2 AIC: 928.2
- The Z-value is determined by getting the ratio between the estimated value & the standard error values.
summary(model)
Call:
glm(formula = default ~ checking_balance + months_loan_duration +
credit_history + percent_of_income + age, family = binomial(link = "logit"),
data = credit_train)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.7875 -0.7953 -0.4715 0.8801 2.3909
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.462592 0.445994 -3.279 0.001040 **
checking_balance> 200 DM -0.946854 0.357448 -2.649 0.008075 **
checking_balance1 - 200 DM -0.533202 0.198579 -2.685 0.007251 **
checking_balanceunknown -1.903347 0.219191 -8.684 < 2e-16 ***
months_loan_duration 0.032527 0.006746 4.822 1.42e-06 ***
credit_historygood 0.609904 0.210337 2.900 0.003736 **
credit_historyperfect 1.665890 0.421369 3.954 7.70e-05 ***
credit_historypoor 0.617574 0.320679 1.926 0.054125 .
credit_historyvery good 1.389261 0.379684 3.659 0.000253 ***
percent_of_income 0.208977 0.076118 2.745 0.006043 **
age -0.013977 0.007565 -1.848 0.064665 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1094.42 on 899 degrees of freedom
Residual deviance: 906.16 on 889 degrees of freedom
AIC: 928.16
Number of Fisher Scoring iterations: 4
anova(model, test="Chisq")
Analysis of Deviance Table
Model: binomial, link: logit
Response: default
Terms added sequentially (first to last)
Df Deviance Resid. Df Resid. Dev Pr(>Chi)
NULL 899 1094.42
checking_balance 3 119.537 896 974.89 < 2.2e-16 ***
months_loan_duration 1 31.658 895 943.23 1.838e-08 ***
credit_history 4 26.390 891 916.84 2.640e-05 ***
percent_of_income 1 7.186 890 909.65 0.007349 **
age 1 3.495 889 906.16 0.061568 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
fitted.results <- predict(model,newdata=credit_test,type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results != credit_test$default)
print(paste('Accuracy',1-misClasificError))
[1] "Accuracy 0.77"
library(ROCR)
p <- predict(model, newdata=credit_test, type="response")
pr <- prediction(p, credit_test$default)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)

auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
[1] 0.7548621
LS0tDQp0aXRsZTogIlIgTm90ZWJvb2siDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCmF1dGhvcjogIkFkaGlzbGFjeSINCi0tLQ0KDQojIyBFWEFNUExFOiBDUkVESVQgREFUQS4NCg0KIyNTVEVQIDE6IERBVEEgQ09MTEVDVElPTi4NCg0KDQojI1NURVAgMjogRVhQTE9SSU5HIEFORCBQUkVQQVJJTkcgREFUQS4NCg0KYGBge3J9DQpjcmVkaXQgPC0gcmVhZC5jc3YoImh0dHA6Ly93d3cuc2NpLmNzdWVhc3RiYXkuZWR1L35lc3Vlc3MvY2xhc3Nlcy9TdGF0aXN0aWNzXzY2MjAvUHJlc2VudGF0aW9ucy9tbDcvY3JlZGl0LmNzdiIpDQpgYGANCg0KDQo+LSAqRml4aW5nIHRoZSBkZWZhdWx0IHZhcmlhYmxlIHRvIGJlIDAgb3IgMS4qDQoNCmBgYHtyfQ0KY3JlZGl0JGRlZmF1bHQgPSBhcy5udW1lcmljKGNyZWRpdCRkZWZhdWx0KQ0KYGBgDQoNCg0KYGBge3J9DQpjcmVkaXQkZGVmYXVsdCA9IGNyZWRpdCRkZWZhdWx0IC0gMQ0KYGBgDQoNCg0KPi0gKkV4YW1pbmcgdGhlIHN0cnVjdGl1cmUgb2YgdGhlIGRhdGEuKg0KDQpgYGB7cn0NCnN0cihjcmVkaXQpDQpgYGANCg0KDQojI1NURVAgMzogVFJBSU5JTkcgVEhFIE1PREVMIE9OIFRIRSBEQVRBLg0KDQo+LSAqTE9HSVNUSUMgUkVHUkVTU0lPTi4qDQoNCj4tICpTZXQgdXAgdHJhaW5uaW5nIGFuZCB0ZXN0IGRhdGEgc2V0cy4qDQoNCmBgYHtyfQ0KaW5keCA9IHNhbXBsZSgxOm5yb3coY3JlZGl0KSwgYXMuaW50ZWdlcigwLjkqbnJvdyhjcmVkaXQpKSkNCmBgYA0KDQoNCmBgYHtyfQ0KaW5keA0KYGBgDQoNCg0KYGBge3J9DQpjcmVkaXRfdHJhaW4gPSBjcmVkaXRbaW5keCxdDQpgYGANCg0KYGBge3J9DQpjcmVkaXRfdGVzdCA9IGNyZWRpdFstaW5keCxdDQpgYGANCg0KDQpgYGB7cn0NCmNyZWRpdF90cmFpbl9sYWJlbHMgPSBjcmVkaXRbaW5keCwxN10NCmBgYA0KDQogICANCmBgYHtyfQ0KY3JlZGl0X3Rlc3RfbGFiZWxzID0gY3JlZGl0Wy1pbmR4LDE3XQ0KYGBgDQoNCg0KPi0gKkNoZWNrIGlmIHRoZXJlIGFyZSBhbnkgbWlzc2luZyB2YWx1ZXMuKg0KDQpgYGB7cn0NCmxpYnJhcnkoQW1lbGlhKQ0KYGBgDQoNCg0KYGBge3J9DQptaXNzbWFwKGNyZWRpdCwgbWFpbiA9ICJNaXNzaW5nIHZzIG9ic2VydmVkIFZhbHVlcyIpDQpgYGANCg0KDQo+LSAqTnVtYmVyIG9mIG1pc3NpbmcgdmFsdWVzIGluIGVhY2ggY29sdW1uLioNCiAgICAqVGhlcmUgYXJlIG5vIG1pc3NpbmcgdmFsdWVzIGluIHRoaXMgZGF0YXNldC4qDQoNCmBgYHtyfQ0Kc2FwcGx5KGNyZWRpdCxmdW5jdGlvbih4KSBzdW0oaXMubmEoeCkpKQ0KYGBgDQoNCg0KPi0gKkNoZWNraW5nIHRoZSBudW1iZXIgb2YgdW5pcXVlIHZhbHVlcyBpbiBlYWNoIGNvbHVtbi4qDQoNCmBgYHtyfQ0Kc2FwcGx5KGNyZWRpdCwgZnVuY3Rpb24oeCkgbGVuZ3RoKHVuaXF1ZSh4KSkpDQpgYGANCg0KDQojI1NURVAgNDogRVZBTFVBVElORyBNT0RFTCBQRVJGT1JNQU5DRS4NCg0KPi0gKkZpdHRpbmcgdGhlIGxvZ2lzdGljIHJlZ3Jlc3Npb24gbW9kZWwsIHdpdGggYWxsIHByZWRpY3RvciB2YXJpYWJsZXMuKg0KDQpgYGB7cn0NCm1vZGVsIDwtIGdsbShkZWZhdWx0IH4uLGZhbWlseT1iaW5vbWlhbChsaW5rPSdsb2dpdCcpLGRhdGE9Y3JlZGl0X3RyYWluKQ0KYGBgDQoNCmBgYHtyfQ0KbW9kZWwNCmBgYA0KDQpgYGB7cn0NCnN1bW1hcnkobW9kZWwpDQpgYGANCg0KPi0gKkhlcmUgd2UgZ2V0IHRvIHNlZSB0aGUgdmFyaWFibGVzIHRoYXQgZGV0ZXJtaW5lIHdoZXRoZXIgYW4gaW5kaXZpZHVhbCB3aWxsIGdldCBhIGxvYW4gb3Igbm90LCBpbiAgICAgIHRoZSBvcmRlciBvZiB0aGUgaW1wb3J0YW5jZS4qDQoNCmBgYHtyfQ0KYW5vdmEobW9kZWwsIHRlc3Q9IkNoaXNxIikNCmBgYA0KDQojI1NURVAgNTogSU1QUk9WSU5HIFRIRSBNT0RFTCBQRVJGT1JNQU5DRS4NCg0KPi0gKkRyb3AgdGhlIGluc2lnbmlmaWNhbnQgcHJlZGljdG9ycywgYWxwaGEgPSAwLjEwLioNCg0KYGBge3J9DQptb2RlbCA8LSBnbG0oZGVmYXVsdCB+IGNoZWNraW5nX2JhbGFuY2UgKyBtb250aHNfbG9hbl9kdXJhdGlvbiArIGNyZWRpdF9oaXN0b3J5ICsgIHBlcmNlbnRfb2ZfaW5jb21lICsgYWdlLGZhbWlseT1iaW5vbWlhbChsaW5rPSdsb2dpdCcpLGRhdGE9Y3JlZGl0X3RyYWluKQ0KYGBgDQoNCj4tICpUaGlzIHNlZW1zIHRvIGJlIGEgYmV0dGVyIHBlcmZvcm1pbmcgbW9kZWwsIGFzIHNlZW4gYnkgdGhlIGltcHJvdmVkIEFJQyB2YWx1ZSwgY29tcGFyZWQgdG8gdGhlICAgICAgICBwcmV2aW91cyBtb2RlbCdzIEFJQyB2YWx1ZS4NCg0KYGBge3J9DQptb2RlbA0KYGBgDQoNCj4tICpUaGUgWi12YWx1ZSBpcyBkZXRlcm1pbmVkIGJ5IGdldHRpbmcgdGhlIHJhdGlvIGJldHdlZW4gdGhlIGVzdGltYXRlZCB2YWx1ZSAmIHRoZSBzdGFuZGFyZCBlcnJvciAgICAgICB2YWx1ZXMuKg0KDQpgYGB7cn0NCnN1bW1hcnkobW9kZWwpDQpgYGANCg0KDQpgYGB7cn0NCmFub3ZhKG1vZGVsLCB0ZXN0PSJDaGlzcSIpDQpgYGANCg0KDQo+LSAqQ2hlY2tpbmcgdGhlIGFjY3VyYWN5LioNCg0KYGBge3J9DQpmaXR0ZWQucmVzdWx0cyA8LSBwcmVkaWN0KG1vZGVsLG5ld2RhdGE9Y3JlZGl0X3Rlc3QsdHlwZT0ncmVzcG9uc2UnKQ0KYGBgDQoNCmBgYHtyfQ0KZml0dGVkLnJlc3VsdHMgPC0gaWZlbHNlKGZpdHRlZC5yZXN1bHRzID4gMC41LDEsMCkNCmBgYA0KDQoNCmBgYHtyfQ0KbWlzQ2xhc2lmaWNFcnJvciA8LSBtZWFuKGZpdHRlZC5yZXN1bHRzICE9IGNyZWRpdF90ZXN0JGRlZmF1bHQpDQpgYGANCg0KDQpgYGB7cn0NCnByaW50KHBhc3RlKCdBY2N1cmFjeScsMS1taXNDbGFzaWZpY0Vycm9yKSkNCmBgYA0KDQoNCmBgYHtyfQ0KbGlicmFyeShST0NSKQ0KYGBgDQoNCg0KYGBge3J9DQpwIDwtIHByZWRpY3QobW9kZWwsIG5ld2RhdGE9Y3JlZGl0X3Rlc3QsIHR5cGU9InJlc3BvbnNlIikNCmBgYA0KDQpgYGB7cn0NCnByIDwtIHByZWRpY3Rpb24ocCwgY3JlZGl0X3Rlc3QkZGVmYXVsdCkNCmBgYA0KDQpgYGB7cn0NCnByZiA8LSBwZXJmb3JtYW5jZShwciwgbWVhc3VyZSA9ICJ0cHIiLCB4Lm1lYXN1cmUgPSAiZnByIikNCmBgYA0KDQpgYGB7cn0NCnBsb3QocHJmKQ0KYGBgDQoNCmBgYHtyfQ0KYXVjIDwtIHBlcmZvcm1hbmNlKHByLCBtZWFzdXJlID0gImF1YyIpDQpgYGANCg0KYGBge3J9DQphdWMgPC0gYXVjQHkudmFsdWVzW1sxXV0NCmBgYA0KDQpgYGB7cn0NCmF1Yw0KYGBgDQoNCg0K