First run of logit model – Znga

## Read Data (Step 2)
znga <- read.csv("Website_VIP_User_data_10000.csv", stringsAsFactors = FALSE)
# Logisitic regression
# Drop ID, and product_like_rate columns
znga <- znga[-1]
znga <- znga[-8]
# Set up trainning and test data sets
indx <- sample(1:nrow(znga), as.integer(0.9*nrow(znga)))
indx
   [1] 9441 8951 6210  656 5582  565  701 3666  291  613 4011  123 7166 1478 7844 1426 4819 1883
  [19] 1170 3849 7679 7096 4855 7358 2768 1997 7120 1267 4540 4622 4702 9101 7770 2267 9281 1983
  [37] 3815 6878 1797 1827 5306 6949 1155 7993  740 3356 5110 1372 6385 8458 5085 7036 3551 7116
  [55] 9467 2775 9723 9324 6334 8197 1463 4116 6458 1671 1117 6137 3260 9951 8938 2837 3555 9145
  [73] 5663 9980 2005 5428 7540 5870 3649 5591 1605 4258   23 3760 8188 7705 6413  299 1971 6269
  [91] 1703 8631 4331 8711 7003 5061 1176 6616 8157 7253 8643 9695 1501 4290 5190 1388 1308 7655
 [109] 3312 3545 8249 5605 6847  541 2680 4374 1529 4297 9022 7940 3953 8263 2374 3509 7905 6468
 [127] 1912 3984 4543  921  161 4722 1149 4329 5680 9464 2354 8172 6573 1582 4357  239 1141 6985
 [145] 8369 2574 3933 8075 8146 1715 5091  757 5434 3354 9302 9757 2934 9134 7294 3491 2612 2315
 [163] 7829 4879 7669 7405 4932 5221 5036 5973 1725 3781 5966 4781 3870 9603  281 6155  579 4626
 [181] 3548 8588 3281 3499 9519  102 2208 4199 5571 8930 5689 5037 3386  244 9119 6290 9499 5601
 [199] 8104 8071 3476 2480  813 7193 3920 5233 3082 6965 5518 8426 4252 5533 8614 7412 6292 7281
 [217] 7741 6004 1988 4471 6401 4153 4788 6979  476 8751 2230 1134 8638 1596 7975 5758  661 3274
 [235] 1301 7121 1045 3393 1049 2148  826 9930 2494 8887 9790 5744 1811 8676 5865 8414 4430 5335
 [253] 6029 9739 1350 7517 1928 4666 5566 7817 6666  975  111 6756 6318 6489   37 3773 2239  365
 [271] 1245 1688 8459 9374 7696 8754  563  839 8647 2507 4333  960 9796 5821 4257 4044 2295 4336
 [289] 9368 1128 3081 6091 3037 7744  414 7506 4962 9208 6280 8704 6271  766 8719 9373 4483 5035
 [307] 2699 2708 8710 8202 3489 5271 2618 2478 1823 7434 3099 4215 6914 4528 5640 2060 9455 5698
 [325] 1295 5860 3328 4139  226 2532 2103  920 5228 4704  136 8065 8650 2723 9894 7266 8789 8613
 [343] 4889 5231 9933 2958 9289 6759 6710 1697 6120 8796 2372 1926 9952 8370 3074 7919 5423 1518
 [361] 9794  679 4907 1856 8623  140 4893 7366 1757 6904 6802  205  869 7032 7054 2957 2814 4868
 [379] 9184 4403 3590  851 1180 3433 5259 3616 2562 6405 3621 4362 5943 8504 4187 3197 8100 4596
 [397] 6244 8911 9030 7077 6415 2089 8557 4934   93 5893 6597 4777 7643 8528 1066 2014 9731 8465
 [415] 5165 6493  744 6368 6680 1614 8477 9296  791 2639 3502 9477 8323 3012 9722  381 8029 6375
 [433]  761   94 7481 7403 8248 1644 8982 6776 7750 4608 2817 1844 6667 7187 2090 5462 9196 2455
 [451] 6009  717 7191  502 9556 1225 7663 9681 6466 6614 7460  800 3959 8932 2102 9343 6342 1831
 [469] 7349 2575 4894 2840 6165  930 3295 6731 4243 2283 9040  247 1317 4005 8723 3347 1091 3397
 [487] 1914 4196 6173 7295 6316  191  321 5772 1783 1837 3597 1919 7125 7774 7526  203 8062 1892
 [505] 9492 8639  382 3308 5750 4248 2273 2767 5621 8475 9301 6814 6863 2140 9855  283 4604 9522
 [523] 5673 3382 5069 2965 3801 9699 6476 9594 6845 8058   48 9317 7458   75 8922 7071  367 9391
 [541] 1623  981 6637 3628 1718 1685 6530 4818 4506 1007 8441 8381 7922 9272 4785 6254 4657 2898
 [559] 8509 6217 5824 6526  471 4409 2766 3568 5540 9466 5486 5731 6868 3495  348 9430 7626 8655
 [577] 5088 1258 7443 6064 5554 5018 8610 3439 7063  544 5648 3752 5027  350 1133 7820 3025 3765
 [595] 8099 9588 4231  274 1289 5143 3862 5445 2966 9862 4025 3564 3966  535 2353 5780 5262 7838
 [613] 7502    9 9230 2327 4771 7890 5606 9647 8576 5818 5001 4315 8858 5749  473 6623 3659 9096
 [631] 6505 5097 4700 2475 4408 7269 4175 1310 1660 8506 3663 5513 3123 8333 5656 5348 9419 1961
 [649] 3901 4412 4295 1348 8256 7346 3053 3226 9426 9479  843 8743 9901 5909 6153 4720 9687 3151
 [667] 3838 3878 7780 5106 4799 8254 5250 7702 1102 9142 7391 5585 2235 5817 3171 9924 6689 5365
 [685] 5411 6031 5690 7235 4145  860 9294 7765 4919 3592 7916 5053 3623 4691 1281 4043 2771 6959
 [703] 8609 2320 6867 4813 3559 4385 3172 2855 2635 1886  351 7459 3335 4024 5833 5177 2798 6026
 [721] 7647 7188 8198 4462 6054 8355 5452 3186 2894 3708 7841 1603 1775 3606 5318 8189 8698   39
 [739] 3579  778 3318 8036 5222 1683 3741 3889 7371  867 7217 7803 7730 4646 8185 2167 6193 5727
 [757] 4079 9341 1003 3181 1754 2012  611 5859 6839 2568 3414 9051 6097 7792 3469 6506 2644 3979
 [775] 4065 9303 6148 2153 6651 7582  169 4486 5993 8624 8889 2011 3924 4814 8914 8310 8604 2293
 [793] 6966 3333 5921 1938 8193 8076 4725  228 6261 1150 8221 5045 1306 8810 8240  364 3243 8068
 [811] 2977 7904 1687 2889 1336 9321 8549 9298 5286 4239 5938   86 6002 9648 5552 4767 5111 1808
 [829] 6181 6682  341 4568 8876 1812 4854 6911 7973 7432 3438 8013  727 3067 1963 9394 7554 2569
 [847] 8004 5444 7871 1847 3906 6819  890 2338 6595  443 8329 2248 1325  598 7956 6656 1395 4156
 [865] 4739 3050 7179 9836 4341 7024 9906 1065 3165 8307 1957 3162 8507 7775 7515  989 1334  131
 [883] 1949 6838 3842 3485 6346 5159 6786 6574 6964 4324 8692 6581 7612 7115 6101 5292 9118 9984
 [901] 7482 1711 9658 3991 3673  152 9192 8792 2237 1890 5524  758  551 7542 5667 7896 6259 5570
 [919] 8943 9078 9400 4829 9968 1207 4518 3119  448 4400 6220 9840 5430 6813 1507 9217 9792 7522
 [937]  415 1005 2363 7694 9259 8606 5021 8991  290 4705 7571 6829 1554 5132 4037  586 6621 4155
 [955]  204 7954 7992 2389 4567 4556 6740 7140 1092 6480  209 6336 9692 1807 2976 5180 4294 7402
 [973] 7342 3814 5119 6063 8086 6012 2753 4222 1123  460 2811 3418 8720 5686  823 5543  370 6507
 [991]  648 2522 2061 3236 2566 6787 8063 8235 4244 2300
 [ reached getOption("max.print") -- omitted 8000 entries ]
znga_train <- znga[indx,]
znga_test <- znga[-indx,]
znga_train_labels <- znga[indx,1]
znga_test_labels <- znga[-indx,1]   
# Check if there are any missing values
# install.packages('Amelia')
library(Amelia)
missmap(znga, main = "Missing values vs observed")

# number of missing values in each column
sapply(znga, function(x) sum(is.na(x)))
                     IsVIP_500                  payment_7_day                       dau_days 
                             0                              0                              0 
days_between_install_first_pay               total_txns_7_day               total_page_views 
                             0                              0                              0 
           total_product_liked          total_free_coupon_got          total_bonus_xp_points 
                             0                              0                              0 
# number of unique values in each column
sapply(znga, function(x) length(unique(x)))
                     IsVIP_500                  payment_7_day                       dau_days 
                             2                           1505                              7 
days_between_install_first_pay               total_txns_7_day               total_page_views 
                          2312                             30                           1656 
           total_product_liked          total_free_coupon_got          total_bonus_xp_points 
                           731                             53                           4164 
# fit the logistic regression model, with all predictor variables
model <- glm(IsVIP_500 ~.,family=binomial(link='logit'),data=znga_train)
summary(model)

Call:
glm(formula = IsVIP_500 ~ ., family = binomial(link = "logit"), 
    data = znga_train)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.8529  -0.1525  -0.1057  -0.0673   3.5425  

Coefficients:
                                 Estimate Std. Error z value Pr(>|z|)    
(Intercept)                    -7.411e+00  3.934e-01 -18.840  < 2e-16 ***
payment_7_day                   2.772e-02  1.697e-03  16.327  < 2e-16 ***
dau_days                        3.547e-01  6.381e-02   5.560 2.71e-08 ***
days_between_install_first_pay -4.342e-04  1.217e-04  -3.566 0.000362 ***
total_txns_7_day                3.161e-02  1.541e-02   2.052 0.040208 *  
total_page_views                7.270e-04  4.012e-04   1.812 0.069957 .  
total_product_liked            -1.196e-03  1.373e-03  -0.872 0.383427    
total_free_coupon_got          -2.599e-03  2.328e-02  -0.112 0.911138    
total_bonus_xp_points           5.495e-08  1.895e-07   0.290 0.771835    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1476.6  on 8999  degrees of freedom
Residual deviance: 1066.8  on 8991  degrees of freedom
AIC: 1084.8

Number of Fisher Scoring iterations: 8
anova(model)
Analysis of Deviance Table

Model: binomial, link: logit

Response: IsVIP_500

Terms added sequentially (first to last)

                               Df Deviance Resid. Df Resid. Dev
NULL                                            8999     1476.6
payment_7_day                   1  290.476      8998     1186.1
dau_days                        1   87.153      8997     1099.0
days_between_install_first_pay  1   15.448      8996     1083.5
total_txns_7_day                1    4.868      8995     1078.7
total_page_views                1   10.948      8994     1067.7
total_product_liked             1    0.672      8993     1067.0
total_free_coupon_got           1    0.135      8992     1066.9
total_bonus_xp_points           1    0.082      8991     1066.8

Dropping insignificant terms – Znga

# drop the insignificant predictors, alpha = 0.10
model <- glm(IsVIP_500 ~ payment_7_day+dau_days+days_between_install_first_pay+total_txns_7_day+total_page_views, family = binomial(link='logit'), data = znga_train)
summary(model)

Call:
glm(formula = IsVIP_500 ~ payment_7_day + dau_days + days_between_install_first_pay + 
    total_txns_7_day + total_page_views, family = binomial(link = "logit"), 
    data = znga_train)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.8397  -0.1531  -0.1061  -0.0677   3.5400  

Coefficients:
                                 Estimate Std. Error z value Pr(>|z|)    
(Intercept)                    -7.3865930  0.3907609 -18.903  < 2e-16 ***
payment_7_day                   0.0274334  0.0016646  16.481  < 2e-16 ***
dau_days                        0.3601509  0.0616972   5.837  5.3e-09 ***
days_between_install_first_pay -0.0004339  0.0001216  -3.569 0.000358 ***
total_txns_7_day                0.0309698  0.0153615   2.016 0.043793 *  
total_page_views                0.0003938  0.0001107   3.559 0.000372 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1476.6  on 8999  degrees of freedom
Residual deviance: 1067.7  on 8994  degrees of freedom
AIC: 1079.7

Number of Fisher Scoring iterations: 8
anova(model, test="Chisq")
Analysis of Deviance Table

Model: binomial, link: logit

Response: IsVIP_500

Terms added sequentially (first to last)

                               Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
NULL                                            8999     1476.6              
payment_7_day                   1  290.476      8998     1186.1 < 2.2e-16 ***
dau_days                        1   87.153      8997     1099.0 < 2.2e-16 ***
days_between_install_first_pay  1   15.448      8996     1083.5  8.48e-05 ***
total_txns_7_day                1    4.868      8995     1078.7 0.0273570 *  
total_page_views                1   10.948      8994     1067.7 0.0009369 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Check Accuracy – Znga

fitted.results <- predict(model,newdata=znga_test,type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results != znga_test$IsVIP_500)
print(paste('Accuracy',1-misClasificError))
[1] "Accuracy 0.988011988011988"

ROC – Template code

# ROC
# Because this data set is so small, it is possible that the test data set
# does not contain both 0 and 1 values.  If this happens the code will not
# run.  And since the test data set is so small the ROC is not useful here
# but the code is provided.
# install.packages('ROCR')
# library(ROCR)
# p <- predict(model, newdata=launch_test, type="response")
# pr <- prediction(p, launch_test$distress_ct)
# prf <- performance(pr, measure = "tpr", x.measure = "fpr")
# plot(prf)
# 
# auc <- performance(pr, measure = "auc")
# auc <- auc@y.values[[1]]
# auc

ROC Accuracy – Znga

# ROC
library(ROCR)
p <- predict(model, newdata=znga_test, type="response")
pr <- prediction(p, znga_test$IsVIP_500)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)

auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
[1] 0.9195592

An Example: Credit Data – Logit

credit <- read.csv("http://www.sci.csueastbay.edu/~esuess/classes/Statistics_6620/Presentations/ml7/credit.csv")
## Fix the default variable to be 0 or 1
credit$default <- as.numeric(credit$default) # Don't need this for znga
credit$default <- credit$default - 1         # Don't need this for znga
# examine the launch data
str(credit)
'data.frame':   1000 obs. of  17 variables:
 $ checking_balance    : Factor w/ 4 levels "< 0 DM","> 200 DM",..: 1 3 4 1 1 4 4 3 4 3 ...
 $ months_loan_duration: int  6 48 12 42 24 36 24 36 12 30 ...
 $ credit_history      : Factor w/ 5 levels "critical","good",..: 1 2 1 2 4 2 2 2 2 1 ...
 $ purpose             : Factor w/ 6 levels "business","car",..: 5 5 4 5 2 4 5 2 5 2 ...
 $ amount              : int  1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
 $ savings_balance     : Factor w/ 5 levels "< 100 DM","> 1000 DM",..: 5 1 1 1 1 5 4 1 2 1 ...
 $ employment_duration : Factor w/ 5 levels "< 1 year","> 7 years",..: 2 3 4 4 3 3 2 3 4 5 ...
 $ percent_of_income   : int  4 2 2 2 3 2 3 2 2 4 ...
 $ years_at_residence  : int  4 2 3 4 4 4 4 2 4 2 ...
 $ age                 : int  67 22 49 45 53 35 53 35 61 28 ...
 $ other_credit        : Factor w/ 3 levels "bank","none",..: 2 2 2 2 2 2 2 2 2 2 ...
 $ housing             : Factor w/ 3 levels "other","own",..: 2 2 2 1 1 1 2 3 2 2 ...
 $ existing_loans_count: int  2 1 1 1 2 1 1 1 1 2 ...
 $ job                 : Factor w/ 4 levels "management","skilled",..: 2 2 4 2 2 4 2 1 4 1 ...
 $ dependents          : int  1 1 2 2 2 2 1 1 1 1 ...
 $ phone               : Factor w/ 2 levels "no","yes": 2 1 1 1 1 2 1 2 1 1 ...
 $ default             : num  0 1 0 0 1 0 0 0 0 1 ...
# logisitic regression
# set up trainning and test data sets
indx <- sample(1:nrow(credit), as.integer(0.9*nrow(credit)))
indx
  [1]  937  487  238  479  153  143  988  886  556  661  935  178  306  284  630  313  708  122  697
 [20]  694  146  111  875  628  797  712  248  567  939  390  972   31  530  852  411  158  990  139
 [39]  657  624  125  369  745  357  761  594  270  753  235  905  184  148  370  459  559  333  500
 [58]  880  434  634  872   84  877  845  632  145  160  864  181  524  186  509  752  177   97    2
 [77]  463  958  429  803  129  954  909  394  564  703  617  779  455  380  182  308  725  247  537
 [96]  998  278  633  293  608  583  172  811  216  395  291  610   46  344  726  616   67  915  245
[115]  788  888  893  721  645  932  917  192  919  263  600  115  197  557  170  436  419  940  101
[134]  759  773  995  665  782  314  944  368  709  505  198  154  446  765  386  466  228  441  498
[153]  808  912  541  743  277  331  294  581  528  502  561  334  656  654  928  233   59  956  123
[172]  710  847   12  887   93  957  482   13  103  513  539  543  347  385   73  906  644  705   55
[191]  533  432  272  255  107  473  786  806  379  421  117  535  666  365  999  376  456  472  693
[210]  105  982  733  345  321  606  798  346  547  252  470  690  239  243  740  215  899  217  689
[229]  894  196  728  555  605  585  670  476  439   35  273  777  813  469  859  237  173  426   90
[248]  977  841  844  134   23  361  942 1000  249  232  891  538  516  378  724  443  923  409  682
[267]  289   78  161   80   49  856   26  109  822  653  819   77  261  166  416  596  565  701  353
[286]  601  211  735   58  962  483   37  414  227  730  126  688  979  422  602  214  462  202  679
[305]  130  920  850  318   54  974  637   41  929  722  521  805  597  337   44  171  542  623   40
[324]  831  444  100  641  350  757  406  322  652  391   33   75  453  484  914  400  514  499  994
[343]   88  488  651  512  236  866  667  458  840  938  793  976  152  468  714  352  658  423  784
[362]  168  163  116  892  618  203  558  371  865  792  450   61  881  598  970  562   24  218  953
[381]  833  244  165   47  591  229  303  946  454  191  642  969   45  930  901  548  338  271  183
[400]  327   62  510  863  404  821  295  489  493  576  837  936  824  827  156  433  687   11  800
[419]  839  588  174   68  405  855  748  343  790  580  449  987  794  282  326  169  360  895  769
[438]  795  635  529  222  941  234  810  723  889  382  374  159  310  815  854  501  200  829  620
[457]  570  857  447  267  907  964   70  317  526  818  307   65   25  862  355  925  496  341  648
[476]  388  729    5  428   69  816  467  812   22  212  702  640  448  383  534  325  738  842   36
[495]  589  762  778  927  952  230  663  949  397   34  515  911  739  356  660  372  302  973  413
[514]   94  315  231  254   76  377  425  934   85  399  768  636  639   57  497  826  460  106  586
[533]  764  749  698  883  814  960  102  664  677  692  104  286  896  506  135  155  417  288  963
[552]  137  396  457  649  574  853  655  290  674  579  328   74  375  696  868   32  646  846  836
[571]  997  647  716  190   96  796   50  747  486  133  713  224  680  783  364   20  775  471  193
[590]  392    7  627  195  563  304  330  902  508  746  774  913  464  552  445  522    6  132    8
[609]  546  851  324   18  830  519  767  572  336  686   64  398   89  573  437  185  834  402  242
[628]  742  540  758  475   91  961  832  799  955  898   79    3  415  240   19  717  771  366  275
[647]  751  201  407  993  113  849  719  772  298  309  188  885  358  503  144   83  532  787  731
[666]  706  975  403  599  131  431  118  147   53  367  823  867  362  205  878  678  175  430  495
[685]  560  274   95  452   82  279  119   16  312  685  744  108  550  527  319  494  860  691  569
[704]   42  301  571  621  536  718  531  910  967  699    1   86  253  480  922  604  296  668  577
[723]  870  904  619  128  971  981  684  440  582   28  157  704  523  820  638  672  755  250  389
[742]  438  384  785  614  802  983  874  858  592  257   60  766  265  966  791  933  194  162  607
[761]  631  985  140  554  879  320   71  838  780  890  285  615  659  566  204   17  329   72  873
[780]  220  551   14  801  918  549  199  504  613    9  809  989  287  943  835  770  206  110  517
[799]  127  114  763   98  142  223  595  711  732  474  991  511  316  387  817  734  882  241  947
[818]  219  465  669  179  707  568  207   56  410  986   27   92  676  209  492  737  828  138  280
[837]   43  626  897  750  650  978  673  683  984  264  575  609  323   63  807  141  804  544   87
[856]  992  490  381  342   99  112  269  268  931  213  408  427  373  151  339  671  478  412  948
[875]  776  351  136  349  662  226  553  251   52  340  149  485  789  260  359  603  262  900  518
[894]  418  924   48  120  335  332  525
credit_train <- credit[indx,]
credit_test <- credit[-indx,]
credit_train_labels <- credit[indx,17]
credit_test_labels <- credit[-indx,17]   
# Check if there are any missing values
library(Amelia)
missmap(credit, main = "Missing values vs observed")

# number of missing values in each column
sapply(credit,function(x) sum(is.na(x)))
    checking_balance months_loan_duration       credit_history              purpose 
                   0                    0                    0                    0 
              amount      savings_balance  employment_duration    percent_of_income 
                   0                    0                    0                    0 
  years_at_residence                  age         other_credit              housing 
                   0                    0                    0                    0 
existing_loans_count                  job           dependents                phone 
                   0                    0                    0                    0 
             default 
                   0 
# number of unique values in each column
sapply(credit, function(x) length(unique(x)))
    checking_balance months_loan_duration       credit_history              purpose 
                   4                   33                    5                    6 
              amount      savings_balance  employment_duration    percent_of_income 
                 921                    5                    5                    4 
  years_at_residence                  age         other_credit              housing 
                   4                   53                    3                    3 
existing_loans_count                  job           dependents                phone 
                   4                    4                    2                    2 
             default 
                   2 
# fit the logistic regression model, with all predictor variables
creditModel <- glm(default ~., family=binomial(link='logit'), data = credit_train)
creditModel

Call:  glm(formula = default ~ ., family = binomial(link = "logit"), 
    data = credit_train)

Coefficients:
                   (Intercept)        checking_balance> 200 DM      checking_balance1 - 200 DM  
                    -1.454e+00                      -9.034e-01                      -3.243e-01  
       checking_balanceunknown            months_loan_duration              credit_historygood  
                    -1.755e+00                       3.155e-02                       8.531e-01  
         credit_historyperfect              credit_historypoor         credit_historyvery good  
                     1.448e+00                       7.433e-01                       1.333e+00  
                    purposecar                     purposecar0                purposeeducation  
                     2.569e-01                      -4.931e-01                       7.098e-01  
   purposefurniture/appliances              purposerenovations                          amount  
                    -1.334e-01                       3.455e-01                       8.337e-05  
      savings_balance> 1000 DM     savings_balance100 - 500 DM    savings_balance500 - 1000 DM  
                    -1.031e+00                      -3.843e-01                      -5.040e-01  
        savings_balanceunknown    employment_duration> 7 years  employment_duration1 - 4 years  
                    -9.277e-01                      -4.541e-01                      -3.002e-01  
employment_duration4 - 7 years   employment_durationunemployed               percent_of_income  
                    -9.441e-01                      -2.707e-01                       2.687e-01  
            years_at_residence                             age                other_creditnone  
                     6.473e-02                      -1.789e-02                      -6.555e-01  
             other_creditstore                      housingown                     housingrent  
                    -1.690e-01                      -8.252e-02                       3.832e-01  
          existing_loans_count                      jobskilled                   jobunemployed  
                     2.134e-01                       2.393e-02                       5.874e-01  
                  jobunskilled                      dependents                        phoneyes  
                    -2.497e-02                       3.753e-02                      -1.665e-01  

Degrees of Freedom: 899 Total (i.e. Null);  864 Residual
Null Deviance:      1094 
Residual Deviance: 843.3    AIC: 915.3
summary(creditModel)

Call:
glm(formula = default ~ ., family = binomial(link = "logit"), 
    data = credit_train)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.9276  -0.7444  -0.4016   0.7978   2.6275  

Coefficients:
                                 Estimate Std. Error z value Pr(>|z|)    
(Intercept)                    -1.454e+00  9.435e-01  -1.541 0.123347    
checking_balance> 200 DM       -9.034e-01  3.879e-01  -2.329 0.019850 *  
checking_balance1 - 200 DM     -3.243e-01  2.143e-01  -1.513 0.130271    
checking_balanceunknown        -1.755e+00  2.385e-01  -7.360 1.83e-13 ***
months_loan_duration            3.155e-02  9.420e-03   3.349 0.000810 ***
credit_historygood              8.531e-01  2.676e-01   3.187 0.001436 ** 
credit_historyperfect           1.448e+00  4.415e-01   3.279 0.001041 ** 
credit_historypoor              7.433e-01  3.381e-01   2.198 0.027938 *  
credit_historyvery good         1.333e+00  4.470e-01   2.982 0.002862 ** 
purposecar                      2.569e-01  3.254e-01   0.789 0.429914    
purposecar0                    -4.931e-01  7.962e-01  -0.619 0.535710    
purposeeducation                7.098e-01  4.485e-01   1.583 0.113481    
purposefurniture/appliances    -1.334e-01  3.181e-01  -0.419 0.674910    
purposerenovations              3.455e-01  6.391e-01   0.541 0.588808    
amount                          8.337e-05  4.306e-05   1.936 0.052862 .  
savings_balance> 1000 DM       -1.031e+00  5.123e-01  -2.013 0.044163 *  
savings_balance100 - 500 DM    -3.843e-01  2.924e-01  -1.314 0.188682    
savings_balance500 - 1000 DM   -5.040e-01  4.403e-01  -1.145 0.252411    
savings_balanceunknown         -9.277e-01  2.667e-01  -3.479 0.000504 ***
employment_duration> 7 years   -4.541e-01  2.939e-01  -1.545 0.122362    
employment_duration1 - 4 years -3.002e-01  2.401e-01  -1.250 0.211215    
employment_duration4 - 7 years -9.441e-01  3.022e-01  -3.124 0.001785 ** 
employment_durationunemployed  -2.707e-01  4.444e-01  -0.609 0.542348    
percent_of_income               2.687e-01  8.813e-02   3.049 0.002297 ** 
years_at_residence              6.473e-02  8.821e-02   0.734 0.463039    
age                            -1.789e-02  9.512e-03  -1.880 0.060041 .  
other_creditnone               -6.555e-01  2.437e-01  -2.690 0.007147 ** 
other_creditstore              -1.690e-01  4.204e-01  -0.402 0.687721    
housingown                     -8.252e-02  3.044e-01  -0.271 0.786323    
housingrent                     3.832e-01  3.485e-01   1.099 0.271574    
existing_loans_count            2.134e-01  1.948e-01   1.096 0.273239    
jobskilled                      2.393e-02  2.880e-01   0.083 0.933779    
jobunemployed                   5.874e-01  6.897e-01   0.852 0.394418    
jobunskilled                   -2.497e-02  3.486e-01  -0.072 0.942895    
dependents                      3.753e-02  2.453e-01   0.153 0.878412    
phoneyes                       -1.665e-01  2.029e-01  -0.820 0.412037    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1094.42  on 899  degrees of freedom
Residual deviance:  843.33  on 864  degrees of freedom
AIC: 915.33

Number of Fisher Scoring iterations: 5
anova(creditModel, test="Chisq")
Analysis of Deviance Table

Model: binomial, link: logit

Response: default

Terms added sequentially (first to last)

                     Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
NULL                                   899    1094.42              
checking_balance      3  123.045       896     971.38 < 2.2e-16 ***
months_loan_duration  1   38.101       895     933.28 6.718e-10 ***
credit_history        4   25.481       891     907.80 4.025e-05 ***
purpose               5    6.305       886     901.49  0.277638    
amount                1    0.096       885     901.40  0.757098    
savings_balance       4   17.311       881     884.08  0.001681 ** 
employment_duration   4   11.705       877     872.38  0.019686 *  
percent_of_income     1    7.343       876     865.04  0.006734 ** 
years_at_residence    1    0.896       875     864.14  0.343989    
age                   1    5.475       874     858.67  0.019295 *  
other_credit          2    7.964       872     850.70  0.018647 *  
housing               2    4.324       870     846.38  0.115105    
existing_loans_count  1    1.374       869     845.01  0.241169    
job                   3    0.972       866     844.03  0.807968    
dependents            1    0.024       865     844.01  0.876113    
phone                 1    0.676       864     843.33  0.411137    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# drop the insignificant predictors, alpha = 0.10
creditModel <- glm(default ~ checking_balance + months_loan_duration + credit_history +  percent_of_income + age, family = binomial(link='logit'), data = credit_train)
creditModel

Call:  glm(formula = default ~ checking_balance + months_loan_duration + 
    credit_history + percent_of_income + age, family = binomial(link = "logit"), 
    data = credit_train)

Coefficients:
               (Intercept)    checking_balance> 200 DM  checking_balance1 - 200 DM  
                  -1.23454                    -1.07359                    -0.47676  
   checking_balanceunknown        months_loan_duration          credit_historygood  
                  -1.97625                     0.03556                     0.57723  
     credit_historyperfect          credit_historypoor     credit_historyvery good  
                   1.61432                     0.67220                     1.35226  
         percent_of_income                         age  
                   0.16530                    -0.01822  

Degrees of Freedom: 899 Total (i.e. Null);  889 Residual
Null Deviance:      1094 
Residual Deviance: 897.9    AIC: 919.9
summary(creditModel)

Call:
glm(formula = default ~ checking_balance + months_loan_duration + 
    credit_history + percent_of_income + age, family = binomial(link = "logit"), 
    data = credit_train)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.8279  -0.8071  -0.4582   0.8897   2.4312  

Coefficients:
                            Estimate Std. Error z value Pr(>|z|)    
(Intercept)                -1.234544   0.436468  -2.828 0.004677 ** 
checking_balance> 200 DM   -1.073590   0.364743  -2.943 0.003246 ** 
checking_balance1 - 200 DM -0.476760   0.195911  -2.434 0.014952 *  
checking_balanceunknown    -1.976250   0.223510  -8.842  < 2e-16 ***
months_loan_duration        0.035558   0.006789   5.237 1.63e-07 ***
credit_historygood          0.577230   0.210606   2.741 0.006129 ** 
credit_historyperfect       1.614321   0.415943   3.881 0.000104 ***
credit_historypoor          0.672202   0.318760   2.109 0.034962 *  
credit_historyvery good     1.352261   0.392121   3.449 0.000564 ***
percent_of_income           0.165302   0.074969   2.205 0.027458 *  
age                        -0.018222   0.007776  -2.343 0.019107 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1094.42  on 899  degrees of freedom
Residual deviance:  897.87  on 889  degrees of freedom
AIC: 919.87

Number of Fisher Scoring iterations: 5
anova(creditModel, test="Chisq")
Analysis of Deviance Table

Model: binomial, link: logit

Response: default

Terms added sequentially (first to last)

                     Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
NULL                                   899    1094.42              
checking_balance      3  123.045       896     971.38 < 2.2e-16 ***
months_loan_duration  1   38.101       895     933.28 6.718e-10 ***
credit_history        4   25.481       891     907.80 4.025e-05 ***
percent_of_income     1    4.253       890     903.54   0.03917 *  
age                   1    5.676       889     897.87   0.01719 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# check Accuracy
fitted.results <- predict(creditModel, newdata = credit_test, type = 'response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results != credit_test$default)
print(paste('Accuracy',1-misClasificError))
[1] "Accuracy 0.73"

Credit Data – ROC

# ROC
library(ROCR)
p <- predict(creditModel, newdata=credit_test, type="response")
pr <- prediction(p, credit_test$default)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)

auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
[1] 0.7195839
LS0tDQp0aXRsZTogIkxvZ2lzdGljIHJlZ3Jlc3Npb24gLS0gWm5nYSINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQoNCiMjIEZpcnN0IHJ1biBvZiBsb2dpdCBtb2RlbCAtLSBabmdhDQpgYGB7cn0NCiMjIFJlYWQgRGF0YSAoU3RlcCAyKQ0Kem5nYSA8LSByZWFkLmNzdigiV2Vic2l0ZV9WSVBfVXNlcl9kYXRhXzEwMDAwLmNzdiIsIHN0cmluZ3NBc0ZhY3RvcnMgPSBGQUxTRSkNCg0KIyBMb2dpc2l0aWMgcmVncmVzc2lvbg0KDQojIERyb3AgSUQsIGFuZCBwcm9kdWN0X2xpa2VfcmF0ZSBjb2x1bW5zDQp6bmdhIDwtIHpuZ2FbLTFdDQp6bmdhIDwtIHpuZ2FbLThdDQoNCiMgU2V0IHVwIHRyYWlubmluZyBhbmQgdGVzdCBkYXRhIHNldHMNCmluZHggPC0gc2FtcGxlKDE6bnJvdyh6bmdhKSwgYXMuaW50ZWdlcigwLjkqbnJvdyh6bmdhKSkpDQppbmR4DQoNCnpuZ2FfdHJhaW4gPC0gem5nYVtpbmR4LF0NCnpuZ2FfdGVzdCA8LSB6bmdhWy1pbmR4LF0NCg0Kem5nYV90cmFpbl9sYWJlbHMgPC0gem5nYVtpbmR4LDFdDQp6bmdhX3Rlc3RfbGFiZWxzIDwtIHpuZ2FbLWluZHgsMV0gICANCg0KIyBDaGVjayBpZiB0aGVyZSBhcmUgYW55IG1pc3NpbmcgdmFsdWVzDQojIGluc3RhbGwucGFja2FnZXMoJ0FtZWxpYScpDQpsaWJyYXJ5KEFtZWxpYSkNCm1pc3NtYXAoem5nYSwgbWFpbiA9ICJNaXNzaW5nIHZhbHVlcyB2cyBvYnNlcnZlZCIpDQoNCiMgbnVtYmVyIG9mIG1pc3NpbmcgdmFsdWVzIGluIGVhY2ggY29sdW1uDQoNCnNhcHBseSh6bmdhLCBmdW5jdGlvbih4KSBzdW0oaXMubmEoeCkpKQ0KDQojIG51bWJlciBvZiB1bmlxdWUgdmFsdWVzIGluIGVhY2ggY29sdW1uDQoNCnNhcHBseSh6bmdhLCBmdW5jdGlvbih4KSBsZW5ndGgodW5pcXVlKHgpKSkNCg0KIyBmaXQgdGhlIGxvZ2lzdGljIHJlZ3Jlc3Npb24gbW9kZWwsIHdpdGggYWxsIHByZWRpY3RvciB2YXJpYWJsZXMNCg0KbW9kZWwgPC0gZ2xtKElzVklQXzUwMCB+LixmYW1pbHk9Ymlub21pYWwobGluaz0nbG9naXQnKSxkYXRhPXpuZ2FfdHJhaW4pDQoNCnN1bW1hcnkobW9kZWwpDQphbm92YShtb2RlbCkNCmBgYA0KDQojIyBEcm9wcGluZyBpbnNpZ25pZmljYW50IHRlcm1zIC0tIFpuZ2ENCmBgYHtyfQ0KIyBkcm9wIHRoZSBpbnNpZ25pZmljYW50IHByZWRpY3RvcnMsIGFscGhhID0gMC4xMA0KDQptb2RlbCA8LSBnbG0oSXNWSVBfNTAwIH4gcGF5bWVudF83X2RheStkYXVfZGF5cytkYXlzX2JldHdlZW5faW5zdGFsbF9maXJzdF9wYXkrdG90YWxfdHhuc183X2RheSt0b3RhbF9wYWdlX3ZpZXdzLCBmYW1pbHkgPSBiaW5vbWlhbChsaW5rPSdsb2dpdCcpLCBkYXRhID0gem5nYV90cmFpbikNCg0Kc3VtbWFyeShtb2RlbCkNCg0KYW5vdmEobW9kZWwsIHRlc3Q9IkNoaXNxIikNCmBgYA0KDQojIyBDaGVjayBBY2N1cmFjeSAtLSBabmdhDQpgYGB7cn0NCmZpdHRlZC5yZXN1bHRzIDwtIHByZWRpY3QobW9kZWwsbmV3ZGF0YT16bmdhX3Rlc3QsdHlwZT0ncmVzcG9uc2UnKQ0KZml0dGVkLnJlc3VsdHMgPC0gaWZlbHNlKGZpdHRlZC5yZXN1bHRzID4gMC41LDEsMCkNCg0KbWlzQ2xhc2lmaWNFcnJvciA8LSBtZWFuKGZpdHRlZC5yZXN1bHRzICE9IHpuZ2FfdGVzdCRJc1ZJUF81MDApDQpwcmludChwYXN0ZSgnQWNjdXJhY3knLDEtbWlzQ2xhc2lmaWNFcnJvcikpDQpgYGANCg0KIyMgUk9DIC0tIFRlbXBsYXRlIGNvZGUNCmBgYHtyfQ0KIyBST0MNCiMgQmVjYXVzZSB0aGlzIGRhdGEgc2V0IGlzIHNvIHNtYWxsLCBpdCBpcyBwb3NzaWJsZSB0aGF0IHRoZSB0ZXN0IGRhdGEgc2V0DQojIGRvZXMgbm90IGNvbnRhaW4gYm90aCAwIGFuZCAxIHZhbHVlcy4gIElmIHRoaXMgaGFwcGVucyB0aGUgY29kZSB3aWxsIG5vdA0KIyBydW4uICBBbmQgc2luY2UgdGhlIHRlc3QgZGF0YSBzZXQgaXMgc28gc21hbGwgdGhlIFJPQyBpcyBub3QgdXNlZnVsIGhlcmUNCiMgYnV0IHRoZSBjb2RlIGlzIHByb3ZpZGVkLg0KIyBpbnN0YWxsLnBhY2thZ2VzKCdST0NSJykNCiMgbGlicmFyeShST0NSKQ0KIyBwIDwtIHByZWRpY3QobW9kZWwsIG5ld2RhdGE9bGF1bmNoX3Rlc3QsIHR5cGU9InJlc3BvbnNlIikNCiMgcHIgPC0gcHJlZGljdGlvbihwLCBsYXVuY2hfdGVzdCRkaXN0cmVzc19jdCkNCiMgcHJmIDwtIHBlcmZvcm1hbmNlKHByLCBtZWFzdXJlID0gInRwciIsIHgubWVhc3VyZSA9ICJmcHIiKQ0KIyBwbG90KHByZikNCiMgDQojIGF1YyA8LSBwZXJmb3JtYW5jZShwciwgbWVhc3VyZSA9ICJhdWMiKQ0KIyBhdWMgPC0gYXVjQHkudmFsdWVzW1sxXV0NCiMgYXVjDQpgYGANCg0KIyMgUk9DIEFjY3VyYWN5IC0tIFpuZ2ENCmBgYHtyfQ0KIyBST0MNCmxpYnJhcnkoUk9DUikNCnAgPC0gcHJlZGljdChtb2RlbCwgbmV3ZGF0YT16bmdhX3Rlc3QsIHR5cGU9InJlc3BvbnNlIikNCnByIDwtIHByZWRpY3Rpb24ocCwgem5nYV90ZXN0JElzVklQXzUwMCkNCnByZiA8LSBwZXJmb3JtYW5jZShwciwgbWVhc3VyZSA9ICJ0cHIiLCB4Lm1lYXN1cmUgPSAiZnByIikNCnBsb3QocHJmKQ0KDQphdWMgPC0gcGVyZm9ybWFuY2UocHIsIG1lYXN1cmUgPSAiYXVjIikNCmF1YyA8LSBhdWNAeS52YWx1ZXNbWzFdXQ0KYXVjDQpgYGANCg0KDQojIyBBbiBFeGFtcGxlOiBDcmVkaXQgRGF0YSAtLSBMb2dpdA0KYGBge3J9DQpjcmVkaXQgPC0gcmVhZC5jc3YoImh0dHA6Ly93d3cuc2NpLmNzdWVhc3RiYXkuZWR1L35lc3Vlc3MvY2xhc3Nlcy9TdGF0aXN0aWNzXzY2MjAvUHJlc2VudGF0aW9ucy9tbDcvY3JlZGl0LmNzdiIpDQoNCiMjIEZpeCB0aGUgZGVmYXVsdCB2YXJpYWJsZSB0byBiZSAwIG9yIDENCmNyZWRpdCRkZWZhdWx0IDwtIGFzLm51bWVyaWMoY3JlZGl0JGRlZmF1bHQpICMgRG9uJ3QgbmVlZCB0aGlzIGZvciB6bmdhDQpjcmVkaXQkZGVmYXVsdCA8LSBjcmVkaXQkZGVmYXVsdCAtIDEgICAgICAgICAjIERvbid0IG5lZWQgdGhpcyBmb3Igem5nYQ0KDQojIGV4YW1pbmUgdGhlIGxhdW5jaCBkYXRhDQpzdHIoY3JlZGl0KQ0KDQojIGxvZ2lzaXRpYyByZWdyZXNzaW9uDQoNCiMgc2V0IHVwIHRyYWlubmluZyBhbmQgdGVzdCBkYXRhIHNldHMNCg0KaW5keCA8LSBzYW1wbGUoMTpucm93KGNyZWRpdCksIGFzLmludGVnZXIoMC45Km5yb3coY3JlZGl0KSkpDQppbmR4DQoNCmNyZWRpdF90cmFpbiA8LSBjcmVkaXRbaW5keCxdDQpjcmVkaXRfdGVzdCA8LSBjcmVkaXRbLWluZHgsXQ0KDQpjcmVkaXRfdHJhaW5fbGFiZWxzIDwtIGNyZWRpdFtpbmR4LDE3XQ0KY3JlZGl0X3Rlc3RfbGFiZWxzIDwtIGNyZWRpdFstaW5keCwxN10gICANCg0KIyBDaGVjayBpZiB0aGVyZSBhcmUgYW55IG1pc3NpbmcgdmFsdWVzDQoNCmxpYnJhcnkoQW1lbGlhKQ0KbWlzc21hcChjcmVkaXQsIG1haW4gPSAiTWlzc2luZyB2YWx1ZXMgdnMgb2JzZXJ2ZWQiKQ0KDQojIG51bWJlciBvZiBtaXNzaW5nIHZhbHVlcyBpbiBlYWNoIGNvbHVtbg0KDQpzYXBwbHkoY3JlZGl0LGZ1bmN0aW9uKHgpIHN1bShpcy5uYSh4KSkpDQoNCiMgbnVtYmVyIG9mIHVuaXF1ZSB2YWx1ZXMgaW4gZWFjaCBjb2x1bW4NCg0Kc2FwcGx5KGNyZWRpdCwgZnVuY3Rpb24oeCkgbGVuZ3RoKHVuaXF1ZSh4KSkpDQoNCiMgZml0IHRoZSBsb2dpc3RpYyByZWdyZXNzaW9uIG1vZGVsLCB3aXRoIGFsbCBwcmVkaWN0b3IgdmFyaWFibGVzDQoNCmNyZWRpdE1vZGVsIDwtIGdsbShkZWZhdWx0IH4uLCBmYW1pbHk9Ymlub21pYWwobGluaz0nbG9naXQnKSwgZGF0YSA9IGNyZWRpdF90cmFpbikNCmNyZWRpdE1vZGVsDQoNCnN1bW1hcnkoY3JlZGl0TW9kZWwpDQoNCmFub3ZhKGNyZWRpdE1vZGVsLCB0ZXN0PSJDaGlzcSIpDQoNCiMgZHJvcCB0aGUgaW5zaWduaWZpY2FudCBwcmVkaWN0b3JzLCBhbHBoYSA9IDAuMTANCg0KY3JlZGl0TW9kZWwgPC0gZ2xtKGRlZmF1bHQgfiBjaGVja2luZ19iYWxhbmNlICsgbW9udGhzX2xvYW5fZHVyYXRpb24gKyBjcmVkaXRfaGlzdG9yeSArICBwZXJjZW50X29mX2luY29tZSArIGFnZSwgZmFtaWx5ID0gYmlub21pYWwobGluaz0nbG9naXQnKSwgZGF0YSA9IGNyZWRpdF90cmFpbikNCmNyZWRpdE1vZGVsDQoNCnN1bW1hcnkoY3JlZGl0TW9kZWwpDQoNCmFub3ZhKGNyZWRpdE1vZGVsLCB0ZXN0PSJDaGlzcSIpDQoNCiMgY2hlY2sgQWNjdXJhY3kNCg0KZml0dGVkLnJlc3VsdHMgPC0gcHJlZGljdChjcmVkaXRNb2RlbCwgbmV3ZGF0YSA9IGNyZWRpdF90ZXN0LCB0eXBlID0gJ3Jlc3BvbnNlJykNCmZpdHRlZC5yZXN1bHRzIDwtIGlmZWxzZShmaXR0ZWQucmVzdWx0cyA+IDAuNSwxLDApDQoNCm1pc0NsYXNpZmljRXJyb3IgPC0gbWVhbihmaXR0ZWQucmVzdWx0cyAhPSBjcmVkaXRfdGVzdCRkZWZhdWx0KQ0KcHJpbnQocGFzdGUoJ0FjY3VyYWN5JywxLW1pc0NsYXNpZmljRXJyb3IpKQ0KYGBgDQoNCiMjIENyZWRpdCBEYXRhIC0tIFJPQw0KYGBge3J9DQojIFJPQw0KbGlicmFyeShST0NSKQ0KcCA8LSBwcmVkaWN0KGNyZWRpdE1vZGVsLCBuZXdkYXRhPWNyZWRpdF90ZXN0LCB0eXBlPSJyZXNwb25zZSIpDQpwciA8LSBwcmVkaWN0aW9uKHAsIGNyZWRpdF90ZXN0JGRlZmF1bHQpDQpwcmYgPC0gcGVyZm9ybWFuY2UocHIsIG1lYXN1cmUgPSAidHByIiwgeC5tZWFzdXJlID0gImZwciIpDQpwbG90KHByZikNCg0KYXVjIDwtIHBlcmZvcm1hbmNlKHByLCBtZWFzdXJlID0gImF1YyIpDQphdWMgPC0gYXVjQHkudmFsdWVzW1sxXV0NCmF1Yw0KYGBgDQoNCg==