##### CSUEB STATS 6620 - Machine Learning with R - Spring 2017 -
##### Gui Larangeira - HW 6
##### Chapter 6: Regression Methods -------------------

## Logisitic Regression ------------------
## Example: Space Shuttle Launch Data ----

# We load the data directly from the class website. This is a small data set, containing only 23 rows:

launch <- read.csv("http://www.sci.csueastbay.edu/~esuess/classes/Statistics_6620/Presentations/ml10/challenger.csv")

# First recode the distress_ct variable into 0 and 1, making 1 to represent at least one failure during a launch.

launch$distress_ct = ifelse(launch$distress_ct<1,0,1)

# Set up trainning and test data sets

indx = sample(1:nrow(launch), as.integer(0.9*nrow(launch)))

launch_train = launch[indx,]
launch_test = launch[-indx,]

launch_train_labels = launch[indx,1]
launch_test_labels = launch[-indx,1]   

# Check for missing values in each column and number of unique values in each column

sapply(launch,function(x) sum(is.na(x)))
##          distress_ct          temperature field_check_pressure 
##                    0                    0                    0 
##           flight_num 
##                    0
sapply(launch, function(x) length(unique(x)))
##          distress_ct          temperature field_check_pressure 
##                    2                   16                    3 
##           flight_num 
##                   23
# Fit the logistic regression model, with all predictor variables

model <- glm(distress_ct ~.,family=binomial(link='logit'),data=launch_train)

summary(model)
## 
## Call:
## glm(formula = distress_ct ~ ., family = binomial(link = "logit"), 
##     data = launch_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.3499  -0.4376   0.0000   0.2248   1.7588  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)  
## (Intercept)          -1.891e+01  1.039e+04  -0.002   0.9985  
## temperature          -2.534e-01  1.505e-01  -1.683   0.0924 .
## field_check_pressure  1.774e-01  5.195e+01   0.003   0.9973  
## flight_num            5.382e-02  2.162e-01   0.249   0.8034  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 24.435  on 19  degrees of freedom
## Residual deviance: 10.854  on 16  degrees of freedom
## AIC: 18.854
## 
## Number of Fisher Scoring iterations: 19
anova(model, test="Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: distress_ct
## 
## Terms added sequentially (first to last)
## 
## 
##                      Df Deviance Resid. Df Resid. Dev Pr(>Chi)   
## NULL                                    19     24.435            
## temperature           1   7.7571        18     16.677  0.00535 **
## field_check_pressure  1   5.7575        17     10.920  0.01642 * 
## flight_num            1   0.0660        16     10.854  0.79726   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# As we can see, the only significant explaining variable is the temperature. So we can drop the remaining with alpha > .10.

model <- glm(distress_ct ~ temperature,family=binomial(link='logit'),data=launch_train)

summary(model)
## 
## Call:
## glm(formula = distress_ct ~ temperature, family = binomial(link = "logit"), 
##     data = launch_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9955  -0.7525  -0.3484   0.4054   2.3324  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  15.7453     7.9247   1.987   0.0469 *
## temperature  -0.2453     0.1176  -2.086   0.0370 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 24.435  on 19  degrees of freedom
## Residual deviance: 16.677  on 18  degrees of freedom
## AIC: 20.677
## 
## Number of Fisher Scoring iterations: 5
anova(model, test="Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: distress_ct
## 
## Terms added sequentially (first to last)
## 
## 
##             Df Deviance Resid. Df Resid. Dev Pr(>Chi)   
## NULL                           19     24.435            
## temperature  1   7.7571        18     16.677  0.00535 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Check Accuracy

fitted.results <- predict(model,newdata=launch_test,type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)

misClasificError <- mean(fitted.results == launch_test$distress_ct)
print(paste('Accuracy',misClasificError))
## [1] "Accuracy 0.666666666666667"
## Example 2: credit

credit <- read.csv("http://www.sci.csueastbay.edu/~esuess/classes/Statistics_6620/Presentations/ml7/credit.csv")

## Fix the default variable to be 0 or 1

credit$default = as.numeric(credit$default)
credit$default = credit$default - 1

# Set up trainning and test data sets

indx = sample(1:nrow(credit), as.integer(0.9*nrow(credit)))

credit_train = credit[indx,]
credit_test = credit[-indx,]

credit_train_labels = credit[indx,17]
credit_test_labels = credit[-indx,17]   

# Number of missing values in each column

sapply(credit,function(x) sum(is.na(x)))
##     checking_balance months_loan_duration       credit_history 
##                    0                    0                    0 
##              purpose               amount      savings_balance 
##                    0                    0                    0 
##  employment_duration    percent_of_income   years_at_residence 
##                    0                    0                    0 
##                  age         other_credit              housing 
##                    0                    0                    0 
## existing_loans_count                  job           dependents 
##                    0                    0                    0 
##                phone              default 
##                    0                    0
# fit the logistic regression model, with all predictor variables

model_credit <- glm(default ~.,family=binomial(link='logit'),data=credit_train)

summary(model_credit)
## 
## Call:
## glm(formula = default ~ ., family = binomial(link = "logit"), 
##     data = credit_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9747  -0.7458  -0.3903   0.7518   2.4554  
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                    -1.893e+00  9.644e-01  -1.963 0.049688 *  
## checking_balance> 200 DM       -9.790e-01  3.860e-01  -2.536 0.011210 *  
## checking_balance1 - 200 DM     -4.314e-01  2.177e-01  -1.982 0.047531 *  
## checking_balanceunknown        -1.889e+00  2.415e-01  -7.820 5.27e-15 ***
## months_loan_duration            2.889e-02  9.510e-03   3.038 0.002383 ** 
## credit_historygood              8.894e-01  2.665e-01   3.338 0.000844 ***
## credit_historyperfect           1.194e+00  4.532e-01   2.635 0.008411 ** 
## credit_historypoor              9.367e-01  3.482e-01   2.691 0.007134 ** 
## credit_historyvery good         1.670e+00  4.452e-01   3.750 0.000177 ***
## purposecar                      3.582e-01  3.300e-01   1.086 0.277674    
## purposecar0                    -6.434e-01  7.703e-01  -0.835 0.403559    
## purposeeducation                8.187e-01  4.531e-01   1.807 0.070765 .  
## purposefurniture/appliances    -1.680e-02  3.256e-01  -0.052 0.958858    
## purposerenovations              6.122e-01  6.170e-01   0.992 0.321129    
## amount                          1.165e-04  4.487e-05   2.597 0.009401 ** 
## savings_balance> 1000 DM       -1.136e+00  5.083e-01  -2.234 0.025458 *  
## savings_balance100 - 500 DM    -1.266e-01  2.795e-01  -0.453 0.650499    
## savings_balance500 - 1000 DM   -5.383e-01  4.497e-01  -1.197 0.231367    
## savings_balanceunknown         -9.229e-01  2.693e-01  -3.428 0.000609 ***
## employment_duration> 7 years   -4.667e-01  2.993e-01  -1.559 0.118957    
## employment_duration1 - 4 years -1.580e-01  2.430e-01  -0.650 0.515396    
## employment_duration4 - 7 years -1.077e+00  3.094e-01  -3.483 0.000496 ***
## employment_durationunemployed  -1.980e-01  4.394e-01  -0.451 0.652213    
## percent_of_income               2.679e-01  8.942e-02   2.996 0.002733 ** 
## years_at_residence             -5.095e-03  8.983e-02  -0.057 0.954766    
## age                            -9.620e-03  9.477e-03  -1.015 0.310041    
## other_creditnone               -3.480e-01  2.467e-01  -1.410 0.158398    
## other_creditstore               1.275e-01  4.353e-01   0.293 0.769674    
## housingown                     -6.244e-02  3.030e-01  -0.206 0.836710    
## housingrent                     4.213e-01  3.490e-01   1.207 0.227283    
## existing_loans_count            4.000e-01  1.940e-01   2.061 0.039282 *  
## jobskilled                      6.992e-02  2.919e-01   0.240 0.810700    
## jobunemployed                   3.458e-01  6.674e-01   0.518 0.604406    
## jobunskilled                   -1.786e-01  3.579e-01  -0.499 0.617829    
## dependents                     -2.244e-01  2.531e-01  -0.887 0.375158    
## phoneyes                       -2.818e-01  2.054e-01  -1.372 0.170212    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1094.42  on 899  degrees of freedom
## Residual deviance:  836.56  on 864  degrees of freedom
## AIC: 908.56
## 
## Number of Fisher Scoring iterations: 5
# Drop the insignificant predictors, considering alpha (significance) = ***

model_creditB <- glm(default ~ checking_balance + months_loan_duration + credit_history +  percent_of_income, family=binomial(link='logit'),data=credit_train)

summary(model_creditB)
## 
## Call:
## glm(formula = default ~ checking_balance + months_loan_duration + 
##     credit_history + percent_of_income, family = binomial(link = "logit"), 
##     data = credit_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8197  -0.8157  -0.4529   0.9153   2.4254  
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -1.799127   0.339500  -5.299 1.16e-07 ***
## checking_balance> 200 DM   -1.103901   0.362474  -3.045 0.002323 ** 
## checking_balance1 - 200 DM -0.500446   0.195874  -2.555 0.010621 *  
## checking_balanceunknown    -2.012779   0.222995  -9.026  < 2e-16 ***
## months_loan_duration        0.036092   0.006699   5.387 7.15e-08 ***
## credit_historygood          0.602723   0.205460   2.934 0.003351 ** 
## credit_historyperfect       1.458346   0.423398   3.444 0.000572 ***
## credit_historypoor          0.823259   0.323276   2.547 0.010877 *  
## credit_historyvery good     1.460236   0.380989   3.833 0.000127 ***
## percent_of_income           0.137660   0.074470   1.849 0.064527 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1094.42  on 899  degrees of freedom
## Residual deviance:  902.45  on 890  degrees of freedom
## AIC: 922.45
## 
## Number of Fisher Scoring iterations: 5
# check Accuracy

fitted.results <- predict(model_creditB,newdata=credit_test,type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)

misClasificError <- mean(fitted.results == credit_test$default)
print(paste('Accuracy',misClasificError))
## [1] "Accuracy 0.7"