##### CSUEB STATS 6620 - Machine Learning with R - Spring 2017 -
##### Gui Larangeira - HW 6
##### Chapter 6: Regression Methods -------------------
## Logisitic Regression ------------------
## Example: Space Shuttle Launch Data ----
# We load the data directly from the class website. This is a small data set, containing only 23 rows:
launch <- read.csv("http://www.sci.csueastbay.edu/~esuess/classes/Statistics_6620/Presentations/ml10/challenger.csv")
# First recode the distress_ct variable into 0 and 1, making 1 to represent at least one failure during a launch.
launch$distress_ct = ifelse(launch$distress_ct<1,0,1)
# Set up trainning and test data sets
indx = sample(1:nrow(launch), as.integer(0.9*nrow(launch)))
launch_train = launch[indx,]
launch_test = launch[-indx,]
launch_train_labels = launch[indx,1]
launch_test_labels = launch[-indx,1]
# Check for missing values in each column and number of unique values in each column
sapply(launch,function(x) sum(is.na(x)))
## distress_ct temperature field_check_pressure
## 0 0 0
## flight_num
## 0
sapply(launch, function(x) length(unique(x)))
## distress_ct temperature field_check_pressure
## 2 16 3
## flight_num
## 23
# Fit the logistic regression model, with all predictor variables
model <- glm(distress_ct ~.,family=binomial(link='logit'),data=launch_train)
summary(model)
##
## Call:
## glm(formula = distress_ct ~ ., family = binomial(link = "logit"),
## data = launch_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3499 -0.4376 0.0000 0.2248 1.7588
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.891e+01 1.039e+04 -0.002 0.9985
## temperature -2.534e-01 1.505e-01 -1.683 0.0924 .
## field_check_pressure 1.774e-01 5.195e+01 0.003 0.9973
## flight_num 5.382e-02 2.162e-01 0.249 0.8034
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 24.435 on 19 degrees of freedom
## Residual deviance: 10.854 on 16 degrees of freedom
## AIC: 18.854
##
## Number of Fisher Scoring iterations: 19
anova(model, test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: distress_ct
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 19 24.435
## temperature 1 7.7571 18 16.677 0.00535 **
## field_check_pressure 1 5.7575 17 10.920 0.01642 *
## flight_num 1 0.0660 16 10.854 0.79726
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# As we can see, the only significant explaining variable is the temperature. So we can drop the remaining with alpha > .10.
model <- glm(distress_ct ~ temperature,family=binomial(link='logit'),data=launch_train)
summary(model)
##
## Call:
## glm(formula = distress_ct ~ temperature, family = binomial(link = "logit"),
## data = launch_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9955 -0.7525 -0.3484 0.4054 2.3324
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 15.7453 7.9247 1.987 0.0469 *
## temperature -0.2453 0.1176 -2.086 0.0370 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 24.435 on 19 degrees of freedom
## Residual deviance: 16.677 on 18 degrees of freedom
## AIC: 20.677
##
## Number of Fisher Scoring iterations: 5
anova(model, test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: distress_ct
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 19 24.435
## temperature 1 7.7571 18 16.677 0.00535 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Check Accuracy
fitted.results <- predict(model,newdata=launch_test,type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results == launch_test$distress_ct)
print(paste('Accuracy',misClasificError))
## [1] "Accuracy 0.666666666666667"
## Example 2: credit
credit <- read.csv("http://www.sci.csueastbay.edu/~esuess/classes/Statistics_6620/Presentations/ml7/credit.csv")
## Fix the default variable to be 0 or 1
credit$default = as.numeric(credit$default)
credit$default = credit$default - 1
# Set up trainning and test data sets
indx = sample(1:nrow(credit), as.integer(0.9*nrow(credit)))
credit_train = credit[indx,]
credit_test = credit[-indx,]
credit_train_labels = credit[indx,17]
credit_test_labels = credit[-indx,17]
# Number of missing values in each column
sapply(credit,function(x) sum(is.na(x)))
## checking_balance months_loan_duration credit_history
## 0 0 0
## purpose amount savings_balance
## 0 0 0
## employment_duration percent_of_income years_at_residence
## 0 0 0
## age other_credit housing
## 0 0 0
## existing_loans_count job dependents
## 0 0 0
## phone default
## 0 0
# fit the logistic regression model, with all predictor variables
model_credit <- glm(default ~.,family=binomial(link='logit'),data=credit_train)
summary(model_credit)
##
## Call:
## glm(formula = default ~ ., family = binomial(link = "logit"),
## data = credit_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9747 -0.7458 -0.3903 0.7518 2.4554
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.893e+00 9.644e-01 -1.963 0.049688 *
## checking_balance> 200 DM -9.790e-01 3.860e-01 -2.536 0.011210 *
## checking_balance1 - 200 DM -4.314e-01 2.177e-01 -1.982 0.047531 *
## checking_balanceunknown -1.889e+00 2.415e-01 -7.820 5.27e-15 ***
## months_loan_duration 2.889e-02 9.510e-03 3.038 0.002383 **
## credit_historygood 8.894e-01 2.665e-01 3.338 0.000844 ***
## credit_historyperfect 1.194e+00 4.532e-01 2.635 0.008411 **
## credit_historypoor 9.367e-01 3.482e-01 2.691 0.007134 **
## credit_historyvery good 1.670e+00 4.452e-01 3.750 0.000177 ***
## purposecar 3.582e-01 3.300e-01 1.086 0.277674
## purposecar0 -6.434e-01 7.703e-01 -0.835 0.403559
## purposeeducation 8.187e-01 4.531e-01 1.807 0.070765 .
## purposefurniture/appliances -1.680e-02 3.256e-01 -0.052 0.958858
## purposerenovations 6.122e-01 6.170e-01 0.992 0.321129
## amount 1.165e-04 4.487e-05 2.597 0.009401 **
## savings_balance> 1000 DM -1.136e+00 5.083e-01 -2.234 0.025458 *
## savings_balance100 - 500 DM -1.266e-01 2.795e-01 -0.453 0.650499
## savings_balance500 - 1000 DM -5.383e-01 4.497e-01 -1.197 0.231367
## savings_balanceunknown -9.229e-01 2.693e-01 -3.428 0.000609 ***
## employment_duration> 7 years -4.667e-01 2.993e-01 -1.559 0.118957
## employment_duration1 - 4 years -1.580e-01 2.430e-01 -0.650 0.515396
## employment_duration4 - 7 years -1.077e+00 3.094e-01 -3.483 0.000496 ***
## employment_durationunemployed -1.980e-01 4.394e-01 -0.451 0.652213
## percent_of_income 2.679e-01 8.942e-02 2.996 0.002733 **
## years_at_residence -5.095e-03 8.983e-02 -0.057 0.954766
## age -9.620e-03 9.477e-03 -1.015 0.310041
## other_creditnone -3.480e-01 2.467e-01 -1.410 0.158398
## other_creditstore 1.275e-01 4.353e-01 0.293 0.769674
## housingown -6.244e-02 3.030e-01 -0.206 0.836710
## housingrent 4.213e-01 3.490e-01 1.207 0.227283
## existing_loans_count 4.000e-01 1.940e-01 2.061 0.039282 *
## jobskilled 6.992e-02 2.919e-01 0.240 0.810700
## jobunemployed 3.458e-01 6.674e-01 0.518 0.604406
## jobunskilled -1.786e-01 3.579e-01 -0.499 0.617829
## dependents -2.244e-01 2.531e-01 -0.887 0.375158
## phoneyes -2.818e-01 2.054e-01 -1.372 0.170212
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1094.42 on 899 degrees of freedom
## Residual deviance: 836.56 on 864 degrees of freedom
## AIC: 908.56
##
## Number of Fisher Scoring iterations: 5
# Drop the insignificant predictors, considering alpha (significance) = ***
model_creditB <- glm(default ~ checking_balance + months_loan_duration + credit_history + percent_of_income, family=binomial(link='logit'),data=credit_train)
summary(model_creditB)
##
## Call:
## glm(formula = default ~ checking_balance + months_loan_duration +
## credit_history + percent_of_income, family = binomial(link = "logit"),
## data = credit_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8197 -0.8157 -0.4529 0.9153 2.4254
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.799127 0.339500 -5.299 1.16e-07 ***
## checking_balance> 200 DM -1.103901 0.362474 -3.045 0.002323 **
## checking_balance1 - 200 DM -0.500446 0.195874 -2.555 0.010621 *
## checking_balanceunknown -2.012779 0.222995 -9.026 < 2e-16 ***
## months_loan_duration 0.036092 0.006699 5.387 7.15e-08 ***
## credit_historygood 0.602723 0.205460 2.934 0.003351 **
## credit_historyperfect 1.458346 0.423398 3.444 0.000572 ***
## credit_historypoor 0.823259 0.323276 2.547 0.010877 *
## credit_historyvery good 1.460236 0.380989 3.833 0.000127 ***
## percent_of_income 0.137660 0.074470 1.849 0.064527 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1094.42 on 899 degrees of freedom
## Residual deviance: 902.45 on 890 degrees of freedom
## AIC: 922.45
##
## Number of Fisher Scoring iterations: 5
# check Accuracy
fitted.results <- predict(model_creditB,newdata=credit_test,type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results == credit_test$default)
print(paste('Accuracy',misClasificError))
## [1] "Accuracy 0.7"