From Data Smart by John Foreman
http://www.wiley.com/WileyCDA/WileyTitle/productCd-111866146X.html.
We’ll need randomForest, ROCR for this lesson.
repo <- 'http://cran.us.r-project.org'
if('randomForest' %in% rownames(installed.packages()) == FALSE) {
install.packages('randomForest', repos=repo, dependencies=TRUE)
}
if('ROCR' %in% rownames(installed.packages()) == FALSE) {
install.packages('ROCR', repos=repo, dependencies=TRUE)
}
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.1.2
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.1.3
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.1.2
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
PregnancyData <- read.csv('data/Pregnancy.csv')
PregnancyData.Test <- read.csv('data/Pregnancy_Test.csv')
head(PregnancyData)
## Implied.Gender Home.Apt..PO.Box Pregnancy.Test Birth.Control
## 1 M A 1 0
## 2 M H 1 0
## 3 M H 1 0
## 4 U H 0 0
## 5 F A 0 0
## 6 F H 0 0
## Feminine.Hygiene Folic.Acid Prenatal.Vitamins Prenatal.Yoga Body.Pillow
## 1 0 0 1 0 0
## 2 0 0 1 0 0
## 3 0 0 0 0 0
## 4 0 0 0 0 0
## 5 0 0 0 1 0
## 6 0 0 1 0 0
## Ginger.Ale Sea.Bands Stopped.buying.ciggies Cigarettes Smoking.Cessation
## 1 0 0 0 0 0
## 2 0 0 0 0 0
## 3 0 1 0 0 0
## 4 1 0 0 0 0
## 5 0 0 0 0 0
## 6 0 0 1 0 0
## Stopped.buying.wine Wine Maternity.Clothes PREGNANT
## 1 0 0 0 1
## 2 0 0 0 1
## 3 0 0 0 1
## 4 0 0 0 1
## 5 1 0 0 1
## 6 0 0 0 1
str(PregnancyData)
## 'data.frame': 1000 obs. of 18 variables:
## $ Implied.Gender : Factor w/ 3 levels "F","M","U": 2 2 2 3 1 1 2 1 1 1 ...
## $ Home.Apt..PO.Box : Factor w/ 3 levels "A","H","P": 1 2 2 2 1 2 2 2 2 2 ...
## $ Pregnancy.Test : int 1 1 1 0 0 0 0 0 0 0 ...
## $ Birth.Control : int 0 0 0 0 0 0 1 0 0 0 ...
## $ Feminine.Hygiene : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Folic.Acid : int 0 0 0 0 0 0 1 0 0 0 ...
## $ Prenatal.Vitamins : int 1 1 0 0 0 1 1 0 0 1 ...
## $ Prenatal.Yoga : int 0 0 0 0 1 0 0 0 0 0 ...
## $ Body.Pillow : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Ginger.Ale : int 0 0 0 1 0 0 0 0 1 0 ...
## $ Sea.Bands : int 0 0 1 0 0 0 0 0 0 0 ...
## $ Stopped.buying.ciggies: int 0 0 0 0 0 1 0 0 0 0 ...
## $ Cigarettes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Smoking.Cessation : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Stopped.buying.wine : int 0 0 0 0 1 0 0 0 0 0 ...
## $ Wine : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Maternity.Clothes : int 0 0 0 0 0 0 0 1 0 1 ...
## $ PREGNANT : int 1 1 1 1 1 1 1 1 1 1 ...
summary(PregnancyData$PREGNANT)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 0.0 0.5 0.5 1.0 1.0
Best for randomForest() to factorize reponse variable (pregnant/no pregnant) into two classes instead of treating it as an integer.
PregnancyData$PREGNANT <- factor(PregnancyData$PREGNANT)
PregnancyData.Test$PREGNANT <- factor(PregnancyData.Test$PREGNANT)
head(PregnancyData)
## Implied.Gender Home.Apt..PO.Box Pregnancy.Test Birth.Control
## 1 M A 1 0
## 2 M H 1 0
## 3 M H 1 0
## 4 U H 0 0
## 5 F A 0 0
## 6 F H 0 0
## Feminine.Hygiene Folic.Acid Prenatal.Vitamins Prenatal.Yoga Body.Pillow
## 1 0 0 1 0 0
## 2 0 0 1 0 0
## 3 0 0 0 0 0
## 4 0 0 0 0 0
## 5 0 0 0 1 0
## 6 0 0 1 0 0
## Ginger.Ale Sea.Bands Stopped.buying.ciggies Cigarettes Smoking.Cessation
## 1 0 0 0 0 0
## 2 0 0 0 0 0
## 3 0 1 0 0 0
## 4 1 0 0 0 0
## 5 0 0 0 0 0
## 6 0 0 1 0 0
## Stopped.buying.wine Wine Maternity.Clothes PREGNANT
## 1 0 0 0 1
## 2 0 0 0 1
## 3 0 0 0 1
## 4 0 0 0 1
## 5 1 0 0 1
## 6 0 0 0 1
str(PregnancyData)
## 'data.frame': 1000 obs. of 18 variables:
## $ Implied.Gender : Factor w/ 3 levels "F","M","U": 2 2 2 3 1 1 2 1 1 1 ...
## $ Home.Apt..PO.Box : Factor w/ 3 levels "A","H","P": 1 2 2 2 1 2 2 2 2 2 ...
## $ Pregnancy.Test : int 1 1 1 0 0 0 0 0 0 0 ...
## $ Birth.Control : int 0 0 0 0 0 0 1 0 0 0 ...
## $ Feminine.Hygiene : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Folic.Acid : int 0 0 0 0 0 0 1 0 0 0 ...
## $ Prenatal.Vitamins : int 1 1 0 0 0 1 1 0 0 1 ...
## $ Prenatal.Yoga : int 0 0 0 0 1 0 0 0 0 0 ...
## $ Body.Pillow : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Ginger.Ale : int 0 0 0 1 0 0 0 0 1 0 ...
## $ Sea.Bands : int 0 0 1 0 0 0 0 0 0 0 ...
## $ Stopped.buying.ciggies: int 0 0 0 0 0 1 0 0 0 0 ...
## $ Cigarettes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Smoking.Cessation : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Stopped.buying.wine : int 0 0 0 0 1 0 0 0 0 0 ...
## $ Wine : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Maternity.Clothes : int 0 0 0 0 0 0 0 1 0 1 ...
## $ PREGNANT : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
summary(PregnancyData$PREGNANT)
## 0 1
## 500 500
Build a logistical regression with glm().
PREGNANT ~ . is a formula in R, and means ’train model to predict PREGNANT column using all other columns.
Pregnancy.lm <- glm(PREGNANT ~ ., data=PregnancyData, family=binomial('logit'))
#You could specify a subset of columns like so...
# Pregnancy.lm <- glm(PREGNANT ~
# Implied.Gender +
# Home.Apt..PO.Box +
# Pregnancy.Test +
# Birth.Control, +
# data=PregnacyData, family=binomial('logit'))
Notice the *’s in the result here. Those without one are of dubious worth.
summary(Pregnancy.lm)
##
## Call:
## glm(formula = PREGNANT ~ ., family = binomial("logit"), data = PregnancyData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2012 -0.5566 -0.0246 0.5127 2.8658
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.343597 0.180755 -1.901 0.057315 .
## Implied.GenderM -0.453880 0.197566 -2.297 0.021599 *
## Implied.GenderU 0.141939 0.307588 0.461 0.644469
## Home.Apt..PO.BoxH -0.172927 0.194591 -0.889 0.374180
## Home.Apt..PO.BoxP -0.002813 0.336432 -0.008 0.993329
## Pregnancy.Test 2.370554 0.521781 4.543 5.54e-06 ***
## Birth.Control -2.300272 0.365270 -6.297 3.03e-10 ***
## Feminine.Hygiene -2.028558 0.342398 -5.925 3.13e-09 ***
## Folic.Acid 4.077666 0.761888 5.352 8.70e-08 ***
## Prenatal.Vitamins 2.479469 0.369063 6.718 1.84e-11 ***
## Prenatal.Yoga 2.922974 1.146990 2.548 0.010822 *
## Body.Pillow 1.261037 0.860617 1.465 0.142847
## Ginger.Ale 1.938502 0.426733 4.543 5.55e-06 ***
## Sea.Bands 1.107530 0.673435 1.645 0.100053
## Stopped.buying.ciggies 1.302222 0.342347 3.804 0.000142 ***
## Cigarettes -1.443022 0.370120 -3.899 9.67e-05 ***
## Smoking.Cessation 1.790779 0.512610 3.493 0.000477 ***
## Stopped.buying.wine 1.383888 0.305883 4.524 6.06e-06 ***
## Wine -1.565539 0.348910 -4.487 7.23e-06 ***
## Maternity.Clothes 2.078202 0.329432 6.308 2.82e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1386.29 on 999 degrees of freedom
## Residual deviance: 744.11 on 980 degrees of freedom
## AIC: 784.11
##
## Number of Fisher Scoring iterations: 7
We can train using a random forest model.
Pregnancy.rf <- randomForest(PREGNANT~., data=PregnancyData, importance=TRUE)
varImpPlot(Pregnancy.rf, type=2)
summary(Pregnancy.rf)
## Length Class Mode
## call 4 -none- call
## type 1 -none- character
## predicted 1000 factor numeric
## err.rate 1500 -none- numeric
## confusion 6 -none- numeric
## votes 2000 matrix numeric
## oob.times 1000 -none- numeric
## classes 2 -none- character
## importance 68 -none- numeric
## importanceSD 51 -none- numeric
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 14 -none- list
## y 1000 factor numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
Let’s get to predicting.
PregnancyData.Test.lm.Preds <- predict(Pregnancy.lm, PregnancyData.Test, type='response')
PregnancyData.Test.rf.Preds <- predict(Pregnancy.rf, PregnancyData.Test, type='prob')
summary(PregnancyData.Test.lm.Preds)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.001179 0.066190 0.239500 0.283100 0.414300 0.999200
summary(PregnancyData.Test.rf.Preds)
## 0 1
## Min. :0.0020 Min. :0.0000
## 1st Qu.:0.7460 1st Qu.:0.0100
## Median :0.9360 Median :0.0640
## Mean :0.8069 Mean :0.1931
## 3rd Qu.:0.9900 3rd Qu.:0.2540
## Max. :1.0000 Max. :0.9980
t(PregnancyData.Test[1,])
## 1
## Implied.Gender "U"
## Home.Apt..PO.Box "A"
## Pregnancy.Test "0"
## Birth.Control "0"
## Feminine.Hygiene "0"
## Folic.Acid "0"
## Prenatal.Vitamins "0"
## Prenatal.Yoga "0"
## Body.Pillow "0"
## Ginger.Ale "0"
## Sea.Bands "1"
## Stopped.buying.ciggies "0"
## Cigarettes "0"
## Smoking.Cessation "0"
## Stopped.buying.wine "1"
## Wine "1"
## Maternity.Clothes "0"
## PREGNANT "1"
t(PregnancyData.Test.lm.Preds[1])
## 1
## [1,] 0.6735358
t(PregnancyData.Test.rf.Preds[1,2])
## [,1]
## [1,] 0.51
Create two ROCR prediction objects…
pred.lm <- prediction(PregnancyData.Test.lm.Preds, PregnancyData.Test$PREGNANT)
pred.rf <- prediction(PregnancyData.Test.rf.Preds[,2], PregnancyData.Test$PREGNANT)
Turn these prediction objects into ROCR objects
perf.lm <- performance(pred.lm, 'tpr', 'fpr')
perf.rf <- performance(pred.rf, 'tpr', 'fpr')
Plot these curves now. lty make the random forest line dashed (lty = line type). Overlyay the two curves.
plot(perf.lm, xlim=c(0,1), ylim=c(0,1))
plot(perf.rf, xlim=c(0,1), ylim=c(0,1), lty=2, add=TRUE)