Doing some data science

Building AI models on Pregnancy Data

We’ll need randomForest, ROCR for this lesson.

repo <- 'http://cran.us.r-project.org'

if('randomForest' %in% rownames(installed.packages()) == FALSE) {
 install.packages('randomForest', repos=repo, dependencies=TRUE)
}

if('ROCR' %in% rownames(installed.packages()) == FALSE) {
 install.packages('ROCR', repos=repo, dependencies=TRUE)
}


library(randomForest)

## Warning: package 'randomForest' was built under R version 3.1.2

## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.

library(ROCR)

## Warning: package 'ROCR' was built under R version 3.1.3

## Loading required package: gplots

## Warning: package 'gplots' was built under R version 3.1.2

## 
## Attaching package: 'gplots'
## 
## The following object is masked from 'package:stats':
## 
##     lowess

PregnancyData <- read.csv('data/Pregnancy.csv')
PregnancyData.Test <- read.csv('data/Pregnancy_Test.csv')

head(PregnancyData)

##   Implied.Gender Home.Apt..PO.Box Pregnancy.Test Birth.Control
## 1              M                A              1             0
## 2              M                H              1             0
## 3              M                H              1             0
## 4              U                H              0             0
## 5              F                A              0             0
## 6              F                H              0             0
##   Feminine.Hygiene Folic.Acid Prenatal.Vitamins Prenatal.Yoga Body.Pillow
## 1                0          0                 1             0           0
## 2                0          0                 1             0           0
## 3                0          0                 0             0           0
## 4                0          0                 0             0           0
## 5                0          0                 0             1           0
## 6                0          0                 1             0           0
##   Ginger.Ale Sea.Bands Stopped.buying.ciggies Cigarettes Smoking.Cessation
## 1          0         0                      0          0                 0
## 2          0         0                      0          0                 0
## 3          0         1                      0          0                 0
## 4          1         0                      0          0                 0
## 5          0         0                      0          0                 0
## 6          0         0                      1          0                 0
##   Stopped.buying.wine Wine Maternity.Clothes PREGNANT
## 1                   0    0                 0        1
## 2                   0    0                 0        1
## 3                   0    0                 0        1
## 4                   0    0                 0        1
## 5                   1    0                 0        1
## 6                   0    0                 0        1

str(PregnancyData)

## 'data.frame':    1000 obs. of  18 variables:
##  $ Implied.Gender        : Factor w/ 3 levels "F","M","U": 2 2 2 3 1 1 2 1 1 1 ...
##  $ Home.Apt..PO.Box      : Factor w/ 3 levels "A","H","P": 1 2 2 2 1 2 2 2 2 2 ...
##  $ Pregnancy.Test        : int  1 1 1 0 0 0 0 0 0 0 ...
##  $ Birth.Control         : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ Feminine.Hygiene      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Folic.Acid            : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ Prenatal.Vitamins     : int  1 1 0 0 0 1 1 0 0 1 ...
##  $ Prenatal.Yoga         : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ Body.Pillow           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Ginger.Ale            : int  0 0 0 1 0 0 0 0 1 0 ...
##  $ Sea.Bands             : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ Stopped.buying.ciggies: int  0 0 0 0 0 1 0 0 0 0 ...
##  $ Cigarettes            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Smoking.Cessation     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Stopped.buying.wine   : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ Wine                  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Maternity.Clothes     : int  0 0 0 0 0 0 0 1 0 1 ...
##  $ PREGNANT              : int  1 1 1 1 1 1 1 1 1 1 ...

summary(PregnancyData$PREGNANT)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     0.0     0.5     0.5     1.0     1.0

Best for randomForest() to factorize reponse variable (pregnant/no pregnant) into two classes instead of treating it as an integer.

PregnancyData$PREGNANT <- factor(PregnancyData$PREGNANT)
PregnancyData.Test$PREGNANT <- factor(PregnancyData.Test$PREGNANT)

head(PregnancyData)

##   Implied.Gender Home.Apt..PO.Box Pregnancy.Test Birth.Control
## 1              M                A              1             0
## 2              M                H              1             0
## 3              M                H              1             0
## 4              U                H              0             0
## 5              F                A              0             0
## 6              F                H              0             0
##   Feminine.Hygiene Folic.Acid Prenatal.Vitamins Prenatal.Yoga Body.Pillow
## 1                0          0                 1             0           0
## 2                0          0                 1             0           0
## 3                0          0                 0             0           0
## 4                0          0                 0             0           0
## 5                0          0                 0             1           0
## 6                0          0                 1             0           0
##   Ginger.Ale Sea.Bands Stopped.buying.ciggies Cigarettes Smoking.Cessation
## 1          0         0                      0          0                 0
## 2          0         0                      0          0                 0
## 3          0         1                      0          0                 0
## 4          1         0                      0          0                 0
## 5          0         0                      0          0                 0
## 6          0         0                      1          0                 0
##   Stopped.buying.wine Wine Maternity.Clothes PREGNANT
## 1                   0    0                 0        1
## 2                   0    0                 0        1
## 3                   0    0                 0        1
## 4                   0    0                 0        1
## 5                   1    0                 0        1
## 6                   0    0                 0        1

str(PregnancyData)

## 'data.frame':    1000 obs. of  18 variables:
##  $ Implied.Gender        : Factor w/ 3 levels "F","M","U": 2 2 2 3 1 1 2 1 1 1 ...
##  $ Home.Apt..PO.Box      : Factor w/ 3 levels "A","H","P": 1 2 2 2 1 2 2 2 2 2 ...
##  $ Pregnancy.Test        : int  1 1 1 0 0 0 0 0 0 0 ...
##  $ Birth.Control         : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ Feminine.Hygiene      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Folic.Acid            : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ Prenatal.Vitamins     : int  1 1 0 0 0 1 1 0 0 1 ...
##  $ Prenatal.Yoga         : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ Body.Pillow           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Ginger.Ale            : int  0 0 0 1 0 0 0 0 1 0 ...
##  $ Sea.Bands             : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ Stopped.buying.ciggies: int  0 0 0 0 0 1 0 0 0 0 ...
##  $ Cigarettes            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Smoking.Cessation     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Stopped.buying.wine   : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ Wine                  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Maternity.Clothes     : int  0 0 0 0 0 0 0 1 0 1 ...
##  $ PREGNANT              : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...

summary(PregnancyData$PREGNANT)

##   0   1 
## 500 500

Build a logistical regression with glm().

PREGNANT ~ . is a formula in R, and means ’train model to predict PREGNANT column using all other columns.

Pregnancy.lm <- glm(PREGNANT ~ ., data=PregnancyData, family=binomial('logit'))

#You could specify a subset of columns like so...
# Pregnancy.lm <- glm(PREGNANT ~
#                       Implied.Gender +
#                       Home.Apt..PO.Box +
#                       Pregnancy.Test +
#                       Birth.Control, +
#                       data=PregnacyData, family=binomial('logit'))

Notice the *’s in the result here. Those without one are of dubious worth.

summary(Pregnancy.lm)

## 
## Call:
## glm(formula = PREGNANT ~ ., family = binomial("logit"), data = PregnancyData)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2012  -0.5566  -0.0246   0.5127   2.8658  
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            -0.343597   0.180755  -1.901 0.057315 .  
## Implied.GenderM        -0.453880   0.197566  -2.297 0.021599 *  
## Implied.GenderU         0.141939   0.307588   0.461 0.644469    
## Home.Apt..PO.BoxH      -0.172927   0.194591  -0.889 0.374180    
## Home.Apt..PO.BoxP      -0.002813   0.336432  -0.008 0.993329    
## Pregnancy.Test          2.370554   0.521781   4.543 5.54e-06 ***
## Birth.Control          -2.300272   0.365270  -6.297 3.03e-10 ***
## Feminine.Hygiene       -2.028558   0.342398  -5.925 3.13e-09 ***
## Folic.Acid              4.077666   0.761888   5.352 8.70e-08 ***
## Prenatal.Vitamins       2.479469   0.369063   6.718 1.84e-11 ***
## Prenatal.Yoga           2.922974   1.146990   2.548 0.010822 *  
## Body.Pillow             1.261037   0.860617   1.465 0.142847    
## Ginger.Ale              1.938502   0.426733   4.543 5.55e-06 ***
## Sea.Bands               1.107530   0.673435   1.645 0.100053    
## Stopped.buying.ciggies  1.302222   0.342347   3.804 0.000142 ***
## Cigarettes             -1.443022   0.370120  -3.899 9.67e-05 ***
## Smoking.Cessation       1.790779   0.512610   3.493 0.000477 ***
## Stopped.buying.wine     1.383888   0.305883   4.524 6.06e-06 ***
## Wine                   -1.565539   0.348910  -4.487 7.23e-06 ***
## Maternity.Clothes       2.078202   0.329432   6.308 2.82e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1386.29  on 999  degrees of freedom
## Residual deviance:  744.11  on 980  degrees of freedom
## AIC: 784.11
## 
## Number of Fisher Scoring iterations: 7

We can train using a random forest model.

Pregnancy.rf <- randomForest(PREGNANT~., data=PregnancyData, importance=TRUE)
varImpPlot(Pregnancy.rf, type=2)

summary(Pregnancy.rf)

##                 Length Class  Mode     
## call               4   -none- call     
## type               1   -none- character
## predicted       1000   factor numeric  
## err.rate        1500   -none- numeric  
## confusion          6   -none- numeric  
## votes           2000   matrix numeric  
## oob.times       1000   -none- numeric  
## classes            2   -none- character
## importance        68   -none- numeric  
## importanceSD      51   -none- numeric  
## localImportance    0   -none- NULL     
## proximity          0   -none- NULL     
## ntree              1   -none- numeric  
## mtry               1   -none- numeric  
## forest            14   -none- list     
## y               1000   factor numeric  
## test               0   -none- NULL     
## inbag              0   -none- NULL     
## terms              3   terms  call

Let’s get to predicting.

PregnancyData.Test.lm.Preds <- predict(Pregnancy.lm, PregnancyData.Test, type='response')
PregnancyData.Test.rf.Preds <- predict(Pregnancy.rf, PregnancyData.Test, type='prob')

summary(PregnancyData.Test.lm.Preds)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.001179 0.066190 0.239500 0.283100 0.414300 0.999200

summary(PregnancyData.Test.rf.Preds)

##        0                1         
##  Min.   :0.0020   Min.   :0.0000  
##  1st Qu.:0.7460   1st Qu.:0.0100  
##  Median :0.9360   Median :0.0640  
##  Mean   :0.8069   Mean   :0.1931  
##  3rd Qu.:0.9900   3rd Qu.:0.2540  
##  Max.   :1.0000   Max.   :0.9980

t(PregnancyData.Test[1,])

##                        1  
## Implied.Gender         "U"
## Home.Apt..PO.Box       "A"
## Pregnancy.Test         "0"
## Birth.Control          "0"
## Feminine.Hygiene       "0"
## Folic.Acid             "0"
## Prenatal.Vitamins      "0"
## Prenatal.Yoga          "0"
## Body.Pillow            "0"
## Ginger.Ale             "0"
## Sea.Bands              "1"
## Stopped.buying.ciggies "0"
## Cigarettes             "0"
## Smoking.Cessation      "0"
## Stopped.buying.wine    "1"
## Wine                   "1"
## Maternity.Clothes      "0"
## PREGNANT               "1"

t(PregnancyData.Test.lm.Preds[1])

##              1
## [1,] 0.6735358

t(PregnancyData.Test.rf.Preds[1,2])

##      [,1]
## [1,] 0.51

Create two ROCR prediction objects…

pred.lm <- prediction(PregnancyData.Test.lm.Preds, PregnancyData.Test$PREGNANT)
pred.rf <- prediction(PregnancyData.Test.rf.Preds[,2], PregnancyData.Test$PREGNANT)

Turn these prediction objects into ROCR objects

perf.lm <- performance(pred.lm, 'tpr', 'fpr')
perf.rf <- performance(pred.rf, 'tpr', 'fpr')

Plot these curves now. lty make the random forest line dashed (lty = line type). Overlyay the two curves.

plot(perf.lm, xlim=c(0,1), ylim=c(0,1))
plot(perf.rf, xlim=c(0,1), ylim=c(0,1), lty=2, add=TRUE)