Assignment

Follow along with Foreman's instructions to download R. Follow along by recreating the models in R.

  • Work through the examples to the top of p. 370. Skip ahead to p. 378 and complete the coding for chapters 6 & 7 through to the bottom of p. 385
  • If you want more details on saving files to your working directory, reference the rest of p. 370
  • If you want more details about installing a library, reference p. 372

Export code, output and plots from R into a Word doc. Submit file titled "R_LnameFinitial".

Solution

Libraries

library(randomForest)
library(ROCR)

Some Simple Hand-Jamming

almostpi <- 355/113
almostpi
## [1] 3.141593
sqrt(almostpi)
## [1] 1.772454
?sqrt
??log
Vector Math and Factoring
someprimes <- c(1,2,3,5,7,11)
someprimes
## [1]  1  2  3  5  7 11
length(someprimes)
## [1] 6
someprimes[4]
## [1] 5
someprimes[c(4,5,6)]
## [1]  5  7 11
someprimes[c(4:6)]
## [1]  5  7 11
which(someprimes<7)
## [1] 1 2 3 4
someprimes[which(someprimes<7)]
## [1] 1 2 3 5
primestime2 <- someprimes*2
primestime2
## [1]  2  4  6 10 14 22
summary(someprimes)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.250   4.000   4.833   6.500  11.000
somecolors <- c("blue", "red", "green", "blue", "green", "yellow", "red", "red")
somecolors
## [1] "blue"   "red"    "green"  "blue"   "green"  "yellow" "red"    "red"
summary(somecolors)
##    Length     Class      Mode 
##         8 character character
somecolors <- factor(somecolors)
somecolors
## [1] blue   red    green  blue   green  yellow red    red   
## Levels: blue green red yellow
summary(somecolors)
##   blue  green    red yellow 
##      2      2      3      1
Two-Dimensional Matrices
amatrix <- matrix(data=c(someprimes, primestime2), nrow=2, ncol=6)
amatrix
##      [,1] [,2] [,3] [,4] [,5] [,6]
## [1,]    1    3    7    2    6   14
## [2,]    2    5   11    4   10   22
nrow(amatrix)
## [1] 2
ncol(amatrix)
## [1] 6
t(amatrix)
##      [,1] [,2]
## [1,]    1    2
## [2,]    3    5
## [3,]    7   11
## [4,]    2    4
## [5,]    6   10
## [6,]   14   22
amatrix[1:2,3]
## [1]  7 11
amatrix[,3]
## [1]  7 11
primestime3 <- someprimes*3
amatrix <- rbind(amatrix, primestime3)
amatrix
##             [,1] [,2] [,3] [,4] [,5] [,6]
##                1    3    7    2    6   14
##                2    5   11    4   10   22
## primestime3    3    6    9   15   21   33
Best Datatype of Them All: The Dataframe
John <- list(gender="male", age="ancient", height=72, spawn=3, spawn_ages=c(.5,2,5))
John
## $gender
## [1] "male"
## 
## $age
## [1] "ancient"
## 
## $height
## [1] 72
## 
## $spawn
## [1] 3
## 
## $spawn_ages
## [1] 0.5 2.0 5.0
bondnames <- c("connery", "lazenby", "moore", "dalton", "brosnan", "craig")
firstyear <- c(1962, 1969, 1973, 1987, 1995, 2006)
eyecolor <- c("brown", "brown", "blue", "green", "blue", "blue")
womenkissed <- c(17, 3, 20, 4, 12, 4)
countofbondjamesbonds <- c(3,2,10,2,5,1)

bonddata <- data.frame(bondnames, firstyear, eyecolor, womenkissed, countofbondjamesbonds)
bonddata
##   bondnames firstyear eyecolor womenkissed countofbondjamesbonds
## 1   connery      1962    brown          17                     3
## 2   lazenby      1969    brown           3                     2
## 3     moore      1973     blue          20                    10
## 4    dalton      1987    green           4                     2
## 5   brosnan      1995     blue          12                     5
## 6     craig      2006     blue           4                     1
str(bonddata)
## 'data.frame':    6 obs. of  5 variables:
##  $ bondnames            : Factor w/ 6 levels "brosnan","connery",..: 2 5 6 4 1 3
##  $ firstyear            : num  1962 1969 1973 1987 1995 ...
##  $ eyecolor             : Factor w/ 3 levels "blue","brown",..: 2 2 1 3 1 1
##  $ womenkissed          : num  17 3 20 4 12 4
##  $ countofbondjamesbonds: num  3 2 10 2 5 1
summary(bonddata)
##    bondnames   firstyear     eyecolor  womenkissed   
##  brosnan:1   Min.   :1962   blue :3   Min.   : 3.00  
##  connery:1   1st Qu.:1970   brown:2   1st Qu.: 4.00  
##  craig  :1   Median :1980   green:1   Median : 8.00  
##  dalton :1   Mean   :1982             Mean   :10.00  
##  lazenby:1   3rd Qu.:1993             3rd Qu.:15.75  
##  moore  :1   Max.   :2006             Max.   :20.00  
##  countofbondjamesbonds
##  Min.   : 1.000       
##  1st Qu.: 2.000       
##  Median : 2.500       
##  Mean   : 3.833       
##  3rd Qu.: 4.500       
##  Max.   :10.000
bonddata$firstyear <- factor(bonddata$firstyear)
summary(bonddata)
##    bondnames firstyear  eyecolor  womenkissed    countofbondjamesbonds
##  brosnan:1   1962:1    blue :3   Min.   : 3.00   Min.   : 1.000       
##  connery:1   1969:1    brown:2   1st Qu.: 4.00   1st Qu.: 2.000       
##  craig  :1   1973:1    green:1   Median : 8.00   Median : 2.500       
##  dalton :1   1987:1              Mean   :10.00   Mean   : 3.833       
##  lazenby:1   1995:1              3rd Qu.:15.75   3rd Qu.: 4.500       
##  moore  :1   2006:1              Max.   :20.00   Max.   :10.000

Building AI Models on the Pregnancy Data

Reading Data into R

getwd()
## [1] "/Users/emiliembolduc/Bard MBA/2019_Spring_Data and Decisions/R/DataSmart_Chapt10"

Loading data

PregnancyData <- read.csv("Pregnancy.csv")
PregnancyData.Test <- read.csv("Pregnancy_Test.csv")
summary(PregnancyData)
##  Implied.Gender Home.Apt..PO.Box Pregnancy.Test  Birth.Control 
##  F:495          A:420            Min.   :0.000   Min.   :0.00  
##  M:401          H:488            1st Qu.:0.000   1st Qu.:0.00  
##  U:104          P: 92            Median :0.000   Median :0.00  
##                                  Mean   :0.075   Mean   :0.14  
##                                  3rd Qu.:0.000   3rd Qu.:0.00  
##                                  Max.   :1.000   Max.   :1.00  
##  Feminine.Hygiene   Folic.Acid    Prenatal.Vitamins Prenatal.Yoga  
##  Min.   :0.000    Min.   :0.000   Min.   :0.000     Min.   :0.000  
##  1st Qu.:0.000    1st Qu.:0.000   1st Qu.:0.000     1st Qu.:0.000  
##  Median :0.000    Median :0.000   Median :0.000     Median :0.000  
##  Mean   :0.141    Mean   :0.106   Mean   :0.128     Mean   :0.018  
##  3rd Qu.:0.000    3rd Qu.:0.000   3rd Qu.:0.000     3rd Qu.:0.000  
##  Max.   :1.000    Max.   :1.000   Max.   :1.000     Max.   :1.000  
##   Body.Pillow      Ginger.Ale      Sea.Bands    Stopped.buying.ciggies
##  Min.   :0.000   Min.   :0.000   Min.   :0.00   Min.   :0.000         
##  1st Qu.:0.000   1st Qu.:0.000   1st Qu.:0.00   1st Qu.:0.000         
##  Median :0.000   Median :0.000   Median :0.00   Median :0.000         
##  Mean   :0.018   Mean   :0.069   Mean   :0.03   Mean   :0.092         
##  3rd Qu.:0.000   3rd Qu.:0.000   3rd Qu.:0.00   3rd Qu.:0.000         
##  Max.   :1.000   Max.   :1.000   Max.   :1.00   Max.   :1.000         
##    Cigarettes    Smoking.Cessation Stopped.buying.wine      Wine      
##  Min.   :0.000   Min.   :0.00      Min.   :0.00        Min.   :0.000  
##  1st Qu.:0.000   1st Qu.:0.00      1st Qu.:0.00        1st Qu.:0.000  
##  Median :0.000   Median :0.00      Median :0.00        Median :0.000  
##  Mean   :0.097   Mean   :0.06      Mean   :0.13        Mean   :0.123  
##  3rd Qu.:0.000   3rd Qu.:0.00      3rd Qu.:0.00        3rd Qu.:0.000  
##  Max.   :1.000   Max.   :1.00      Max.   :1.00        Max.   :1.000  
##  Maternity.Clothes    PREGNANT  
##  Min.   :0.000     Min.   :0.0  
##  1st Qu.:0.000     1st Qu.:0.0  
##  Median :0.000     Median :0.5  
##  Mean   :0.131     Mean   :0.5  
##  3rd Qu.:0.000     3rd Qu.:1.0  
##  Max.   :1.000     Max.   :1.0
summary(PregnancyData.Test)
##  Implied.Gender Home.Apt..PO.Box Pregnancy.Test  Birth.Control  
##  F:251          A:226            Min.   :0.000   Min.   :0.000  
##  M:424          H:469            1st Qu.:0.000   1st Qu.:0.000  
##  U:325          P:305            Median :0.000   Median :0.000  
##                                  Mean   :0.011   Mean   :0.216  
##                                  3rd Qu.:0.000   3rd Qu.:0.000  
##                                  Max.   :1.000   Max.   :1.000  
##  Feminine.Hygiene   Folic.Acid   Prenatal.Vitamins Prenatal.Yoga  
##  Min.   :0.000    Min.   :0.00   Min.   :0.000     Min.   :0.000  
##  1st Qu.:0.000    1st Qu.:0.00   1st Qu.:0.000     1st Qu.:0.000  
##  Median :0.000    Median :0.00   Median :0.000     Median :0.000  
##  Mean   :0.209    Mean   :0.02   Mean   :0.043     Mean   :0.005  
##  3rd Qu.:0.000    3rd Qu.:0.00   3rd Qu.:0.000     3rd Qu.:0.000  
##  Max.   :1.000    Max.   :1.00   Max.   :1.000     Max.   :1.000  
##   Body.Pillow      Ginger.Ale      Sea.Bands     Stopped.buying.ciggies
##  Min.   :0.000   Min.   :0.000   Min.   :0.000   Min.   :0.00          
##  1st Qu.:0.000   1st Qu.:0.000   1st Qu.:0.000   1st Qu.:0.00          
##  Median :0.000   Median :0.000   Median :0.000   Median :0.00          
##  Mean   :0.008   Mean   :0.032   Mean   :0.013   Mean   :0.05          
##  3rd Qu.:0.000   3rd Qu.:0.000   3rd Qu.:0.000   3rd Qu.:0.00          
##  Max.   :1.000   Max.   :1.000   Max.   :1.000   Max.   :1.00          
##    Cigarettes    Smoking.Cessation Stopped.buying.wine      Wine      
##  Min.   :0.000   Min.   :0.000     Min.   :0.00        Min.   :0.000  
##  1st Qu.:0.000   1st Qu.:0.000     1st Qu.:0.00        1st Qu.:0.000  
##  Median :0.000   Median :0.000     Median :0.00        Median :0.000  
##  Mean   :0.148   Mean   :0.009     Mean   :0.08        Mean   :0.202  
##  3rd Qu.:0.000   3rd Qu.:0.000     3rd Qu.:0.00        3rd Qu.:0.000  
##  Max.   :1.000   Max.   :1.000     Max.   :1.00        Max.   :1.000  
##  Maternity.Clothes    PREGNANT   
##  Min.   :0.000     Min.   :0.00  
##  1st Qu.:0.000     1st Qu.:0.00  
##  Median :0.000     Median :0.00  
##  Mean   :0.052     Mean   :0.06  
##  3rd Qu.:0.000     3rd Qu.:0.00  
##  Max.   :1.000     Max.   :1.00
str(PregnancyData)
## 'data.frame':    1000 obs. of  18 variables:
##  $ Implied.Gender        : Factor w/ 3 levels "F","M","U": 2 2 2 3 1 1 2 1 1 1 ...
##  $ Home.Apt..PO.Box      : Factor w/ 3 levels "A","H","P": 1 2 2 2 1 2 2 2 2 2 ...
##  $ Pregnancy.Test        : int  1 1 1 0 0 0 0 0 0 0 ...
##  $ Birth.Control         : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ Feminine.Hygiene      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Folic.Acid            : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ Prenatal.Vitamins     : int  1 1 0 0 0 1 1 0 0 1 ...
##  $ Prenatal.Yoga         : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ Body.Pillow           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Ginger.Ale            : int  0 0 0 1 0 0 0 0 1 0 ...
##  $ Sea.Bands             : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ Stopped.buying.ciggies: int  0 0 0 0 0 1 0 0 0 0 ...
##  $ Cigarettes            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Smoking.Cessation     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Stopped.buying.wine   : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ Wine                  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Maternity.Clothes     : int  0 0 0 0 0 0 0 1 0 1 ...
##  $ PREGNANT              : int  1 1 1 1 1 1 1 1 1 1 ...
str(PregnancyData.Test)
## 'data.frame':    1000 obs. of  18 variables:
##  $ Implied.Gender        : Factor w/ 3 levels "F","M","U": 3 3 2 2 2 2 1 1 1 1 ...
##  $ Home.Apt..PO.Box      : Factor w/ 3 levels "A","H","P": 1 2 2 2 1 2 2 3 3 2 ...
##  $ Pregnancy.Test        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Birth.Control         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Feminine.Hygiene      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Folic.Acid            : int  0 0 0 0 0 1 0 0 0 0 ...
##  $ Prenatal.Vitamins     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Prenatal.Yoga         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Body.Pillow           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Ginger.Ale            : int  0 0 0 0 0 1 0 0 0 0 ...
##  $ Sea.Bands             : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ Stopped.buying.ciggies: int  0 0 1 0 0 0 0 0 0 1 ...
##  $ Cigarettes            : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ Smoking.Cessation     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Stopped.buying.wine   : int  1 0 0 1 0 1 1 0 0 0 ...
##  $ Wine                  : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ Maternity.Clothes     : int  0 0 0 0 0 0 1 0 0 1 ...
##  $ PREGNANT              : int  1 1 1 1 1 1 1 1 1 1 ...

Turn the response variable (1 for pregnant, 0 for not pregnant) into factor for Random Forest function

PregnancyData$PREGNANT <- factor(PregnancyData$PREGNANT)
PregnancyData.Test$PREGNANT <- factor(PregnancyData.Test$PREGNANT)
summary(PregnancyData$PREGNANT)
##   0   1 
## 500 500

Start with a logistic regression

Pregnancy.1m <- glm(PREGNANT ~ .,
                    data = PregnancyData, family = binomial("logit"))
summary(Pregnancy.1m)
## 
## Call:
## glm(formula = PREGNANT ~ ., family = binomial("logit"), data = PregnancyData)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2012  -0.5566  -0.0246   0.5127   2.8658  
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            -0.343597   0.180755  -1.901 0.057315 .  
## Implied.GenderM        -0.453880   0.197566  -2.297 0.021599 *  
## Implied.GenderU         0.141939   0.307588   0.461 0.644469    
## Home.Apt..PO.BoxH      -0.172927   0.194591  -0.889 0.374180    
## Home.Apt..PO.BoxP      -0.002813   0.336432  -0.008 0.993329    
## Pregnancy.Test          2.370554   0.521781   4.543 5.54e-06 ***
## Birth.Control          -2.300272   0.365270  -6.297 3.03e-10 ***
## Feminine.Hygiene       -2.028558   0.342398  -5.925 3.13e-09 ***
## Folic.Acid              4.077666   0.761888   5.352 8.70e-08 ***
## Prenatal.Vitamins       2.479469   0.369063   6.718 1.84e-11 ***
## Prenatal.Yoga           2.922974   1.146990   2.548 0.010822 *  
## Body.Pillow             1.261037   0.860617   1.465 0.142847    
## Ginger.Ale              1.938502   0.426733   4.543 5.55e-06 ***
## Sea.Bands               1.107530   0.673435   1.645 0.100053    
## Stopped.buying.ciggies  1.302222   0.342347   3.804 0.000142 ***
## Cigarettes             -1.443022   0.370120  -3.899 9.67e-05 ***
## Smoking.Cessation       1.790779   0.512610   3.493 0.000477 ***
## Stopped.buying.wine     1.383888   0.305883   4.524 6.06e-06 ***
## Wine                   -1.565539   0.348910  -4.487 7.23e-06 ***
## Maternity.Clothes       2.078202   0.329432   6.308 2.82e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1386.29  on 999  degrees of freedom
## Residual deviance:  744.11  on 980  degrees of freedom
## AIC: 784.11
## 
## Number of Fisher Scoring iterations: 7

Train a random forest model

Pregnancy.rf <- randomForest(PREGNANT ~ ., data = PregnancyData, importance = TRUE)

Looking at how much each variable contributes to decreasing node impurity on average

varImpPlot(Pregnancy.rf, type = 2)

Prediction

First the linear regression model prediction

PregnancyData.Test.1m.Preds <- predict(Pregnancy.1m, PregnancyData.Test, type = "response")
summary(PregnancyData.Test.1m.Preds)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.001179 0.066194 0.239459 0.283077 0.414253 0.999211

Second test the random forest model prediction

PregnancyData.Test.rf.Preds <- predict(Pregnancy.rf, PregnancyData.Test, type = "prob")
summary(PregnancyData.Test.rf.Preds)
##        0                1         
##  Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.7630   1st Qu.:0.0080  
##  Median :0.9620   Median :0.0380  
##  Mean   :0.8107   Mean   :0.1893  
##  3rd Qu.:0.9920   3rd Qu.:0.2370  
##  Max.   :1.0000   Max.   :1.0000

Let's check out the two models on one customer

t(PregnancyData.Test[1, ])
##                        1  
## Implied.Gender         "U"
## Home.Apt..PO.Box       "A"
## Pregnancy.Test         "0"
## Birth.Control          "0"
## Feminine.Hygiene       "0"
## Folic.Acid             "0"
## Prenatal.Vitamins      "0"
## Prenatal.Yoga          "0"
## Body.Pillow            "0"
## Ginger.Ale             "0"
## Sea.Bands              "1"
## Stopped.buying.ciggies "0"
## Cigarettes             "0"
## Smoking.Cessation      "0"
## Stopped.buying.wine    "1"
## Wine                   "1"
## Maternity.Clothes      "0"
## PREGNANT               "1"
t(PregnancyData.Test.1m.Preds[1])
##              1
## [1,] 0.6735358
t(PregnancyData.Test.rf.Preds[1,2])
##       [,1]
## [1,] 0.518

Linear regression modeling wins this one with a score of 0.67 (she was pregnant).

ROC Curves
pred.1m <- prediction(PregnancyData.Test.1m.Preds, PregnancyData.Test$PREGNANT)
pred.rf <- prediction(PregnancyData.Test.rf.Preds[ ,2], PregnancyData.Test$PREGNANT)

perf.1m <- performance(pred.1m, "tpr", "fpr")
perf.rf <- performance(pred.rf, "tpr", "fpr")

Plot the curves...

plot(perf.1m, xlim = c(0,1), ylim = c(0,1))
plot(perf.rf, xlim = c(0,1), ylim = c(0,1), lty = 2, add = TRUE)