Follow along with Foreman's instructions to download R. Follow along by recreating the models in R.
Export code, output and plots from R into a Word doc. Submit file titled "R_LnameFinitial".
library(randomForest)
library(ROCR)
almostpi <- 355/113
almostpi
## [1] 3.141593
sqrt(almostpi)
## [1] 1.772454
?sqrt
??log
someprimes <- c(1,2,3,5,7,11)
someprimes
## [1] 1 2 3 5 7 11
length(someprimes)
## [1] 6
someprimes[4]
## [1] 5
someprimes[c(4,5,6)]
## [1] 5 7 11
someprimes[c(4:6)]
## [1] 5 7 11
which(someprimes<7)
## [1] 1 2 3 4
someprimes[which(someprimes<7)]
## [1] 1 2 3 5
primestime2 <- someprimes*2
primestime2
## [1] 2 4 6 10 14 22
summary(someprimes)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.250 4.000 4.833 6.500 11.000
somecolors <- c("blue", "red", "green", "blue", "green", "yellow", "red", "red")
somecolors
## [1] "blue" "red" "green" "blue" "green" "yellow" "red" "red"
summary(somecolors)
## Length Class Mode
## 8 character character
somecolors <- factor(somecolors)
somecolors
## [1] blue red green blue green yellow red red
## Levels: blue green red yellow
summary(somecolors)
## blue green red yellow
## 2 2 3 1
amatrix <- matrix(data=c(someprimes, primestime2), nrow=2, ncol=6)
amatrix
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 1 3 7 2 6 14
## [2,] 2 5 11 4 10 22
nrow(amatrix)
## [1] 2
ncol(amatrix)
## [1] 6
t(amatrix)
## [,1] [,2]
## [1,] 1 2
## [2,] 3 5
## [3,] 7 11
## [4,] 2 4
## [5,] 6 10
## [6,] 14 22
amatrix[1:2,3]
## [1] 7 11
amatrix[,3]
## [1] 7 11
primestime3 <- someprimes*3
amatrix <- rbind(amatrix, primestime3)
amatrix
## [,1] [,2] [,3] [,4] [,5] [,6]
## 1 3 7 2 6 14
## 2 5 11 4 10 22
## primestime3 3 6 9 15 21 33
John <- list(gender="male", age="ancient", height=72, spawn=3, spawn_ages=c(.5,2,5))
John
## $gender
## [1] "male"
##
## $age
## [1] "ancient"
##
## $height
## [1] 72
##
## $spawn
## [1] 3
##
## $spawn_ages
## [1] 0.5 2.0 5.0
bondnames <- c("connery", "lazenby", "moore", "dalton", "brosnan", "craig")
firstyear <- c(1962, 1969, 1973, 1987, 1995, 2006)
eyecolor <- c("brown", "brown", "blue", "green", "blue", "blue")
womenkissed <- c(17, 3, 20, 4, 12, 4)
countofbondjamesbonds <- c(3,2,10,2,5,1)
bonddata <- data.frame(bondnames, firstyear, eyecolor, womenkissed, countofbondjamesbonds)
bonddata
## bondnames firstyear eyecolor womenkissed countofbondjamesbonds
## 1 connery 1962 brown 17 3
## 2 lazenby 1969 brown 3 2
## 3 moore 1973 blue 20 10
## 4 dalton 1987 green 4 2
## 5 brosnan 1995 blue 12 5
## 6 craig 2006 blue 4 1
str(bonddata)
## 'data.frame': 6 obs. of 5 variables:
## $ bondnames : Factor w/ 6 levels "brosnan","connery",..: 2 5 6 4 1 3
## $ firstyear : num 1962 1969 1973 1987 1995 ...
## $ eyecolor : Factor w/ 3 levels "blue","brown",..: 2 2 1 3 1 1
## $ womenkissed : num 17 3 20 4 12 4
## $ countofbondjamesbonds: num 3 2 10 2 5 1
summary(bonddata)
## bondnames firstyear eyecolor womenkissed
## brosnan:1 Min. :1962 blue :3 Min. : 3.00
## connery:1 1st Qu.:1970 brown:2 1st Qu.: 4.00
## craig :1 Median :1980 green:1 Median : 8.00
## dalton :1 Mean :1982 Mean :10.00
## lazenby:1 3rd Qu.:1993 3rd Qu.:15.75
## moore :1 Max. :2006 Max. :20.00
## countofbondjamesbonds
## Min. : 1.000
## 1st Qu.: 2.000
## Median : 2.500
## Mean : 3.833
## 3rd Qu.: 4.500
## Max. :10.000
bonddata$firstyear <- factor(bonddata$firstyear)
summary(bonddata)
## bondnames firstyear eyecolor womenkissed countofbondjamesbonds
## brosnan:1 1962:1 blue :3 Min. : 3.00 Min. : 1.000
## connery:1 1969:1 brown:2 1st Qu.: 4.00 1st Qu.: 2.000
## craig :1 1973:1 green:1 Median : 8.00 Median : 2.500
## dalton :1 1987:1 Mean :10.00 Mean : 3.833
## lazenby:1 1995:1 3rd Qu.:15.75 3rd Qu.: 4.500
## moore :1 2006:1 Max. :20.00 Max. :10.000
Reading Data into R
getwd()
## [1] "/Users/emiliembolduc/Bard MBA/2019_Spring_Data and Decisions/R/DataSmart_Chapt10"
Loading data
PregnancyData <- read.csv("Pregnancy.csv")
PregnancyData.Test <- read.csv("Pregnancy_Test.csv")
summary(PregnancyData)
## Implied.Gender Home.Apt..PO.Box Pregnancy.Test Birth.Control
## F:495 A:420 Min. :0.000 Min. :0.00
## M:401 H:488 1st Qu.:0.000 1st Qu.:0.00
## U:104 P: 92 Median :0.000 Median :0.00
## Mean :0.075 Mean :0.14
## 3rd Qu.:0.000 3rd Qu.:0.00
## Max. :1.000 Max. :1.00
## Feminine.Hygiene Folic.Acid Prenatal.Vitamins Prenatal.Yoga
## Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000
## 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000
## Median :0.000 Median :0.000 Median :0.000 Median :0.000
## Mean :0.141 Mean :0.106 Mean :0.128 Mean :0.018
## 3rd Qu.:0.000 3rd Qu.:0.000 3rd Qu.:0.000 3rd Qu.:0.000
## Max. :1.000 Max. :1.000 Max. :1.000 Max. :1.000
## Body.Pillow Ginger.Ale Sea.Bands Stopped.buying.ciggies
## Min. :0.000 Min. :0.000 Min. :0.00 Min. :0.000
## 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.00 1st Qu.:0.000
## Median :0.000 Median :0.000 Median :0.00 Median :0.000
## Mean :0.018 Mean :0.069 Mean :0.03 Mean :0.092
## 3rd Qu.:0.000 3rd Qu.:0.000 3rd Qu.:0.00 3rd Qu.:0.000
## Max. :1.000 Max. :1.000 Max. :1.00 Max. :1.000
## Cigarettes Smoking.Cessation Stopped.buying.wine Wine
## Min. :0.000 Min. :0.00 Min. :0.00 Min. :0.000
## 1st Qu.:0.000 1st Qu.:0.00 1st Qu.:0.00 1st Qu.:0.000
## Median :0.000 Median :0.00 Median :0.00 Median :0.000
## Mean :0.097 Mean :0.06 Mean :0.13 Mean :0.123
## 3rd Qu.:0.000 3rd Qu.:0.00 3rd Qu.:0.00 3rd Qu.:0.000
## Max. :1.000 Max. :1.00 Max. :1.00 Max. :1.000
## Maternity.Clothes PREGNANT
## Min. :0.000 Min. :0.0
## 1st Qu.:0.000 1st Qu.:0.0
## Median :0.000 Median :0.5
## Mean :0.131 Mean :0.5
## 3rd Qu.:0.000 3rd Qu.:1.0
## Max. :1.000 Max. :1.0
summary(PregnancyData.Test)
## Implied.Gender Home.Apt..PO.Box Pregnancy.Test Birth.Control
## F:251 A:226 Min. :0.000 Min. :0.000
## M:424 H:469 1st Qu.:0.000 1st Qu.:0.000
## U:325 P:305 Median :0.000 Median :0.000
## Mean :0.011 Mean :0.216
## 3rd Qu.:0.000 3rd Qu.:0.000
## Max. :1.000 Max. :1.000
## Feminine.Hygiene Folic.Acid Prenatal.Vitamins Prenatal.Yoga
## Min. :0.000 Min. :0.00 Min. :0.000 Min. :0.000
## 1st Qu.:0.000 1st Qu.:0.00 1st Qu.:0.000 1st Qu.:0.000
## Median :0.000 Median :0.00 Median :0.000 Median :0.000
## Mean :0.209 Mean :0.02 Mean :0.043 Mean :0.005
## 3rd Qu.:0.000 3rd Qu.:0.00 3rd Qu.:0.000 3rd Qu.:0.000
## Max. :1.000 Max. :1.00 Max. :1.000 Max. :1.000
## Body.Pillow Ginger.Ale Sea.Bands Stopped.buying.ciggies
## Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.00
## 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.00
## Median :0.000 Median :0.000 Median :0.000 Median :0.00
## Mean :0.008 Mean :0.032 Mean :0.013 Mean :0.05
## 3rd Qu.:0.000 3rd Qu.:0.000 3rd Qu.:0.000 3rd Qu.:0.00
## Max. :1.000 Max. :1.000 Max. :1.000 Max. :1.00
## Cigarettes Smoking.Cessation Stopped.buying.wine Wine
## Min. :0.000 Min. :0.000 Min. :0.00 Min. :0.000
## 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.00 1st Qu.:0.000
## Median :0.000 Median :0.000 Median :0.00 Median :0.000
## Mean :0.148 Mean :0.009 Mean :0.08 Mean :0.202
## 3rd Qu.:0.000 3rd Qu.:0.000 3rd Qu.:0.00 3rd Qu.:0.000
## Max. :1.000 Max. :1.000 Max. :1.00 Max. :1.000
## Maternity.Clothes PREGNANT
## Min. :0.000 Min. :0.00
## 1st Qu.:0.000 1st Qu.:0.00
## Median :0.000 Median :0.00
## Mean :0.052 Mean :0.06
## 3rd Qu.:0.000 3rd Qu.:0.00
## Max. :1.000 Max. :1.00
str(PregnancyData)
## 'data.frame': 1000 obs. of 18 variables:
## $ Implied.Gender : Factor w/ 3 levels "F","M","U": 2 2 2 3 1 1 2 1 1 1 ...
## $ Home.Apt..PO.Box : Factor w/ 3 levels "A","H","P": 1 2 2 2 1 2 2 2 2 2 ...
## $ Pregnancy.Test : int 1 1 1 0 0 0 0 0 0 0 ...
## $ Birth.Control : int 0 0 0 0 0 0 1 0 0 0 ...
## $ Feminine.Hygiene : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Folic.Acid : int 0 0 0 0 0 0 1 0 0 0 ...
## $ Prenatal.Vitamins : int 1 1 0 0 0 1 1 0 0 1 ...
## $ Prenatal.Yoga : int 0 0 0 0 1 0 0 0 0 0 ...
## $ Body.Pillow : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Ginger.Ale : int 0 0 0 1 0 0 0 0 1 0 ...
## $ Sea.Bands : int 0 0 1 0 0 0 0 0 0 0 ...
## $ Stopped.buying.ciggies: int 0 0 0 0 0 1 0 0 0 0 ...
## $ Cigarettes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Smoking.Cessation : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Stopped.buying.wine : int 0 0 0 0 1 0 0 0 0 0 ...
## $ Wine : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Maternity.Clothes : int 0 0 0 0 0 0 0 1 0 1 ...
## $ PREGNANT : int 1 1 1 1 1 1 1 1 1 1 ...
str(PregnancyData.Test)
## 'data.frame': 1000 obs. of 18 variables:
## $ Implied.Gender : Factor w/ 3 levels "F","M","U": 3 3 2 2 2 2 1 1 1 1 ...
## $ Home.Apt..PO.Box : Factor w/ 3 levels "A","H","P": 1 2 2 2 1 2 2 3 3 2 ...
## $ Pregnancy.Test : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Birth.Control : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Feminine.Hygiene : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Folic.Acid : int 0 0 0 0 0 1 0 0 0 0 ...
## $ Prenatal.Vitamins : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Prenatal.Yoga : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Body.Pillow : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Ginger.Ale : int 0 0 0 0 0 1 0 0 0 0 ...
## $ Sea.Bands : int 1 0 0 0 0 0 0 0 0 0 ...
## $ Stopped.buying.ciggies: int 0 0 1 0 0 0 0 0 0 1 ...
## $ Cigarettes : int 0 0 0 0 0 0 0 0 0 1 ...
## $ Smoking.Cessation : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Stopped.buying.wine : int 1 0 0 1 0 1 1 0 0 0 ...
## $ Wine : int 1 0 0 0 0 0 0 0 0 0 ...
## $ Maternity.Clothes : int 0 0 0 0 0 0 1 0 0 1 ...
## $ PREGNANT : int 1 1 1 1 1 1 1 1 1 1 ...
Turn the response variable (1 for pregnant, 0 for not pregnant) into factor for Random Forest function
PregnancyData$PREGNANT <- factor(PregnancyData$PREGNANT)
PregnancyData.Test$PREGNANT <- factor(PregnancyData.Test$PREGNANT)
summary(PregnancyData$PREGNANT)
## 0 1
## 500 500
Start with a logistic regression
Pregnancy.1m <- glm(PREGNANT ~ .,
data = PregnancyData, family = binomial("logit"))
summary(Pregnancy.1m)
##
## Call:
## glm(formula = PREGNANT ~ ., family = binomial("logit"), data = PregnancyData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2012 -0.5566 -0.0246 0.5127 2.8658
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.343597 0.180755 -1.901 0.057315 .
## Implied.GenderM -0.453880 0.197566 -2.297 0.021599 *
## Implied.GenderU 0.141939 0.307588 0.461 0.644469
## Home.Apt..PO.BoxH -0.172927 0.194591 -0.889 0.374180
## Home.Apt..PO.BoxP -0.002813 0.336432 -0.008 0.993329
## Pregnancy.Test 2.370554 0.521781 4.543 5.54e-06 ***
## Birth.Control -2.300272 0.365270 -6.297 3.03e-10 ***
## Feminine.Hygiene -2.028558 0.342398 -5.925 3.13e-09 ***
## Folic.Acid 4.077666 0.761888 5.352 8.70e-08 ***
## Prenatal.Vitamins 2.479469 0.369063 6.718 1.84e-11 ***
## Prenatal.Yoga 2.922974 1.146990 2.548 0.010822 *
## Body.Pillow 1.261037 0.860617 1.465 0.142847
## Ginger.Ale 1.938502 0.426733 4.543 5.55e-06 ***
## Sea.Bands 1.107530 0.673435 1.645 0.100053
## Stopped.buying.ciggies 1.302222 0.342347 3.804 0.000142 ***
## Cigarettes -1.443022 0.370120 -3.899 9.67e-05 ***
## Smoking.Cessation 1.790779 0.512610 3.493 0.000477 ***
## Stopped.buying.wine 1.383888 0.305883 4.524 6.06e-06 ***
## Wine -1.565539 0.348910 -4.487 7.23e-06 ***
## Maternity.Clothes 2.078202 0.329432 6.308 2.82e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1386.29 on 999 degrees of freedom
## Residual deviance: 744.11 on 980 degrees of freedom
## AIC: 784.11
##
## Number of Fisher Scoring iterations: 7
Train a random forest model
Pregnancy.rf <- randomForest(PREGNANT ~ ., data = PregnancyData, importance = TRUE)
Looking at how much each variable contributes to decreasing node impurity on average
varImpPlot(Pregnancy.rf, type = 2)
First the linear regression model prediction
PregnancyData.Test.1m.Preds <- predict(Pregnancy.1m, PregnancyData.Test, type = "response")
summary(PregnancyData.Test.1m.Preds)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.001179 0.066194 0.239459 0.283077 0.414253 0.999211
Second test the random forest model prediction
PregnancyData.Test.rf.Preds <- predict(Pregnancy.rf, PregnancyData.Test, type = "prob")
summary(PregnancyData.Test.rf.Preds)
## 0 1
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.7630 1st Qu.:0.0080
## Median :0.9620 Median :0.0380
## Mean :0.8107 Mean :0.1893
## 3rd Qu.:0.9920 3rd Qu.:0.2370
## Max. :1.0000 Max. :1.0000
Let's check out the two models on one customer
t(PregnancyData.Test[1, ])
## 1
## Implied.Gender "U"
## Home.Apt..PO.Box "A"
## Pregnancy.Test "0"
## Birth.Control "0"
## Feminine.Hygiene "0"
## Folic.Acid "0"
## Prenatal.Vitamins "0"
## Prenatal.Yoga "0"
## Body.Pillow "0"
## Ginger.Ale "0"
## Sea.Bands "1"
## Stopped.buying.ciggies "0"
## Cigarettes "0"
## Smoking.Cessation "0"
## Stopped.buying.wine "1"
## Wine "1"
## Maternity.Clothes "0"
## PREGNANT "1"
t(PregnancyData.Test.1m.Preds[1])
## 1
## [1,] 0.6735358
t(PregnancyData.Test.rf.Preds[1,2])
## [,1]
## [1,] 0.518
Linear regression modeling wins this one with a score of 0.67 (she was pregnant).
pred.1m <- prediction(PregnancyData.Test.1m.Preds, PregnancyData.Test$PREGNANT)
pred.rf <- prediction(PregnancyData.Test.rf.Preds[ ,2], PregnancyData.Test$PREGNANT)
perf.1m <- performance(pred.1m, "tpr", "fpr")
perf.rf <- performance(pred.rf, "tpr", "fpr")
Plot the curves...
plot(perf.1m, xlim = c(0,1), ylim = c(0,1))
plot(perf.rf, xlim = c(0,1), ylim = c(0,1), lty = 2, add = TRUE)