library(readxl)
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
TransFood <- read_xlsx("C:/Users/Lynx/Documents/MSDA/621/TransFood.xlsx")
transfood <- as(as.matrix(TransFood), "transactions")
summary(transfood)
## transactions as itemMatrix in sparse format with
## 19076 rows (elements/itemsets/transactions) and
## 118 columns (items) and a density of 0.02230729
##
## most frequent items:
## Bottled.WaterFood Slice.of.CheeseFood Medium.DrinkFood Small.DrinkFood
## 3166 3072 2871 2769
## Slice.of.PeppFood (Other)
## 2354 35981
##
## element (itemset/transaction) length distribution:
## sizes
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 15
## 197 5675 5178 3253 2129 1293 655 351 178 95 42 14 8 7 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.000 2.000 2.632 4.000 15.000
##
## includes extended item information - examples:
## labels
## 1 Add.CheeseFood
## 2 BeerFood
## 3 Bottled.WaterFood
The largest total number of items purchased by a customer during one visit is 15.
itemFrequencyPlot(transfood, support = 0.1, cex.names = 0.8)
Bottled Water is the most frequently bought by customers.
transfood_rules <- apriori(transfood, parameter = list(sup = 0.01, conf = 0.5, target = "rules"))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.5 0.1 1 none FALSE TRUE 5 0.01 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 190
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[115 item(s), 19076 transaction(s)] done [0.00s].
## sorting and recoding items ... [45 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 done [0.00s].
## writing ... [8 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
summary(transfood_rules)
## set of 8 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3
## 5 3
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 2.000 2.000 2.375 3.000 3.000
##
## summary of quality measures:
## support confidence coverage lift
## Min. :0.01012 Min. :0.5152 Min. :0.01735 Min. : 3.199
## 1st Qu.:0.01290 1st Qu.:0.5516 1st Qu.:0.02115 1st Qu.: 3.534
## Median :0.01594 Median :0.6398 Median :0.02432 Median : 7.687
## Mean :0.01641 Mean :0.6790 Mean :0.02387 Mean : 6.965
## 3rd Qu.:0.01776 3rd Qu.:0.7672 3rd Qu.:0.02696 3rd Qu.: 8.331
## Max. :0.02857 Max. :0.9982 Max. :0.02862 Max. :13.181
## count
## Min. :193.0
## 1st Qu.:246.0
## Median :304.0
## Mean :313.1
## 3rd Qu.:338.8
## Max. :545.0
##
## mining info:
## data ntransactions support confidence
## transfood 19076 0.01 0.5
## call
## apriori(data = transfood, parameter = list(sup = 0.01, conf = 0.5, target = "rules"))
a) How many association rules can you find that have support of at least 1% and confidence of at least 50%?
transfood_rules
## set of 8 rules
b) What is the smallest value of lift obtained for the association rules calculated in this question?
The lift based on the summary is 3.199.
c) Based on the values of lift for each rule, are the items in each rule frequently purchased together? Or are the items seldom purchased together?
inspect(head(transfood_rules))
## lhs rhs support confidence coverage lift count
## [1] {Hot.Chocolate.Souvenir.RefillFood} => {Hot.Chocolate.SouvenirFood} 0.01499266 0.5596869 0.02678759 13.180972 286
## [2] {ToppingFood} => {Ice.Cream.ConeFood} 0.02856993 0.9981685 0.02862235 8.947868 545
## [3] {Add.CheeseFood} => {Soft.Pretzel..3_39Food} 0.01913399 0.6965649 0.02746907 7.601643 365
## [4] {Chicken.TendersFood} => {French.Fries.BasketFood} 0.01729922 0.7586207 0.02280352 7.771992 330
## [5] {CheeseburgerFood} => {French.Fries.BasketFood} 0.01687985 0.7931034 0.02128329 8.125264 322
## [6] {GatoradeFood,
## Slice.of.PeppFood} => {Slice.of.CheeseFood} 0.01011743 0.5830816 0.01735165 3.620724 193
Based on the values of lift for each rule, the items are frequently purchased together.
d) What is the average confidence obtained for the association rules calculated in this question?
The average confidence is 0.679
e) What is the median support obtained for the association rules calculated in this question?
The median support is 0.01594
f) Create a plot to visualize the support, confidence, and lift of the rules calculated in this question.
library('arulesViz')
plot(transfood_rules)
g) Choose two rules, and explain how the store could use these rules to improve sales.
The store could look at the rules and position items like Toppings and Cheese near Ice Cream Cone and Soft Pretzels respectively to improve sales as these items are generally purchased in tandem.
Titanic <- as.data.frame(read_xlsx("C:/Users/Lynx/Documents/MSDA/621/Titanic passenger list.xlsx"))
str(Titanic)
## 'data.frame': 1309 obs. of 7 variables:
## $ Passenger_Class : num 1 1 1 1 1 1 1 1 1 1 ...
## $ Survived : chr "Lived" "Lived" "Died" "Died" ...
## $ Sex : chr "female" "male" "female" "male" ...
## $ Sibling_or_Spouse: num 0 1 1 1 1 0 1 0 2 0 ...
## $ Parent_or_Child : num 0 2 2 2 2 0 0 0 0 0 ...
## $ Fare : num 211 152 152 152 152 ...
## $ Embarked : chr "Southhampton" "Southhampton" "Southhampton" "Southhampton" ...
Titanic$Survived <- as.factor(Titanic$Survived)
Titanic$Sex <- as.factor(Titanic$Sex)
Titanic$Embarked <- as.factor(Titanic$Embarked)
sum(is.na(Titanic))
## [1] 3
a) Create a table and/or graph displaying the number of passengers who embarked from each location (i.e., the number of passengers who embarked from Cherbourg, the number who embarked from Queenstown, etc.). From which location did the largest number of passengers embark? Impute all missing values in the Embarked column with this location.
summary(Titanic$Embarked)
## Cherbourg Queenstown Southhampton NA's
## 270 123 914 2
barplot(table(Titanic$Embarked))
Titanic$Embarked[is.na(Titanic$Embarked)==T] <- "Southhampton"
table(Titanic$Embarked)
##
## Cherbourg Queenstown Southhampton
## 270 123 916
b) For any missing value(s) in a numeric column(s) of data, impute the average of the column.
summary(is.na(Titanic))
## Passenger_Class Survived Sex Sibling_or_Spouse
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:1309 FALSE:1309 FALSE:1309 FALSE:1309
##
## Parent_or_Child Fare Embarked
## Mode :logical Mode :logical Mode :logical
## FALSE:1309 FALSE:1308 FALSE:1309
## TRUE :1
Titanic$Fare[is.na(Titanic$Fare)==T] <- mean(Titanic$Fare, na.rm = T)
index <- sample(nrow(Titanic), nrow(Titanic)*0.90)
titanic_train <- Titanic[index,]
titanic_test <- Titanic[-index,]
library(ipred)
titanic_bag <- bagging(formula = Survived~., data = titanic_train, nbagg = 300)
titanic_bag
##
## Bagging classification trees with 300 bootstrap replications
##
## Call: bagging.data.frame(formula = Survived ~ ., data = titanic_train,
## nbagg = 300)
a) What is the out-of-bag error for your model?
titanic_bag_oob <- bagging(formula = Survived~.,
data = titanic_train,
coob = T,
nbagg = 300)
titanic_bag_oob
##
## Bagging classification trees with 300 bootstrap replications
##
## Call: bagging.data.frame(formula = Survived ~ ., data = titanic_train,
## coob = T, nbagg = 300)
##
## Out-of-bag estimate of misclassification error: 0.2241
b) Use your model to make predictions for the observations in your testing set.
titanic_bag_pred <- predict(titanic_bag, newdata = titanic_test)
summary(titanic_bag_pred)
## Died Lived
## 88 43
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
titanic_rf <- randomForest(Survived~., data=titanic_train, importance = TRUE, ntree = 300)
a) What is the out-of-bag error for your model?
titanic_rf
##
## Call:
## randomForest(formula = Survived ~ ., data = titanic_train, importance = TRUE, ntree = 300)
## Type of random forest: classification
## Number of trees: 300
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 20.54%
## Confusion matrix:
## Died Lived class.error
## Died 653 74 0.1017882
## Lived 168 283 0.3725055
# mean of square residuals in the output is the out-of-bag mean squared error
b) Based on the out-of-bag error, is the random forest better at predicting survival than the bagging model? Or is the bagging model better than the random forest? Or do both models seem to perform with about the same accuracy?
Based on the observations above, it seems that the random forest (0.2173) is slightly better at predicting survival than the bagging model (0.1995).
c) What is the false positive rate (for your random forest model developed with 300 trees)?
= Lived / (Died + Lived)
d) What is the false negative rate (for your random forest model developed with 300 tress)?
= Died / (Died + Lived)
e) Create a plot comparing the out-of-bag error, the false positive rate, the false negative rate, and the number of trees. Based on your plot, does it appear that a large number of trees are needed to develop a fairly accurate random forest model?
plot(titanic_rf, lwd=rep(2, 3))
legend("right", legend = c("OOB Error", "FPR", "FNR"), lwd = rep(2, 3), lty = c(1, 2, 3), col = c("black", "red", "green"))
f) Use your random forest model to make predictions for the observations in your testing set.
titanic_rf_pred <- predict(titanic_rf, newdata = titanic_test)
summary(titanic_rf_pred)
## Died Lived
## 91 40