library(reshape2)
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(caret)
## Warning: package 'caret' was built under R version 4.2.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.2.3
## Loading required package: lattice
library(ggplot2)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(class)
# svm
library(e1071)
# lm
library(leaps)
## Warning: package 'leaps' was built under R version 4.2.3
library(forecast) #for accuracy measures
## Warning: package 'forecast' was built under R version 4.2.3
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
# decision tree
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.2.3
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.2.3
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(xgboost)
## Warning: package 'xgboost' was built under R version 4.2.3
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
library(Matrix)
library(gains)
dating <- read.csv("Speed Dating Data.csv")
which(names(dating) == "match")
## [1] 13
# Learn the column number of the beginning and end the hobbies. According to the key the begining is sports while yoga is the end.
which(names(dating) == "sports")
## [1] 51
which(names(dating) == "yoga")
## [1] 67
reduced <- dating[,c(13,51:67)]
# Start the imputing process by selecting only the rows without NA
finished_red <- reduced[complete.cases(reduced),]
# Start the imputing process
which(is.na(finished_red))
## integer(0)
# Start building a knn model
# Split Dataset!
#set.seed value as 1
set.seed(1)
# randomly select 60 percent of the row numbers finished_red in order to split.
train.index <- sample(row.names(finished_red), 0.6*dim(finished_red)[1])
# select the rest of row numbers of finished_red by using setdiff()
valid.index <- setdiff(row.names(finished_red), train.index)
# take the 60 percent of random row numbers and call it train.df
train.df <- finished_red[train.index, ]
# take the rest and call it valid.df
valid.df <- finished_red[valid.index, ]
# make sure our validating outcome variables as factor so they become a categorical variable.
valid.df$match <-as.factor(valid.df$match)
train.df$match <-as.factor(train.df$match)
# Normalize
train.norm.df <- train.df
valid.norm.df <- valid.df
norm.df <- finished_red
norm.values <- preProcess(train.df[,-1], method=c("center", "scale"))
train.norm.df[,-1] <- predict(norm.values, train.df[,-1])
valid.norm.df[,-1] <- predict(norm.values, valid.df[,-1])
norm.df[,-1] <- predict(norm.values, finished_red[,-1])
############################################ knn
# use knn with k = 3 which means "number of neighbours considered" = 3 instead of 2
knn.pred <- knn(train.norm.df[,-1], valid.norm.df[,-1],
cl = train.norm.df[, 1], k = 3)
summary(knn.pred)
## 0 1
## 3189 131
# Accuracy
length(which(knn.pred == valid.df$match))/nrow(valid.df)
## [1] 0.8337349
# initialize a data frame with two columns: k, and accuracy.
accuracy.df <- data.frame(k = seq(1, 10, 1), accuracy = rep(0, 10))
# Use a for loop to do what we did above for each row of accuracy.df
for(i in 1:10) {
knn.pred <- knn(train.norm.df[,-1], valid.norm.df[,-1],
cl = train.norm.df[, 1], k = i)
accuracy.df[i, 2] <- confusionMatrix(knn.pred, valid.norm.df[,1])$overall[1]
}
accuracy.df
## k accuracy
## 1 1 0.8307229
## 2 2 0.8316265
## 3 3 0.8331325
## 4 4 0.8343373
## 5 5 0.8325301
## 6 6 0.8334337
## 7 7 0.8352410
## 8 8 0.8322289
## 9 9 0.8337349
## 10 10 0.8346386
kvalue <- which.max(accuracy.df$accuracy)
knn.pred.new <- knn(train.norm.df[, -1], valid.norm.df[, -1],
cl = train.norm.df[, 1], k = kvalue)
summary(knn.pred.new)
## 0 1
## 3238 82
# Accuracy
length(which(knn.pred.new == valid.df$match))/nrow(valid.df)
## [1] 0.8346386
valid.df$pred <- knn.pred.new
valid.df$diff <- valid.df$match == knn.pred.new
wrong <- valid.df[which(!valid.df$diff),]
wrong <- wrong[,1:18]
melt_data <- melt(wrong, id = c("match"))
ggplot(melt_data, aes(x = as.factor(value))) +
geom_bar() +
theme_bw()+
facet_wrap(~variable, ncol=3)

## Ten fold cross validation in caret
train.df$match <- as.factor(train.df$match)
trControl <- trainControl(method = "cv",
number = 10)
fit <- train(match ~ .,
method = "knn",
tuneGrid = expand.grid(k = 1:10),
trControl = trControl,
metric = "Accuracy",
data = train.df)
fit
## k-Nearest Neighbors
##
## 4979 samples
## 17 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 4482, 4481, 4480, 4481, 4482, 4482, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.8174320 0.094075148
## 2 0.8188352 0.093775424
## 3 0.8216461 0.095016767
## 4 0.8198380 0.083330316
## 5 0.8204417 0.072037316
## 6 0.8222497 0.063888919
## 7 0.8202437 0.034645571
## 8 0.8216485 0.033187163
## 9 0.8236573 0.022672824
## 10 0.8232561 0.001285542
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
############################# logistic regression
# in creating the folds we specify the target feature (dependent variable) and # of folds
folds = createFolds(train.df$match, k = 10)
# in cv we are going to applying a created function to our 'folds'
cv = lapply(folds, function(x) { # start of function
# in the next two lines we will separate the Training set into it's 10 pieces
training_fold = train.df[-x, ] # training fold = training set minus (-) it's sub test fold
test_fold = train.df[x, ] # here we describe the test fold individually
# now apply (train) the classifer on the training_fold
classifier = svm(formula = match ~ .,
data = train.df,
type = 'C-classification',
kernel = 'radial')
# next step in the loop, we calculate the predictions and cm and we equate the accuracy
# note we are training on training_fold and testing its accuracy on the test_fold
y_pred = predict(classifier, newdata = test_fold[,-1])
cm = table(test_fold[, 1], y_pred)
accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
return(accuracy)
})
cv
## $Fold01
## [1] 0.8293173
##
## $Fold02
## [1] 0.8313253
##
## $Fold03
## [1] 0.8370221
##
## $Fold04
## [1] 0.8350101
##
## $Fold05
## [1] 0.8333333
##
## $Fold06
## [1] 0.8373494
##
## $Fold07
## [1] 0.8336673
##
## $Fold08
## [1] 0.8333333
##
## $Fold09
## [1] 0.8333333
##
## $Fold10
## [1] 0.8373494
which.max(cv)
## Fold06
## 6
f<- folds[[which.max(cv)]]
# $Fold01 is the best in terms of accuracy (0.8409543)
##########Now lets build a model!!
set.seed(1)
# since fold 1 is most accurate we shall use folds$Fold01 to create our train and test dataset!
training_fold = train.df[-f, ]
# now apply (train) the classifer on the training_fold
classifier = svm(formula = match ~ .,
data = train.df,
type = 'C-classification',
kernel = 'radial')
summary(classifier)
##
## Call:
## svm(formula = match ~ ., data = train.df, type = "C-classification",
## kernel = "radial")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 2007
##
## ( 1171 836 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
# next step in the loop, we calculate the predictions and cm and we equate the accuracy
# note we are training on training_fold and testing its accuracy on the test_fold
y_pred = predict(classifier, newdata = valid.df[,-1])
cm = table(valid.df[, 1], y_pred)
accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
accuracy
## [1] 0.8418675
################################### Random Forest
# in creating the folds we specify the target feature (dependent variable) and # of folds
folds = createFolds(train.df$match, k = 10)
# in cv we are going to applying a created function to our 'folds'
cv = lapply(folds, function(x) { # start of function
# in the next two lines we will separate the Training set into it's 10 pieces
training_fold = train.df[-x, ] # training fold = training set minus (-) it's sub test fold
test_fold = train.df[x, ] # here we describe the test fold individually
levels(test_fold$match) <- levels(training_fold$match)
# now apply (train) the classifer on the training_fold
classifier = randomForest(match~., data=training_fold, proximity=TRUE)
# next step in the loop, we calculate the predictions and cm and we equate the accuracy
# note we are training on training_fold and testing its accuracy on the test_fold
y_pred = predict(classifier, newdata = test_fold)
cm = table(test_fold[, 9], y_pred)
accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
return(accuracy)
})
cv
## $Fold01
## [1] 0.1023622
##
## $Fold02
## [1] 0.05109489
##
## $Fold03
## [1] 0.06153846
##
## $Fold04
## [1] 0.08547009
##
## $Fold05
## [1] 0.08270677
##
## $Fold06
## [1] 0.008928571
##
## $Fold07
## [1] 0.05594406
##
## $Fold08
## [1] 0.06422018
##
## $Fold09
## [1] 0.0754717
##
## $Fold10
## [1] 0.08064516
which.max(cv)
## Fold01
## 1
#Random Forrest cannot predict with high accuracy for this dataset
##########Now lets build a model!!
set.seed(1)
# since fold 1 is most accurate we shall use folds$Fold01 to create our train and test dataset!
training_fold = train.df[-f, ]
test_fold = valid.df # here we describe the test fold individually
# now apply (train) the classifer on the training_fold
classifier = randomForest(formula = match ~ ., data=training_fold, proximity=TRUE)
summary(classifier)
## Length Class Mode
## call 4 -none- call
## type 1 -none- character
## predicted 4481 factor numeric
## err.rate 1500 -none- numeric
## confusion 6 -none- numeric
## votes 8962 matrix numeric
## oob.times 4481 -none- numeric
## classes 2 -none- character
## importance 17 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 20079361 -none- numeric
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 14 -none- list
## y 4481 factor numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
# next step in the loop, we calculate the predictions and cm and we equate the accuracy
# note we are training on training_fold and testing its accuracy on the test_fold
y_pred = predict(classifier, newdata = test_fold[,-1])
cm = table(test_fold[, 1], y_pred)
accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
accuracy
## [1] 0.8301205
##############################################Decision Tree - 10 fold Cross validation
valid.df$match <-as.factor(valid.df$match)
train.df$match <-as.factor(train.df$match)
folds = createFolds(train.df$match, k = 10)
cv = lapply(folds, function(x) { # start of function
# in the next two lines we will separate the Training set into it's 10 pieces
training_fold = train.df[-x, ] # training fold = training set minus (-) it's sub test fold
test_fold = train.df[x, ] # here we describe the test fold individually
levels(test_fold$match) <- levels(training_fold$match)
# now apply (train) the classifer on the training_fold
tr <- rpart(match ~ ., data = training_fold, cp=0.0001, minbucket = 5, maxdepth = 7)
pfit<- prune(tr, cp = tr$cptable[which.min(tr$cptable[,"xerror"]),"CP"])
# next step in the loop, we calculate the predictions and cm and we equate the accuracy
# note we are training on training_fold and testing its accuracy on the test_fold
y_pred = predict(pfit, test_fold, type = "class")
cm = table(test_fold[, 1], y_pred)
accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
return(accuracy)
})
cv
## $Fold01
## [1] 0.832998
##
## $Fold02
## [1] 0.8316633
##
## $Fold03
## [1] 0.832998
##
## $Fold04
## [1] 0.8313253
##
## $Fold05
## [1] 0.8313253
##
## $Fold06
## [1] 0.8313253
##
## $Fold07
## [1] 0.8313253
##
## $Fold08
## [1] 0.8333333
##
## $Fold09
## [1] 0.8236473
##
## $Fold10
## [1] 0.832998
which.max(cv)
## Fold08
## 8
f<- folds[[which.max(cv)]]
##############################################Decision Tree
#fit a classification tree model (Competitive. is the outcome variable, and all other variables are predictors).
tr <- rpart(match ~ ., data = train.df[-f], cp=0.0001, minbucket = 5, maxdepth = 7)
rpart.plot(tr, extra = 106)

#print cp of the newly fitted model.
options(scipen = 10)
printcp(tr)
##
## Classification tree:
## rpart(formula = match ~ ., data = train.df[-f], cp = 0.0001,
## minbucket = 5, maxdepth = 7)
##
## Variables actually used in tree construction:
## [1] clubbing concerts gaming music reading shopping sports theater
## [9] tvsports yoga
##
## Root node error: 836/4979 = 0.16791
##
## n= 4979
##
## CP nsplit rel error xerror xstd
## 1 0.00287081 0 1.00000 1.0000 0.031549
## 2 0.00239234 5 0.98565 1.0108 0.031684
## 3 0.00119617 8 0.97847 1.0096 0.031669
## 4 0.00029904 12 0.97368 1.0227 0.031832
## 5 0.00010000 16 0.97249 1.0239 0.031847
# determine a nested sequence of subtrees on the rpart object "tr" to be trimmed based on the complexity parameter that has the lowest standard error.
pfit<- prune(tr, cp = tr$cptable[which.min(tr$cptable[,"xerror"]),"CP"])
# print out the lowest standard error (xerror)
tr$cptable[which.min(tr$cptable[,"xerror"]),"CP"]
## [1] 0.002870813
#Q6: Based on the above calculation, which number of splits indicates the minimum complexity parameters?
prp(tr)

t(t(names(finished_red)))
## [,1]
## [1,] "match"
## [2,] "sports"
## [3,] "tvsports"
## [4,] "exercise"
## [5,] "dining"
## [6,] "museums"
## [7,] "art"
## [8,] "hiking"
## [9,] "gaming"
## [10,] "clubbing"
## [11,] "reading"
## [12,] "tv"
## [13,] "theater"
## [14,] "movies"
## [15,] "concerts"
## [16,] "music"
## [17,] "shopping"
## [18,] "yoga"
#which are the most important variables
t(t(tr$variable.importance))
## [,1]
## clubbing 13.641537
## reading 11.616833
## theater 10.476731
## gaming 10.259535
## concerts 6.367454
## shopping 6.161475
## sports 5.778132
## dining 5.561181
## music 5.088161
## tvsports 4.860007
## movies 3.661107
## tv 2.644789
## museums 2.280793
## art 2.279216
## exercise 1.600611
## yoga 1.353388
#set of rules
tr
## n= 4979
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 4979 836 0 (0.83209480 0.16790520)
## 2) reading< 5.5 767 86 0 (0.88787484 0.11212516)
## 4) tvsports>=9.5 76 2 0 (0.97368421 0.02631579) *
## 5) tvsports< 9.5 691 84 0 (0.87843705 0.12156295)
## 10) yoga< 2.5 250 20 0 (0.92000000 0.08000000)
## 20) concerts< 9 225 13 0 (0.94222222 0.05777778) *
## 21) concerts>=9 25 7 0 (0.72000000 0.28000000)
## 42) sports< 7.5 18 3 0 (0.83333333 0.16666667) *
## 43) sports>=7.5 7 3 1 (0.42857143 0.57142857) *
## 11) yoga>=2.5 441 64 0 (0.85487528 0.14512472) *
## 3) reading>=5.5 4212 750 0 (0.82193732 0.17806268)
## 6) clubbing< 5.5 1698 247 0 (0.85453475 0.14546525) *
## 7) clubbing>=5.5 2514 503 0 (0.79992045 0.20007955)
## 14) reading>=6.5 2262 432 0 (0.80901857 0.19098143)
## 28) theater>=1.5 2257 427 0 (0.81081081 0.18918919)
## 56) music< 8.5 1228 204 0 (0.83387622 0.16612378)
## 112) shopping>=6.5 602 82 0 (0.86378738 0.13621262)
## 224) clubbing< 9.5 597 78 0 (0.86934673 0.13065327) *
## 225) clubbing>=9.5 5 1 1 (0.20000000 0.80000000) *
## 113) shopping< 6.5 626 122 0 (0.80511182 0.19488818) *
## 57) music>=8.5 1029 223 0 (0.78328474 0.21671526) *
## 29) theater< 1.5 5 0 1 (0.00000000 1.00000000) *
## 15) reading< 6.5 252 71 0 (0.71825397 0.28174603)
## 30) gaming< 7.5 221 52 0 (0.76470588 0.23529412)
## 60) shopping< 7.5 163 30 0 (0.81595092 0.18404908)
## 120) sports>=2.5 155 25 0 (0.83870968 0.16129032) *
## 121) sports< 2.5 8 3 1 (0.37500000 0.62500000) *
## 61) shopping>=7.5 58 22 0 (0.62068966 0.37931034)
## 122) theater< 9.5 42 12 0 (0.71428571 0.28571429) *
## 123) theater>=9.5 16 6 1 (0.37500000 0.62500000) *
## 31) gaming>=7.5 31 12 1 (0.38709677 0.61290323)
## 62) sports< 6 11 5 0 (0.54545455 0.45454545) *
## 63) sports>=6 20 6 1 (0.30000000 0.70000000) *
valid.df$pred <- predict(pfit, valid.df, type = "class")
corr <- which(valid.df$pred == valid.df$match)
base_accuracy <- length(corr)/nrow(valid.df)
base_accuracy
## [1] 0.8406627