library(reshape2)
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(caret)
## Warning: package 'caret' was built under R version 4.2.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.2.3
## Loading required package: lattice
library(ggplot2)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(class)
# svm
library(e1071)
# lm
library(leaps)
## Warning: package 'leaps' was built under R version 4.2.3
library(forecast) #for accuracy measures
## Warning: package 'forecast' was built under R version 4.2.3
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
# decision tree
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.2.3
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.2.3
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(xgboost)
## Warning: package 'xgboost' was built under R version 4.2.3
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
library(Matrix)
library(gains)
dating <- read.csv("Speed Dating Data.csv")
which(names(dating) == "match")
## [1] 13
# Learn the column number of the beginning and end the hobbies. According to the key the begining is sports while yoga is the end.
which(names(dating) == "sports")
## [1] 51
which(names(dating) == "yoga")
## [1] 67
reduced <- dating[,c(13,51:67)]
# Start the imputing process by selecting only the rows without NA
finished_red <- reduced[complete.cases(reduced),]
# Start the imputing process
which(is.na(finished_red))
## integer(0)
# Start building a knn model
# Split Dataset!
#set.seed value as 1
set.seed(1)
# randomly select 60 percent of the row numbers finished_red in order to split.
train.index <- sample(row.names(finished_red), 0.6*dim(finished_red)[1])
# select the rest of row numbers of finished_red by using setdiff()
valid.index <- setdiff(row.names(finished_red), train.index)
# take the 60 percent of random row numbers and call it train.df
train.df <- finished_red[train.index, ]
# take the rest and call it valid.df
valid.df <- finished_red[valid.index, ]
# make sure our validating outcome variables as factor so they become a categorical variable.
valid.df$match <-as.factor(valid.df$match)
train.df$match <-as.factor(train.df$match)
# Normalize
train.norm.df <- train.df
valid.norm.df <- valid.df
norm.df <- finished_red
norm.values <- preProcess(train.df[,-1], method=c("center", "scale"))
train.norm.df[,-1] <- predict(norm.values, train.df[,-1])
valid.norm.df[,-1] <- predict(norm.values, valid.df[,-1])
norm.df[,-1] <- predict(norm.values, finished_red[,-1])
############################################ knn
# use knn with k = 3 which means "number of neighbours considered" = 3 instead of 2
knn.pred <- knn(train.norm.df[,-1], valid.norm.df[,-1],
cl = train.norm.df[, 1], k = 3)
summary(knn.pred)
## 0 1
## 3189 131
# Accuracy
length(which(knn.pred == valid.df$match))/nrow(valid.df)
## [1] 0.8337349
# initialize a data frame with two columns: k, and accuracy.
accuracy.df <- data.frame(k = seq(1, 10, 1), accuracy = rep(0, 10))
# Use a for loop to do what we did above for each row of accuracy.df
for(i in 1:10) {
knn.pred <- knn(train.norm.df[,-1], valid.norm.df[,-1],
cl = train.norm.df[, 1], k = i)
accuracy.df[i, 2] <- confusionMatrix(knn.pred, valid.norm.df[,1])$overall[1]
}
accuracy.df
## k accuracy
## 1 1 0.8307229
## 2 2 0.8316265
## 3 3 0.8331325
## 4 4 0.8343373
## 5 5 0.8325301
## 6 6 0.8334337
## 7 7 0.8352410
## 8 8 0.8322289
## 9 9 0.8337349
## 10 10 0.8346386
# k = 10 is the best (0.8323389)
knn.pred.new <- knn(train.norm.df[, -1], valid.norm.df[, -1],
cl = train.norm.df[, 1], k = 10)
summary(knn.pred.new)
## 0 1
## 3266 54
# Accuracy
length(which(knn.pred.new == valid.df$match))/nrow(valid.df)
## [1] 0.8340361
valid.df$pred <- knn.pred.new
valid.df$diff <- valid.df$match == knn.pred.new
wrong <- valid.df[which(!valid.df$diff),]
wrong <- wrong[,1:18]
melt_data <- melt(wrong, id = c("match"))
ggplot(melt_data, aes(x = as.factor(value))) +
geom_bar() +
theme_bw()+
facet_wrap(~variable, ncol=3)

## Ten fold cross validation in caret
train.df$match <- as.factor(train.df$match)
trControl <- trainControl(method = "cv",
number = 10)
fit <- train(match ~ .,
method = "knn",
tuneGrid = expand.grid(k = 1:10),
trControl = trControl,
metric = "Accuracy",
data = train.df)
fit
## k-Nearest Neighbors
##
## 4979 samples
## 17 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 4481, 4481, 4480, 4482, 4482, 4480, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.8158268 0.09031820
## 2 0.8172324 0.09010522
## 3 0.8192380 0.08754955
## 4 0.8176328 0.08264229
## 5 0.8194429 0.06992110
## 6 0.8208477 0.05614832
## 7 0.8208497 0.04015440
## 8 0.8230557 0.04531414
## 9 0.8244622 0.03286815
## 10 0.8254674 0.01568135
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 10.
############################# logistic regression
# in creating the folds we specify the target feature (dependent variable) and # of folds
folds = createFolds(train.df$match, k = 10)
# in cv we are going to applying a created function to our 'folds'
cv = lapply(folds, function(x) { # start of function
# in the next two lines we will separate the Training set into it's 10 pieces
training_fold = train.df[-x, ] # training fold = training set minus (-) it's sub test fold
test_fold = train.df[x, ] # here we describe the test fold individually
# now apply (train) the classifer on the training_fold
classifier = svm(formula = match ~ .,
data = train.df,
type = 'C-classification',
kernel = 'radial')
# next step in the loop, we calculate the predictions and cm and we equate the accuracy
# note we are training on training_fold and testing its accuracy on the test_fold
y_pred = predict(classifier, newdata = test_fold[,-1])
cm = table(test_fold[, 1], y_pred)
accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
return(accuracy)
})
cv
## $Fold01
## [1] 0.8356713
##
## $Fold02
## [1] 0.8353414
##
## $Fold03
## [1] 0.8333333
##
## $Fold04
## [1] 0.8336673
##
## $Fold05
## [1] 0.8350101
##
## $Fold06
## [1] 0.8353414
##
## $Fold07
## [1] 0.8373494
##
## $Fold08
## [1] 0.8309859
##
## $Fold09
## [1] 0.8293173
##
## $Fold10
## [1] 0.8350101
which.max(cv)
## Fold07
## 7
f<- folds[[which.max(cv)]]
# $Fold01 is the best in terms of accuracy (0.8409543)
##########Now lets build a model!!
set.seed(1)
# since fold 1 is most accurate we shall use folds$Fold01 to create our train and test dataset!
training_fold = train.df[-f, ]
# now apply (train) the classifer on the training_fold
classifier = svm(formula = match ~ .,
data = train.df,
type = 'C-classification',
kernel = 'radial')
summary(classifier)
##
## Call:
## svm(formula = match ~ ., data = train.df, type = "C-classification",
## kernel = "radial")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 2007
##
## ( 1171 836 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
# next step in the loop, we calculate the predictions and cm and we equate the accuracy
# note we are training on training_fold and testing its accuracy on the test_fold
y_pred = predict(classifier, newdata = valid.df[,-1])
cm = table(valid.df[, 1], y_pred)
accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
accuracy
## [1] 0.8418675
################################### Random Forest
# in creating the folds we specify the target feature (dependent variable) and # of folds
folds = createFolds(train.df$match, k = 10)
train.df[] <- lapply(train.df, as.numeric)
valid.df[] <- lapply(valid.df, as.numeric)
# in cv we are going to applying a created function to our 'folds'
cv = lapply(folds, function(x) { # start of function
# in the next two lines we will separate the Training set into it's 10 pieces
training_fold = train.df[-x, ] # training fold = training set minus (-) it's sub test fold
test_fold = train.df[x, ] # here we describe the test fold individually
levels(test_fold$match) <- levels(training_fold$match)
# now apply (train) the classifer on the training_fold
classifier = randomForest(match~., data=training_fold, proximity=TRUE)
# next step in the loop, we calculate the predictions and cm and we equate the accuracy
# note we are training on training_fold and testing its accuracy on the test_fold
y_pred = predict(classifier, newdata = test_fold)
cm = table(test_fold[, 9], y_pred)
accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
return(accuracy)
})
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
cv
## $Fold01
## [1] 0
##
## $Fold02
## [1] 0
##
## $Fold03
## [1] 0.09677419
##
## $Fold04
## [1] 0.08695652
##
## $Fold05
## [1] 0
##
## $Fold06
## [1] 0
##
## $Fold07
## [1] 0.2173913
##
## $Fold08
## [1] 0
##
## $Fold09
## [1] 0
##
## $Fold10
## [1] 0.08333333
which.max(cv)
## Fold07
## 7
#Random Forrest cannot predict with high accuracy for this dataset
##############################################Decision Tree - 10 fold Cross validation
valid.df$match <-as.factor(valid.df$match)
train.df$match <-as.factor(train.df$match)
folds = createFolds(train.df$match, k = 10)
cv = lapply(folds, function(x) { # start of function
# in the next two lines we will separate the Training set into it's 10 pieces
training_fold = train.df[-x, ] # training fold = training set minus (-) it's sub test fold
test_fold = train.df[x, ] # here we describe the test fold individually
levels(test_fold$match) <- levels(training_fold$match)
# now apply (train) the classifer on the training_fold
tr <- rpart(match ~ ., data = training_fold, cp=0.0001, minbucket = 5, maxdepth = 7)
pfit<- prune(tr, cp = tr$cptable[which.min(tr$cptable[,"xerror"]),"CP"])
# next step in the loop, we calculate the predictions and cm and we equate the accuracy
# note we are training on training_fold and testing its accuracy on the test_fold
y_pred = predict(pfit, test_fold, type = "class")
cm = table(test_fold[, 1], y_pred)
accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
return(accuracy)
})
cv
## $Fold01
## [1] 0.8293173
##
## $Fold02
## [1] 0.8313253
##
## $Fold03
## [1] 0.832998
##
## $Fold04
## [1] 0.8313253
##
## $Fold05
## [1] 0.8316633
##
## $Fold06
## [1] 0.8313253
##
## $Fold07
## [1] 0.8313253
##
## $Fold08
## [1] 0.832998
##
## $Fold09
## [1] 0.8316633
##
## $Fold10
## [1] 0.832998
which.max(cv)
## Fold03
## 3
f<- folds[[which.max(cv)]]
##############################################Decision Tree
#fit a classification tree model (Competitive. is the outcome variable, and all other variables are predictors).
tr <- rpart(match ~ ., data = train.df[-f], cp=0.0001, minbucket = 5, maxdepth = 7)
rpart.plot(tr, extra = 106)

#print cp of the newly fitted model.
options(scipen = 10)
printcp(tr)
##
## Classification tree:
## rpart(formula = match ~ ., data = train.df[-f], cp = 0.0001,
## minbucket = 5, maxdepth = 7)
##
## Variables actually used in tree construction:
## [1] clubbing concerts hiking museums reading shopping sports theater
## [9] tvsports yoga
##
## Root node error: 836/4979 = 0.16791
##
## n= 4979
##
## CP nsplit rel error xerror xstd
## 1 0.00199362 0 1.00000 1.0000 0.031549
## 2 0.00059809 12 0.97129 1.0060 0.031624
## 3 0.00029904 14 0.97010 1.0072 0.031639
## 4 0.00010000 18 0.96890 1.0072 0.031639
# determine a nested sequence of subtrees on the rpart object "tr" to be trimmed based on the complexity parameter that has the lowest standard error.
pfit<- prune(tr, cp = tr$cptable[which.min(tr$cptable[,"xerror"]),"CP"])
# print out the lowest standard error (xerror)
tr$cptable[which.min(tr$cptable[,"xerror"]),"CP"]
## [1] 0.00199362
#Q6: Based on the above calculation, which number of splits indicates the minimum complexity parameters?
prp(tr)

t(t(names(finished_red)))
## [,1]
## [1,] "match"
## [2,] "sports"
## [3,] "tvsports"
## [4,] "exercise"
## [5,] "dining"
## [6,] "museums"
## [7,] "art"
## [8,] "hiking"
## [9,] "gaming"
## [10,] "clubbing"
## [11,] "reading"
## [12,] "tv"
## [13,] "theater"
## [14,] "movies"
## [15,] "concerts"
## [16,] "music"
## [17,] "shopping"
## [18,] "yoga"
#which are the most important variables
t(t(tr$variable.importance))
## [,1]
## hiking 14.659248
## clubbing 13.476890
## theater 12.091390
## reading 11.616833
## shopping 10.285327
## museums 8.624560
## yoga 8.248458
## sports 7.891788
## tv 6.366825
## tvsports 6.330249
## concerts 5.801635
## exercise 4.200721
## music 3.948437
## dining 3.419929
## movies 1.743543
#set of rules
tr
## n= 4979
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 4979 836 1 (0.83209480 0.16790520)
## 2) reading< 5.5 767 86 1 (0.88787484 0.11212516)
## 4) tvsports>=9.5 76 2 1 (0.97368421 0.02631579) *
## 5) tvsports< 9.5 691 84 1 (0.87843705 0.12156295)
## 10) yoga< 2.5 250 20 1 (0.92000000 0.08000000)
## 20) concerts< 9 225 13 1 (0.94222222 0.05777778) *
## 21) concerts>=9 25 7 1 (0.72000000 0.28000000)
## 42) sports< 7.5 18 3 1 (0.83333333 0.16666667) *
## 43) sports>=7.5 7 3 2 (0.42857143 0.57142857) *
## 11) yoga>=2.5 441 64 1 (0.85487528 0.14512472) *
## 3) reading>=5.5 4212 750 1 (0.82193732 0.17806268)
## 6) clubbing< 5.5 1698 247 1 (0.85453475 0.14546525) *
## 7) clubbing>=5.5 2514 503 1 (0.79992045 0.20007955)
## 14) reading>=6.5 2262 432 1 (0.80901857 0.19098143)
## 28) theater>=1.5 2257 427 1 (0.81081081 0.18918919)
## 56) hiking< 8.5 1910 337 1 (0.82356021 0.17643979)
## 112) clubbing< 9.5 1858 319 1 (0.82831001 0.17168999) *
## 113) clubbing>=9.5 52 18 1 (0.65384615 0.34615385)
## 226) shopping< 7.5 35 6 1 (0.82857143 0.17142857) *
## 227) shopping>=7.5 17 5 2 (0.29411765 0.70588235) *
## 57) hiking>=8.5 347 90 1 (0.74063401 0.25936599)
## 114) yoga< 5.5 173 32 1 (0.81502890 0.18497110) *
## 115) yoga>=5.5 174 58 1 (0.66666667 0.33333333)
## 230) museums< 9.5 143 42 1 (0.70629371 0.29370629) *
## 231) museums>=9.5 31 15 2 (0.48387097 0.51612903) *
## 29) theater< 1.5 5 0 2 (0.00000000 1.00000000) *
## 15) reading< 6.5 252 71 1 (0.71825397 0.28174603)
## 30) tvsports>=8.5 24 2 1 (0.91666667 0.08333333) *
## 31) tvsports< 8.5 228 69 1 (0.69736842 0.30263158)
## 62) sports< 9.5 219 62 1 (0.71689498 0.28310502)
## 124) museums< 8.5 163 37 1 (0.77300613 0.22699387)
## 248) theater< 9.5 157 33 1 (0.78980892 0.21019108) *
## 249) theater>=9.5 6 2 2 (0.33333333 0.66666667) *
## 125) museums>=8.5 56 25 1 (0.55357143 0.44642857)
## 250) clubbing< 8 35 12 1 (0.65714286 0.34285714) *
## 251) clubbing>=8 21 8 2 (0.38095238 0.61904762) *
## 63) sports>=9.5 9 2 2 (0.22222222 0.77777778) *
valid.df$pred <- predict(pfit, valid.df, type = "class")
corr <- which(valid.df$pred == valid.df$match)
base_accuracy <- length(corr)/nrow(valid.df)
base_accuracy
## [1] 0.8406627