proj—match-from-hobbies.R

library(reshape2)
library(mice)

## 
## Attaching package: 'mice'

## The following object is masked from 'package:stats':
## 
##     filter

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

library(caret)

## Warning: package 'caret' was built under R version 4.2.3

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 4.2.3

## Loading required package: lattice

library(ggplot2)
library(dplyr)

## Warning: package 'dplyr' was built under R version 4.2.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(class)
# svm
library(e1071)
# lm
library(leaps)

## Warning: package 'leaps' was built under R version 4.2.3

library(forecast) #for accuracy measures

## Warning: package 'forecast' was built under R version 4.2.3

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

# decision tree
library(rpart)
library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 4.2.3

library(randomForest)

## Warning: package 'randomForest' was built under R version 4.2.3

## randomForest 4.7-1.1

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

library(xgboost)

## Warning: package 'xgboost' was built under R version 4.2.3

## 
## Attaching package: 'xgboost'

## The following object is masked from 'package:dplyr':
## 
##     slice

library(Matrix)
library(gains)

dating <- read.csv("Speed Dating Data.csv")

which(names(dating) == "match")

## [1] 13

# Learn the column number of the beginning and end the hobbies. According to the key the begining is sports while yoga is the end.
which(names(dating) == "sports")

## [1] 51

which(names(dating) == "yoga")

## [1] 67

reduced <- dating[,c(13,51:67)]



# Start the imputing process by selecting only the rows without NA
finished_red <- reduced[complete.cases(reduced),]
# Start the imputing process
which(is.na(finished_red))

## integer(0)

# Start building a knn model


# Split Dataset!

#set.seed value as 1
set.seed(1)

# randomly select 60 percent of the row numbers finished_red in order to split.
train.index <- sample(row.names(finished_red), 0.6*dim(finished_red)[1])  

# select the rest of row numbers of finished_red by using setdiff()
valid.index <- setdiff(row.names(finished_red), train.index)  

# take the 60 percent of random row numbers and call it train.df
train.df <- finished_red[train.index, ]

# take the rest and call it valid.df
valid.df <- finished_red[valid.index, ]

# make sure our validating outcome variables as factor so they become a categorical variable.
valid.df$match <-as.factor(valid.df$match)
train.df$match <-as.factor(train.df$match)


# Normalize
train.norm.df <- train.df
valid.norm.df <- valid.df
norm.df <- finished_red

norm.values <- preProcess(train.df[,-1], method=c("center", "scale"))

train.norm.df[,-1] <- predict(norm.values, train.df[,-1])
valid.norm.df[,-1] <- predict(norm.values, valid.df[,-1])
norm.df[,-1] <- predict(norm.values, finished_red[,-1])






############################################ knn
# use knn with k = 3 which means "number of neighbours considered" = 3 instead of 2
knn.pred <- knn(train.norm.df[,-1], valid.norm.df[,-1],
                cl = train.norm.df[, 1], k = 3)

summary(knn.pred)

##    0    1 
## 3189  131

# Accuracy
length(which(knn.pred == valid.df$match))/nrow(valid.df)

## [1] 0.8337349

# initialize a data frame with two columns: k, and accuracy.
accuracy.df <- data.frame(k = seq(1, 10, 1), accuracy = rep(0, 10))


# Use a for loop to do what we did above for each row of accuracy.df
for(i in 1:10) {
  knn.pred <- knn(train.norm.df[,-1], valid.norm.df[,-1], 
                  cl = train.norm.df[, 1], k = i)
  accuracy.df[i, 2] <- confusionMatrix(knn.pred, valid.norm.df[,1])$overall[1] 
}

accuracy.df

##     k  accuracy
## 1   1 0.8307229
## 2   2 0.8316265
## 3   3 0.8331325
## 4   4 0.8343373
## 5   5 0.8325301
## 6   6 0.8334337
## 7   7 0.8352410
## 8   8 0.8322289
## 9   9 0.8337349
## 10 10 0.8346386

kvalue <- which.max(accuracy.df$accuracy)


knn.pred.new <- knn(train.norm.df[, -1], valid.norm.df[, -1],
                    cl = train.norm.df[, 1], k = kvalue)

summary(knn.pred.new)

##    0    1 
## 3238   82

# Accuracy
length(which(knn.pred.new == valid.df$match))/nrow(valid.df)

## [1] 0.8346386

valid.df$pred <- knn.pred.new

valid.df$diff <- valid.df$match == knn.pred.new


wrong <- valid.df[which(!valid.df$diff),]
wrong <- wrong[,1:18]

melt_data <- melt(wrong, id = c("match")) 

ggplot(melt_data, aes(x = as.factor(value))) +
  geom_bar() +
  theme_bw()+
  facet_wrap(~variable,  ncol=3)

## Ten fold cross validation in caret

train.df$match <- as.factor(train.df$match)
trControl <- trainControl(method  = "cv",
                          number  = 10)
fit <- train(match ~ .,
             method     = "knn",
             tuneGrid   = expand.grid(k = 1:10),
             trControl  = trControl,
             metric     = "Accuracy",
             data       = train.df)


fit

## k-Nearest Neighbors 
## 
## 4979 samples
##   17 predictor
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 4482, 4481, 4480, 4481, 4482, 4482, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa      
##    1  0.8174320  0.094075148
##    2  0.8188352  0.093775424
##    3  0.8216461  0.095016767
##    4  0.8198380  0.083330316
##    5  0.8204417  0.072037316
##    6  0.8222497  0.063888919
##    7  0.8202437  0.034645571
##    8  0.8216485  0.033187163
##    9  0.8236573  0.022672824
##   10  0.8232561  0.001285542
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.

############################# logistic regression


# in creating the folds we specify the target feature (dependent variable) and # of folds
folds = createFolds(train.df$match, k = 10)

# in cv we are going to applying a created function to our 'folds'
cv = lapply(folds, function(x) { # start of function
  # in the next two lines we will separate the Training set into it's 10 pieces
  training_fold = train.df[-x, ] # training fold =  training set minus (-) it's sub test fold
  test_fold = train.df[x, ] # here we describe the test fold individually
  # now apply (train) the classifer on the training_fold
  classifier = svm(formula = match ~ .,
                   data = train.df,
                   type = 'C-classification',
                   kernel = 'radial')
  # next step in the loop, we calculate the predictions and cm and we equate the accuracy
  # note we are training on training_fold and testing its accuracy on the test_fold
  y_pred = predict(classifier, newdata = test_fold[,-1])
  cm = table(test_fold[, 1], y_pred)
  accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
  return(accuracy)
})

cv

## $Fold01
## [1] 0.8293173
## 
## $Fold02
## [1] 0.8313253
## 
## $Fold03
## [1] 0.8370221
## 
## $Fold04
## [1] 0.8350101
## 
## $Fold05
## [1] 0.8333333
## 
## $Fold06
## [1] 0.8373494
## 
## $Fold07
## [1] 0.8336673
## 
## $Fold08
## [1] 0.8333333
## 
## $Fold09
## [1] 0.8333333
## 
## $Fold10
## [1] 0.8373494

which.max(cv)

## Fold06 
##      6

f<- folds[[which.max(cv)]]

# $Fold01 is the best in terms of accuracy (0.8409543)

##########Now lets build a model!!
set.seed(1)
# since fold 1 is most accurate we shall use folds$Fold01 to create our train and test dataset!
training_fold = train.df[-f, ]
# now apply (train) the classifer on the training_fold
classifier = svm(formula = match ~ .,
                 data = train.df,
                 type = 'C-classification',
                 kernel = 'radial')

summary(classifier)

## 
## Call:
## svm(formula = match ~ ., data = train.df, type = "C-classification", 
##     kernel = "radial")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
## 
## Number of Support Vectors:  2007
## 
##  ( 1171 836 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1

# next step in the loop, we calculate the predictions and cm and we equate the accuracy
# note we are training on training_fold and testing its accuracy on the test_fold
y_pred = predict(classifier, newdata = valid.df[,-1])
cm = table(valid.df[, 1], y_pred)
accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])

accuracy

## [1] 0.8418675

################################### Random Forest

# in creating the folds we specify the target feature (dependent variable) and # of folds
folds = createFolds(train.df$match, k = 10)

# in cv we are going to applying a created function to our 'folds'
cv = lapply(folds, function(x) { # start of function
  # in the next two lines we will separate the Training set into it's 10 pieces
  training_fold = train.df[-x, ] # training fold =  training set minus (-) it's sub test fold
  test_fold = train.df[x, ] # here we describe the test fold individually
  levels(test_fold$match) <- levels(training_fold$match)
  # now apply (train) the classifer on the training_fold
  classifier = randomForest(match~., data=training_fold, proximity=TRUE)
  # next step in the loop, we calculate the predictions and cm and we equate the accuracy
  # note we are training on training_fold and testing its accuracy on the test_fold
  y_pred = predict(classifier, newdata = test_fold)
  cm = table(test_fold[, 9], y_pred)
  accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
  return(accuracy)
})

cv

## $Fold01
## [1] 0.1023622
## 
## $Fold02
## [1] 0.05109489
## 
## $Fold03
## [1] 0.06153846
## 
## $Fold04
## [1] 0.08547009
## 
## $Fold05
## [1] 0.08270677
## 
## $Fold06
## [1] 0.008928571
## 
## $Fold07
## [1] 0.05594406
## 
## $Fold08
## [1] 0.06422018
## 
## $Fold09
## [1] 0.0754717
## 
## $Fold10
## [1] 0.08064516

which.max(cv)

## Fold01 
##      1

#Random Forrest cannot predict with high accuracy for this dataset

##########Now lets build a model!!
set.seed(1)
# since fold 1 is most accurate we shall use folds$Fold01 to create our train and test dataset!
training_fold = train.df[-f, ]
test_fold = valid.df # here we describe the test fold individually
# now apply (train) the classifer on the training_fold
classifier = randomForest(formula = match ~ ., data=training_fold, proximity=TRUE)

summary(classifier)

##                 Length   Class  Mode     
## call                   4 -none- call     
## type                   1 -none- character
## predicted           4481 factor numeric  
## err.rate            1500 -none- numeric  
## confusion              6 -none- numeric  
## votes               8962 matrix numeric  
## oob.times           4481 -none- numeric  
## classes                2 -none- character
## importance            17 -none- numeric  
## importanceSD           0 -none- NULL     
## localImportance        0 -none- NULL     
## proximity       20079361 -none- numeric  
## ntree                  1 -none- numeric  
## mtry                   1 -none- numeric  
## forest                14 -none- list     
## y                   4481 factor numeric  
## test                   0 -none- NULL     
## inbag                  0 -none- NULL     
## terms                  3 terms  call

# next step in the loop, we calculate the predictions and cm and we equate the accuracy
# note we are training on training_fold and testing its accuracy on the test_fold
y_pred = predict(classifier, newdata = test_fold[,-1])
cm = table(test_fold[, 1], y_pred)
accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])


accuracy

## [1] 0.8301205

##############################################Decision Tree - 10 fold Cross validation


valid.df$match <-as.factor(valid.df$match)
train.df$match <-as.factor(train.df$match)

folds = createFolds(train.df$match, k = 10)


cv = lapply(folds, function(x) { # start of function
  # in the next two lines we will separate the Training set into it's 10 pieces
  training_fold = train.df[-x, ] # training fold = training set minus (-) it's sub test fold
  test_fold = train.df[x, ] # here we describe the test fold individually
  levels(test_fold$match) <- levels(training_fold$match)
  # now apply (train) the classifer on the training_fold
  tr <- rpart(match ~ ., data = training_fold, cp=0.0001, minbucket = 5, maxdepth = 7)
  
  pfit<- prune(tr, cp = tr$cptable[which.min(tr$cptable[,"xerror"]),"CP"])
  # next step in the loop, we calculate the predictions and cm and we equate the accuracy
  # note we are training on training_fold and testing its accuracy on the test_fold
  
  y_pred = predict(pfit, test_fold, type = "class")
  cm = table(test_fold[, 1], y_pred)
  accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
  return(accuracy)
})


cv

## $Fold01
## [1] 0.832998
## 
## $Fold02
## [1] 0.8316633
## 
## $Fold03
## [1] 0.832998
## 
## $Fold04
## [1] 0.8313253
## 
## $Fold05
## [1] 0.8313253
## 
## $Fold06
## [1] 0.8313253
## 
## $Fold07
## [1] 0.8313253
## 
## $Fold08
## [1] 0.8333333
## 
## $Fold09
## [1] 0.8236473
## 
## $Fold10
## [1] 0.832998

which.max(cv)

## Fold08 
##      8

f<- folds[[which.max(cv)]]

##############################################Decision Tree

#fit a classification tree model (Competitive. is the outcome variable, and all other variables are predictors). 
tr <- rpart(match ~ ., data = train.df[-f], cp=0.0001, minbucket = 5, maxdepth = 7)
rpart.plot(tr, extra = 106)

#print cp of the newly fitted model.  
options(scipen = 10)
printcp(tr)

## 
## Classification tree:
## rpart(formula = match ~ ., data = train.df[-f], cp = 0.0001, 
##     minbucket = 5, maxdepth = 7)
## 
## Variables actually used in tree construction:
##  [1] clubbing concerts gaming   music    reading  shopping sports   theater 
##  [9] tvsports yoga    
## 
## Root node error: 836/4979 = 0.16791
## 
## n= 4979 
## 
##           CP nsplit rel error xerror     xstd
## 1 0.00287081      0   1.00000 1.0000 0.031549
## 2 0.00239234      5   0.98565 1.0108 0.031684
## 3 0.00119617      8   0.97847 1.0096 0.031669
## 4 0.00029904     12   0.97368 1.0227 0.031832
## 5 0.00010000     16   0.97249 1.0239 0.031847

# determine a nested sequence of subtrees on the rpart object "tr" to be trimmed based on the complexity parameter that has the lowest standard error.
pfit<- prune(tr, cp = tr$cptable[which.min(tr$cptable[,"xerror"]),"CP"])

# print out the lowest standard error (xerror)
tr$cptable[which.min(tr$cptable[,"xerror"]),"CP"]

## [1] 0.002870813

#Q6: Based on the above calculation, which number of splits indicates the minimum complexity parameters? 

prp(tr)

t(t(names(finished_red)))

##       [,1]      
##  [1,] "match"   
##  [2,] "sports"  
##  [3,] "tvsports"
##  [4,] "exercise"
##  [5,] "dining"  
##  [6,] "museums" 
##  [7,] "art"     
##  [8,] "hiking"  
##  [9,] "gaming"  
## [10,] "clubbing"
## [11,] "reading" 
## [12,] "tv"      
## [13,] "theater" 
## [14,] "movies"  
## [15,] "concerts"
## [16,] "music"   
## [17,] "shopping"
## [18,] "yoga"

#which are the most important variables
t(t(tr$variable.importance))

##               [,1]
## clubbing 13.641537
## reading  11.616833
## theater  10.476731
## gaming   10.259535
## concerts  6.367454
## shopping  6.161475
## sports    5.778132
## dining    5.561181
## music     5.088161
## tvsports  4.860007
## movies    3.661107
## tv        2.644789
## museums   2.280793
## art       2.279216
## exercise  1.600611
## yoga      1.353388

#set of rules
tr

## n= 4979 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##   1) root 4979 836 0 (0.83209480 0.16790520)  
##     2) reading< 5.5 767  86 0 (0.88787484 0.11212516)  
##       4) tvsports>=9.5 76   2 0 (0.97368421 0.02631579) *
##       5) tvsports< 9.5 691  84 0 (0.87843705 0.12156295)  
##        10) yoga< 2.5 250  20 0 (0.92000000 0.08000000)  
##          20) concerts< 9 225  13 0 (0.94222222 0.05777778) *
##          21) concerts>=9 25   7 0 (0.72000000 0.28000000)  
##            42) sports< 7.5 18   3 0 (0.83333333 0.16666667) *
##            43) sports>=7.5 7   3 1 (0.42857143 0.57142857) *
##        11) yoga>=2.5 441  64 0 (0.85487528 0.14512472) *
##     3) reading>=5.5 4212 750 0 (0.82193732 0.17806268)  
##       6) clubbing< 5.5 1698 247 0 (0.85453475 0.14546525) *
##       7) clubbing>=5.5 2514 503 0 (0.79992045 0.20007955)  
##        14) reading>=6.5 2262 432 0 (0.80901857 0.19098143)  
##          28) theater>=1.5 2257 427 0 (0.81081081 0.18918919)  
##            56) music< 8.5 1228 204 0 (0.83387622 0.16612378)  
##             112) shopping>=6.5 602  82 0 (0.86378738 0.13621262)  
##               224) clubbing< 9.5 597  78 0 (0.86934673 0.13065327) *
##               225) clubbing>=9.5 5   1 1 (0.20000000 0.80000000) *
##             113) shopping< 6.5 626 122 0 (0.80511182 0.19488818) *
##            57) music>=8.5 1029 223 0 (0.78328474 0.21671526) *
##          29) theater< 1.5 5   0 1 (0.00000000 1.00000000) *
##        15) reading< 6.5 252  71 0 (0.71825397 0.28174603)  
##          30) gaming< 7.5 221  52 0 (0.76470588 0.23529412)  
##            60) shopping< 7.5 163  30 0 (0.81595092 0.18404908)  
##             120) sports>=2.5 155  25 0 (0.83870968 0.16129032) *
##             121) sports< 2.5 8   3 1 (0.37500000 0.62500000) *
##            61) shopping>=7.5 58  22 0 (0.62068966 0.37931034)  
##             122) theater< 9.5 42  12 0 (0.71428571 0.28571429) *
##             123) theater>=9.5 16   6 1 (0.37500000 0.62500000) *
##          31) gaming>=7.5 31  12 1 (0.38709677 0.61290323)  
##            62) sports< 6 11   5 0 (0.54545455 0.45454545) *
##            63) sports>=6 20   6 1 (0.30000000 0.70000000) *

valid.df$pred <- predict(pfit, valid.df, type = "class")
corr <- which(valid.df$pred == valid.df$match)

base_accuracy <- length(corr)/nrow(valid.df)

base_accuracy

## [1] 0.8406627

proj—match-from-hobbies.R

amber

2023-11-22