proj-match-from-hobbies-.R

library(reshape2)
library(mice)

## 
## Attaching package: 'mice'

## The following object is masked from 'package:stats':
## 
##     filter

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

library(caret)

## Warning: package 'caret' was built under R version 4.2.3

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 4.2.3

## Loading required package: lattice

library(ggplot2)
library(dplyr)

## Warning: package 'dplyr' was built under R version 4.2.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(class)
# svm
library(e1071)
# lm
library(leaps)

## Warning: package 'leaps' was built under R version 4.2.3

library(forecast) #for accuracy measures

## Warning: package 'forecast' was built under R version 4.2.3

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

# decision tree
library(rpart)
library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 4.2.3

library(randomForest)

## Warning: package 'randomForest' was built under R version 4.2.3

## randomForest 4.7-1.1

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

library(xgboost)

## Warning: package 'xgboost' was built under R version 4.2.3

## 
## Attaching package: 'xgboost'

## The following object is masked from 'package:dplyr':
## 
##     slice

library(Matrix)
library(gains)

dating <- read.csv("Speed Dating Data.csv")

which(names(dating) == "match")

## [1] 13

# Learn the column number of the beginning and end the hobbies. According to the key the begining is sports while yoga is the end.
which(names(dating) == "sports")

## [1] 51

which(names(dating) == "yoga")

## [1] 67

reduced <- dating[,c(13,51:67)]



# Start the imputing process by selecting only the rows without NA
finished_red <- reduced[complete.cases(reduced),]
# Start the imputing process
which(is.na(finished_red))

## integer(0)

# Start building a knn model


# Split Dataset!

#set.seed value as 1
set.seed(1)

# randomly select 60 percent of the row numbers finished_red in order to split.
train.index <- sample(row.names(finished_red), 0.6*dim(finished_red)[1])  

# select the rest of row numbers of finished_red by using setdiff()
valid.index <- setdiff(row.names(finished_red), train.index)  

# take the 60 percent of random row numbers and call it train.df
train.df <- finished_red[train.index, ]

# take the rest and call it valid.df
valid.df <- finished_red[valid.index, ]

# make sure our validating outcome variables as factor so they become a categorical variable.
valid.df$match <-as.factor(valid.df$match)
train.df$match <-as.factor(train.df$match)


# Normalize
train.norm.df <- train.df
valid.norm.df <- valid.df
norm.df <- finished_red

norm.values <- preProcess(train.df[,-1], method=c("center", "scale"))

train.norm.df[,-1] <- predict(norm.values, train.df[,-1])
valid.norm.df[,-1] <- predict(norm.values, valid.df[,-1])
norm.df[,-1] <- predict(norm.values, finished_red[,-1])






############################################ knn
# use knn with k = 3 which means "number of neighbours considered" = 3 instead of 2
knn.pred <- knn(train.norm.df[,-1], valid.norm.df[,-1],
                cl = train.norm.df[, 1], k = 3)

summary(knn.pred)

##    0    1 
## 3189  131

# Accuracy
length(which(knn.pred == valid.df$match))/nrow(valid.df)

## [1] 0.8337349

# initialize a data frame with two columns: k, and accuracy.
accuracy.df <- data.frame(k = seq(1, 10, 1), accuracy = rep(0, 10))


# Use a for loop to do what we did above for each row of accuracy.df
for(i in 1:10) {
  knn.pred <- knn(train.norm.df[,-1], valid.norm.df[,-1], 
                  cl = train.norm.df[, 1], k = i)
  accuracy.df[i, 2] <- confusionMatrix(knn.pred, valid.norm.df[,1])$overall[1] 
}

accuracy.df

##     k  accuracy
## 1   1 0.8307229
## 2   2 0.8316265
## 3   3 0.8331325
## 4   4 0.8343373
## 5   5 0.8325301
## 6   6 0.8334337
## 7   7 0.8352410
## 8   8 0.8322289
## 9   9 0.8337349
## 10 10 0.8346386

# k = 10 is the best (0.8323389)
knn.pred.new <- knn(train.norm.df[, -1], valid.norm.df[, -1],
                    cl = train.norm.df[, 1], k = 10)

summary(knn.pred.new)

##    0    1 
## 3266   54

# Accuracy
length(which(knn.pred.new == valid.df$match))/nrow(valid.df)

## [1] 0.8340361

valid.df$pred <- knn.pred.new

valid.df$diff <- valid.df$match == knn.pred.new


wrong <- valid.df[which(!valid.df$diff),]
wrong <- wrong[,1:18]

melt_data <- melt(wrong, id = c("match")) 

ggplot(melt_data, aes(x = as.factor(value))) +
  geom_bar() +
  theme_bw()+
  facet_wrap(~variable,  ncol=3)

## Ten fold cross validation in caret

train.df$match <- as.factor(train.df$match)
trControl <- trainControl(method  = "cv",
                          number  = 10)
fit <- train(match ~ .,
             method     = "knn",
             tuneGrid   = expand.grid(k = 1:10),
             trControl  = trControl,
             metric     = "Accuracy",
             data       = train.df)


fit

## k-Nearest Neighbors 
## 
## 4979 samples
##   17 predictor
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 4481, 4481, 4480, 4482, 4482, 4480, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa     
##    1  0.8158268  0.09031820
##    2  0.8172324  0.09010522
##    3  0.8192380  0.08754955
##    4  0.8176328  0.08264229
##    5  0.8194429  0.06992110
##    6  0.8208477  0.05614832
##    7  0.8208497  0.04015440
##    8  0.8230557  0.04531414
##    9  0.8244622  0.03286815
##   10  0.8254674  0.01568135
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 10.

############################# logistic regression


# in creating the folds we specify the target feature (dependent variable) and # of folds
folds = createFolds(train.df$match, k = 10)

# in cv we are going to applying a created function to our 'folds'
cv = lapply(folds, function(x) { # start of function
  # in the next two lines we will separate the Training set into it's 10 pieces
  training_fold = train.df[-x, ] # training fold =  training set minus (-) it's sub test fold
  test_fold = train.df[x, ] # here we describe the test fold individually
  # now apply (train) the classifer on the training_fold
  classifier = svm(formula = match ~ .,
                   data = train.df,
                   type = 'C-classification',
                   kernel = 'radial')
  # next step in the loop, we calculate the predictions and cm and we equate the accuracy
  # note we are training on training_fold and testing its accuracy on the test_fold
  y_pred = predict(classifier, newdata = test_fold[,-1])
  cm = table(test_fold[, 1], y_pred)
  accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
  return(accuracy)
})

cv

## $Fold01
## [1] 0.8356713
## 
## $Fold02
## [1] 0.8353414
## 
## $Fold03
## [1] 0.8333333
## 
## $Fold04
## [1] 0.8336673
## 
## $Fold05
## [1] 0.8350101
## 
## $Fold06
## [1] 0.8353414
## 
## $Fold07
## [1] 0.8373494
## 
## $Fold08
## [1] 0.8309859
## 
## $Fold09
## [1] 0.8293173
## 
## $Fold10
## [1] 0.8350101

which.max(cv)

## Fold07 
##      7

f<- folds[[which.max(cv)]]

# $Fold01 is the best in terms of accuracy (0.8409543)

##########Now lets build a model!!
set.seed(1)
# since fold 1 is most accurate we shall use folds$Fold01 to create our train and test dataset!
training_fold = train.df[-f, ]
# now apply (train) the classifer on the training_fold
classifier = svm(formula = match ~ .,
                 data = train.df,
                 type = 'C-classification',
                 kernel = 'radial')

summary(classifier)

## 
## Call:
## svm(formula = match ~ ., data = train.df, type = "C-classification", 
##     kernel = "radial")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
## 
## Number of Support Vectors:  2007
## 
##  ( 1171 836 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1

# next step in the loop, we calculate the predictions and cm and we equate the accuracy
# note we are training on training_fold and testing its accuracy on the test_fold
y_pred = predict(classifier, newdata = valid.df[,-1])
cm = table(valid.df[, 1], y_pred)
accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])

accuracy

## [1] 0.8418675

################################### Random Forest

# in creating the folds we specify the target feature (dependent variable) and # of folds
folds = createFolds(train.df$match, k = 10)

train.df[] <- lapply(train.df, as.numeric)
valid.df[] <- lapply(valid.df, as.numeric)

# in cv we are going to applying a created function to our 'folds'
cv = lapply(folds, function(x) { # start of function
  # in the next two lines we will separate the Training set into it's 10 pieces
  training_fold = train.df[-x, ] # training fold =  training set minus (-) it's sub test fold
  test_fold = train.df[x, ] # here we describe the test fold individually
  levels(test_fold$match) <- levels(training_fold$match)
  # now apply (train) the classifer on the training_fold
  classifier = randomForest(match~., data=training_fold, proximity=TRUE)
  # next step in the loop, we calculate the predictions and cm and we equate the accuracy
  # note we are training on training_fold and testing its accuracy on the test_fold
  y_pred = predict(classifier, newdata = test_fold)
  cm = table(test_fold[, 9], y_pred)
  accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
  return(accuracy)
})

## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values.  Are you sure you want to do regression?

## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values.  Are you sure you want to do regression?

## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values.  Are you sure you want to do regression?

## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values.  Are you sure you want to do regression?

## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values.  Are you sure you want to do regression?

## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values.  Are you sure you want to do regression?

## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values.  Are you sure you want to do regression?

## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values.  Are you sure you want to do regression?

## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values.  Are you sure you want to do regression?

## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values.  Are you sure you want to do regression?

cv

## $Fold01
## [1] 0
## 
## $Fold02
## [1] 0
## 
## $Fold03
## [1] 0.09677419
## 
## $Fold04
## [1] 0.08695652
## 
## $Fold05
## [1] 0
## 
## $Fold06
## [1] 0
## 
## $Fold07
## [1] 0.2173913
## 
## $Fold08
## [1] 0
## 
## $Fold09
## [1] 0
## 
## $Fold10
## [1] 0.08333333

which.max(cv)

## Fold07 
##      7

#Random Forrest cannot predict with high accuracy for this dataset






##############################################Decision Tree - 10 fold Cross validation


valid.df$match <-as.factor(valid.df$match)
train.df$match <-as.factor(train.df$match)

folds = createFolds(train.df$match, k = 10)


cv = lapply(folds, function(x) { # start of function
  # in the next two lines we will separate the Training set into it's 10 pieces
  training_fold = train.df[-x, ] # training fold = training set minus (-) it's sub test fold
  test_fold = train.df[x, ] # here we describe the test fold individually
  levels(test_fold$match) <- levels(training_fold$match)
  # now apply (train) the classifer on the training_fold
  tr <- rpart(match ~ ., data = training_fold, cp=0.0001, minbucket = 5, maxdepth = 7)
  
  pfit<- prune(tr, cp = tr$cptable[which.min(tr$cptable[,"xerror"]),"CP"])
  # next step in the loop, we calculate the predictions and cm and we equate the accuracy
  # note we are training on training_fold and testing its accuracy on the test_fold
  
  y_pred = predict(pfit, test_fold, type = "class")
  cm = table(test_fold[, 1], y_pred)
  accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
  return(accuracy)
})


cv

## $Fold01
## [1] 0.8293173
## 
## $Fold02
## [1] 0.8313253
## 
## $Fold03
## [1] 0.832998
## 
## $Fold04
## [1] 0.8313253
## 
## $Fold05
## [1] 0.8316633
## 
## $Fold06
## [1] 0.8313253
## 
## $Fold07
## [1] 0.8313253
## 
## $Fold08
## [1] 0.832998
## 
## $Fold09
## [1] 0.8316633
## 
## $Fold10
## [1] 0.832998

which.max(cv)

## Fold03 
##      3

f<- folds[[which.max(cv)]]

##############################################Decision Tree

#fit a classification tree model (Competitive. is the outcome variable, and all other variables are predictors). 
tr <- rpart(match ~ ., data = train.df[-f], cp=0.0001, minbucket = 5, maxdepth = 7)
rpart.plot(tr, extra = 106)

#print cp of the newly fitted model.  
options(scipen = 10)
printcp(tr)

## 
## Classification tree:
## rpart(formula = match ~ ., data = train.df[-f], cp = 0.0001, 
##     minbucket = 5, maxdepth = 7)
## 
## Variables actually used in tree construction:
##  [1] clubbing concerts hiking   museums  reading  shopping sports   theater 
##  [9] tvsports yoga    
## 
## Root node error: 836/4979 = 0.16791
## 
## n= 4979 
## 
##           CP nsplit rel error xerror     xstd
## 1 0.00199362      0   1.00000 1.0000 0.031549
## 2 0.00059809     12   0.97129 1.0060 0.031624
## 3 0.00029904     14   0.97010 1.0072 0.031639
## 4 0.00010000     18   0.96890 1.0072 0.031639

# determine a nested sequence of subtrees on the rpart object "tr" to be trimmed based on the complexity parameter that has the lowest standard error.
pfit<- prune(tr, cp = tr$cptable[which.min(tr$cptable[,"xerror"]),"CP"])

# print out the lowest standard error (xerror)
tr$cptable[which.min(tr$cptable[,"xerror"]),"CP"]

## [1] 0.00199362

#Q6: Based on the above calculation, which number of splits indicates the minimum complexity parameters? 

prp(tr)

t(t(names(finished_red)))

##       [,1]      
##  [1,] "match"   
##  [2,] "sports"  
##  [3,] "tvsports"
##  [4,] "exercise"
##  [5,] "dining"  
##  [6,] "museums" 
##  [7,] "art"     
##  [8,] "hiking"  
##  [9,] "gaming"  
## [10,] "clubbing"
## [11,] "reading" 
## [12,] "tv"      
## [13,] "theater" 
## [14,] "movies"  
## [15,] "concerts"
## [16,] "music"   
## [17,] "shopping"
## [18,] "yoga"

#which are the most important variables
t(t(tr$variable.importance))

##               [,1]
## hiking   14.659248
## clubbing 13.476890
## theater  12.091390
## reading  11.616833
## shopping 10.285327
## museums   8.624560
## yoga      8.248458
## sports    7.891788
## tv        6.366825
## tvsports  6.330249
## concerts  5.801635
## exercise  4.200721
## music     3.948437
## dining    3.419929
## movies    1.743543

#set of rules
tr

## n= 4979 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##   1) root 4979 836 1 (0.83209480 0.16790520)  
##     2) reading< 5.5 767  86 1 (0.88787484 0.11212516)  
##       4) tvsports>=9.5 76   2 1 (0.97368421 0.02631579) *
##       5) tvsports< 9.5 691  84 1 (0.87843705 0.12156295)  
##        10) yoga< 2.5 250  20 1 (0.92000000 0.08000000)  
##          20) concerts< 9 225  13 1 (0.94222222 0.05777778) *
##          21) concerts>=9 25   7 1 (0.72000000 0.28000000)  
##            42) sports< 7.5 18   3 1 (0.83333333 0.16666667) *
##            43) sports>=7.5 7   3 2 (0.42857143 0.57142857) *
##        11) yoga>=2.5 441  64 1 (0.85487528 0.14512472) *
##     3) reading>=5.5 4212 750 1 (0.82193732 0.17806268)  
##       6) clubbing< 5.5 1698 247 1 (0.85453475 0.14546525) *
##       7) clubbing>=5.5 2514 503 1 (0.79992045 0.20007955)  
##        14) reading>=6.5 2262 432 1 (0.80901857 0.19098143)  
##          28) theater>=1.5 2257 427 1 (0.81081081 0.18918919)  
##            56) hiking< 8.5 1910 337 1 (0.82356021 0.17643979)  
##             112) clubbing< 9.5 1858 319 1 (0.82831001 0.17168999) *
##             113) clubbing>=9.5 52  18 1 (0.65384615 0.34615385)  
##               226) shopping< 7.5 35   6 1 (0.82857143 0.17142857) *
##               227) shopping>=7.5 17   5 2 (0.29411765 0.70588235) *
##            57) hiking>=8.5 347  90 1 (0.74063401 0.25936599)  
##             114) yoga< 5.5 173  32 1 (0.81502890 0.18497110) *
##             115) yoga>=5.5 174  58 1 (0.66666667 0.33333333)  
##               230) museums< 9.5 143  42 1 (0.70629371 0.29370629) *
##               231) museums>=9.5 31  15 2 (0.48387097 0.51612903) *
##          29) theater< 1.5 5   0 2 (0.00000000 1.00000000) *
##        15) reading< 6.5 252  71 1 (0.71825397 0.28174603)  
##          30) tvsports>=8.5 24   2 1 (0.91666667 0.08333333) *
##          31) tvsports< 8.5 228  69 1 (0.69736842 0.30263158)  
##            62) sports< 9.5 219  62 1 (0.71689498 0.28310502)  
##             124) museums< 8.5 163  37 1 (0.77300613 0.22699387)  
##               248) theater< 9.5 157  33 1 (0.78980892 0.21019108) *
##               249) theater>=9.5 6   2 2 (0.33333333 0.66666667) *
##             125) museums>=8.5 56  25 1 (0.55357143 0.44642857)  
##               250) clubbing< 8 35  12 1 (0.65714286 0.34285714) *
##               251) clubbing>=8 21   8 2 (0.38095238 0.61904762) *
##            63) sports>=9.5 9   2 2 (0.22222222 0.77777778) *

valid.df$pred <- predict(pfit, valid.df, type = "class")
corr <- which(valid.df$pred == valid.df$match)

base_accuracy <- length(corr)/nrow(valid.df)

base_accuracy

## [1] 0.8406627

proj-match-from-hobbies-.R

amber

2023-11-22