library(reshape2)
library(mice)
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(caret)
## Warning: package 'caret' was built under R version 4.2.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.2.3
## Loading required package: lattice
library(ggplot2)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(class)
# svm
library(e1071)
# lm
library(leaps) 
## Warning: package 'leaps' was built under R version 4.2.3
library(forecast) #for accuracy measures
## Warning: package 'forecast' was built under R version 4.2.3
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
#rf
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.2.3
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
# decision tree
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.2.3
library(randomForest)
library(xgboost)
## Warning: package 'xgboost' was built under R version 4.2.3
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
library(Matrix)
library(gains)

dating <- read.csv("Speed Dating Data.csv")

ggplot(dating, aes(x = as.factor(race))) +
  geom_bar() +
  theme_bw() +
  scale_x_discrete(labels=c("African American", "Caucasian","Hispanic","Pacific Islander", "Native American", "Other"))

ggplot(dating, aes(x = as.factor(career_c))) +
  geom_bar() +
  theme_bw()

reduced <- dating[,c(15,51:67)]

# Start the imputing process by selecting only the rows without NA
finished_red <- reduced[complete.cases(reduced),]
# Start the imputing process
which(is.na(finished_red))
## integer(0)
# Convert to factor values
finished_red$samerace <- as.factor(finished_red$samerace)




######################################## Start building a knn model


# Split Dataset!

#set.seed value as 1
set.seed(1)

# randomly select 60 percent of the row numbers finished_red in order to split.
train.index <- sample(row.names(finished_red), 0.6*dim(finished_red)[1])  

# select the rest of row numbers of finished_red by using setdiff()
valid.index <- setdiff(row.names(finished_red), train.index)  

# take the 10 percent of random row numbers and call it train.df
train.df <- finished_red[train.index, ]

# take the rest and call it valid.df
valid.df <- finished_red[valid.index, ]

# make sure our validating outcome variables as factor so they become a categorical variable.

# Normalize
train.norm.df <- train.df
valid.norm.df <- valid.df
norm.df <- finished_red

norm.values <- preProcess(train.df[,-1], method=c("center", "scale"))

train.norm.df[,-1] <- predict(norm.values, train.df[,-1])
valid.norm.df[,-1] <- predict(norm.values, valid.df[,-1])
norm.df[,-1] <- predict(norm.values, finished_red[,-1])




############################################ knn
# use knn with k = 3 which means "number of neighbours considered" = 3 instead of 2
knn.pred <- knn(train.norm.df[,-1], valid.norm.df[,-1],
                cl = train.norm.df[,1], k = 3)

summary(knn.pred)
##    0    1 
## 1975 1345
# Accuracy
length(which(knn.pred == valid.df$samerace))/nrow(valid.df)
## [1] 0.6626506
# initialize a data frame with two columns: k, and accuracy.
accuracy.df <- data.frame(k = seq(1, 10, 1), accuracy = rep(0, 10))


# Use a for loop to do what we did above for each row of accuracy.df
for(i in 1:10) {
  knn.pred <- knn(train.norm.df[,-1], valid.norm.df[,-1], 
                  cl = train.norm.df[, 1], k = i)
  accuracy.df[i, 2] <- confusionMatrix(knn.pred, valid.norm.df[,1])$overall[1] 
}

accuracy.df
##     k  accuracy
## 1   1 0.6626506
## 2   2 0.6590361
## 3   3 0.6641566
## 4   4 0.6650602
## 5   5 0.6626506
## 6   6 0.6536145
## 7   7 0.6500000
## 8   8 0.6430723
## 9   9 0.6397590
## 10 10 0.6394578
# k = 1 is the best (0.1114471)
knn.pred.new <- knn(train.norm.df[, -1], valid.norm.df[, -1],
                    cl = train.norm.df[, 1], k = 1)

summary(knn.pred.new)
##    0    1 
## 1952 1368
# Accuracy
length(which(knn.pred.new == valid.df$samerace))/nrow(valid.df)
## [1] 0.6587349
result <- valid.df

result$pred <- knn.pred.new

result$diff <- result$samerace == knn.pred.new


wrong <- valid.df[which(!result$diff),]

melt_data <- melt(wrong, id = c("samerace")) 

melt_data$value <- as.numeric(melt_data$value)


ggplot(melt_data, aes(x = samerace, y = value)) +
  geom_boxplot() +
  facet_wrap(~variable,  scales = "free")

###################################### Ten fold cross validation in caret

train.df$samerace <- as.factor(train.df$samerace)
trControl <- trainControl(method  = "cv",
                          number  = 10)
fit <- train(samerace ~ .,
             method     = "knn",
             tuneGrid   = expand.grid(k = 1:10),
             trControl  = trControl,
             metric     = "Accuracy",
             data       = train.df)


fit
## k-Nearest Neighbors 
## 
## 4979 samples
##   17 predictor
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 4482, 4481, 4481, 4480, 4482, 4482, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa    
##    1  0.6597827  0.2841865
##    2  0.6599811  0.2840645
##    3  0.6621912  0.2883650
##    4  0.6585868  0.2801314
##    5  0.6581836  0.2786213
##    6  0.6507550  0.2632526
##    7  0.6487454  0.2580464
##    8  0.6463301  0.2491112
##    9  0.6396951  0.2308615
##   10  0.6364810  0.2224066
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 3.
################################### logistic regression

# in creating the folds we specify the target feature (dependent variable) and # of folds
folds = createFolds(train.df$samerace, k = 10)

# in cv we are going to applying a created function to our 'folds'
cv = lapply(folds, function(x) { # start of function
  # in the next two lines we will separate the Training set into it's 10 pieces
  training_fold = train.df[-x, ] # training fold =  training set minus (-) it's sub test fold
  test_fold = train.df[x, ] # here we describe the test fold individually
  levels(test_fold$samerace) <- levels(training_fold$samerace)
  # now apply (train) the classifer on the training_fold
  classifier = svm(formula = samerace ~ .,
                   data = training_fold,
                   type = 'C-classification',
                   kernel = 'radial')
  # next step in the loop, we calculate the predictions and cm and we equate the accuracy
  # note we are training on training_fold and testing its accuracy on the test_fold
  y_pred = predict(classifier, newdata = test_fold)
  cm = table(test_fold[, 1], y_pred)
  accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
  return(accuracy)
})

cv
## $Fold01
## [1] 0.6653307
## 
## $Fold02
## [1] 0.624498
## 
## $Fold03
## [1] 0.6317907
## 
## $Fold04
## [1] 0.6177062
## 
## $Fold05
## [1] 0.6365462
## 
## $Fold06
## [1] 0.6325301
## 
## $Fold07
## [1] 0.6553106
## 
## $Fold08
## [1] 0.6526104
## 
## $Fold09
## [1] 0.6780684
## 
## $Fold10
## [1] 0.6546185
which.max(cv)
## Fold09 
##      9
f<- folds[[which.max(cv)]]



##########Now lets build a model!!
set.seed(1)
# since fold 1 is most accurate we shall use folds$Fold01 to create our train and test dataset!
training_fold = train.df[-f, ]
test_fold = valid.df # here we describe the test fold individually
# now apply (train) the classifer on the training_fold
classifier = svm(formula = samerace ~ .,
                 data = train.df,
                 type = 'C-classification',
                 kernel = 'radial')

summary(classifier)
## 
## Call:
## svm(formula = samerace ~ ., data = train.df, type = "C-classification", 
##     kernel = "radial")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
## 
## Number of Support Vectors:  3887
## 
##  ( 2001 1886 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1
# next step in the loop, we calculate the predictions and cm and we equate the accuracy
# note we are training on training_fold and testing its accuracy on the test_fold
y_pred = predict(classifier, newdata = test_fold[,-1])
cm = table(test_fold[, 1], y_pred)
accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])


accuracy 
## [1] 0.6370482
################################### Random Forest

# in creating the folds we specify the target feature (dependent variable) and # of folds
folds = createFolds(train.df$samerace, k = 10)


# in cv we are going to applying a created function to our 'folds'
cv = lapply(folds, function(x) { # start of function
  # in the next two lines we will separate the Training set into it's 10 pieces
  training_fold = train.df[-x, ] # training fold =  training set minus (-) it's sub test fold
  test_fold = train.df[x, ] # here we describe the test fold individually
  levels(test_fold$samerace) <- levels(training_fold$samerace)
  # now apply (train) the classifer on the training_fold
  classifier = randomForest(samerace~., data=training_fold, proximity=TRUE)
  # next step in the loop, we calculate the predictions and cm and we equate the accuracy
  # note we are training on training_fold and testing its accuracy on the test_fold
  y_pred = predict(classifier, newdata = test_fold)
  cm = table(test_fold[, 1], y_pred)
  accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
  return(accuracy)
})

cv
## $Fold01
## [1] 0.6633267
## 
## $Fold02
## [1] 0.6700201
## 
## $Fold03
## [1] 0.6539235
## 
## $Fold04
## [1] 0.6553106
## 
## $Fold05
## [1] 0.694165
## 
## $Fold06
## [1] 0.6706827
## 
## $Fold07
## [1] 0.6626506
## 
## $Fold08
## [1] 0.6666667
## 
## $Fold09
## [1] 0.6593186
## 
## $Fold10
## [1] 0.6619718
which.max(cv)
## Fold05 
##      5
f<- folds[[which.max(cv)]]

##########Now lets build a model!!
set.seed(1)
# since fold 1 is most accurate we shall use folds$Fold01 to create our train and test dataset!
training_fold = train.df[-f, ]
test_fold = valid.df # here we describe the test fold individually
# now apply (train) the classifer on the training_fold
classifier = randomForest(formula = samerace ~ ., data=training_fold, proximity=TRUE)

summary(classifier)
##                 Length   Class  Mode     
## call                   4 -none- call     
## type                   1 -none- character
## predicted           4482 factor numeric  
## err.rate            1500 -none- numeric  
## confusion              6 -none- numeric  
## votes               8964 matrix numeric  
## oob.times           4482 -none- numeric  
## classes                2 -none- character
## importance            17 -none- numeric  
## importanceSD           0 -none- NULL     
## localImportance        0 -none- NULL     
## proximity       20088324 -none- numeric  
## ntree                  1 -none- numeric  
## mtry                   1 -none- numeric  
## forest                14 -none- list     
## y                   4482 factor numeric  
## test                   0 -none- NULL     
## inbag                  0 -none- NULL     
## terms                  3 terms  call
# next step in the loop, we calculate the predictions and cm and we equate the accuracy
# note we are training on training_fold and testing its accuracy on the test_fold
y_pred = predict(classifier, newdata = test_fold[,-1])
cm = table(test_fold[, 1], y_pred)
accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])


accuracy 
## [1] 0.6554217
##############################################Decision Tree - 10 fold Cross validation

folds = createFolds(train.df$samerace, k = 10)


cv = lapply(folds, function(x) { # start of function
  # in the next two lines we will separate the Training set into it's 10 pieces
  training_fold = train.df[-x, ] # training fold = training set minus (-) it's sub test fold
  test_fold = train.df[x, ] # here we describe the test fold individually
  levels(test_fold$samerace) <- levels(training_fold$samerace)
  # now apply (train) the classifer on the training_fold
  tr <- rpart(samerace ~ ., data = training_fold, cp=0.0001, minbucket = 5, maxdepth = 7)
  
  pfit<- prune(tr, cp = tr$cptable[which.min(tr$cptable[,"xerror"]),"CP"])
  # next step in the loop, we calculate the predictions and cm and we equate the accuracy
  # note we are training on training_fold and testing its accuracy on the test_fold
  
  y_pred = predict(pfit, test_fold, type = "class")
  cm = table(test_fold[, 1], y_pred)
  accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
  return(accuracy)
})


cv
## $Fold01
## [1] 0.6124498
## 
## $Fold02
## [1] 0.6372745
## 
## $Fold03
## [1] 0.6156942
## 
## $Fold04
## [1] 0.62249
## 
## $Fold05
## [1] 0.6365462
## 
## $Fold06
## [1] 0.6112224
## 
## $Fold07
## [1] 0.6317907
## 
## $Fold08
## [1] 0.6297787
## 
## $Fold09
## [1] 0.6365462
## 
## $Fold10
## [1] 0.62249
which.max(cv)
## Fold02 
##      2
f<- folds[[which.max(cv)]]


################################################build Decision Tree
#fit a classification tree model (Competitive. is the outcome variable, and all other variables are predictors). 
tr <- rpart(samerace ~ ., data = train.df[-f,], cp=0.0001, minbucket = 5, maxdepth = 7)
rpart.plot(tr, extra = 106)

#print cp of the newly fitted model.  
options(scipen = 10)
printcp(tr)
## 
## Classification tree:
## rpart(formula = samerace ~ ., data = train.df[-f, ], cp = 0.0001, 
##     minbucket = 5, maxdepth = 7)
## 
## Variables actually used in tree construction:
##  [1] art      clubbing concerts dining   exercise gaming   movies   museums 
##  [9] music    reading  sports   theater  tv       tvsports yoga    
## 
## Root node error: 1768/4480 = 0.39464
## 
## n= 4480 
## 
##            CP nsplit rel error  xerror     xstd
## 1  0.00848416      0   1.00000 1.00000 0.018504
## 2  0.00707014      1   0.99152 0.99208 0.018478
## 3  0.00678733      6   0.95023 0.99887 0.018500
## 4  0.00641026      8   0.93665 0.99887 0.018500
## 5  0.00622172     11   0.91742 0.99830 0.018498
## 6  0.00452489     13   0.90498 0.99491 0.018487
## 7  0.00395928     14   0.90045 0.98982 0.018471
## 8  0.00226244     15   0.89649 0.98360 0.018449
## 9  0.00169683     20   0.88518 0.97964 0.018436
## 10 0.00113122     24   0.87839 0.97964 0.018436
## 11 0.00070701     27   0.87500 0.97568 0.018422
## 12 0.00056561     31   0.87217 0.97964 0.018436
## 13 0.00028281     32   0.87161 0.97907 0.018434
## 14 0.00010000     34   0.87104 0.98077 0.018440
# determine a nested sequence of subtrees on the rpart object "tr" to be trimmed based on the complexity parameter that has the lowest standard error.
pfit<- prune(tr, cp = tr$cptable[which.min(tr$cptable[,"xerror"]),"CP"])

# print out the lowest standard error (xerror)
tr$cptable[which.min(tr$cptable[,"xerror"]),"CP"]
## [1] 0.0007070136
#Q6: Based on the above calculation, which number of splits indicates the minimum complexity parameters? 

prp(tr)

t(t(names(finished_red)))
##       [,1]      
##  [1,] "samerace"
##  [2,] "sports"  
##  [3,] "tvsports"
##  [4,] "exercise"
##  [5,] "dining"  
##  [6,] "museums" 
##  [7,] "art"     
##  [8,] "hiking"  
##  [9,] "gaming"  
## [10,] "clubbing"
## [11,] "reading" 
## [12,] "tv"      
## [13,] "theater" 
## [14,] "movies"  
## [15,] "concerts"
## [16,] "music"   
## [17,] "shopping"
## [18,] "yoga"
#which are the most important variables
t(t(tr$variable.importance))
##               [,1]
## clubbing 36.098534
## art      33.536213
## museums  30.441618
## exercise 29.192094
## movies   27.914797
## theater  26.966988
## sports   26.554477
## yoga     23.389204
## reading  20.611029
## tvsports 20.405934
## dining   19.716627
## music    18.300311
## tv       17.566070
## concerts 15.026010
## gaming   10.731883
## hiking    7.825384
## shopping  7.259339
#set of rules
tr
## n= 4480 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##   1) root 4480 1768 0 (0.60535714 0.39464286)  
##     2) yoga>=0.5 4465 1753 0 (0.60739082 0.39260918)  
##       4) gaming>=4.5 1727  612 0 (0.64562826 0.35437174)  
##         8) exercise< 9.5 1592  541 0 (0.66017588 0.33982412)  
##          16) sports>=1.5 1569  523 0 (0.66666667 0.33333333)  
##            32) clubbing>=9.5 27    1 0 (0.96296296 0.03703704) *
##            33) clubbing< 9.5 1542  522 0 (0.66147860 0.33852140)  
##              66) clubbing< 7.5 903  277 0 (0.69324474 0.30675526)  
##               132) movies>=4.5 829  239 0 (0.71170084 0.28829916) *
##               133) movies< 4.5 74   36 1 (0.48648649 0.51351351) *
##              67) clubbing>=7.5 639  245 0 (0.61658842 0.38341158)  
##               134) music>=6.5 578  210 0 (0.63667820 0.36332180) *
##               135) music< 6.5 61   26 1 (0.42622951 0.57377049) *
##          17) sports< 1.5 23    5 1 (0.21739130 0.78260870)  
##            34) exercise< 3 6    2 0 (0.66666667 0.33333333) *
##            35) exercise>=3 17    1 1 (0.05882353 0.94117647) *
##         9) exercise>=9.5 135   64 1 (0.47407407 0.52592593)  
##          18) museums< 4.5 16    1 0 (0.93750000 0.06250000) *
##          19) museums>=4.5 119   49 1 (0.41176471 0.58823529)  
##            38) theater>=9.5 9    1 0 (0.88888889 0.11111111) *
##            39) theater< 9.5 110   41 1 (0.37272727 0.62727273)  
##              78) concerts< 7.5 38   19 0 (0.50000000 0.50000000)  
##               156) museums>=6 33   16 0 (0.51515152 0.48484848) *
##               157) museums< 6 5    2 1 (0.40000000 0.60000000) *
##              79) concerts>=7.5 72   22 1 (0.30555556 0.69444444) *
##       5) gaming< 4.5 2738 1141 0 (0.58327246 0.41672754)  
##        10) sports< 6.5 1280  475 0 (0.62890625 0.37109375)  
##          20) clubbing>=3.5 882  292 0 (0.66893424 0.33106576)  
##            40) reading>=9.5 182   38 0 (0.79120879 0.20879121)  
##              80) dining>=5.5 162   27 0 (0.83333333 0.16666667) *
##              81) dining< 5.5 20    9 1 (0.45000000 0.55000000)  
##               162) museums< 9.5 11    5 0 (0.54545455 0.45454545) *
##               163) museums>=9.5 9    3 1 (0.33333333 0.66666667) *
##            41) reading< 9.5 700  254 0 (0.63714286 0.36285714)  
##              82) exercise< 7.5 534  177 0 (0.66853933 0.33146067) *
##              83) exercise>=7.5 166   77 0 (0.53614458 0.46385542)  
##               166) music< 5.5 17    1 0 (0.94117647 0.05882353) *
##               167) music>=5.5 149   73 1 (0.48993289 0.51006711) *
##          21) clubbing< 3.5 398  183 0 (0.54020101 0.45979899)  
##            42) art< 2.5 16    1 0 (0.93750000 0.06250000) *
##            43) art>=2.5 382  182 0 (0.52356021 0.47643979)  
##              86) yoga>=8 46   12 0 (0.73913043 0.26086957)  
##               172) tvsports< 2.5 29    2 0 (0.93103448 0.06896552) *
##               173) tvsports>=2.5 17    7 1 (0.41176471 0.58823529) *
##              87) yoga< 8 336  166 1 (0.49404762 0.50595238)  
##               174) exercise< 1.5 22    5 0 (0.77272727 0.22727273) *
##               175) exercise>=1.5 314  149 1 (0.47452229 0.52547771) *
##        11) sports>=6.5 1458  666 0 (0.54320988 0.45679012)  
##          22) theater>=7.5 439  167 0 (0.61958998 0.38041002)  
##            44) tvsports>=3.5 261   81 0 (0.68965517 0.31034483) *
##            45) tvsports< 3.5 178   86 0 (0.51685393 0.48314607)  
##              90) movies>=8.5 120   46 0 (0.61666667 0.38333333)  
##               180) clubbing< 7.5 80   24 0 (0.70000000 0.30000000) *
##               181) clubbing>=7.5 40   18 1 (0.45000000 0.55000000) *
##              91) movies< 8.5 58   18 1 (0.31034483 0.68965517)  
##               182) sports< 7.5 14    6 0 (0.57142857 0.42857143) *
##               183) sports>=7.5 44   10 1 (0.22727273 0.77272727) *
##          23) theater< 7.5 1019  499 0 (0.51030422 0.48969578)  
##            46) dining>=7.5 529  229 0 (0.56710775 0.43289225)  
##              92) art< 7.5 368  137 0 (0.62771739 0.37228261) *
##              93) art>=7.5 161   69 1 (0.42857143 0.57142857)  
##               186) tvsports>=8 10    1 0 (0.90000000 0.10000000) *
##               187) tvsports< 8 151   60 1 (0.39735099 0.60264901) *
##            47) dining< 7.5 490  220 1 (0.44897959 0.55102041)  
##              94) tv>=8.5 29    9 0 (0.68965517 0.31034483) *
##              95) tv< 8.5 461  200 1 (0.43383948 0.56616052)  
##               190) movies>=8.5 73   30 0 (0.58904110 0.41095890) *
##               191) movies< 8.5 388  157 1 (0.40463918 0.59536082) *
##     3) yoga< 0.5 15    0 1 (0.00000000 1.00000000) *
valid.df$pred <- predict(pfit, valid.df, type = "class")
corr <- which(valid.df$pred == valid.df$samerace)

base_accuracy <- length(corr)/nrow(valid.df)

base_accuracy
## [1] 0.6144578