library(reshape2)
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(caret)
## Warning: package 'caret' was built under R version 4.2.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.2.3
## Loading required package: lattice
library(ggplot2)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(class)
# svm
library(e1071)
# lm
library(leaps)
## Warning: package 'leaps' was built under R version 4.2.3
library(forecast) #for accuracy measures
## Warning: package 'forecast' was built under R version 4.2.3
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
#rf
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.2.3
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
# decision tree
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.2.3
library(randomForest)
library(xgboost)
## Warning: package 'xgboost' was built under R version 4.2.3
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
library(Matrix)
library(gains)
dating <- read.csv("Speed Dating Data.csv")
ggplot(dating, aes(x = as.factor(race))) +
geom_bar() +
theme_bw() +
scale_x_discrete(labels=c("African American", "Caucasian","Hispanic","Pacific Islander", "Native American", "Other"))

ggplot(dating, aes(x = as.factor(career_c))) +
geom_bar() +
theme_bw()

reduced <- dating[,c(15,51:67)]
# Start the imputing process by selecting only the rows without NA
finished_red <- reduced[complete.cases(reduced),]
# Start the imputing process
which(is.na(finished_red))
## integer(0)
# Convert to factor values
finished_red$samerace <- as.factor(finished_red$samerace)
######################################## Start building a knn model
# Split Dataset!
#set.seed value as 1
set.seed(1)
# randomly select 60 percent of the row numbers finished_red in order to split.
train.index <- sample(row.names(finished_red), 0.6*dim(finished_red)[1])
# select the rest of row numbers of finished_red by using setdiff()
valid.index <- setdiff(row.names(finished_red), train.index)
# take the 10 percent of random row numbers and call it train.df
train.df <- finished_red[train.index, ]
# take the rest and call it valid.df
valid.df <- finished_red[valid.index, ]
# make sure our validating outcome variables as factor so they become a categorical variable.
# Normalize
train.norm.df <- train.df
valid.norm.df <- valid.df
norm.df <- finished_red
norm.values <- preProcess(train.df[,-1], method=c("center", "scale"))
train.norm.df[,-1] <- predict(norm.values, train.df[,-1])
valid.norm.df[,-1] <- predict(norm.values, valid.df[,-1])
norm.df[,-1] <- predict(norm.values, finished_red[,-1])
############################################ knn
# use knn with k = 3 which means "number of neighbours considered" = 3 instead of 2
knn.pred <- knn(train.norm.df[,-1], valid.norm.df[,-1],
cl = train.norm.df[,1], k = 3)
summary(knn.pred)
## 0 1
## 1975 1345
# Accuracy
length(which(knn.pred == valid.df$samerace))/nrow(valid.df)
## [1] 0.6626506
# initialize a data frame with two columns: k, and accuracy.
accuracy.df <- data.frame(k = seq(1, 10, 1), accuracy = rep(0, 10))
# Use a for loop to do what we did above for each row of accuracy.df
for(i in 1:10) {
knn.pred <- knn(train.norm.df[,-1], valid.norm.df[,-1],
cl = train.norm.df[, 1], k = i)
accuracy.df[i, 2] <- confusionMatrix(knn.pred, valid.norm.df[,1])$overall[1]
}
accuracy.df
## k accuracy
## 1 1 0.6626506
## 2 2 0.6590361
## 3 3 0.6641566
## 4 4 0.6650602
## 5 5 0.6626506
## 6 6 0.6536145
## 7 7 0.6500000
## 8 8 0.6430723
## 9 9 0.6397590
## 10 10 0.6394578
# k = 1 is the best (0.1114471)
knn.pred.new <- knn(train.norm.df[, -1], valid.norm.df[, -1],
cl = train.norm.df[, 1], k = 1)
summary(knn.pred.new)
## 0 1
## 1952 1368
# Accuracy
length(which(knn.pred.new == valid.df$samerace))/nrow(valid.df)
## [1] 0.6587349
result <- valid.df
result$pred <- knn.pred.new
result$diff <- result$samerace == knn.pred.new
wrong <- valid.df[which(!result$diff),]
melt_data <- melt(wrong, id = c("samerace"))
melt_data$value <- as.numeric(melt_data$value)
ggplot(melt_data, aes(x = samerace, y = value)) +
geom_boxplot() +
facet_wrap(~variable, scales = "free")

###################################### Ten fold cross validation in caret
train.df$samerace <- as.factor(train.df$samerace)
trControl <- trainControl(method = "cv",
number = 10)
fit <- train(samerace ~ .,
method = "knn",
tuneGrid = expand.grid(k = 1:10),
trControl = trControl,
metric = "Accuracy",
data = train.df)
fit
## k-Nearest Neighbors
##
## 4979 samples
## 17 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 4482, 4481, 4481, 4480, 4482, 4482, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.6597827 0.2841865
## 2 0.6599811 0.2840645
## 3 0.6621912 0.2883650
## 4 0.6585868 0.2801314
## 5 0.6581836 0.2786213
## 6 0.6507550 0.2632526
## 7 0.6487454 0.2580464
## 8 0.6463301 0.2491112
## 9 0.6396951 0.2308615
## 10 0.6364810 0.2224066
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 3.
################################### logistic regression
# in creating the folds we specify the target feature (dependent variable) and # of folds
folds = createFolds(train.df$samerace, k = 10)
# in cv we are going to applying a created function to our 'folds'
cv = lapply(folds, function(x) { # start of function
# in the next two lines we will separate the Training set into it's 10 pieces
training_fold = train.df[-x, ] # training fold = training set minus (-) it's sub test fold
test_fold = train.df[x, ] # here we describe the test fold individually
levels(test_fold$samerace) <- levels(training_fold$samerace)
# now apply (train) the classifer on the training_fold
classifier = svm(formula = samerace ~ .,
data = training_fold,
type = 'C-classification',
kernel = 'radial')
# next step in the loop, we calculate the predictions and cm and we equate the accuracy
# note we are training on training_fold and testing its accuracy on the test_fold
y_pred = predict(classifier, newdata = test_fold)
cm = table(test_fold[, 1], y_pred)
accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
return(accuracy)
})
cv
## $Fold01
## [1] 0.6653307
##
## $Fold02
## [1] 0.624498
##
## $Fold03
## [1] 0.6317907
##
## $Fold04
## [1] 0.6177062
##
## $Fold05
## [1] 0.6365462
##
## $Fold06
## [1] 0.6325301
##
## $Fold07
## [1] 0.6553106
##
## $Fold08
## [1] 0.6526104
##
## $Fold09
## [1] 0.6780684
##
## $Fold10
## [1] 0.6546185
which.max(cv)
## Fold09
## 9
f<- folds[[which.max(cv)]]
##########Now lets build a model!!
set.seed(1)
# since fold 1 is most accurate we shall use folds$Fold01 to create our train and test dataset!
training_fold = train.df[-f, ]
test_fold = valid.df # here we describe the test fold individually
# now apply (train) the classifer on the training_fold
classifier = svm(formula = samerace ~ .,
data = train.df,
type = 'C-classification',
kernel = 'radial')
summary(classifier)
##
## Call:
## svm(formula = samerace ~ ., data = train.df, type = "C-classification",
## kernel = "radial")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 3887
##
## ( 2001 1886 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
# next step in the loop, we calculate the predictions and cm and we equate the accuracy
# note we are training on training_fold and testing its accuracy on the test_fold
y_pred = predict(classifier, newdata = test_fold[,-1])
cm = table(test_fold[, 1], y_pred)
accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
accuracy
## [1] 0.6370482
################################### Random Forest
# in creating the folds we specify the target feature (dependent variable) and # of folds
folds = createFolds(train.df$samerace, k = 10)
# in cv we are going to applying a created function to our 'folds'
cv = lapply(folds, function(x) { # start of function
# in the next two lines we will separate the Training set into it's 10 pieces
training_fold = train.df[-x, ] # training fold = training set minus (-) it's sub test fold
test_fold = train.df[x, ] # here we describe the test fold individually
levels(test_fold$samerace) <- levels(training_fold$samerace)
# now apply (train) the classifer on the training_fold
classifier = randomForest(samerace~., data=training_fold, proximity=TRUE)
# next step in the loop, we calculate the predictions and cm and we equate the accuracy
# note we are training on training_fold and testing its accuracy on the test_fold
y_pred = predict(classifier, newdata = test_fold)
cm = table(test_fold[, 1], y_pred)
accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
return(accuracy)
})
cv
## $Fold01
## [1] 0.6633267
##
## $Fold02
## [1] 0.6700201
##
## $Fold03
## [1] 0.6539235
##
## $Fold04
## [1] 0.6553106
##
## $Fold05
## [1] 0.694165
##
## $Fold06
## [1] 0.6706827
##
## $Fold07
## [1] 0.6626506
##
## $Fold08
## [1] 0.6666667
##
## $Fold09
## [1] 0.6593186
##
## $Fold10
## [1] 0.6619718
which.max(cv)
## Fold05
## 5
f<- folds[[which.max(cv)]]
##########Now lets build a model!!
set.seed(1)
# since fold 1 is most accurate we shall use folds$Fold01 to create our train and test dataset!
training_fold = train.df[-f, ]
test_fold = valid.df # here we describe the test fold individually
# now apply (train) the classifer on the training_fold
classifier = randomForest(formula = samerace ~ ., data=training_fold, proximity=TRUE)
summary(classifier)
## Length Class Mode
## call 4 -none- call
## type 1 -none- character
## predicted 4482 factor numeric
## err.rate 1500 -none- numeric
## confusion 6 -none- numeric
## votes 8964 matrix numeric
## oob.times 4482 -none- numeric
## classes 2 -none- character
## importance 17 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 20088324 -none- numeric
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 14 -none- list
## y 4482 factor numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
# next step in the loop, we calculate the predictions and cm and we equate the accuracy
# note we are training on training_fold and testing its accuracy on the test_fold
y_pred = predict(classifier, newdata = test_fold[,-1])
cm = table(test_fold[, 1], y_pred)
accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
accuracy
## [1] 0.6554217
##############################################Decision Tree - 10 fold Cross validation
folds = createFolds(train.df$samerace, k = 10)
cv = lapply(folds, function(x) { # start of function
# in the next two lines we will separate the Training set into it's 10 pieces
training_fold = train.df[-x, ] # training fold = training set minus (-) it's sub test fold
test_fold = train.df[x, ] # here we describe the test fold individually
levels(test_fold$samerace) <- levels(training_fold$samerace)
# now apply (train) the classifer on the training_fold
tr <- rpart(samerace ~ ., data = training_fold, cp=0.0001, minbucket = 5, maxdepth = 7)
pfit<- prune(tr, cp = tr$cptable[which.min(tr$cptable[,"xerror"]),"CP"])
# next step in the loop, we calculate the predictions and cm and we equate the accuracy
# note we are training on training_fold and testing its accuracy on the test_fold
y_pred = predict(pfit, test_fold, type = "class")
cm = table(test_fold[, 1], y_pred)
accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
return(accuracy)
})
cv
## $Fold01
## [1] 0.6124498
##
## $Fold02
## [1] 0.6372745
##
## $Fold03
## [1] 0.6156942
##
## $Fold04
## [1] 0.62249
##
## $Fold05
## [1] 0.6365462
##
## $Fold06
## [1] 0.6112224
##
## $Fold07
## [1] 0.6317907
##
## $Fold08
## [1] 0.6297787
##
## $Fold09
## [1] 0.6365462
##
## $Fold10
## [1] 0.62249
which.max(cv)
## Fold02
## 2
f<- folds[[which.max(cv)]]
################################################build Decision Tree
#fit a classification tree model (Competitive. is the outcome variable, and all other variables are predictors).
tr <- rpart(samerace ~ ., data = train.df[-f,], cp=0.0001, minbucket = 5, maxdepth = 7)
rpart.plot(tr, extra = 106)

#print cp of the newly fitted model.
options(scipen = 10)
printcp(tr)
##
## Classification tree:
## rpart(formula = samerace ~ ., data = train.df[-f, ], cp = 0.0001,
## minbucket = 5, maxdepth = 7)
##
## Variables actually used in tree construction:
## [1] art clubbing concerts dining exercise gaming movies museums
## [9] music reading sports theater tv tvsports yoga
##
## Root node error: 1768/4480 = 0.39464
##
## n= 4480
##
## CP nsplit rel error xerror xstd
## 1 0.00848416 0 1.00000 1.00000 0.018504
## 2 0.00707014 1 0.99152 0.99208 0.018478
## 3 0.00678733 6 0.95023 0.99887 0.018500
## 4 0.00641026 8 0.93665 0.99887 0.018500
## 5 0.00622172 11 0.91742 0.99830 0.018498
## 6 0.00452489 13 0.90498 0.99491 0.018487
## 7 0.00395928 14 0.90045 0.98982 0.018471
## 8 0.00226244 15 0.89649 0.98360 0.018449
## 9 0.00169683 20 0.88518 0.97964 0.018436
## 10 0.00113122 24 0.87839 0.97964 0.018436
## 11 0.00070701 27 0.87500 0.97568 0.018422
## 12 0.00056561 31 0.87217 0.97964 0.018436
## 13 0.00028281 32 0.87161 0.97907 0.018434
## 14 0.00010000 34 0.87104 0.98077 0.018440
# determine a nested sequence of subtrees on the rpart object "tr" to be trimmed based on the complexity parameter that has the lowest standard error.
pfit<- prune(tr, cp = tr$cptable[which.min(tr$cptable[,"xerror"]),"CP"])
# print out the lowest standard error (xerror)
tr$cptable[which.min(tr$cptable[,"xerror"]),"CP"]
## [1] 0.0007070136
#Q6: Based on the above calculation, which number of splits indicates the minimum complexity parameters?
prp(tr)

t(t(names(finished_red)))
## [,1]
## [1,] "samerace"
## [2,] "sports"
## [3,] "tvsports"
## [4,] "exercise"
## [5,] "dining"
## [6,] "museums"
## [7,] "art"
## [8,] "hiking"
## [9,] "gaming"
## [10,] "clubbing"
## [11,] "reading"
## [12,] "tv"
## [13,] "theater"
## [14,] "movies"
## [15,] "concerts"
## [16,] "music"
## [17,] "shopping"
## [18,] "yoga"
#which are the most important variables
t(t(tr$variable.importance))
## [,1]
## clubbing 36.098534
## art 33.536213
## museums 30.441618
## exercise 29.192094
## movies 27.914797
## theater 26.966988
## sports 26.554477
## yoga 23.389204
## reading 20.611029
## tvsports 20.405934
## dining 19.716627
## music 18.300311
## tv 17.566070
## concerts 15.026010
## gaming 10.731883
## hiking 7.825384
## shopping 7.259339
#set of rules
tr
## n= 4480
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 4480 1768 0 (0.60535714 0.39464286)
## 2) yoga>=0.5 4465 1753 0 (0.60739082 0.39260918)
## 4) gaming>=4.5 1727 612 0 (0.64562826 0.35437174)
## 8) exercise< 9.5 1592 541 0 (0.66017588 0.33982412)
## 16) sports>=1.5 1569 523 0 (0.66666667 0.33333333)
## 32) clubbing>=9.5 27 1 0 (0.96296296 0.03703704) *
## 33) clubbing< 9.5 1542 522 0 (0.66147860 0.33852140)
## 66) clubbing< 7.5 903 277 0 (0.69324474 0.30675526)
## 132) movies>=4.5 829 239 0 (0.71170084 0.28829916) *
## 133) movies< 4.5 74 36 1 (0.48648649 0.51351351) *
## 67) clubbing>=7.5 639 245 0 (0.61658842 0.38341158)
## 134) music>=6.5 578 210 0 (0.63667820 0.36332180) *
## 135) music< 6.5 61 26 1 (0.42622951 0.57377049) *
## 17) sports< 1.5 23 5 1 (0.21739130 0.78260870)
## 34) exercise< 3 6 2 0 (0.66666667 0.33333333) *
## 35) exercise>=3 17 1 1 (0.05882353 0.94117647) *
## 9) exercise>=9.5 135 64 1 (0.47407407 0.52592593)
## 18) museums< 4.5 16 1 0 (0.93750000 0.06250000) *
## 19) museums>=4.5 119 49 1 (0.41176471 0.58823529)
## 38) theater>=9.5 9 1 0 (0.88888889 0.11111111) *
## 39) theater< 9.5 110 41 1 (0.37272727 0.62727273)
## 78) concerts< 7.5 38 19 0 (0.50000000 0.50000000)
## 156) museums>=6 33 16 0 (0.51515152 0.48484848) *
## 157) museums< 6 5 2 1 (0.40000000 0.60000000) *
## 79) concerts>=7.5 72 22 1 (0.30555556 0.69444444) *
## 5) gaming< 4.5 2738 1141 0 (0.58327246 0.41672754)
## 10) sports< 6.5 1280 475 0 (0.62890625 0.37109375)
## 20) clubbing>=3.5 882 292 0 (0.66893424 0.33106576)
## 40) reading>=9.5 182 38 0 (0.79120879 0.20879121)
## 80) dining>=5.5 162 27 0 (0.83333333 0.16666667) *
## 81) dining< 5.5 20 9 1 (0.45000000 0.55000000)
## 162) museums< 9.5 11 5 0 (0.54545455 0.45454545) *
## 163) museums>=9.5 9 3 1 (0.33333333 0.66666667) *
## 41) reading< 9.5 700 254 0 (0.63714286 0.36285714)
## 82) exercise< 7.5 534 177 0 (0.66853933 0.33146067) *
## 83) exercise>=7.5 166 77 0 (0.53614458 0.46385542)
## 166) music< 5.5 17 1 0 (0.94117647 0.05882353) *
## 167) music>=5.5 149 73 1 (0.48993289 0.51006711) *
## 21) clubbing< 3.5 398 183 0 (0.54020101 0.45979899)
## 42) art< 2.5 16 1 0 (0.93750000 0.06250000) *
## 43) art>=2.5 382 182 0 (0.52356021 0.47643979)
## 86) yoga>=8 46 12 0 (0.73913043 0.26086957)
## 172) tvsports< 2.5 29 2 0 (0.93103448 0.06896552) *
## 173) tvsports>=2.5 17 7 1 (0.41176471 0.58823529) *
## 87) yoga< 8 336 166 1 (0.49404762 0.50595238)
## 174) exercise< 1.5 22 5 0 (0.77272727 0.22727273) *
## 175) exercise>=1.5 314 149 1 (0.47452229 0.52547771) *
## 11) sports>=6.5 1458 666 0 (0.54320988 0.45679012)
## 22) theater>=7.5 439 167 0 (0.61958998 0.38041002)
## 44) tvsports>=3.5 261 81 0 (0.68965517 0.31034483) *
## 45) tvsports< 3.5 178 86 0 (0.51685393 0.48314607)
## 90) movies>=8.5 120 46 0 (0.61666667 0.38333333)
## 180) clubbing< 7.5 80 24 0 (0.70000000 0.30000000) *
## 181) clubbing>=7.5 40 18 1 (0.45000000 0.55000000) *
## 91) movies< 8.5 58 18 1 (0.31034483 0.68965517)
## 182) sports< 7.5 14 6 0 (0.57142857 0.42857143) *
## 183) sports>=7.5 44 10 1 (0.22727273 0.77272727) *
## 23) theater< 7.5 1019 499 0 (0.51030422 0.48969578)
## 46) dining>=7.5 529 229 0 (0.56710775 0.43289225)
## 92) art< 7.5 368 137 0 (0.62771739 0.37228261) *
## 93) art>=7.5 161 69 1 (0.42857143 0.57142857)
## 186) tvsports>=8 10 1 0 (0.90000000 0.10000000) *
## 187) tvsports< 8 151 60 1 (0.39735099 0.60264901) *
## 47) dining< 7.5 490 220 1 (0.44897959 0.55102041)
## 94) tv>=8.5 29 9 0 (0.68965517 0.31034483) *
## 95) tv< 8.5 461 200 1 (0.43383948 0.56616052)
## 190) movies>=8.5 73 30 0 (0.58904110 0.41095890) *
## 191) movies< 8.5 388 157 1 (0.40463918 0.59536082) *
## 3) yoga< 0.5 15 0 1 (0.00000000 1.00000000) *
valid.df$pred <- predict(pfit, valid.df, type = "class")
corr <- which(valid.df$pred == valid.df$samerace)
base_accuracy <- length(corr)/nrow(valid.df)
base_accuracy
## [1] 0.6144578