library(reshape2)
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(caret)
## Warning: package 'caret' was built under R version 4.2.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.2.3
## Loading required package: lattice
library(ggplot2)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(class)
# svm
library(e1071)
# lm
library(leaps)
## Warning: package 'leaps' was built under R version 4.2.3
library(forecast) #for accuracy measures
## Warning: package 'forecast' was built under R version 4.2.3
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
#rf
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.2.3
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
# decision tree
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.2.3
library(randomForest)
library(xgboost)
## Warning: package 'xgboost' was built under R version 4.2.3
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
library(Matrix)
library(gains)
dating <- read.csv("Speed Dating Data.csv")
ggplot(dating, aes(x = as.factor(race))) +
geom_bar() +
theme_bw() +
scale_x_discrete(labels=c("African American", "Caucasian","Hispanic","Pacific Islander", "Native American", "Other"))

ggplot(dating, aes(x = as.factor(career_c))) +
geom_bar() +
theme_bw()

reduced <- dating[,c(38:40, 45:48, 50, 13)]
reduced[which(reduced$mn_sat == ""), 1] <- NA
reduced[which(reduced$tuition == ""), 2] <- NA
reduced[which(reduced$income == ""), 4] <- NA
unique(reduced$mn_sat)
## [1] NA "1,070.00" "1,258.00" "1,400.00" "1,290.00" "1,460.00"
## [7] "1,430.00" "1,215.00" "1,330.00" "1,450.00" "1,155.00" "1,140.00"
## [13] "1,360.00" "1,402.00" "1,250.00" "1,210.00" "1,220.00" "1,410.00"
## [19] "1,260.00" "1,380.00" "1,030.00" "1,309.00" "1,308.00" "1,050.00"
## [25] "1,100.00" "1,310.00" "1,490.00" "1,188.00" "1,097.00" "1,212.00"
## [31] "1,340.00" "1,034.00" "1,185.00" "1,242.00" "1,160.00" "1,099.00"
## [37] "1,214.00" "1,270.00" "1,110.00" "1,178.00" "1,060.00" "1,157.00"
## [43] "1,180.00" "1,014.00" "1,341.00" "990.00" "1,320.00" "1,159.00"
## [49] "1,370.00" "1,105.00" "1,365.00" "1,011.00" "1,130.00" "1,206.00"
## [55] "1,331.00" "1,191.00" "914.00" "1,200.00" "1,080.00" "1,090.00"
## [61] "1,092.00" "1,470.00" "1,149.00" "1,134.00" "1,230.00" "1,267.00"
## [67] "1,280.00" "1,227.00" "1,239.00"
unique(reduced$tuition)
## [1] NA "12,696.00" "25,020.00" "26,630.00" "15,309.00" "26,376.00"
## [7] "26,908.00" "9,451.00" "24,036.00" "26,100.00" "13,258.00" "9,168.00"
## [13] "26,062.00" "27,240.00" "13,413.00" "12,900.00" "26,019.00" "21,960.00"
## [19] "17,125.00" "24,742.00" "27,395.00" "25,917.00" "34,290.00" "27,100.00"
## [25] "27,076.00" "17,030.00" "15,162.00" "25,555.00" "14,433.00" "9,696.00"
## [31] "13,211.00" "11,360.00" "33,585.00" "25,552.00" "23,187.00" "21,645.00"
## [37] "27,728.00" "11,311.00" "10,352.00" "26,377.00" "9,210.00" "25,380.00"
## [43] "14,589.00" "14,695.00" "17,808.00" "9,620.00" "27,230.00" "14,388.00"
## [49] "10,704.00" "25,335.00" "22,170.00" "10,356.00" "14,942.00" "3,730.00"
## [55] "26,786.00" "25,425.00" "10,450.00" "10,052.00" "11,424.00" "10,222.00"
## [61] "15,260.00" "18,720.00" "11,132.00" "17,715.00" "23,500.00" "14,380.00"
## [67] "24,921.00" "25,504.00" "26,871.00" "10,800.00" "25,026.00" "8,421.00"
## [73] "10,096.00" "25,533.00" "25,839.00" "25,890.00" "14,493.00" "27,025.00"
## [79] "15,990.00" "9,729.00" "26,170.00" "13,046.00" "20,100.00" "22,481.00"
## [85] "12,350.00" "25,000.00" "24,497.00" "9,790.00" "14,600.00" "26,720.00"
## [91] "14,915.00" "26,775.00" "17,478.00" "8,100.00" "26,580.00" "19,525.00"
## [97] "11,570.00" "26,892.00" "34,300.00" "27,982.00" "24,794.00" "2,406.00"
## [103] "15,004.00" "12,444.00" "13,872.00" "23,530.00" "21,168.00" "25,847.00"
## [109] "18,656.00" "10,332.00" "27,350.00" "9,811.00" "9,162.00" "16,650.00"
## [115] "20,337.00" "26,562.00"
unique(reduced$income)
## [1] "69,487.00" "65,929.00" NA "37,754.00" "86,340.00"
## [6] "60,304.00" "54,620.00" "48,652.00" "29,237.00" "56,580.00"
## [11] "36,782.00" "38,548.00" "52,010.00" "28,418.00" "43,185.00"
## [16] "23,152.00" "43,664.00" "48,441.00" "61,152.00" "36,485.00"
## [21] "41,507.00" "17,134.00" "30,038.00" "33,772.00" "24,997.00"
## [26] "42,096.00" "28,891.00" "62,635.00" "12,063.00" "29,809.00"
## [31] "26,482.00" "30,147.00" "39,919.00" "41,466.00" "23,988.00"
## [36] "28,989.00" "50,948.00" "38,022.00" "47,559.00" "53,539.00"
## [41] "32,159.00" "53,940.00" "40,753.00" "38,207.00" "46,166.00"
## [46] "30,973.00" "28,317.00" "26,645.00" "25,589.00" "55,223.00"
## [51] "109,031.00" "40,409.00" "21,597.00" "76,624.00" "35,968.00"
## [56] "51,725.00" "55,419.00" "55,550.00" "26,682.00" "41,547.00"
## [61] "23,361.00" "74,893.00" "52,804.00" "53,923.00" "27,094.00"
## [66] "57,213.00" "42,390.00" "43,636.00" "57,887.00" "30,768.00"
## [71] "66,699.00" "45,360.00" "55,080.00" "17,378.00" "40,375.00"
## [76] "48,929.00" "78,193.00" "63,351.00" "50,745.00" "29,279.00"
## [81] "38,774.00" "58,802.00" "41,831.00" "52,186.00" "97,857.00"
## [86] "74,624.00" "21,590.00" "38,832.00" "37,248.00" "28,240.00"
## [91] "53,771.00" "56,096.00" "31,560.00" "52,467.00" "80,006.00"
## [96] "47,572.00" "22,439.00" "31,383.00" "40,749.00" "47,997.00"
## [101] "78,704.00" "31,143.00" "32,129.00" "44,195.00" "46,837.00"
## [106] "97,972.00" "35,960.00" "65,708.00" "49,466.00" "53,229.00"
## [111] "32,649.00" "35,867.00" "40,244.00" "42,640.00" "52,388.00"
## [116] "62,875.00" "30,855.00" "46,800.00" "45,695.00" "46,792.00"
## [121] "53,501.00" "64,716.00" "27,248.00" "22,805.00" "56,118.00"
## [126] "30,146.00" "39,123.00" "46,153.00" "45,300.00" "42,397.00"
## [131] "44,346.00" "42,225.00" "37,405.00" "28,524.00" "61,141.00"
## [136] "8,607.00" "41,476.00" "49,841.00" "37,240.00" "36,594.00"
## [141] "62,997.00" "46,608.00" "37,881.00" "48,944.00" "77,112.00"
## [146] "18,283.00" "31,432.00" "73,073.00" "26,706.00" "50,060.00"
## [151] "25,401.00" "80,608.00" "43,844.00" "53,196.00" "25,786.00"
## [156] "39,394.00" "40,695.00" "45,788.00" "37,315.00" "51,663.00"
## [161] "32,563.00" "54,303.00" "16,908.00" "39,729.00" "57,316.00"
## [166] "30,587.00" "57,513.00" "31,857.00" "23,207.00" "25,831.00"
## [171] "28,759.00" "19,264.00" "41,778.00" "35,963.00" "49,409.00"
## [176] "31,516.00" "36,223.00" "43,367.00" "27,503.00" "35,187.00"
## [181] "26,298.00" "31,148.00" "55,704.00" "46,138.00" "66,827.00"
## [186] "42,897.00" "31,809.00" "75,347.00" "47,005.00" "52,805.00"
## [191] "50,725.00" "65,693.00" "45,736.00" "33,906.00" "50,501.00"
## [196] "48,785.00" "52,318.00" "62,844.00" "52,586.00" "29,236.00"
## [201] "31,486.00" "31,632.00" "106,663.00" "84,043.00" "35,224.00"
## [206] "36,381.00" "65,498.00" "60,000.00" "22,669.00" "81,266.00"
## [211] "29,746.00" "47,556.00" "42,651.00" "27,794.00" "41,737.00"
## [216] "90,225.00" "52,280.00" "56,056.00" "60,835.00" "62,829.00"
## [221] "16,767.00" "42,967.00" "21,488.00" "89,977.00" "18,619.00"
## [226] "22,161.00" "82,734.00" "40,163.00" "46,185.00" "78,844.00"
## [231] "29,575.00" "34,752.00" "22,173.00" "37,994.00" "35,409.00"
## [236] "23,707.00" "57,501.00" "25,314.00" "48,876.00" "34,870.00"
## [241] "35,848.00" "45,017.00" "12,416.00" "87,789.00" "50,572.00"
## [246] "49,642.00" "20,000.00" "32,508.00" "35,627.00" "46,280.00"
## [251] "41,191.00" "71,787.00" "72,412.00" "36,510.00" "32,386.00"
## [256] "15,863.00" "46,272.00" "48,137.00" "61,686.00" "47,624.00"
## [261] "36,673.00" "55,138.00"
unique(reduced$goal)
## [1] 2 1 6 3 4 NA 5
unique(reduced$date)
## [1] 7 5 3 4 6 1 NA 2
unique(reduced$go_out)
## [1] 1 4 2 NA 3 7 5 6
unique(reduced$career_c)
## [1] NA 1 6 9 2 7 10 5 3 4 14 11 8 15 12 17 13 16
# Convert to numerical values
reduced[] <- lapply(reduced, function(x) gsub("\\,", "", x))
reduced$mn_sat <- as.numeric(reduced$mn_sat)
reduced$tuition <- as.numeric(reduced$tuition)
reduced$income <- as.numeric(reduced$income)
# Start the imputing process
finished_red <- reduced[complete.cases(reduced),]
which(is.na(finished_red))
## integer(0)
finished_red[] <- lapply(finished_red, as.numeric)
finished_red$match <- as.factor(finished_red$match)
######################################## Start building a knn model
# Split Dataset!
#set.seed value as 1
set.seed(1)
# randomly select 60 percent of the row numbers finished_red in order to split.
train.index <- sample(row.names(finished_red), 0.6*dim(finished_red)[1])
# select the rest of row numbers of finished_red by using setdiff()
valid.index <- setdiff(row.names(finished_red), train.index)
# take the 60 percent of random row numbers and call it train.df
train.df <- finished_red[train.index, ]
# take the rest and call it valid.df
valid.df <- finished_red[valid.index, ]
# make sure our validating outcome variables as factor so they become a categorical variable.
# Normalize
train.norm.df <- train.df
valid.norm.df <- valid.df
norm.df <- finished_red
norm.values <- preProcess(train.df[,-9], method=c("center", "scale"))
train.norm.df[,-9] <- predict(norm.values, train.df[,-9])
valid.norm.df[,-9] <- predict(norm.values, valid.df[,-9])
norm.df[,-9] <- predict(norm.values, finished_red[,-9])
############################################ knn
# use knn with k = 3 which means "number of neighbours considered" = 3 instead of 2
knn.pred <- knn(train.norm.df[,-9], valid.norm.df[,-9],
cl = train.norm.df[,9], k = 3)
summary(knn.pred)
## 0 1
## 715 53
# Accuracy
length(which(knn.pred == valid.df$match))/nrow(valid.df)
## [1] 0.8229167
# initialize a data frame with two columns: k, and accuracy.
accuracy.df <- data.frame(k = seq(1, 10, 1), accuracy = rep(0, 10))
# Use a for loop to do what we did above for each row of accuracy.df
for(i in 1:10) {
knn.pred <- knn(train.norm.df[,-9], valid.norm.df[,-9],
cl = train.norm.df[, 9], k = i)
accuracy.df[i, 2] <- confusionMatrix(knn.pred, valid.norm.df[,9])$overall[1]
}
accuracy.df
## k accuracy
## 1 1 0.8216146
## 2 2 0.8229167
## 3 3 0.8229167
## 4 4 0.8229167
## 5 5 0.8242188
## 6 6 0.8229167
## 7 7 0.8281250
## 8 8 0.8307292
## 9 9 0.8307292
## 10 10 0.8359375
# k = 10 is the best (0.8385417)
knn.pred.new <- knn(train.norm.df[, -9], valid.norm.df[, -9],
cl = train.norm.df[, 9], k = 10)
summary(knn.pred.new)
## 0 1
## 735 33
# Accuracy
length(which(knn.pred.new == valid.df$match))/nrow(valid.df)
## [1] 0.8385417
result <- valid.df
result$pred <- knn.pred.new
result$diff <- result$match == knn.pred.new
wrong <- valid.df[which(!result$diff),]
wrong <- wrong[,1:9]
melt_data <- melt(wrong, id = c("match"))
melt_data$value <- as.numeric(melt_data$value)
ggplot(melt_data, aes(x = match, y = value)) +
geom_boxplot() +
facet_wrap(~variable, scales = "free")

###################################### Ten fold cross validation in caret
train.df$match <- as.factor(train.df$match)
trControl <- trainControl(method = "cv",
number = 10)
fit <- train(match ~ .,
method = "knn",
tuneGrid = expand.grid(k = 1:10),
trControl = trControl,
metric = "Accuracy",
data = train.df)
fit
## k-Nearest Neighbors
##
## 1151 samples
## 8 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1036, 1035, 1035, 1036, 1037, 1035, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.8374909 0.15448235
## 2 0.8461869 0.20228512
## 3 0.8418767 0.16146609
## 4 0.8409694 0.14804248
## 5 0.8410148 0.13695364
## 6 0.8401527 0.08528767
## 7 0.8401377 0.07829305
## 8 0.8436011 0.08017681
## 9 0.8436388 0.06875562
## 10 0.8418919 0.05870907
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 2.
################################### logistic regression
# in creating the folds we specify the target feature (dependent variable) and # of folds
folds = createFolds(train.df$match, k = 10)
# in cv we are going to applying a created function to our 'folds'
cv = lapply(folds, function(x) { # start of function
# in the next two lines we will separate the Training set into it's 10 pieces
training_fold = train.df[-x, ] # training fold = training set minus (-) it's sub test fold
test_fold = train.df[x, ] # here we describe the test fold individually
levels(test_fold$match) <- levels(training_fold$match)
# now apply (train) the classifer on the training_fold
classifier = svm(formula = match ~ .,
data = training_fold,
type = 'C-classification',
kernel = 'linear')
# next step in the loop, we calculate the predictions and cm and we equate the accuracy
# note we are training on training_fold and testing its accuracy on the test_fold
y_pred = predict(classifier, newdata = test_fold)
cm = table(test_fold[, 9], y_pred)
accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
return(accuracy)
})
cv
## $Fold01
## [1] 0.8508772
##
## $Fold02
## [1] 0.8521739
##
## $Fold03
## [1] 0.8521739
##
## $Fold04
## [1] 0.8521739
##
## $Fold05
## [1] 0.8448276
##
## $Fold06
## [1] 0.8448276
##
## $Fold07
## [1] 0.8521739
##
## $Fold08
## [1] 0.8434783
##
## $Fold09
## [1] 0.8434783
##
## $Fold10
## [1] 0.8521739
which.max(cv)
## Fold02
## 2
f<- folds[[which.max(cv)]]
##########Now lets build a model!!
set.seed(1)
# since fold 1 is most accurate we shall use folds$Fold01 to create our train and test dataset!
training_fold = train.df[-f, ]
test_fold = valid.df # here we describe the test fold individually
# now apply (train) the classifer on the training_fold
classifier = svm(formula = match ~ .,
data = train.df,
type = 'C-classification',
kernel = 'radial')
summary(classifier)
##
## Call:
## svm(formula = match ~ ., data = train.df, type = "C-classification",
## kernel = "radial")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 402
##
## ( 228 174 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
# next step in the loop, we calculate the predictions and cm and we equate the accuracy
# note we are training on training_fold and testing its accuracy on the test_fold
y_pred = predict(classifier, newdata = test_fold[,-9])
cm = table(test_fold[, 9], y_pred)
accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
accuracy
## [1] 0.8359375
################################### Random Forest
# in creating the folds we specify the target feature (dependent variable) and # of folds
folds = createFolds(train.df$match, k = 10)
# in cv we are going to applying a created function to our 'folds'
cv = lapply(folds, function(x) { # start of function
# in the next two lines we will separate the Training set into it's 10 pieces
training_fold = train.df[-x, ] # training fold = training set minus (-) it's sub test fold
test_fold = train.df[x, ] # here we describe the test fold individually
levels(test_fold$match) <- levels(training_fold$match)
# now apply (train) the classifer on the training_fold
classifier = randomForest(match~., data=training_fold, proximity=TRUE)
# next step in the loop, we calculate the predictions and cm and we equate the accuracy
# note we are training on training_fold and testing its accuracy on the test_fold
y_pred = predict(classifier, newdata = test_fold)
cm = table(test_fold[, 9], y_pred)
accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
return(accuracy)
})
cv
## $Fold01
## [1] 0.8695652
##
## $Fold02
## [1] 0.8448276
##
## $Fold03
## [1] 0.8086957
##
## $Fold04
## [1] 0.8448276
##
## $Fold05
## [1] 0.8521739
##
## $Fold06
## [1] 0.8421053
##
## $Fold07
## [1] 0.8362069
##
## $Fold08
## [1] 0.8157895
##
## $Fold09
## [1] 0.826087
##
## $Fold10
## [1] 0.8347826
which.max(cv)
## Fold01
## 1
#Random Forrest cannot predict with high accuracy for this dataset
##############################################Decision Tree - 10 fold Cross validation
folds = createFolds(train.df$match, k = 10)
cv = lapply(folds, function(x) { # start of function
# in the next two lines we will separate the Training set into it's 10 pieces
training_fold = train.df[-x, ] # training fold = training set minus (-) it's sub test fold
test_fold = train.df[x, ] # here we describe the test fold individually
levels(test_fold$match) <- levels(training_fold$match)
# now apply (train) the classifer on the training_fold
tr <- rpart(match ~ ., data = training_fold, cp=0.0001, minbucket = 5, maxdepth = 7)
pfit<- prune(tr, cp = tr$cptable[which.min(tr$cptable[,"xerror"]),"CP"])
# next step in the loop, we calculate the predictions and cm and we equate the accuracy
# note we are training on training_fold and testing its accuracy on the test_fold
y_pred = predict(pfit, test_fold, type = "class")
cm = table(test_fold[, 1], y_pred)
accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
return(accuracy)
})
cv
## $Fold01
## [1] 0.5
##
## $Fold02
## [1] 0.6666667
##
## $Fold03
## [1] 0.25
##
## $Fold04
## [1] 0.25
##
## $Fold05
## [1] 0.4285714
##
## $Fold06
## [1] 0.5
##
## $Fold07
## [1] 0.25
##
## $Fold08
## [1] 0.5
##
## $Fold09
## [1] 0.5
##
## $Fold10
## [1] 0.5
which.max(cv)
## Fold02
## 2
f<- folds[[which.max(cv)]]
##############################################Decision Tree
# Re-structure the dataset since decision tree can handle NA values!
reduced <- dating[,c(38:40, 45:48, 50, 35, 43, 13)]
reduced[] <- lapply(reduced, function(x) gsub("\\,", "", x))
reduced$match <- as.factor(reduced$match)
reduced[which(reduced$mn_sat == ""), 1] <- NA
reduced[which(reduced$tuition == ""), 2] <- NA
reduced[which(reduced$income == ""), 4] <- NA
reduced[which(reduced$field == ""), 9] <- NA
reduced[which(reduced$from == ""), 10] <- NA
# Split Dataset!
#set.seed value as 1
set.seed(1)
# randomly select 60 percent of the row numbers finished_red in order to split.
train.index <- sample(row.names(finished_red), 0.6*dim(finished_red)[1])
# select the rest of row numbers of finished_red by using setdiff()
valid.index <- setdiff(row.names(finished_red), train.index)
# take the 60 percent of random row numbers and call it train.df
train.df <- finished_red[train.index, ]
# take the rest and call it valid.df
valid.df <- finished_red[valid.index, ]
valid.df$match <-as.factor(valid.df$match)
train.df$match <-as.factor(train.df$match)
#fit a classification tree model (Competitive. is the outcome variable, and all other variables are predictors).
tr <- rpart(match ~ ., data = train.df[-f,], cp=0.0001, minbucket = 5, maxdepth = 7)
rpart.plot(tr, extra = 106)

#print cp of the newly fitted model.
options(scipen = 10)
printcp(tr)
##
## Classification tree:
## rpart(formula = match ~ ., data = train.df[-f, ], cp = 0.0001,
## minbucket = 5, maxdepth = 7)
##
## Variables actually used in tree construction:
## [1] date goal income mn_sat
##
## Root node error: 157/1036 = 0.15154
##
## n= 1036
##
## CP nsplit rel error xerror xstd
## 1 0.0191083 0 1.00000 1.00000 0.073513
## 2 0.0031847 2 0.96178 0.96178 0.072340
## 3 0.0021231 4 0.95541 1.04459 0.074834
## 4 0.0001000 7 0.94904 1.04459 0.074834
# determine a nested sequence of subtrees on the rpart object "tr" to be trimmed based on the complexity parameter that has the lowest standard error.
pfit<- prune(tr, cp = tr$cptable[which.min(tr$cptable[,"xerror"]),"CP"])
# print out the lowest standard error (xerror)
tr$cptable[which.min(tr$cptable[,"xerror"]),"CP"]
## [1] 0.003184713
#Q6: Based on the above calculation, which number of splits indicates the minimum complexity parameters?
prp(tr)

t(t(names(finished_red)))
## [,1]
## [1,] "mn_sat"
## [2,] "tuition"
## [3,] "race"
## [4,] "income"
## [5,] "goal"
## [6,] "date"
## [7,] "go_out"
## [8,] "career_c"
## [9,] "match"
#which are the most important variables
t(t(tr$variable.importance))
## [,1]
## goal 8.1776751
## mn_sat 7.6034723
## date 6.7661649
## income 4.7952378
## tuition 4.4872351
## career_c 3.2041209
## go_out 0.7039521
## race 0.2807651
#set of rules
tr
## n= 1036
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 1036 157 0 (0.8484556 0.1515444)
## 2) mn_sat< 1372.5 709 84 0 (0.8815233 0.1184767)
## 4) date>=3.5 637 69 0 (0.8916797 0.1083203) *
## 5) date< 3.5 72 15 0 (0.7916667 0.2083333)
## 10) mn_sat>=1032 63 10 0 (0.8412698 0.1587302) *
## 11) mn_sat< 1032 9 4 1 (0.4444444 0.5555556) *
## 3) mn_sat>=1372.5 327 73 0 (0.7767584 0.2232416)
## 6) goal< 5.5 305 59 0 (0.8065574 0.1934426)
## 12) date< 5.5 166 24 0 (0.8554217 0.1445783) *
## 13) date>=5.5 139 35 0 (0.7482014 0.2517986)
## 26) income< 72099.5 112 22 0 (0.8035714 0.1964286) *
## 27) income>=72099.5 27 13 0 (0.5185185 0.4814815)
## 54) mn_sat< 1415 10 4 0 (0.6000000 0.4000000) *
## 55) mn_sat>=1415 17 8 1 (0.4705882 0.5294118) *
## 7) goal>=5.5 22 8 1 (0.3636364 0.6363636) *
valid.df$pred <- predict(pfit, valid.df, type = "class")
corr <- which(valid.df$pred == valid.df$match)
base_accuracy <- length(corr)/nrow(valid.df)
base_accuracy
## [1] 0.8359375