proj-1-match-from-demo-.R

library(reshape2)
library(mice)

## 
## Attaching package: 'mice'

## The following object is masked from 'package:stats':
## 
##     filter

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

library(caret)

## Warning: package 'caret' was built under R version 4.2.3

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 4.2.3

## Loading required package: lattice

library(ggplot2)
library(dplyr)

## Warning: package 'dplyr' was built under R version 4.2.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(class)
# svm
library(e1071)
# lm
library(leaps)

## Warning: package 'leaps' was built under R version 4.2.3

library(forecast) #for accuracy measures

## Warning: package 'forecast' was built under R version 4.2.3

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

#rf
library(randomForest)

## Warning: package 'randomForest' was built under R version 4.2.3

## randomForest 4.7-1.1

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

# decision tree
library(rpart)
library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 4.2.3

library(randomForest)
library(xgboost)

## Warning: package 'xgboost' was built under R version 4.2.3

## 
## Attaching package: 'xgboost'

## The following object is masked from 'package:dplyr':
## 
##     slice

library(Matrix)
library(gains)

dating <- read.csv("Speed Dating Data.csv")

ggplot(dating, aes(x = as.factor(race))) +
  geom_bar() +
  theme_bw() +
  scale_x_discrete(labels=c("African American", "Caucasian","Hispanic","Pacific Islander", "Native American", "Other"))

ggplot(dating, aes(x = as.factor(career_c))) +
  geom_bar() +
  theme_bw()

reduced <- dating[,c(38:40, 45:48, 50, 13)]
reduced[which(reduced$mn_sat == ""), 1] <- NA
reduced[which(reduced$tuition == ""), 2] <- NA
reduced[which(reduced$income == ""), 4] <- NA

unique(reduced$mn_sat)

##  [1] NA         "1,070.00" "1,258.00" "1,400.00" "1,290.00" "1,460.00"
##  [7] "1,430.00" "1,215.00" "1,330.00" "1,450.00" "1,155.00" "1,140.00"
## [13] "1,360.00" "1,402.00" "1,250.00" "1,210.00" "1,220.00" "1,410.00"
## [19] "1,260.00" "1,380.00" "1,030.00" "1,309.00" "1,308.00" "1,050.00"
## [25] "1,100.00" "1,310.00" "1,490.00" "1,188.00" "1,097.00" "1,212.00"
## [31] "1,340.00" "1,034.00" "1,185.00" "1,242.00" "1,160.00" "1,099.00"
## [37] "1,214.00" "1,270.00" "1,110.00" "1,178.00" "1,060.00" "1,157.00"
## [43] "1,180.00" "1,014.00" "1,341.00" "990.00"   "1,320.00" "1,159.00"
## [49] "1,370.00" "1,105.00" "1,365.00" "1,011.00" "1,130.00" "1,206.00"
## [55] "1,331.00" "1,191.00" "914.00"   "1,200.00" "1,080.00" "1,090.00"
## [61] "1,092.00" "1,470.00" "1,149.00" "1,134.00" "1,230.00" "1,267.00"
## [67] "1,280.00" "1,227.00" "1,239.00"

unique(reduced$tuition)

##   [1] NA          "12,696.00" "25,020.00" "26,630.00" "15,309.00" "26,376.00"
##   [7] "26,908.00" "9,451.00"  "24,036.00" "26,100.00" "13,258.00" "9,168.00" 
##  [13] "26,062.00" "27,240.00" "13,413.00" "12,900.00" "26,019.00" "21,960.00"
##  [19] "17,125.00" "24,742.00" "27,395.00" "25,917.00" "34,290.00" "27,100.00"
##  [25] "27,076.00" "17,030.00" "15,162.00" "25,555.00" "14,433.00" "9,696.00" 
##  [31] "13,211.00" "11,360.00" "33,585.00" "25,552.00" "23,187.00" "21,645.00"
##  [37] "27,728.00" "11,311.00" "10,352.00" "26,377.00" "9,210.00"  "25,380.00"
##  [43] "14,589.00" "14,695.00" "17,808.00" "9,620.00"  "27,230.00" "14,388.00"
##  [49] "10,704.00" "25,335.00" "22,170.00" "10,356.00" "14,942.00" "3,730.00" 
##  [55] "26,786.00" "25,425.00" "10,450.00" "10,052.00" "11,424.00" "10,222.00"
##  [61] "15,260.00" "18,720.00" "11,132.00" "17,715.00" "23,500.00" "14,380.00"
##  [67] "24,921.00" "25,504.00" "26,871.00" "10,800.00" "25,026.00" "8,421.00" 
##  [73] "10,096.00" "25,533.00" "25,839.00" "25,890.00" "14,493.00" "27,025.00"
##  [79] "15,990.00" "9,729.00"  "26,170.00" "13,046.00" "20,100.00" "22,481.00"
##  [85] "12,350.00" "25,000.00" "24,497.00" "9,790.00"  "14,600.00" "26,720.00"
##  [91] "14,915.00" "26,775.00" "17,478.00" "8,100.00"  "26,580.00" "19,525.00"
##  [97] "11,570.00" "26,892.00" "34,300.00" "27,982.00" "24,794.00" "2,406.00" 
## [103] "15,004.00" "12,444.00" "13,872.00" "23,530.00" "21,168.00" "25,847.00"
## [109] "18,656.00" "10,332.00" "27,350.00" "9,811.00"  "9,162.00"  "16,650.00"
## [115] "20,337.00" "26,562.00"

unique(reduced$income)

##   [1] "69,487.00"  "65,929.00"  NA           "37,754.00"  "86,340.00" 
##   [6] "60,304.00"  "54,620.00"  "48,652.00"  "29,237.00"  "56,580.00" 
##  [11] "36,782.00"  "38,548.00"  "52,010.00"  "28,418.00"  "43,185.00" 
##  [16] "23,152.00"  "43,664.00"  "48,441.00"  "61,152.00"  "36,485.00" 
##  [21] "41,507.00"  "17,134.00"  "30,038.00"  "33,772.00"  "24,997.00" 
##  [26] "42,096.00"  "28,891.00"  "62,635.00"  "12,063.00"  "29,809.00" 
##  [31] "26,482.00"  "30,147.00"  "39,919.00"  "41,466.00"  "23,988.00" 
##  [36] "28,989.00"  "50,948.00"  "38,022.00"  "47,559.00"  "53,539.00" 
##  [41] "32,159.00"  "53,940.00"  "40,753.00"  "38,207.00"  "46,166.00" 
##  [46] "30,973.00"  "28,317.00"  "26,645.00"  "25,589.00"  "55,223.00" 
##  [51] "109,031.00" "40,409.00"  "21,597.00"  "76,624.00"  "35,968.00" 
##  [56] "51,725.00"  "55,419.00"  "55,550.00"  "26,682.00"  "41,547.00" 
##  [61] "23,361.00"  "74,893.00"  "52,804.00"  "53,923.00"  "27,094.00" 
##  [66] "57,213.00"  "42,390.00"  "43,636.00"  "57,887.00"  "30,768.00" 
##  [71] "66,699.00"  "45,360.00"  "55,080.00"  "17,378.00"  "40,375.00" 
##  [76] "48,929.00"  "78,193.00"  "63,351.00"  "50,745.00"  "29,279.00" 
##  [81] "38,774.00"  "58,802.00"  "41,831.00"  "52,186.00"  "97,857.00" 
##  [86] "74,624.00"  "21,590.00"  "38,832.00"  "37,248.00"  "28,240.00" 
##  [91] "53,771.00"  "56,096.00"  "31,560.00"  "52,467.00"  "80,006.00" 
##  [96] "47,572.00"  "22,439.00"  "31,383.00"  "40,749.00"  "47,997.00" 
## [101] "78,704.00"  "31,143.00"  "32,129.00"  "44,195.00"  "46,837.00" 
## [106] "97,972.00"  "35,960.00"  "65,708.00"  "49,466.00"  "53,229.00" 
## [111] "32,649.00"  "35,867.00"  "40,244.00"  "42,640.00"  "52,388.00" 
## [116] "62,875.00"  "30,855.00"  "46,800.00"  "45,695.00"  "46,792.00" 
## [121] "53,501.00"  "64,716.00"  "27,248.00"  "22,805.00"  "56,118.00" 
## [126] "30,146.00"  "39,123.00"  "46,153.00"  "45,300.00"  "42,397.00" 
## [131] "44,346.00"  "42,225.00"  "37,405.00"  "28,524.00"  "61,141.00" 
## [136] "8,607.00"   "41,476.00"  "49,841.00"  "37,240.00"  "36,594.00" 
## [141] "62,997.00"  "46,608.00"  "37,881.00"  "48,944.00"  "77,112.00" 
## [146] "18,283.00"  "31,432.00"  "73,073.00"  "26,706.00"  "50,060.00" 
## [151] "25,401.00"  "80,608.00"  "43,844.00"  "53,196.00"  "25,786.00" 
## [156] "39,394.00"  "40,695.00"  "45,788.00"  "37,315.00"  "51,663.00" 
## [161] "32,563.00"  "54,303.00"  "16,908.00"  "39,729.00"  "57,316.00" 
## [166] "30,587.00"  "57,513.00"  "31,857.00"  "23,207.00"  "25,831.00" 
## [171] "28,759.00"  "19,264.00"  "41,778.00"  "35,963.00"  "49,409.00" 
## [176] "31,516.00"  "36,223.00"  "43,367.00"  "27,503.00"  "35,187.00" 
## [181] "26,298.00"  "31,148.00"  "55,704.00"  "46,138.00"  "66,827.00" 
## [186] "42,897.00"  "31,809.00"  "75,347.00"  "47,005.00"  "52,805.00" 
## [191] "50,725.00"  "65,693.00"  "45,736.00"  "33,906.00"  "50,501.00" 
## [196] "48,785.00"  "52,318.00"  "62,844.00"  "52,586.00"  "29,236.00" 
## [201] "31,486.00"  "31,632.00"  "106,663.00" "84,043.00"  "35,224.00" 
## [206] "36,381.00"  "65,498.00"  "60,000.00"  "22,669.00"  "81,266.00" 
## [211] "29,746.00"  "47,556.00"  "42,651.00"  "27,794.00"  "41,737.00" 
## [216] "90,225.00"  "52,280.00"  "56,056.00"  "60,835.00"  "62,829.00" 
## [221] "16,767.00"  "42,967.00"  "21,488.00"  "89,977.00"  "18,619.00" 
## [226] "22,161.00"  "82,734.00"  "40,163.00"  "46,185.00"  "78,844.00" 
## [231] "29,575.00"  "34,752.00"  "22,173.00"  "37,994.00"  "35,409.00" 
## [236] "23,707.00"  "57,501.00"  "25,314.00"  "48,876.00"  "34,870.00" 
## [241] "35,848.00"  "45,017.00"  "12,416.00"  "87,789.00"  "50,572.00" 
## [246] "49,642.00"  "20,000.00"  "32,508.00"  "35,627.00"  "46,280.00" 
## [251] "41,191.00"  "71,787.00"  "72,412.00"  "36,510.00"  "32,386.00" 
## [256] "15,863.00"  "46,272.00"  "48,137.00"  "61,686.00"  "47,624.00" 
## [261] "36,673.00"  "55,138.00"

unique(reduced$goal)

## [1]  2  1  6  3  4 NA  5

unique(reduced$date)

## [1]  7  5  3  4  6  1 NA  2

unique(reduced$go_out)

## [1]  1  4  2 NA  3  7  5  6

unique(reduced$career_c)

##  [1] NA  1  6  9  2  7 10  5  3  4 14 11  8 15 12 17 13 16

# Convert to numerical values

reduced[] <- lapply(reduced, function(x) gsub("\\,", "", x))


reduced$mn_sat <- as.numeric(reduced$mn_sat)
reduced$tuition <- as.numeric(reduced$tuition)
reduced$income <- as.numeric(reduced$income)

# Start the imputing process
finished_red <- reduced[complete.cases(reduced),]

which(is.na(finished_red))

## integer(0)

finished_red[] <- lapply(finished_red, as.numeric)
finished_red$match <- as.factor(finished_red$match)

######################################## Start building a knn model


# Split Dataset!

#set.seed value as 1
set.seed(1)

# randomly select 60 percent of the row numbers finished_red in order to split.
train.index <- sample(row.names(finished_red), 0.6*dim(finished_red)[1])  

# select the rest of row numbers of finished_red by using setdiff()
valid.index <- setdiff(row.names(finished_red), train.index)  

# take the 60 percent of random row numbers and call it train.df
train.df <- finished_red[train.index, ]

# take the rest and call it valid.df
valid.df <- finished_red[valid.index, ]

# make sure our validating outcome variables as factor so they become a categorical variable.

# Normalize
train.norm.df <- train.df
valid.norm.df <- valid.df
norm.df <- finished_red

norm.values <- preProcess(train.df[,-9], method=c("center", "scale"))

train.norm.df[,-9] <- predict(norm.values, train.df[,-9])
valid.norm.df[,-9] <- predict(norm.values, valid.df[,-9])
norm.df[,-9] <- predict(norm.values, finished_red[,-9])




############################################ knn
# use knn with k = 3 which means "number of neighbours considered" = 3 instead of 2
knn.pred <- knn(train.norm.df[,-9], valid.norm.df[,-9],
                cl = train.norm.df[,9], k = 3)

summary(knn.pred)

##   0   1 
## 715  53

# Accuracy
length(which(knn.pred == valid.df$match))/nrow(valid.df)

## [1] 0.8229167

# initialize a data frame with two columns: k, and accuracy.
accuracy.df <- data.frame(k = seq(1, 10, 1), accuracy = rep(0, 10))


# Use a for loop to do what we did above for each row of accuracy.df
for(i in 1:10) {
  knn.pred <- knn(train.norm.df[,-9], valid.norm.df[,-9], 
                  cl = train.norm.df[, 9], k = i)
  accuracy.df[i, 2] <- confusionMatrix(knn.pred, valid.norm.df[,9])$overall[1] 
}

accuracy.df

##     k  accuracy
## 1   1 0.8216146
## 2   2 0.8229167
## 3   3 0.8229167
## 4   4 0.8229167
## 5   5 0.8242188
## 6   6 0.8229167
## 7   7 0.8281250
## 8   8 0.8307292
## 9   9 0.8307292
## 10 10 0.8359375

# k = 10 is the best (0.8385417)
knn.pred.new <- knn(train.norm.df[, -9], valid.norm.df[, -9],
                    cl = train.norm.df[, 9], k = 10)

summary(knn.pred.new)

##   0   1 
## 735  33

# Accuracy
length(which(knn.pred.new == valid.df$match))/nrow(valid.df)

## [1] 0.8385417

result <- valid.df

result$pred <- knn.pred.new

result$diff <- result$match == knn.pred.new


wrong <- valid.df[which(!result$diff),]
wrong <- wrong[,1:9]

melt_data <- melt(wrong, id = c("match")) 

melt_data$value <- as.numeric(melt_data$value)


ggplot(melt_data, aes(x = match, y = value)) +
  geom_boxplot() +
  facet_wrap(~variable,  scales = "free")

###################################### Ten fold cross validation in caret

train.df$match <- as.factor(train.df$match)
trControl <- trainControl(method  = "cv",
                          number  = 10)
fit <- train(match ~ .,
             method     = "knn",
             tuneGrid   = expand.grid(k = 1:10),
             trControl  = trControl,
             metric     = "Accuracy",
             data       = train.df)


fit

## k-Nearest Neighbors 
## 
## 1151 samples
##    8 predictor
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1036, 1035, 1035, 1036, 1037, 1035, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa     
##    1  0.8374909  0.15448235
##    2  0.8461869  0.20228512
##    3  0.8418767  0.16146609
##    4  0.8409694  0.14804248
##    5  0.8410148  0.13695364
##    6  0.8401527  0.08528767
##    7  0.8401377  0.07829305
##    8  0.8436011  0.08017681
##    9  0.8436388  0.06875562
##   10  0.8418919  0.05870907
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 2.

################################### logistic regression

# in creating the folds we specify the target feature (dependent variable) and # of folds
folds = createFolds(train.df$match, k = 10)

# in cv we are going to applying a created function to our 'folds'
cv = lapply(folds, function(x) { # start of function
  # in the next two lines we will separate the Training set into it's 10 pieces
  training_fold = train.df[-x, ] # training fold =  training set minus (-) it's sub test fold
  test_fold = train.df[x, ] # here we describe the test fold individually
  levels(test_fold$match) <- levels(training_fold$match)
  # now apply (train) the classifer on the training_fold
  classifier = svm(formula = match ~ .,
                   data = training_fold,
                   type = 'C-classification',
                   kernel = 'linear')
  # next step in the loop, we calculate the predictions and cm and we equate the accuracy
  # note we are training on training_fold and testing its accuracy on the test_fold
  y_pred = predict(classifier, newdata = test_fold)
  cm = table(test_fold[, 9], y_pred)
  accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
  return(accuracy)
})

cv

## $Fold01
## [1] 0.8508772
## 
## $Fold02
## [1] 0.8521739
## 
## $Fold03
## [1] 0.8521739
## 
## $Fold04
## [1] 0.8521739
## 
## $Fold05
## [1] 0.8448276
## 
## $Fold06
## [1] 0.8448276
## 
## $Fold07
## [1] 0.8521739
## 
## $Fold08
## [1] 0.8434783
## 
## $Fold09
## [1] 0.8434783
## 
## $Fold10
## [1] 0.8521739

which.max(cv)

## Fold02 
##      2

f<- folds[[which.max(cv)]]



##########Now lets build a model!!
set.seed(1)
# since fold 1 is most accurate we shall use folds$Fold01 to create our train and test dataset!
training_fold = train.df[-f, ]
test_fold = valid.df # here we describe the test fold individually
# now apply (train) the classifer on the training_fold
classifier = svm(formula = match ~ .,
                 data = train.df,
                 type = 'C-classification',
                 kernel = 'radial')

summary(classifier)

## 
## Call:
## svm(formula = match ~ ., data = train.df, type = "C-classification", 
##     kernel = "radial")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
## 
## Number of Support Vectors:  402
## 
##  ( 228 174 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1

# next step in the loop, we calculate the predictions and cm and we equate the accuracy
# note we are training on training_fold and testing its accuracy on the test_fold
y_pred = predict(classifier, newdata = test_fold[,-9])
cm = table(test_fold[, 9], y_pred)
accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])


accuracy

## [1] 0.8359375

################################### Random Forest

# in creating the folds we specify the target feature (dependent variable) and # of folds
folds = createFolds(train.df$match, k = 10)

# in cv we are going to applying a created function to our 'folds'
cv = lapply(folds, function(x) { # start of function
  # in the next two lines we will separate the Training set into it's 10 pieces
  training_fold = train.df[-x, ] # training fold =  training set minus (-) it's sub test fold
  test_fold = train.df[x, ] # here we describe the test fold individually
  levels(test_fold$match) <- levels(training_fold$match)
  # now apply (train) the classifer on the training_fold
  classifier = randomForest(match~., data=training_fold, proximity=TRUE)
  # next step in the loop, we calculate the predictions and cm and we equate the accuracy
  # note we are training on training_fold and testing its accuracy on the test_fold
  y_pred = predict(classifier, newdata = test_fold)
  cm = table(test_fold[, 9], y_pred)
  accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
  return(accuracy)
})

cv

## $Fold01
## [1] 0.8695652
## 
## $Fold02
## [1] 0.8448276
## 
## $Fold03
## [1] 0.8086957
## 
## $Fold04
## [1] 0.8448276
## 
## $Fold05
## [1] 0.8521739
## 
## $Fold06
## [1] 0.8421053
## 
## $Fold07
## [1] 0.8362069
## 
## $Fold08
## [1] 0.8157895
## 
## $Fold09
## [1] 0.826087
## 
## $Fold10
## [1] 0.8347826

which.max(cv)

## Fold01 
##      1

#Random Forrest cannot predict with high accuracy for this dataset





##############################################Decision Tree - 10 fold Cross validation

folds = createFolds(train.df$match, k = 10)


cv = lapply(folds, function(x) { # start of function
  # in the next two lines we will separate the Training set into it's 10 pieces
  training_fold = train.df[-x, ] # training fold = training set minus (-) it's sub test fold
  test_fold = train.df[x, ] # here we describe the test fold individually
  levels(test_fold$match) <- levels(training_fold$match)
  # now apply (train) the classifer on the training_fold
  tr <- rpart(match ~ ., data = training_fold, cp=0.0001, minbucket = 5, maxdepth = 7)
  
  pfit<- prune(tr, cp = tr$cptable[which.min(tr$cptable[,"xerror"]),"CP"])
  # next step in the loop, we calculate the predictions and cm and we equate the accuracy
  # note we are training on training_fold and testing its accuracy on the test_fold
  
  y_pred = predict(pfit, test_fold, type = "class")
  cm = table(test_fold[, 1], y_pred)
  accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
  return(accuracy)
})


cv

## $Fold01
## [1] 0.5
## 
## $Fold02
## [1] 0.6666667
## 
## $Fold03
## [1] 0.25
## 
## $Fold04
## [1] 0.25
## 
## $Fold05
## [1] 0.4285714
## 
## $Fold06
## [1] 0.5
## 
## $Fold07
## [1] 0.25
## 
## $Fold08
## [1] 0.5
## 
## $Fold09
## [1] 0.5
## 
## $Fold10
## [1] 0.5

which.max(cv)

## Fold02 
##      2

f<- folds[[which.max(cv)]]




##############################################Decision Tree


# Re-structure the dataset since decision tree can handle NA values!

reduced <- dating[,c(38:40, 45:48, 50, 35, 43, 13)]

reduced[] <- lapply(reduced, function(x) gsub("\\,", "", x))

reduced$match <- as.factor(reduced$match)

reduced[which(reduced$mn_sat == ""), 1] <- NA
reduced[which(reduced$tuition == ""), 2] <- NA
reduced[which(reduced$income == ""), 4] <- NA
reduced[which(reduced$field == ""), 9] <- NA
reduced[which(reduced$from == ""), 10] <- NA




# Split Dataset!

#set.seed value as 1
set.seed(1)

# randomly select 60 percent of the row numbers finished_red in order to split.
train.index <- sample(row.names(finished_red), 0.6*dim(finished_red)[1])  

# select the rest of row numbers of finished_red by using setdiff()
valid.index <- setdiff(row.names(finished_red), train.index)  

# take the 60 percent of random row numbers and call it train.df
train.df <- finished_red[train.index, ]

# take the rest and call it valid.df
valid.df <- finished_red[valid.index, ]



valid.df$match <-as.factor(valid.df$match)
train.df$match <-as.factor(train.df$match)


#fit a classification tree model (Competitive. is the outcome variable, and all other variables are predictors). 
tr <- rpart(match ~ ., data = train.df[-f,], cp=0.0001, minbucket = 5, maxdepth = 7)
rpart.plot(tr, extra = 106)

#print cp of the newly fitted model.  
options(scipen = 10)
printcp(tr)

## 
## Classification tree:
## rpart(formula = match ~ ., data = train.df[-f, ], cp = 0.0001, 
##     minbucket = 5, maxdepth = 7)
## 
## Variables actually used in tree construction:
## [1] date   goal   income mn_sat
## 
## Root node error: 157/1036 = 0.15154
## 
## n= 1036 
## 
##          CP nsplit rel error  xerror     xstd
## 1 0.0191083      0   1.00000 1.00000 0.073513
## 2 0.0031847      2   0.96178 0.96178 0.072340
## 3 0.0021231      4   0.95541 1.04459 0.074834
## 4 0.0001000      7   0.94904 1.04459 0.074834

# determine a nested sequence of subtrees on the rpart object "tr" to be trimmed based on the complexity parameter that has the lowest standard error.
pfit<- prune(tr, cp = tr$cptable[which.min(tr$cptable[,"xerror"]),"CP"])

# print out the lowest standard error (xerror)
tr$cptable[which.min(tr$cptable[,"xerror"]),"CP"]

## [1] 0.003184713

#Q6: Based on the above calculation, which number of splits indicates the minimum complexity parameters? 

prp(tr)

t(t(names(finished_red)))

##       [,1]      
##  [1,] "mn_sat"  
##  [2,] "tuition" 
##  [3,] "race"    
##  [4,] "income"  
##  [5,] "goal"    
##  [6,] "date"    
##  [7,] "go_out"  
##  [8,] "career_c"
##  [9,] "match"

#which are the most important variables
t(t(tr$variable.importance))

##               [,1]
## goal     8.1776751
## mn_sat   7.6034723
## date     6.7661649
## income   4.7952378
## tuition  4.4872351
## career_c 3.2041209
## go_out   0.7039521
## race     0.2807651

#set of rules
tr

## n= 1036 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 1036 157 0 (0.8484556 0.1515444)  
##    2) mn_sat< 1372.5 709  84 0 (0.8815233 0.1184767)  
##      4) date>=3.5 637  69 0 (0.8916797 0.1083203) *
##      5) date< 3.5 72  15 0 (0.7916667 0.2083333)  
##       10) mn_sat>=1032 63  10 0 (0.8412698 0.1587302) *
##       11) mn_sat< 1032 9   4 1 (0.4444444 0.5555556) *
##    3) mn_sat>=1372.5 327  73 0 (0.7767584 0.2232416)  
##      6) goal< 5.5 305  59 0 (0.8065574 0.1934426)  
##       12) date< 5.5 166  24 0 (0.8554217 0.1445783) *
##       13) date>=5.5 139  35 0 (0.7482014 0.2517986)  
##         26) income< 72099.5 112  22 0 (0.8035714 0.1964286) *
##         27) income>=72099.5 27  13 0 (0.5185185 0.4814815)  
##           54) mn_sat< 1415 10   4 0 (0.6000000 0.4000000) *
##           55) mn_sat>=1415 17   8 1 (0.4705882 0.5294118) *
##      7) goal>=5.5 22   8 1 (0.3636364 0.6363636) *

valid.df$pred <- predict(pfit, valid.df, type = "class")
corr <- which(valid.df$pred == valid.df$match)

base_accuracy <- length(corr)/nrow(valid.df)

base_accuracy

## [1] 0.8359375

proj-1-match-from-demo-.R

amber

2023-11-22