#Question 1
#Describe a situation or problem from your job, everyday life, current events, etc., for which a
#classification model would be appropriate. List some (up to 5) predictors that you might use.
#ANSWER
#Classifying new customer signups into distinct groups based on
#their behavior on the website.
#Some predictors could be:
# 1)
#######################################################
#Question 2-a
#The files credit_card_data.txt (without headers) and credit_card_data-headers.txt (with headers)
#contain a dataset with 654 data points, 6 continuous and 4 binary predictor variables. It has
#anonymized credit card applications with a binary response variable (last column) indicating if the
#application was positive or negative. The dataset is the “Credit Approval Data Set” from the UCI Machine
#Learning Repository (https://archive.ics.uci.edu/ml/datasets/Credit+Approval) without the categorial
#variables and without data points that have missing values.
#import libraries
library(data.table)
library(dplyr)
## -------------------------------------------------------------------------
## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!
## -------------------------------------------------------------------------
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#kernlab package
#https://cran.r-project.org/web/packages/kernlab/kernlab.pdf
if (!require("kernlab")) {
install.packages("kernlab", dependencies = TRUE)
library(kernlab)
}
## Loading required package: kernlab
#install R's kknn (Weighted K-nearest neighbors package)
if (!require("kknn")) {
install.packages("kknn", dependencies = TRUE)
library(kknn)
}
## Loading required package: kknn
#import data
noHeadersData <- fread('https://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data')
#inspect data
head(noHeadersData)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16
## 1: b 30.83 0.000 u g w v 1.25 t t 1 f g 00202 0 +
## 2: a 58.67 4.460 u g q h 3.04 t t 6 f g 00043 560 +
## 3: a 24.50 0.500 u g q h 1.50 t f 0 f g 00280 824 +
## 4: b 27.83 1.540 u g w v 3.75 t t 5 t g 00100 3 +
## 5: b 20.17 5.625 u g w v 1.71 t f 0 f s 00120 0 +
## 6: b 32.08 4.000 u g m v 2.50 t f 0 t g 00360 0 +
#import data that has headers
creditcardData <- fread('https://d37djvu3ytnwxt.cloudfront.net/assets/courseware/v1/e39a3df780dacd5503df6a8322d72cd2/asset-v1:GTx+ISYE6501x+2T2017+type@asset+block/credit_card_data-headers.txt')
#inspect the data
head(creditcardData)
## A1 A2 A3 A8 A9 A10 A11 A12 A14 A15 R1
## 1: 1 30.83 0.000 1.25 1 0 1 1 202 0 1
## 2: 0 58.67 4.460 3.04 1 0 6 1 43 560 1
## 3: 0 24.50 0.500 1.50 1 1 0 1 280 824 1
## 4: 1 27.83 1.540 3.75 1 0 5 0 100 3 1
## 5: 1 20.17 5.625 1.71 1 1 0 1 120 0 1
## 6: 1 32.08 4.000 2.50 1 1 0 0 360 0 1
#check data structure
glimpse(creditcardData)
## Observations: 654
## Variables: 11
## $ A1 <int> 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0...
## $ A2 <dbl> 30.83, 58.67, 24.50, 27.83, 20.17, 32.08, 33.17, 22.92, 54...
## $ A3 <dbl> 0.000, 4.460, 0.500, 1.540, 5.625, 4.000, 1.040, 11.585, 0...
## $ A8 <dbl> 1.250, 3.040, 1.500, 3.750, 1.710, 2.500, 6.500, 0.040, 3....
## $ A9 <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1...
## $ A10 <int> 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0...
## $ A11 <int> 1, 6, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 10, 3, 10, 0,...
## $ A12 <int> 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1...
## $ A14 <int> 202, 43, 280, 100, 120, 360, 164, 80, 180, 52, 128, 260, 0...
## $ A15 <int> 0, 560, 824, 3, 0, 0, 31285, 1349, 314, 1442, 0, 200, 0, 2...
## $ R1 <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
#transform data table into a data matrix
creditcardDataMatrix<-data.matrix(creditcardData)
#KMEANS SAMPLE################################
#scale the data
myClusterDataStandardized <- scale(creditcardData[-1])
#transform data table into a data matrix
myClusterDataStandardizedMatrix<-data.matrix(myClusterDataStandardized)
# Run kmeans on our standardized data
ourGroups <- kmeans(myClusterDataStandardizedMatrix, 3)
# Load in our cluster library
library(cluster)
# Visualize our clusters
clusplot(myClusterDataStandardizedMatrix, ourGroups$cluster)

#########################################################
model <- ksvm(creditcardDataMatrix[,1:10],creditcardDataMatrix[,11],type= "C-svc",
kernel= "vanilladot",C=100,scaled=TRUE)
## Setting default kernel parameters
# calculate a1…am
a <- colSums(creditcardDataMatrix[model@SVindex,1:10] * model@coef[[1]])
a
## A1 A2 A3 A8 A9
## -4.660362e-04 -1.405350e-02 -8.168866e-03 1.012922e-02 5.016095e-01
## A10 A11 A12 A14 A15
## -1.403434e-03 1.291217e-03 -2.668989e-04 -2.067550e-01 5.583356e+02
# calculate a0
a0 <- sum(a*creditcardDataMatrix[1,1:10]) - model@b
a0
## [1] -41.60136
# see what the model predicts
pred <- predict(model,creditcardDataMatrix[,1:10])
pred
## [1] 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1
## [71] 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [106] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [141] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1
## [176] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [211] 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [246] 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## [281] 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0
## [316] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [351] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [386] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [421] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [456] 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [491] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [526] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1
## [561] 1 1 1 1 0 1 1 1 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## [596] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## [631] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# see what fraction of the model’s predictions match the
#actual classification
sum(pred == creditcardDataMatrix[,11]) / nrow(creditcardDataMatrix)
## [1] 0.8639144
##############################################################################
#Question 2-b: Using the k-nearest-neighbors classification function kknn contained in the R kknn package,
#suggest a good value of k, and show how well it classifies that data points in the full data set.
#Don’t forget to scale the data (scale=TRUE in kknn).
#Using R base
# Set up 2 x 3 plotting grid
par(mfrow = c(2, 3))
# Set seed
set.seed(1)
for(i in 1:6) {
# Run kmeans() on dataset with three clusters and one start
km.out <- kmeans(creditcardDataMatrix, centers = 3, nstart = 1)
# Plot clusters
plot(creditcardDataMatrix, col = km.out$cluster,
main = km.out$tot.withinss,
xlab = "", ylab = "")
}

# Initialize total within sum of squares error: wss
wss <- 0
# For 1 to 15 cluster centers
for (i in 1:15) {
km.out <- kmeans(creditcardDataMatrix, centers = i, nstart = 20)
# Save total within sum of squares to wss variable
wss[i] <- km.out$tot.withinss
}
# Plot total within sum of squares vs. number of clusters
plot(1:15, wss, type = "b",
xlab = "Number of Clusters",
ylab = "Within groups sum of squares")
k<-4
#####################################################################################################
#Question 3
#Using the same data set as Question 2 use the ksvm or kknn function to find a good classifier:
# (a) using cross-validation for the k-nearest-neighbors model; and
#(b) splitting the data into training, validation, and test data sets.
