1._Classification

#Question 1

#Describe   a   situation   or  problem from    your    job,    everyday    life,   current events, etc.,   for which   a   
#classification model would be  appropriate.    List    some    (up to  5)  predictors  that    you might   use.

#ANSWER
#Classifying new customer signups into distinct groups based on
#their behavior on the website.

#Some predictors could be:
#  1)

#######################################################
#Question 2-a

#The    files credit_card_data.txt (without headers)    and credit_card_data-headers.txt    (with   headers)    
#contain    a   dataset with    654 data    points, 6   continuous  and 4   binary  predictor   variables.      It  has 
#anonymized credit  card    applications    with    a   binary  response    variable    (last   column) indicating  if  the 
#application    was positive    or  negative.   The dataset is  the “Credit Approval    Data    Set”    from    the UCI Machine 
#Learning   Repository  (https://archive.ics.uci.edu/ml/datasets/Credit+Approval)   without the categorial  
#variables and  without data    points  that    have    missing values.



#import libraries
library(data.table)
library(dplyr)

## -------------------------------------------------------------------------

## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!

## -------------------------------------------------------------------------

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

#kernlab package
#https://cran.r-project.org/web/packages/kernlab/kernlab.pdf
if (!require("kernlab")) {
  install.packages("kernlab", dependencies = TRUE)
  library(kernlab)
}

## Loading required package: kernlab

#install R's kknn (Weighted K-nearest neighbors package)
if (!require("kknn")) {
  install.packages("kknn", dependencies = TRUE)
  library(kknn)
}

## Loading required package: kknn

#import data
noHeadersData <- fread('https://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data')

#inspect data
head(noHeadersData)

##    V1    V2    V3 V4 V5 V6 V7   V8 V9 V10 V11 V12 V13   V14 V15 V16
## 1:  b 30.83 0.000  u  g  w  v 1.25  t   t   1   f   g 00202   0   +
## 2:  a 58.67 4.460  u  g  q  h 3.04  t   t   6   f   g 00043 560   +
## 3:  a 24.50 0.500  u  g  q  h 1.50  t   f   0   f   g 00280 824   +
## 4:  b 27.83 1.540  u  g  w  v 3.75  t   t   5   t   g 00100   3   +
## 5:  b 20.17 5.625  u  g  w  v 1.71  t   f   0   f   s 00120   0   +
## 6:  b 32.08 4.000  u  g  m  v 2.50  t   f   0   t   g 00360   0   +

#import data that has headers
creditcardData <- fread('https://d37djvu3ytnwxt.cloudfront.net/assets/courseware/v1/e39a3df780dacd5503df6a8322d72cd2/asset-v1:GTx+ISYE6501x+2T2017+type@asset+block/credit_card_data-headers.txt')

#inspect the data
head(creditcardData)

##    A1    A2    A3   A8 A9 A10 A11 A12 A14 A15 R1
## 1:  1 30.83 0.000 1.25  1   0   1   1 202   0  1
## 2:  0 58.67 4.460 3.04  1   0   6   1  43 560  1
## 3:  0 24.50 0.500 1.50  1   1   0   1 280 824  1
## 4:  1 27.83 1.540 3.75  1   0   5   0 100   3  1
## 5:  1 20.17 5.625 1.71  1   1   0   1 120   0  1
## 6:  1 32.08 4.000 2.50  1   1   0   0 360   0  1

#check data structure
glimpse(creditcardData)

## Observations: 654
## Variables: 11
## $ A1  <int> 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0...
## $ A2  <dbl> 30.83, 58.67, 24.50, 27.83, 20.17, 32.08, 33.17, 22.92, 54...
## $ A3  <dbl> 0.000, 4.460, 0.500, 1.540, 5.625, 4.000, 1.040, 11.585, 0...
## $ A8  <dbl> 1.250, 3.040, 1.500, 3.750, 1.710, 2.500, 6.500, 0.040, 3....
## $ A9  <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1...
## $ A10 <int> 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0...
## $ A11 <int> 1, 6, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 10, 3, 10, 0,...
## $ A12 <int> 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1...
## $ A14 <int> 202, 43, 280, 100, 120, 360, 164, 80, 180, 52, 128, 260, 0...
## $ A15 <int> 0, 560, 824, 3, 0, 0, 31285, 1349, 314, 1442, 0, 200, 0, 2...
## $ R1  <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...

#transform data table into a data matrix
creditcardDataMatrix<-data.matrix(creditcardData)

#KMEANS SAMPLE################################
#scale the data
myClusterDataStandardized <- scale(creditcardData[-1])

#transform data table into a data matrix
myClusterDataStandardizedMatrix<-data.matrix(myClusterDataStandardized)

# Run kmeans on our standardized data
ourGroups <- kmeans(myClusterDataStandardizedMatrix, 3)

# Load in our cluster library 
library(cluster)

# Visualize our clusters
clusplot(myClusterDataStandardizedMatrix, ourGroups$cluster)

#########################################################


model <- ksvm(creditcardDataMatrix[,1:10],creditcardDataMatrix[,11],type= "C-svc", 
              kernel= "vanilladot",C=100,scaled=TRUE)

##  Setting default kernel parameters

# calculate a1…am
a <- colSums(creditcardDataMatrix[model@SVindex,1:10] * model@coef[[1]])
a

##            A1            A2            A3            A8            A9 
## -4.660362e-04 -1.405350e-02 -8.168866e-03  1.012922e-02  5.016095e-01 
##           A10           A11           A12           A14           A15 
## -1.403434e-03  1.291217e-03 -2.668989e-04 -2.067550e-01  5.583356e+02

# calculate a0
a0 <- sum(a*creditcardDataMatrix[1,1:10]) - model@b
a0

## [1] -41.60136

# see what the model predicts
pred <- predict(model,creditcardDataMatrix[,1:10])
pred

##   [1] 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1
##  [71] 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [106] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [141] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1
## [176] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [211] 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [246] 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## [281] 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0
## [316] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [351] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [386] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [421] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [456] 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [491] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [526] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1
## [561] 1 1 1 1 0 1 1 1 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## [596] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## [631] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

# see what fraction of the model’s predictions match the
#actual classification
sum(pred == creditcardDataMatrix[,11]) / nrow(creditcardDataMatrix)

## [1] 0.8639144

##############################################################################
#Question 2-b: Using    the k-nearest-neighbors classification  function    kknn contained  in  the R   kknn    package,
#suggest    a   good    value   of  k,  and show    how well    it  classifies  that    data    points  in  the full    data    set.        
#Don’t  forget  to  scale   the data    (scale=TRUE in  kknn).

#Using R base

# Set up 2 x 3 plotting grid
par(mfrow = c(2, 3))

# Set seed
set.seed(1)

for(i in 1:6) {
  # Run kmeans() on dataset with three clusters and one start
  km.out <- kmeans(creditcardDataMatrix, centers = 3, nstart = 1)
  
  # Plot clusters
  plot(creditcardDataMatrix, col = km.out$cluster, 
       main = km.out$tot.withinss, 
       xlab = "", ylab = "")
}

# Initialize total within sum of squares error: wss
wss <- 0

# For 1 to 15 cluster centers
for (i in 1:15) {
  km.out <- kmeans(creditcardDataMatrix, centers = i, nstart = 20)
  # Save total within sum of squares to wss variable
  wss[i] <- km.out$tot.withinss
}

# Plot total within sum of squares vs. number of clusters
plot(1:15, wss, type = "b", 
     xlab = "Number of Clusters", 
     ylab = "Within groups sum of squares")

k<-4
#####################################################################################################
#Question 3

#Using  the same    data    set as  Question    2 use   the ksvm or kknn function   to  find    a   good    classifier:
#  (a) using    cross-validation    for the k-nearest-neighbors model;  and
#(b) splitting  the data    into    training,   validation, and test    data    sets.

1._Classification_Model.R

anitaowens

Sun May 21 18:02:23 2017