Supervised Learning with R - SVM

Example

## Example 
# ---
# Create an svm model that predicts the presence of heart disease in a patient.
# ---
# Dataset url = http://bit.ly/HeartdiseaseDataset
# ---
# OUR CODE GOES BELOW
# ---
#

# We first install the caret package. 
# This package will be very helfpul in providing us with 
# direct access to various functions for training our model
# with various machine learning algorithms such 
# as KNN, SVM, Decision Tree, Linear Regression etc.
# ---
# 
install.packages('caret')

# We then load our dataset as shown below
# The ‘sep’ attribute indicates that the data is stored in a CSV or Comma Separated Version.
# ---
# 

heart <- read.csv("http://bit.ly/HeartdiseaseDataset", sep = ',', header = FALSE)
head(heart)

# We check the structure of the dataframe through the function str()
# ---
# 
str(heart)

# Previewing our dataset 
# ---
# 
head(heart)

# Next we split the data into training set and testing set. 
# NB: The training set will be used for model building while the testing set for model evaluation.
# ---
# - The “y” parameter takes the value of variable according to which data needs to be partitioned. 
# In our case, target variable is at V14, so we are passing heart$V14
# - The “p” parameter holds a decimal value in the range of 0-1. It’s to show the percentage of the split. 
# We are using p=0.7. It means that data split should be done in 70:30 ratio. 
# So, 70% of the data is used for training and the remaining 30% is for testing the model.
# - The “list” parameter is for whether to return a list or matrix. 
# We are passing FALSE for not returning a list
# ---
# 
library('caret')

intrain <- createDataPartition(y = heart$V14, p= 0.7, list = FALSE)
training <- heart[intrain,]
testing <- heart[-intrain,]

# We check the dimensions of out training dataframe and testing dataframe
# ---
# 
dim(training); 
dim(testing);

# We then clean the data using the anyNA() method that checks for any null values.
# ---
#  
anyNA(heart)

# Then check the summary of our data by using the summary() function
# ---
#  
summary(heart)

# From our output above, we can see that the values of the various variables are not standardized. 
# For example, the V14 variables, which is our target variable, it holds only 2 values, either 0 or 1.
# This should be a categorical variable. To convert these to categorical variables, we need to factorize them.
# The following code will convert the training data frame’s “V14” column to a factor variable.
# ---
# 
training[["V14"]] = factor(training[["V14"]])

# Before we train our model we will need to control all the computational overheads. 
# We will implement this through the trainControl() method. 
# This will allow us to use the train() function provided by the caret package. 
# ---
# The trainControl method will take three parameters:
# a) The “method” parameter defines the resampling method, 
# in this demo we’ll be using the repeatedcv or the repeated cross-validation method.
# b) The next parameter is the “number”, this basically holds the number of resampling iterations.
# c) The “repeats ” parameter contains the sets to compute for our repeated cross-validation. 
# We are using setting number =10 and repeats =3
# ---
# 
trctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 3)

svm_Linear <- train(V14 ~., data = training, method = "svmLinear",
trControl=trctrl,
preProcess = c("center", "scale"),
tuneLength = 10)

# We can then check the reult of our train() model as shown below
# ---
# 
svm_Linear

# We can use the predict() method for predicting results as shown below. 
# We pass 2 arguements, our trained model and our testing data frame.
# ---
# 
test_pred <- predict(svm_Linear, newdata = testing)
test_pred

# Now checking for our accuracy of our model by using a confusion matrix 
# ---
# 
confusionMatrix(table(test_pred, testing$V14))

The above example was borrowed from the following reading: https://www.edureka.co/blog/support-vector-machine-in-r/#SVM%20Use%20Case

Challenge

## Challenge 
# ---
# Using R build an SVM model to determine whether a student will default on their loan.
# ---
# Dataset url = http://bit.ly/StudentDefaultDataset
# ---
# 
# OUR CODE GOES BELOW

std = read.csv("http://bit.ly/StudentDefaultDataset")
head(std)

str(std)

dim(std)

anyNA(std)

#
install.packages("superml")
library(superml)

lbl <- LabelEncoder$new()
lbl$fit(std$default)
std$default <- lbl$fit_transform(std$default)

#
lbl <- LabelEncoder$new()
lbl$fit(std$student)
std$student <- lbl$fit_transform(std$student)

str(std)

dim(std)

# 
library('caret')

intrain <- createDataPartition(y = std$default, p= 0.7, list = FALSE)
training <- std[intrain,]
testing <- std[-intrain,]

# 
training[["default"]] = factor(training[["default"]])

#
trctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 3)

svm_Linear <- train(default ~., data = training, method = "svmLinear",
trControl=trctrl,
preProcess = c("center", "scale"),
tuneLength = 10)

#
svm_Linear

# 
test_pred <- predict(svm_Linear, newdata = testing)
test_pred

#
confusionMatrix(table(test_pred, testing$default))