## Example
# ---
# Create an svm model that predicts the presence of heart disease in a patient.
# ---
# Dataset url = http://bit.ly/HeartdiseaseDataset
# ---
# OUR CODE GOES BELOW
# ---
#
# We first install the caret package.
# This package will be very helfpul in providing us with
# direct access to various functions for training our model
# with various machine learning algorithms such
# as KNN, SVM, Decision Tree, Linear Regression etc.
# ---
#
install.packages('caret')
# We then load our dataset as shown below
# The ‘sep’ attribute indicates that the data is stored in a CSV or Comma Separated Version.
# ---
#
heart <- read.csv("http://bit.ly/HeartdiseaseDataset", sep = ',', header = FALSE)
head(heart)
# We check the structure of the dataframe through the function str()
# ---
#
str(heart)
# Previewing our dataset
# ---
#
head(heart)
# Next we split the data into training set and testing set.
# NB: The training set will be used for model building while the testing set for model evaluation.
# ---
# - The “y” parameter takes the value of variable according to which data needs to be partitioned.
# In our case, target variable is at V14, so we are passing heart$V14
# - The “p” parameter holds a decimal value in the range of 0-1. It’s to show the percentage of the split.
# We are using p=0.7. It means that data split should be done in 70:30 ratio.
# So, 70% of the data is used for training and the remaining 30% is for testing the model.
# - The “list” parameter is for whether to return a list or matrix.
# We are passing FALSE for not returning a list
# ---
#
library('caret')
intrain <- createDataPartition(y = heart$V14, p= 0.7, list = FALSE)
training <- heart[intrain,]
testing <- heart[-intrain,]
# We check the dimensions of out training dataframe and testing dataframe
# ---
#
dim(training);
dim(testing);
# We then clean the data using the anyNA() method that checks for any null values.
# ---
#
anyNA(heart)
# Then check the summary of our data by using the summary() function
# ---
#
summary(heart)
# From our output above, we can see that the values of the various variables are not standardized.
# For example, the V14 variables, which is our target variable, it holds only 2 values, either 0 or 1.
# This should be a categorical variable. To convert these to categorical variables, we need to factorize them.
# The following code will convert the training data frame’s “V14” column to a factor variable.
# ---
#
training[["V14"]] = factor(training[["V14"]])
# Before we train our model we will need to control all the computational overheads.
# We will implement this through the trainControl() method.
# This will allow us to use the train() function provided by the caret package.
# ---
# The trainControl method will take three parameters:
# a) The “method” parameter defines the resampling method,
# in this demo we’ll be using the repeatedcv or the repeated cross-validation method.
# b) The next parameter is the “number”, this basically holds the number of resampling iterations.
# c) The “repeats ” parameter contains the sets to compute for our repeated cross-validation.
# We are using setting number =10 and repeats =3
# ---
#
trctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 3)
svm_Linear <- train(V14 ~., data = training, method = "svmLinear",
trControl=trctrl,
preProcess = c("center", "scale"),
tuneLength = 10)
# We can then check the reult of our train() model as shown below
# ---
#
svm_Linear
# We can use the predict() method for predicting results as shown below.
# We pass 2 arguements, our trained model and our testing data frame.
# ---
#
test_pred <- predict(svm_Linear, newdata = testing)
test_pred
# Now checking for our accuracy of our model by using a confusion matrix
# ---
#
confusionMatrix(table(test_pred, testing$V14))
The above example was borrowed from the following reading: https://www.edureka.co/blog/support-vector-machine-in-r/#SVM%20Use%20Case
## Challenge
# ---
# Using R build an SVM model to determine whether a student will default on their loan.
# ---
# Dataset url = http://bit.ly/StudentDefaultDataset
# ---
#
# OUR CODE GOES BELOW
std = read.csv("http://bit.ly/StudentDefaultDataset")
head(std)
str(std)
dim(std)
anyNA(std)
#
install.packages("superml")
library(superml)
lbl <- LabelEncoder$new()
lbl$fit(std$default)
std$default <- lbl$fit_transform(std$default)
#
lbl <- LabelEncoder$new()
lbl$fit(std$student)
std$student <- lbl$fit_transform(std$student)
str(std)
dim(std)
#
library('caret')
intrain <- createDataPartition(y = std$default, p= 0.7, list = FALSE)
training <- std[intrain,]
testing <- std[-intrain,]
#
training[["default"]] = factor(training[["default"]])
#
trctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 3)
svm_Linear <- train(default ~., data = training, method = "svmLinear",
trControl=trctrl,
preProcess = c("center", "scale"),
tuneLength = 10)
#
svm_Linear
#
test_pred <- predict(svm_Linear, newdata = testing)
test_pred
#
confusionMatrix(table(test_pred, testing$default))