Support Vector Machines


> import pandas as pd
+ import numpy as np
+ import matplotlib.pyplot as plt
+ import seaborn as sns

The Data

We’ll use the built in breast cancer dataset from Scikit Learn.

> from sklearn.datasets import load_breast_cancer
+ cancer = load_breast_cancer()

The data set is presented in a dictionary form:

> cancer.keys()
dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])

We can grab information and arrays out of this dictionary to set up our data frame and understanding of the features:

Set up DataFrame

> df_feat = pd.DataFrame(cancer['data'],
+                        columns=cancer['feature_names'])
mean radius mean texture mean perimeter mean area mean smoothness mean compactness mean concavity mean concave points mean symmetry mean fractal dimension
17.99 10.38 122.80 1001.0 0.11840 0.27760 0.3001 0.14710 0.2419 0.07871
20.57 17.77 132.90 1326.0 0.08474 0.07864 0.0869 0.07017 0.1812 0.05667
19.69 21.25 130.00 1203.0 0.10960 0.15990 0.1974 0.12790 0.2069 0.05999
11.42 20.38 77.58 386.1 0.14250 0.28390 0.2414 0.10520 0.2597 0.09744
20.29 14.34 135.10 1297.0 0.10030 0.13280 0.1980 0.10430 0.1809 0.05883
radius error texture error perimeter error area error smoothness error compactness error concavity error concave points error symmetry error fractal dimension error
1.0950 0.9053 8.589 153.40 0.006399 0.04904 0.05373 0.01587 0.03003 0.006193
0.5435 0.7339 3.398 74.08 0.005225 0.01308 0.01860 0.01340 0.01389 0.003532
0.7456 0.7869 4.585 94.03 0.006150 0.04006 0.03832 0.02058 0.02250 0.004571
0.4956 1.1560 3.445 27.23 0.009110 0.07458 0.05661 0.01867 0.05963 0.009208
0.7572 0.7813 5.438 94.44 0.011490 0.02461 0.05688 0.01885 0.01756 0.005115
worst radius worst texture worst perimeter worst area worst smoothness worst compactness worst concavity worst concave points worst symmetry worst fractal dimension
25.38 17.33 184.60 2019.0 0.1622 0.6656 0.7119 0.2654 0.4601 0.11890
24.99 23.41 158.80 1956.0 0.1238 0.1866 0.2416 0.1860 0.2750 0.08902
23.57 25.53 152.50 1709.0 0.1444 0.4245 0.4504 0.2430 0.3613 0.08758
14.91 26.50 98.87 567.7 0.2098 0.8663 0.6869 0.2575 0.6638 0.17300
22.54 16.67 152.20 1575.0 0.1374 0.2050 0.4000 0.1625 0.2364 0.07678
> cancer['target']
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1])
> df_target = pd.DataFrame(cancer['target'],
+                          columns=['Cancer'])

Train Test Split

> from sklearn.model_selection import train_test_split
> X = df_feat
+ y = cancer['target']
+ 
+ X_train, X_test, y_train, y_test = train_test_split(
+     X, y, test_size=0.30, random_state=101)

Train the Classifier

> from sklearn.svm import SVC
> model = SVC(gamma='auto')
> model.fit(X_train,y_train)
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

Predictions and Evaluations

> predictions = model.predict(X_test)
> from sklearn.metrics import classification_report,confusion_matrix
> confu = confusion_matrix(y_test,predictions)
+ pd.DataFrame(confu,index=['Actual 0','Actual 1'],
+             columns=['Predicted 0','Predicted 1'])
          Predicted 0  Predicted 1
Actual 0            0           66
Actual 1            0          105
> print(classification_report(y_test,predictions))
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        66
           1       0.61      1.00      0.76       105

    accuracy                           0.61       171
   macro avg       0.31      0.50      0.38       171
weighted avg       0.38      0.61      0.47       171


C:\Users\pbj20\Anaconda3\lib\site-packages\sklearn\metrics\classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

Notice that we are classifying everything into a single class. This means our model needs to have it parameters adjusted (it may also help to normalize the data).

We can search for parameters using a GridSearch.

Gridsearch

Finding the right parameters (which C or gamma values to use) is a tricky task. Luckily we evaluate multiple combinations to see what works best. This idea of creating a ‘grid’ of parameters and trying all the possible combinations is called a Gridsearch. This method is common enough that Scikit-learn has this functionality built in with GridSearchCV. The CV stands for cross-validation.

GridSearchCV takes a dictionary that describes the parameters that should be tried and a model to train. The grid of parameters is defined as a dictionary, where the keys are the parameters and the values are the settings to be tested.

> param_grid = {'C': [0.1,1, 10, 100, 1000],
+               'gamma': [1,0.1,0.01,0.001,0.0001],
+               'kernel': ['rbf']} 
> from sklearn.model_selection import GridSearchCV

One of the great things about GridSearchCV is that it is a meta-estimator. It takes an estimator like SVC, and creates a new estimator, that behaves exactly the same - in this case, like a classifier. You should add refit=True and choose verbose to whatever number you want. The higher the number the more verbose (verbose just means the text output describing the process).

> grid = GridSearchCV(SVC(),param_grid,
+               refit=True,verbose=1, cv=3)

What fit does is a bit more involved then usual. First, it runs the same loop with cross-validation, to find the best parameter combination. Once it has the best combination, it runs fit again on all data passed to fit (without cross-validation), to built a single new model using the best parameter setting.

> grid.fit(X_train,y_train)
Fitting 3 folds for each of 25 candidates, totalling 75 fits
GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=1)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:    0.6s finished

You can inspect the best parameters found by GridSearchCV in the best_params_ attribute, and the best estimator in the best_estimator_ attribute:

> grid.best_params_
{'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
> grid.best_estimator_
SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

Then you can re-run predictions on this grid object just like you would with a normal model.

> grid_predictions = grid.predict(X_test)
> confu = confusion_matrix(y_test,grid_predictions)
+ pd.DataFrame(confu,index=['Actual 0','Actual 1'],
+             columns=['Predicted 0','Predicted 1'])
          Predicted 0  Predicted 1
Actual 0           60            6
Actual 1            3          102
> print(classification_report(y_test,grid_predictions))
              precision    recall  f1-score   support

           0       0.95      0.91      0.93        66
           1       0.94      0.97      0.96       105

    accuracy                           0.95       171
   macro avg       0.95      0.94      0.94       171
weighted avg       0.95      0.95      0.95       171

Example - Iris Data


For this example, we will be using the famous Iris flower data set.

It is a multivariate data set introduced by Sir Ronald Fisher in the 1936 as an example of discriminant analysis.

The data set consists of 50 samples from each of three species of Iris (Iris setosa, Iris virginica and Iris versicolor). Four features were measured from each sample: the length and the width of the sepals and petals, in centimeters.

Here’s a picture of the three different Iris types:

  • Setosa
> knitr::include_graphics("http://upload.wikimedia.org/wikipedia/commons/5/56/Kosaciec_szczecinkowaty_Iris_setosa.jpg")

  • Versicolor
> knitr::include_graphics("http://upload.wikimedia.org/wikipedia/commons/4/41/Iris_versicolor_3.jpg")

  • Virginica
> knitr::include_graphics("http://upload.wikimedia.org/wikipedia/commons/9/9f/Iris_virginica.jpg")

The iris dataset contains measurements for 150 iris flowers from three different species.

The three classes in the Iris dataset:

Iris-setosa (n=50)
Iris-versicolor (n=50)
Iris-virginica (n=50)

The four features of the Iris dataset:

sepal length in cm
sepal width in cm
petal length in cm
petal width in cm

The Data

> iris = sns.load_dataset('iris')

Exploratory Data Analysis

  • Create a pairplot of the data set.

Setosa is the most separable.

> sns.set_style('darkgrid')
+ sns.pairplot(iris,hue='species',palette='Dark2');
+ plt.show()

  • Create a kde plot of sepal_length versus sepal_width for Setosa species of flower
> plt.figure(figsize=(8,6))
+ setosa = iris[iris['species']=='setosa']
+ sns.kdeplot( setosa['sepal_width'], setosa['sepal_length'],
+             cmap="plasma", shade=True, shade_lowest=False);
+ plt.show()

Train Test Split

> X = iris.drop('species',axis=1)
+ y = iris['species']
+ X_train, X_test, y_train, y_test = train_test_split(
+     X, y, test_size=0.30, random_state=42)

Train the Model

> svc_model = SVC(gamma='auto')
> svc_model.fit(X_train,y_train)
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

Model Evaluation

> predictions = svc_model.predict(X_test)
> confu = confusion_matrix(y_test,predictions)
> pd.DataFrame(confu,index=['Actual Setosa',
+ 'Actual Versicolor','Actual Virginica'],
+ columns=['Predicted Setosa','Predicted Versicolor',
+ 'Predicted Virginica'])
                   Predicted Setosa  Predicted Versicolor  Predicted Virginica
Actual Setosa                    19                     0                    0
Actual Versicolor                 0                    13                    0
Actual Virginica                  0                     0                   13
> print(classification_report(y_test,predictions))
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        19
  versicolor       1.00      1.00      1.00        13
   virginica       1.00      1.00      1.00        13

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45

Gridsearch

Although we can’t improve the default model, we can still try GridsearchCV.

> param_grid = {'C': [0.1,1, 10, 100], 
+ 'gamma': [1,0.1,0.01,0.001]} 
> grid = GridSearchCV(SVC(),param_grid,refit=True,
+              verbose=1, cv=3, iid=True)
+ grid.fit(X_train,y_train)
Fitting 3 folds for each of 16 candidates, totalling 48 fits
GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid=True, n_jobs=None,
             param_grid={'C': [0.1, 1, 10, 100],
                         'gamma': [1, 0.1, 0.01, 0.001]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=1)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:    0.0s finished
> grid_predictions = grid.predict(X_test)
> confu = confusion_matrix(y_test,grid_predictions)
+ 
+ pd.DataFrame(confu,index=['Actual Setosa',
+     'Actual Versicolor','Actual Virginica'],
+  columns=['Predicted Setosa','Predicted Versicolor',
+             'Predicted Virginica'])
                   Predicted Setosa  Predicted Versicolor  Predicted Virginica
Actual Setosa                    19                     0                    0
Actual Versicolor                 0                    13                    0
Actual Virginica                  0                     0                   13
> print(classification_report(y_test,grid_predictions))
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        19
  versicolor       1.00      1.00      1.00        13
   virginica       1.00      1.00      1.00        13

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45