library(e1071)
setwd("C:/Users/Maria Elena Morinigo/Desktop/MSDA/DA 6813 - Data Analytics Applications/Week 5")
df <- read.csv(file = "SVR_SpineD2.csv", header = TRUE, sep = ",")
str(df)
## 'data.frame':    69 obs. of  19 variables:
##  $ Tum.cross.section.Area : num  81.7 100.4 60.5 57 63.2 ...
##  $ Tumor.Volume..cc.      : num  1765 1865 1490 1432 1606 ...
##  $ Tumor.Spread..cm.      : num  22 24.4 20.7 25.2 21 ...
##  $ Tumor.Surface.area     : num  865 969 1227 844 989 ...
##  $ Cord.Tum.Min.Dist      : num  0.6 0.7 0.825 0.539 0.849 ...
##  $ Cord.Tum.Dist          : num  3.92 5.14 4.03 6.48 5.81 ...
##  $ Cord.cross.section.area: num  0.809 1.121 1.749 0.712 1.061 ...
##  $ Cord.Volume..cc.       : num  24.5 21.6 25.5 17.3 25.3 ...
##  $ Cord.Spread..cm.       : num  19.8 21.9 16.2 21.3 18 ...
##  $ Cord.Surface.area      : num  123 133 127 106 112 ...
##  $ Tum.D2                 : num  81.2 76.4 61.5 76.9 77 69.4 72.2 79.5 76.3 76.5 ...
##  $ Tum.D20                : num  77.7 73.9 59.3 74.2 73.7 64.1 69.3 76.8 73.7 74.3 ...
##  $ Tum.D40                : num  76 72.5 58.6 73 70.5 58.2 68 70.7 70.7 73.3 ...
##  $ Tum.D60                : num  69.6 68.5 57.8 69.7 64.4 57 65.1 59.7 59.8 69.5 ...
##  $ Tum.D80                : num  56 60.9 56.6 60.1 56.8 55.2 55.8 48.2 50.6 58.9 ...
##  $ Tum.D98                : num  43.9 36.7 51.1 47.1 49.3 46.8 48.9 41.6 47.5 47.1 ...
##  $ Concurrent.Chemotherapy: int  1 1 1 1 1 0 1 1 1 1 ...
##  $ Surgery                : int  0 1 0 1 1 0 0 0 1 0 ...
##  $ D2                     : num  57.1 48.3 47.9 52.7 45.7 42.3 47.1 45.5 38.9 42.1 ...

Make factor variables

df$Concurrent.Chemotherapy <- as.factor(df$Concurrent.Chemotherapy)
df$Surgery <- as.factor(df$Surgery)

Split the data set

For now simply computing for D2 for spine and brainstem. Split 50 patients as training and 19 as testing**

set.seed(123)
train <- sample(1:nrow(df), 50)
train.data <- as.data.frame(df[train,])
test.data <- as.data.frame(df[-train,])

Tune SVM, and run SVM with tuned cost and gamma, predict on test set, and evaluate MSE

set.seed(123)
tuned1 <- tune.svm(D2~., data = train.data, gamma = seq(.01, 0.1, by = .01), cost = seq(0.1, 1, by = 0.1), scale(TRUE))

Build my svm

resultsSVM <- svm(formula = D2 ~ ., data = train.data, gamma = tuned1$best.parameters$gamma, cost = tuned1$best.parameters$cost)
summary(resultsSVM)
## 
## Call:
## svm(formula = D2 ~ ., data = train.data, gamma = tuned1$best.parameters$gamma, 
##     cost = tuned1$best.parameters$cost)
## 
## 
## Parameters:
##    SVM-Type:  eps-regression 
##  SVM-Kernel:  radial 
##        cost:  0.4 
##       gamma:  0.04 
##     epsilon:  0.1 
## 
## 
## Number of Support Vectors:  47

Make predictions

predSVM <- predict(resultsSVM, test.data, type = "response")

Compute Error

mean((predSVM - test.data$D2)^2)
## [1] 29.27505

Compare to LM

resultsLM = lm(D2 ~ ., data = train.data)
summary(resultsLM)
## 
## Call:
## lm(formula = D2 ~ ., data = train.data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.3839 -1.6608  0.4359  2.0896  4.1640 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)   
## (Intercept)               48.920658  16.708663   2.928  0.00634 **
## Tum.cross.section.Area    -0.062196   0.042167  -1.475  0.15030   
## Tumor.Volume..cc.          0.006067   0.002834   2.141  0.04028 * 
## Tumor.Spread..cm.         -0.310167   0.395982  -0.783  0.43940   
## Tumor.Surface.area        -0.007883   0.003369  -2.340  0.02591 * 
## Cord.Tum.Min.Dist        -12.530541   3.983982  -3.145  0.00365 **
## Cord.Tum.Dist             -0.431544   0.541978  -0.796  0.43195   
## Cord.cross.section.area   -2.274603   3.052377  -0.745  0.46177   
## Cord.Volume..cc.          -0.136425   0.171806  -0.794  0.43319   
## Cord.Spread..cm.          -0.110274   0.360600  -0.306  0.76180   
## Cord.Surface.area          0.015219   0.026823   0.567  0.57453   
## Tum.D2                     0.425388   0.400806   1.061  0.29674   
## Tum.D20                    0.024194   0.634144   0.038  0.96981   
## Tum.D40                   -0.074288   0.503149  -0.148  0.88358   
## Tum.D60                    0.352077   0.382553   0.920  0.36451   
## Tum.D80                   -0.381967   0.206100  -1.853  0.07338 . 
## Tum.D98                   -0.088720   0.086507  -1.026  0.31303   
## Concurrent.Chemotherapy1  -3.256423   3.095977  -1.052  0.30101   
## Surgery1                   1.063002   1.217746   0.873  0.38942   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.452 on 31 degrees of freedom
## Multiple R-squared:  0.5342, Adjusted R-squared:  0.2637 
## F-statistic: 1.975 on 18 and 31 DF,  p-value: 0.04664
predLM = predict(resultsLM, newdata = test.data)
mean((predLM - test.data$D2)^2)
## [1] 25.25422