######################################################
# Stat 6801 - Project
#
# Sarah Rathwell 
#
# Objective - Investigate SVR vs GAM on Fat data.
######################################################
rm(list=ls())
par(mfrow=c(1,1), oma=c(0,0,0,0))
library(ggplot2)
library(mgcv)
## Loading required package: nlme
## This is mgcv 1.8-12. For overview type 'help("mgcv-package")'.
library(faraway)
library(e1071)
library(rminer)
library(GGally)
## 
## Attaching package: 'GGally'
## The following object is masked from 'package:faraway':
## 
##     happy
library(scatterplot3d)

######################################################
cat(" Investigate SVR vs GAM on Fat data ",
    "Last modification:",date(),'\n')
##  Investigate SVR vs GAM on Fat data  Last modification: Wed Dec  7 22:48:46 2016
######################################################

# get data 

data(fat, package='faraway')
str(fat)
## 'data.frame':    252 obs. of  18 variables:
##  $ brozek : num  12.6 6.9 24.6 10.9 27.8 20.6 19 12.8 5.1 12 ...
##  $ siri   : num  12.3 6.1 25.3 10.4 28.7 20.9 19.2 12.4 4.1 11.7 ...
##  $ density: num  1.07 1.09 1.04 1.08 1.03 ...
##  $ age    : int  23 22 22 26 24 24 26 25 25 23 ...
##  $ weight : num  154 173 154 185 184 ...
##  $ height : num  67.8 72.2 66.2 72.2 71.2 ...
##  $ adipos : num  23.7 23.4 24.7 24.9 25.6 26.5 26.2 23.6 24.6 25.8 ...
##  $ free   : num  135 161 116 165 133 ...
##  $ neck   : num  36.2 38.5 34 37.4 34.4 39 36.4 37.8 38.1 42.1 ...
##  $ chest  : num  93.1 93.6 95.8 101.8 97.3 ...
##  $ abdom  : num  85.2 83 87.9 86.4 100 94.4 90.7 88.5 82.5 88.6 ...
##  $ hip    : num  94.5 98.7 99.2 101.2 101.9 ...
##  $ thigh  : num  59 58.7 59.6 60.1 63.2 66 58.4 60 62.9 63.1 ...
##  $ knee   : num  37.3 37.3 38.9 37.3 42.2 42 38.3 39.4 38.3 41.7 ...
##  $ ankle  : num  21.9 23.4 24 22.8 24 25.6 22.9 23.2 23.8 25 ...
##  $ biceps : num  32 30.5 28.8 32.4 32.2 35.7 31.9 30.5 35.9 35.6 ...
##  $ forearm: num  27.4 28.9 25.2 29.4 27.7 30.6 27.8 29 31.1 30 ...
##  $ wrist  : num  17.1 18.2 16.6 18.2 17.7 18.8 17.7 18.8 18.2 19.2 ...
head(fat)
##   brozek siri density age weight height adipos  free neck chest abdom
## 1   12.6 12.3  1.0708  23 154.25  67.75   23.7 134.9 36.2  93.1  85.2
## 2    6.9  6.1  1.0853  22 173.25  72.25   23.4 161.3 38.5  93.6  83.0
## 3   24.6 25.3  1.0414  22 154.00  66.25   24.7 116.0 34.0  95.8  87.9
## 4   10.9 10.4  1.0751  26 184.75  72.25   24.9 164.7 37.4 101.8  86.4
## 5   27.8 28.7  1.0340  24 184.25  71.25   25.6 133.1 34.4  97.3 100.0
## 6   20.6 20.9  1.0502  24 210.25  74.75   26.5 167.0 39.0 104.5  94.4
##     hip thigh knee ankle biceps forearm wrist
## 1  94.5  59.0 37.3  21.9   32.0    27.4  17.1
## 2  98.7  58.7 37.3  23.4   30.5    28.9  18.2
## 3  99.2  59.6 38.9  24.0   28.8    25.2  16.6
## 4 101.2  60.1 37.3  22.8   32.4    29.4  18.2
## 5 101.9  63.2 42.2  24.0   32.2    27.7  17.7
## 6 107.8  66.0 42.0  25.6   35.7    30.6  18.8
# clean up known errors

fat <- fat[,-c(1,3,8)]
fat$height[42] <- 69.5
fat <- fat[-182,]
head(fat)
##   siri age weight height adipos neck chest abdom   hip thigh knee ankle
## 1 12.3  23 154.25  67.75   23.7 36.2  93.1  85.2  94.5  59.0 37.3  21.9
## 2  6.1  22 173.25  72.25   23.4 38.5  93.6  83.0  98.7  58.7 37.3  23.4
## 3 25.3  22 154.00  66.25   24.7 34.0  95.8  87.9  99.2  59.6 38.9  24.0
## 4 10.4  26 184.75  72.25   24.9 37.4 101.8  86.4 101.2  60.1 37.3  22.8
## 5 28.7  24 184.25  71.25   25.6 34.4  97.3 100.0 101.9  63.2 42.2  24.0
## 6 20.9  24 210.25  74.75   26.5 39.0 104.5  94.4 107.8  66.0 42.0  25.6
##   biceps forearm wrist
## 1   32.0    27.4  17.1
## 2   30.5    28.9  18.2
## 3   28.8    25.2  16.6
## 4   32.4    29.4  18.2
## 5   32.2    27.7  17.7
## 6   35.7    30.6  18.8
str(fat)
## 'data.frame':    251 obs. of  15 variables:
##  $ siri   : num  12.3 6.1 25.3 10.4 28.7 20.9 19.2 12.4 4.1 11.7 ...
##  $ age    : int  23 22 22 26 24 24 26 25 25 23 ...
##  $ weight : num  154 173 154 185 184 ...
##  $ height : num  67.8 72.2 66.2 72.2 71.2 ...
##  $ adipos : num  23.7 23.4 24.7 24.9 25.6 26.5 26.2 23.6 24.6 25.8 ...
##  $ neck   : num  36.2 38.5 34 37.4 34.4 39 36.4 37.8 38.1 42.1 ...
##  $ chest  : num  93.1 93.6 95.8 101.8 97.3 ...
##  $ abdom  : num  85.2 83 87.9 86.4 100 94.4 90.7 88.5 82.5 88.6 ...
##  $ hip    : num  94.5 98.7 99.2 101.2 101.9 ...
##  $ thigh  : num  59 58.7 59.6 60.1 63.2 66 58.4 60 62.9 63.1 ...
##  $ knee   : num  37.3 37.3 38.9 37.3 42.2 42 38.3 39.4 38.3 41.7 ...
##  $ ankle  : num  21.9 23.4 24 22.8 24 25.6 22.9 23.2 23.8 25 ...
##  $ biceps : num  32 30.5 28.8 32.4 32.2 35.7 31.9 30.5 35.9 35.6 ...
##  $ forearm: num  27.4 28.9 25.2 29.4 27.7 30.6 27.8 29 31.1 30 ...
##  $ wrist  : num  17.1 18.2 16.6 18.2 17.7 18.8 17.7 18.8 18.2 19.2 ...
######################################################
# split into training/test -- 80/20

set.seed(12345)

train <-sample(1:251, 200, replace=FALSE)

fat.test <- fat[-c(train),]
str(fat.test)
## 'data.frame':    51 obs. of  15 variables:
##  $ siri   : num  25.3 20.9 19.2 12.4 29 16.5 7.9 5.7 35.2 32.6 ...
##  $ age    : int  22 24 26 25 34 33 34 29 46 50 ...
##  $ weight : num  154 210 181 176 196 ...
##  $ height : num  66.2 74.8 69.8 72.5 71 ...
##  $ adipos : num  24.7 26.5 26.2 23.6 27.3 27.6 20.3 22.2 48.9 31.8 ...
##  $ neck   : num  34 39 36.4 37.8 38.9 40 36.2 37.3 51.2 40.2 ...
##  $ chest  : num  95.8 104.5 105.1 99.6 101.9 ...
##  $ abdom  : num  87.9 94.4 90.7 88.5 96.4 ...
##  $ hip    : num  99.2 107.8 100.3 97.1 105.2 ...
##  $ thigh  : num  59.6 66 58.4 60 64.8 65.8 51.7 58.5 87.3 61.3 ...
##  $ knee   : num  38.9 42 38.3 39.4 40.8 40.6 34.7 38.8 49.1 41.1 ...
##  $ ankle  : num  24 25.6 22.9 23.2 23.1 24 21.4 21.5 29.6 24.7 ...
##  $ biceps : num  28.8 35.7 31.9 30.5 36.2 37.1 28.7 30.1 45 34.1 ...
##  $ forearm: num  25.2 30.6 27.8 29 30.8 30.1 27 26.4 29 31 ...
##  $ wrist  : num  16.6 18.8 17.7 18.8 17.3 18.2 16.5 17.9 21.4 18.3 ...
fat <- fat[c(train),]
str(fat)
## 'data.frame':    200 obs. of  15 variables:
##  $ siri   : num  26.6 15 11.4 12.4 22.1 34.5 18.8 13.8 13.1 17 ...
##  $ age    : int  39 53 41 54 47 45 66 50 37 65 ...
##  $ weight : num  219 154 153 153 178 ...
##  $ height : num  74.2 69.2 69.2 70.5 70 ...
##  $ adipos : num  28 22.7 22.5 24.5 25.6 39.1 25.1 25.6 23.7 20.8 ...
##  $ neck   : num  40 37.6 36.4 38.5 40.2 43.2 37.4 37.7 35.3 34.7 ...
##  $ chest  : num  108.5 93.9 91.4 99 99.7 ...
##  $ abdom  : num  104.6 88.7 80.6 91.8 95 ...
##  $ hip    : num  109.8 94.5 92.3 96.2 98.6 ...
##  $ thigh  : num  68.1 53.7 54.3 57.7 62.3 72.5 56.5 58.5 60 50.7 ...
##  $ knee   : num  42.8 36.2 36.3 38.1 38.1 39.6 39.3 36.6 38.1 33.4 ...
##  $ ankle  : num  24.1 22 21.8 23.9 23.9 26.6 22.7 23.5 22 20.1 ...
##  $ biceps : num  35.6 28.5 29.6 31.4 35.3 36.4 30.3 34.4 31.5 28.5 ...
##  $ forearm: num  29 25.7 27.3 29.9 31.1 32.7 28.7 29.2 26.6 24.8 ...
##  $ wrist  : num  19 17.1 17.9 18.9 19.8 21.4 19 18 16.7 16.5 ...
######################################################
# predict rmse for test set

rmse <- function(error)
{
  sqrt(mean(error^2))
}

######################################################
######################################################
# Linear Regression 
######################################################

fat.lm <- lm(siri~., data=fat)
summary(fat.lm)
## 
## Call:
## lm(formula = siri ~ ., data = fat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.9508  -2.8269  -0.0407   2.8213   9.5360 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -14.06339   47.06987  -0.299  0.76545    
## age           0.07981    0.03664   2.178  0.03066 *  
## weight       -0.06779    0.13023  -0.521  0.60334    
## height        0.02205    0.68108   0.032  0.97421    
## adipos        0.34640    0.97917   0.354  0.72391    
## neck         -0.16669    0.28581  -0.583  0.56047    
## chest        -0.19934    0.11922  -1.672  0.09620 .  
## abdom         0.95065    0.10054   9.455  < 2e-16 ***
## hip          -0.15544    0.16224  -0.958  0.33926    
## thigh         0.23911    0.15708   1.522  0.12966    
## knee         -0.19102    0.27853  -0.686  0.49369    
## ankle         0.11368    0.23016   0.494  0.62196    
## biceps        0.08334    0.17971   0.464  0.64340    
## forearm       0.26789    0.21651   1.237  0.21754    
## wrist        -1.89761    0.61070  -3.107  0.00219 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.174 on 185 degrees of freedom
## Multiple R-squared:  0.7677, Adjusted R-squared:  0.7501 
## F-statistic: 43.66 on 14 and 185 DF,  p-value: < 2.2e-16
lmfpred <- predict(fat.lm, fat.test)

halfnorm(cooks.distance(fat.lm), main="Cooks Distance - Full LM")

fat3 <- fat[-c(4,111),]

fat.lm2 <-lm(siri~., data=fat3)
summary(fat.lm2)
## 
## Call:
## lm(formula = siri ~ ., data = fat3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -10.039  -2.789  -0.101   2.726   9.337 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -103.68006   58.57767  -1.770   0.0784 .  
## age            0.06533    0.03662   1.784   0.0761 .  
## weight        -0.31887    0.16319  -1.954   0.0522 .  
## height         1.24781    0.82455   1.513   0.1319    
## adipos         1.90002    1.14696   1.657   0.0993 .  
## neck          -0.12149    0.28043  -0.433   0.6654    
## chest         -0.17381    0.11714  -1.484   0.1396    
## abdom          0.97340    0.09889   9.844   <2e-16 ***
## hip           -0.19855    0.15971  -1.243   0.2154    
## thigh          0.31355    0.15593   2.011   0.0458 *  
## knee          -0.07687    0.27541  -0.279   0.7805    
## ankle         -0.21148    0.28049  -0.754   0.4518    
## biceps         0.05798    0.17703   0.328   0.7437    
## forearm        0.29083    0.21232   1.370   0.1724    
## wrist         -1.55676    0.61035  -2.551   0.0116 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.09 on 183 degrees of freedom
## Multiple R-squared:  0.7777, Adjusted R-squared:  0.7607 
## F-statistic: 45.74 on 14 and 183 DF,  p-value: < 2.2e-16
halfnorm(cooks.distance(fat.lm2))

fat.lm3 <-lm(siri~age+weight+adipos+abdom+thigh+wrist, data=fat3)
summary(fat.lm3)
## 
## Call:
## lm(formula = siri ~ age + weight + adipos + abdom + thigh + wrist, 
##     data = fat3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.5440  -2.8313  -0.1751   3.0129   9.4796 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -36.09611    9.24908  -3.903 0.000132 ***
## age           0.05428    0.03408   1.593 0.112859    
## weight       -0.13765    0.03641  -3.780 0.000209 ***
## adipos        0.10388    0.23615   0.440 0.660528    
## abdom         0.91826    0.09249   9.928  < 2e-16 ***
## thigh         0.28398    0.12412   2.288 0.023238 *  
## wrist        -1.49123    0.53461  -2.789 0.005816 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.091 on 191 degrees of freedom
## Multiple R-squared:  0.7678, Adjusted R-squared:  0.7605 
## F-statistic: 105.3 on 6 and 191 DF,  p-value: < 2.2e-16
anova(fat.lm3, fat.lm2, test="F")
## Analysis of Variance Table
## 
## Model 1: siri ~ age + weight + adipos + abdom + thigh + wrist
## Model 2: siri ~ age + weight + height + adipos + neck + chest + abdom + 
##     hip + thigh + knee + ankle + biceps + forearm + wrist
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    191 3197.3                           
## 2    183 3060.6  8    136.73 1.0219  0.421
fat.lm4 <-lm(siri~weight+abdom+thigh+wrist, data=fat3)
summary(fat.lm4)
## 
## Call:
## lm(formula = siri ~ weight + abdom + thigh + wrist, data = fat3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.0488  -2.8607  -0.0674   3.1068   9.5085 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -39.55945    8.84409  -4.473 1.32e-05 ***
## weight       -0.16021    0.03354  -4.777 3.51e-06 ***
## abdom         1.01287    0.05791  17.490  < 2e-16 ***
## thigh         0.23034    0.11134   2.069   0.0399 *  
## wrist        -1.10578    0.48013  -2.303   0.0223 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.098 on 193 degrees of freedom
## Multiple R-squared:  0.7646, Adjusted R-squared:  0.7597 
## F-statistic: 156.7 on 4 and 193 DF,  p-value: < 2.2e-16
anova(fat.lm4, fat.lm2, test="F")
## Analysis of Variance Table
## 
## Model 1: siri ~ weight + abdom + thigh + wrist
## Model 2: siri ~ age + weight + height + adipos + neck + chest + abdom + 
##     hip + thigh + knee + ankle + biceps + forearm + wrist
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    193 3241.7                           
## 2    183 3060.6 10     181.1 1.0829 0.3775
fat.lm5<-lm(siri~weight+abdom+wrist, data=fat3)
anova(fat.lm5, fat.lm2, test="F")
## Analysis of Variance Table
## 
## Model 1: siri ~ weight + abdom + wrist
## Model 2: siri ~ age + weight + height + adipos + neck + chest + abdom + 
##     hip + thigh + knee + ankle + biceps + forearm + wrist
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    194 3313.5                           
## 2    183 3060.6 11    252.98 1.3752 0.1877
######################################################
# check interactions

ggpairs(fat[,c(3,8,15)], axisLabels = "none", 
        title = "Pairs Plots of Fat Data")

fat.lint <-lm(siri~weight*abdom*wrist, data=fat3)
summary(fat.lint)
## 
## Call:
## lm(formula = siri ~ weight * abdom * wrist, data = fat3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.9940 -2.8957 -0.2935  2.9756  9.9287 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)
## (Intercept)         1.185e+02  2.187e+02   0.542    0.589
## weight              1.935e-01  1.116e+00   0.173    0.863
## abdom              -1.770e+00  2.721e+00  -0.651    0.516
## wrist              -1.118e+01  1.231e+01  -0.909    0.365
## weight:abdom        3.520e-03  1.122e-02   0.314    0.754
## weight:wrist       -7.064e-03  6.057e-02  -0.117    0.907
## abdom:wrist         1.697e-01  1.511e-01   1.123    0.263
## weight:abdom:wrist -2.901e-04  6.074e-04  -0.478    0.633
## 
## Residual standard error: 4.133 on 190 degrees of freedom
## Multiple R-squared:  0.7643, Adjusted R-squared:  0.7556 
## F-statistic: 88.01 on 7 and 190 DF,  p-value: < 2.2e-16
anova(fat.lm2, fat.lint, test="F")
## Analysis of Variance Table
## 
## Model 1: siri ~ age + weight + height + adipos + neck + chest + abdom + 
##     hip + thigh + knee + ankle + biceps + forearm + wrist
## Model 2: siri ~ weight * abdom * wrist
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    183 3060.6                           
## 2    190 3245.4 -7    -184.9 1.5794  0.144
fat.lint2 <-lm(siri~weight*abdom+wrist, data=fat3)
summary(fat.lint2)
## 
## Call:
## lm(formula = siri ~ weight * abdom + wrist, data = fat3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.7773 -2.8812 -0.2731  3.0726  9.4313 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -4.874e+01  1.633e+01  -2.984  0.00321 ** 
## weight       -3.765e-03  8.718e-02  -0.043  0.96560    
## abdom         1.227e+00  1.744e-01   7.036 3.35e-11 ***
## wrist        -1.388e+00  4.725e-01  -2.937  0.00372 ** 
## weight:abdom -1.180e-03  8.767e-04  -1.347  0.17970    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.124 on 193 degrees of freedom
## Multiple R-squared:  0.7616, Adjusted R-squared:  0.7567 
## F-statistic: 154.1 on 4 and 193 DF,  p-value: < 2.2e-16
anova(fat.lm2, fat.lint2, test="F")
## Analysis of Variance Table
## 
## Model 1: siri ~ age + weight + height + adipos + neck + chest + abdom + 
##     hip + thigh + knee + ankle + biceps + forearm + wrist
## Model 2: siri ~ weight * abdom + wrist
##   Res.Df    RSS  Df Sum of Sq      F Pr(>F)
## 1    183 3060.6                            
## 2    193 3282.7 -10   -222.14 1.3283 0.2181
fat.lint3 <-lm(siri~weight*wrist+abdom, data=fat3)
summary(fat.lint3)
## 
## Call:
## lm(formula = siri ~ weight * wrist + abdom, data = fat3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.7237 -2.9877 -0.1459  3.1643  9.7639 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -63.983806  31.975156  -2.001   0.0468 *  
## weight         0.078456   0.175120   0.448   0.6546    
## wrist          0.582161   1.768954   0.329   0.7424    
## abdom          1.003974   0.058282  17.226   <2e-16 ***
## weight:wrist  -0.010533   0.009388  -1.122   0.2633    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.13 on 193 degrees of freedom
## Multiple R-squared:  0.7609, Adjusted R-squared:  0.756 
## F-statistic: 153.6 on 4 and 193 DF,  p-value: < 2.2e-16
anova(fat.lm2, fat.lint3, test="F")
## Analysis of Variance Table
## 
## Model 1: siri ~ age + weight + height + adipos + neck + chest + abdom + 
##     hip + thigh + knee + ankle + biceps + forearm + wrist
## Model 2: siri ~ weight * wrist + abdom
##   Res.Df    RSS  Df Sum of Sq      F Pr(>F)
## 1    183 3060.6                            
## 2    193 3292.1 -10   -231.51 1.3843 0.1904
#not enough change to merit the increase in complexity

######################################################
# fit reduced model

fat.finallm <-lm(siri~abdom+weight+wrist, data=fat3)
summary(fat.finallm)
## 
## Call:
## lm(formula = siri ~ abdom + weight + wrist, data = fat3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.9236 -2.9650 -0.2539  3.0181  9.7623 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -29.0572     7.3029  -3.979 9.78e-05 ***
## abdom         1.0058     0.0583  17.253  < 2e-16 ***
## weight       -0.1158     0.0260  -4.455 1.41e-05 ***
## wrist        -1.3307     0.4716  -2.822  0.00527 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.133 on 194 degrees of freedom
## Multiple R-squared:  0.7594, Adjusted R-squared:  0.7556 
## F-statistic: 204.1 on 3 and 194 DF,  p-value: < 2.2e-16
lmpred <- predict(fat.finallm, fat.test)

######################################################
######################################################
# GAM Regression
######################################################

fat.gam <- gam(siri ~ s(age)+s(weight)+s(height)+s(adipos)+s(neck)+s(chest)+s(abdom)+s(hip)+
                 s(thigh)+s(knee)+s(ankle)+s(biceps)+s(forearm)+s(wrist), data = fat)

summary(fat.gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## siri ~ s(age) + s(weight) + s(height) + s(adipos) + s(neck) + 
##     s(chest) + s(abdom) + s(hip) + s(thigh) + s(knee) + s(ankle) + 
##     s(biceps) + s(forearm) + s(wrist)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  19.1800     0.2555   75.08   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##              edf Ref.df      F p-value    
## s(age)     5.713  6.836  2.942 0.00796 ** 
## s(weight)  2.884  3.733  0.795 0.52242    
## s(height)  1.673  2.092  1.060 0.35214    
## s(adipos)  4.199  5.245  1.759 0.11430    
## s(neck)    1.000  1.000  0.934 0.33532    
## s(chest)   1.000  1.000  1.945 0.16510    
## s(abdom)   4.936  5.998 17.484 < 2e-16 ***
## s(hip)     6.276  7.344  2.584 0.01421 *  
## s(thigh)   1.000  1.000  2.435 0.12066    
## s(knee)    2.286  2.936  1.185 0.28933    
## s(ankle)   3.607  4.412  1.636 0.18697    
## s(biceps)  8.075  8.731  1.643 0.07883 .  
## s(forearm) 1.000  1.000  3.981 0.04776 *  
## s(wrist)   1.626  2.015  3.234 0.04055 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.813   Deviance explained = 85.5%
## GCV = 16.981  Scale est. = 13.052    n = 200
fat.gamcooks <- cooks.distance(fat.gam)
halfnorm(fat.gamcooks, main="Cooks Distance - Full GAM")

gamfpred <- predict(fat.gam, fat.test)


fat[c(6,4),]
##     siri age weight height adipos neck chest abdom   hip thigh knee ankle
## 41  34.5  45 262.75  68.75   39.1 43.2 128.3 126.2 125.6  72.5 39.6  26.6
## 221 12.4  54 153.25  70.50   24.5 38.5  99.0  91.8  96.2  57.7 38.1  23.9
##     biceps forearm wrist
## 41    36.4    32.7  21.4
## 221   31.4    29.9  18.9
fat2 <- fat[-c(6),]

fat.gam2 <- gam(siri ~ s(age)+s(weight)+s(height)+s(adipos)+s(neck)+s(chest)+s(abdom)+s(hip)+
                 s(thigh)+s(knee)+s(ankle)+s(biceps)+s(forearm)+s(wrist), data = fat2)
summary(fat.gam2)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## siri ~ s(age) + s(weight) + s(height) + s(adipos) + s(neck) + 
##     s(chest) + s(abdom) + s(hip) + s(thigh) + s(knee) + s(ankle) + 
##     s(biceps) + s(forearm) + s(wrist)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  19.1030     0.2557    74.7   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##              edf Ref.df      F p-value    
## s(age)     5.778  6.897  2.950 0.00737 ** 
## s(weight)  2.710  3.526  0.738 0.54619    
## s(height)  1.496  1.836  0.718 0.38180    
## s(adipos)  4.315  5.428  1.686 0.11797    
## s(neck)    1.000  1.000  0.881 0.34940    
## s(chest)   1.000  1.000  1.849 0.17583    
## s(abdom)   4.869  5.955 17.455 < 2e-16 ***
## s(hip)     5.478  6.579  2.526 0.02002 *  
## s(thigh)   1.000  1.000  2.361 0.12643    
## s(knee)    2.212  2.839  1.017 0.34260    
## s(ankle)   3.666  4.484  1.667 0.17783    
## s(biceps)  8.138  8.760  1.752 0.06132 .  
## s(forearm) 1.000  1.000  4.348 0.03868 *  
## s(wrist)   1.751  2.193  3.052 0.04078 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.811   Deviance explained = 85.3%
## GCV = 16.863  Scale est. = 13.015    n = 199
fat.gamcooks2 <- cooks.distance(fat.gam2)
halfnorm(fat.gamcooks2)

par(mfrow=c(2,4), oma=c(0,0,2,0))
plot(fat.gam2, select=2)
abline(h=1, col="blue")
plot(fat.gam2, select=3)
abline(h=0, col="blue")
plot(fat.gam2, select=4)
abline(h=-1, col="blue")
plot(fat.gam2, select=5)
abline(h=0, col="blue")
plot(fat.gam2, select=6)
abline(h=0, col="blue")
plot(fat.gam2, select=9)
abline(h=0, col="blue")
plot(fat.gam2, select=10)
abline(h=-0.3, col="blue")
mtext("Non-significant Factors - GAM", cex=1.5, outer=T)

fat.gam3 <- gam(siri ~s(age)+s(abdom)+s(hip)+s(ankle)+s(biceps)+s(forearm)+s(wrist), data = fat2)
summary(fat.gam3)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## siri ~ s(age) + s(abdom) + s(hip) + s(ankle) + s(biceps) + s(forearm) + 
##     s(wrist)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  19.1030     0.2744   69.62   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##              edf Ref.df      F p-value    
## s(age)     5.818  6.966  2.780 0.00940 ** 
## s(abdom)   4.888  6.044 28.971 < 2e-16 ***
## s(hip)     6.688  7.793  3.278 0.00186 ** 
## s(ankle)   4.615  5.586  2.467 0.03046 *  
## s(biceps)  1.000  1.000  1.547 0.21520    
## s(forearm) 1.000  1.000  1.773 0.18475    
## s(wrist)   1.276  1.501 16.122 1.1e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.782   Deviance explained =   81%
## GCV = 17.265  Scale est. = 14.984    n = 199
plot(fat.gam3)

par(mfrow=c(1,2))

plot(fat.gam3, select=5)
abline(h=0, col="blue")
plot(fat.gam3, select=6)
abline(h=0, col="blue")
mtext("Non-significant factors - Gam [iteration 2]", cex=1.5, outer=T)

fat.gam4 <- gam(siri ~s(age)+s(abdom)+s(hip)+s(ankle)+s(wrist), data = fat2)
summary(fat.gam4)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## siri ~ s(age) + s(abdom) + s(hip) + s(ankle) + s(wrist)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  19.1030     0.2789    68.5   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##            edf Ref.df      F p-value    
## s(age)   5.495  6.637  2.253 0.03621 *  
## s(abdom) 4.837  5.988 30.732 < 2e-16 ***
## s(hip)   5.854  7.007  2.960 0.00581 ** 
## s(ankle) 4.307  5.238  2.099 0.06377 .  
## s(wrist) 1.000  1.000 18.707 2.5e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.775   Deviance explained =   80%
## GCV = 17.448  Scale est. = 15.476    n = 199
par(mfrow=c(2,3))
plot(fat.gam4)

######################################################
# test interaction models based on highest correlations

ggpairs(fat[,c(2,8,9,12,15)], axisLabels = "none", 
        title = "Pairs Plots of Fat Data")


int <- gam(siri ~s(neck,abdom,ankle,wrist,hip)+s(age), data = fat2)
anova(fat.gam4, int, test="F")
## Analysis of Deviance Table
## 
## Model 1: siri ~ s(age) + s(abdom) + s(hip) + s(ankle) + s(wrist)
## Model 2: siri ~ s(neck, abdom, ankle, wrist, hip) + s(age)
##   Resid. Df Resid. Dev     Df Deviance      F Pr(>F)
## 1    176.51     2731.7                              
## 2    133.54     2105.4 42.964   626.29 0.9246 0.6065
int2 <- gam(siri ~s(abdom, hip)+s(age)+s(ankle)+s(wrist), data=fat2) #sig?
anova(fat.gam4, int2, test="F")
## Analysis of Deviance Table
## 
## Model 1: siri ~ s(age) + s(abdom) + s(hip) + s(ankle) + s(wrist)
## Model 2: siri ~ s(abdom, hip) + s(age) + s(ankle) + s(wrist)
##   Resid. Df Resid. Dev      Df Deviance F Pr(>F)
## 1    176.51     2731.7                          
## 2    176.29     2864.3 0.21854  -132.62
int3 <- gam(siri ~s(wrist, hip)+s(abdom)+s(ankle)+s(age), data = fat2) 
anova(fat.gam4, int3, test="F") 
## Analysis of Deviance Table
## 
## Model 1: siri ~ s(age) + s(abdom) + s(hip) + s(ankle) + s(wrist)
## Model 2: siri ~ s(wrist, hip) + s(abdom) + s(ankle) + s(age)
##   Resid. Df Resid. Dev     Df Deviance      F Pr(>F)
## 1    176.51     2731.7                              
## 2    175.17     2699.1 1.3392   32.559 1.5778 0.2133
summary(int2)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## siri ~ s(abdom, hip) + s(age) + s(ankle) + s(wrist)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  19.1030     0.2857   66.86   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                 edf Ref.df      F  p-value    
## s(abdom,hip) 11.408 15.520 30.382  < 2e-16 ***
## s(age)        5.334  6.458  1.986   0.0664 .  
## s(ankle)      3.970  4.829  1.175   0.2790    
## s(wrist)      1.000  1.000 17.412 4.65e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.764   Deviance explained =   79%
## GCV = 18.341  Scale est. = 16.248    n = 199
par(mfrow=c(1,1))

plot(int2, select=1)

par(mfrow=c(2,2), oma=c(0,0,2,0))
vis.gam(int2, theta=-45, color="cm")
vis.gam(int2, theta=-15, color="cm")
vis.gam(int2, theta=45, color="cm")
vis.gam(int2, theta=90, color="cm")
mtext("Plane of interaction smoother: Abdomen, Hip", outer=T, cex=1.5)

fat.gint <- gam(siri ~s(abdom, hip)+s(age)+s(wrist)+s(ankle), data=fat2)
summary(fat.gint)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## siri ~ s(abdom, hip) + s(age) + s(wrist) + s(ankle)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  19.1030     0.2857   66.86   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                 edf Ref.df      F  p-value    
## s(abdom,hip) 11.408 15.520 30.382  < 2e-16 ***
## s(age)        5.334  6.458  1.986   0.0664 .  
## s(wrist)      1.000  1.000 17.412 4.65e-05 ***
## s(ankle)      3.970  4.829  1.175   0.2790    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.764   Deviance explained =   79%
## GCV = 18.341  Scale est. = 16.248    n = 199
par(mfrow=c(2,2),oma=c(0,0,2,0))
plot(fat.gint)
mtext("Predictors in Reduced GAM Model", cex=1.5, outer=T)

######################################################
# test semi-parametric models

semi1 <- gam(siri ~abdom:hip+s(age)+s(wrist)+s(ankle), data = fat2) 
semi2 <- gam(siri ~s(abdom, hip)+age+s(wrist)+s(ankle), data = fat2) 
semi3 <- gam(siri ~s(abdom, hip)+s(age)+wrist+s(ankle), data = fat2)
semi4 <- gam(siri ~s(abdom, hip)+s(age)+s(wrist)+ankle, data = fat2)


anova(semi1,fat.gint, test="F") #sig
## Analysis of Deviance Table
## 
## Model 1: siri ~ abdom:hip + s(age) + s(wrist) + s(ankle)
## Model 2: siri ~ s(abdom, hip) + s(age) + s(wrist) + s(ankle)
##   Resid. Df Resid. Dev     Df Deviance      F    Pr(>F)    
## 1    186.05     3769.0                                     
## 2    176.29     2864.3 9.7628   904.69 5.7034 2.764e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(semi2,fat.gint, test="F") #linear?
## Analysis of Deviance Table
## 
## Model 1: siri ~ s(abdom, hip) + age + s(wrist) + s(ankle)
## Model 2: siri ~ s(abdom, hip) + s(age) + s(wrist) + s(ankle)
##   Resid. Df Resid. Dev     Df Deviance   F  Pr(>F)  
## 1    181.33     3077.4                              
## 2    176.29     2864.3 5.0447    213.1 2.6 0.02651 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(semi3,fat.gint, test="F") #linear
## Analysis of Deviance Table
## 
## Model 1: siri ~ s(abdom, hip) + s(age) + wrist + s(ankle)
## Model 2: siri ~ s(abdom, hip) + s(age) + s(wrist) + s(ankle)
##   Resid. Df Resid. Dev          Df    Deviance  F    Pr(>F)    
## 1    176.29     2864.3                                         
## 2    176.29     2864.3 -7.3447e-06 -0.00023867  2 4.131e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(semi4,fat.gint, test="F") #linear?
## Analysis of Deviance Table
## 
## Model 1: siri ~ s(abdom, hip) + s(age) + s(wrist) + ankle
## Model 2: siri ~ s(abdom, hip) + s(age) + s(wrist) + s(ankle)
##   Resid. Df Resid. Dev     Df Deviance     F  Pr(>F)  
## 1    183.84     3161.2                                
## 2    176.29     2864.3 7.5553   296.95 2.419 0.01878 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
######################################################
# plot test linear hypothesis for three non-sig variables

par(mfrow=c(1,3), oma=c(0,0,2,0))
plot(fat.gint, residuals = T, select=2, ylim=c(-10,10))
abline(-2.5,0.0518, col="blue")
plot(fat.gint, residuals = T, select=3, ylim=c(-10,10))
abline(40,-2.2, col="blue")
plot(fat.gint, residuals = T, select=4, ylim=c(-10,10))
abline(6,-.26, col="blue")
mtext('Linear Predictor Testing for Fat GAM model', outer = T, cex = 1.5)

semi4 <- gam(siri~s(abdom,hip)+age+wrist+ankle, data=fat2)
summary(semi4)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## siri ~ s(abdom, hip) + age + wrist + ankle
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 61.86751    8.39583   7.369 5.31e-12 ***
## age          0.06966    0.03117   2.235   0.0266 *  
## wrist       -2.54056    0.49602  -5.122 7.47e-07 ***
## ankle        0.01699    0.21998   0.077   0.9385    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##               edf Ref.df     F p-value    
## s(abdom,hip) 7.32  10.12 46.52  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.743   Deviance explained = 75.7%
## GCV = 18.736  Scale est. = 17.67     n = 199
######################################################
# construct and check final semi-parametric reduced GAM
fat.finalforplot <- gam(siri~s(abdom, hip)+s(age)+s(wrist), data=fat2)

par(mfrow=c(1,2), oma=c(0,0,2,0))
plot(fat.finalforplot, residuals = T, select=2, ylim=c(-10,10))
abline(-2.5,0.0518, col="blue")
plot(fat.finalforplot, residuals = T, select=3, ylim=c(-10,10))
abline(40,-2.2, col="blue")
mtext('Linear Predictor Testing for Fat GAM model', outer = T, cex = 1.5)

layout(matrix(c(1,2,1,3), 2, 2, byrow = TRUE))
plot(fat.finalforplot, residuals=T)
mtext("Predictors for GAM - Reduced", cex=1.5, outer=T)

fat.finalgam <- gam(siri~s(abdom, hip)+age+wrist, data=fat2)
summary(fat.finalgam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## siri ~ s(abdom, hip) + age + wrist
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 62.03329    8.12754   7.632 1.11e-12 ***
## age          0.06935    0.03078   2.253   0.0254 *  
## wrist       -2.52731    0.46158  -5.475 1.38e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                edf Ref.df     F p-value    
## s(abdom,hip) 7.398  10.23 46.92  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.745   Deviance explained = 75.7%
## GCV = 18.541  Scale est. = 17.572    n = 199
gampred <- predict(fat.finalgam, fat.test)


######################################################
######################################################
# Support Vector Regression 
######################################################

fat.svr = svm(siri~.,data=fat)
summary(fat.svr)
## 
## Call:
## svm(formula = siri ~ ., data = fat)
## 
## 
## Parameters:
##    SVM-Type:  eps-regression 
##  SVM-Kernel:  radial 
##        cost:  1 
##       gamma:  0.07142857 
##     epsilon:  0.1 
## 
## 
## Number of Support Vectors:  173
svrfpred <-predict(fat.svr, fat.test)  

######################################################
# reduce covariates -- set up validation set

set.seed(67890)

train2 <-sample(1:200, 140, replace=FALSE)
trainset <- fat[c(train2),]
testset <- fat[-c(train2),]
str(trainset)
## 'data.frame':    140 obs. of  15 variables:
##  $ siri   : num  13.5 10.3 18.8 22.6 11.3 27 23.6 4 27.3 20.4 ...
##  $ age    : int  55 23 66 54 50 72 41 47 34 48 ...
##  $ weight : num  125 188 171 198 162 ...
##  $ height : num  64 77.5 69.2 72 66.5 ...
##  $ adipos : num  21.5 22.1 25.1 26.9 25.9 24.7 29.7 20.2 29.7 23.6 ...
##  $ neck   : num  33.2 38 37.4 39.9 38.7 38.5 41.9 34 39.5 37 ...
##  $ chest  : num  87.7 96.6 102.7 107.6 99.4 ...
##  $ abdom  : num  76 85.3 98.6 100 86.7 ...
##  $ hip    : num  88.6 102.5 100.2 99.6 96.2 ...
##  $ thigh  : num  50.9 59.1 56.5 57.2 62.1 56.3 67.7 50.6 63.8 59.3 ...
##  $ knee   : num  35.4 37.6 39.3 38 39.3 36.6 41.3 34.4 42 38.4 ...
##  $ ankle  : num  19.1 23.2 22.7 22 23.3 22 24.7 21.9 23.4 22.4 ...
##  $ biceps : num  29.3 31.8 30.3 35.9 30.6 29.7 37.2 26.8 34 27.9 ...
##  $ forearm: num  25.7 29.7 28.7 30.2 27.8 26.3 31.8 25.8 31.2 26.2 ...
##  $ wrist  : num  16.9 18.3 19 18.9 18.2 18 20 16.8 18.5 17 ...
str(testset)
## 'data.frame':    60 obs. of  15 variables:
##  $ siri   : num  15 11.4 22.1 4.1 18.2 25.4 11.5 19.2 10 29.9 ...
##  $ age    : int  53 41 47 25 44 43 54 24 28 65 ...
##  $ weight : num  154 153 178 191 180 ...
##  $ height : num  69.2 69.2 70 74 69.5 ...
##  $ adipos : num  22.7 22.5 25.6 24.6 26.2 26 25 27.7 24.6 30.9 ...
##  $ neck   : num  37.6 36.4 40.2 38.1 39.2 39.6 37.4 39.2 37 40.8 ...
##  $ chest  : num  93.9 91.4 99.7 100.9 101.9 ...
##  $ abdom  : num  88.7 80.6 95 82.5 93.2 ...
##  $ hip    : num  94.5 92.3 98.6 99.9 100.6 ...
##  $ thigh  : num  53.7 54.3 62.3 62.9 58.9 59.5 59.7 71.2 60.8 59.2 ...
##  $ knee   : num  36.2 36.3 38.1 38.3 39.7 36.1 40.2 43.5 38.5 38.1 ...
##  $ ankle  : num  22 21.8 23.9 23.8 23.1 22 23.4 25.2 25 24 ...
##  $ biceps : num  28.5 29.6 35.3 35.9 31.4 30.1 27.9 36.1 31.6 35.9 ...
##  $ forearm: num  25.7 27.3 31.1 31.1 28.4 27.2 27 30.3 28 30.5 ...
##  $ wrist  : num  17.1 17.9 19.8 18.2 18.8 17.7 17.8 18.7 18.6 19.1 ...
num <- ncol(trainset) - 2
trainRMSE <- array(dim=num)
trainY <- trainset$siri
testRMSE <- array(dim=num)
testY <- testset$siri
x <- array(dim=num)
varnames <- array(dim=num)

par(mfrow=c(1,1), oma=c(0,0,0,0))
for(i in 1:num){
  x[i] <- i
  
  Model=fit(siri~., trainset, model="svm")
  predtrain=predict(Model, trainset)
  trainRMSE[i]=mmetric(trainY, predtrain, "RMSE")
  predtest=predict(Model, testset)
  testRMSE[i]=mmetric(testY, predtest, "RMSE")
  
  varimp <- Importance(Model,trainset, method="sensv")
  dev.set(dev.next())
  
  L=list(runs=1,sen=t(varimp$imp),sresponses=varimp$sresponses)
  mgraph(L,graph="IMP", col="gray", Grid=10)

  z <- order(varimp$imp, decreasing=F)
  IND <- z[2]
  var_to_remove <- names(trainset[IND])
  varnames[i] <- var_to_remove
  
  ymin=min(testRMSE[1:i])
  ymax=max(testRMSE[1:i])
  ymin=ymin-((ymax-ymin)/4)
  
  plot(x[1:i],testRMSE[1:i],type="b",col="blue",main="Variable Elimination", xlab="Covariates Removed",
       ylab="RMSE", ylim=c(ymin,ymax))
  legend("bottomright", c('validation set'), lty=1, col=c("blue"))
  text(x[1:i], testRMSE[1:i], labels=varnames, cex= 0.7, pos=3)
  
  trainset[IND] = NULL
  testset[IND] = NULL
  
  cat("\ntrainRMSE", trainRMSE[i])
  cat("\ntestRMSE", testRMSE[i])
  cat("\nremoving variable", var_to_remove)

  flush.console()
}

## 
## trainRMSE 3.405217
## testRMSE 5.055386
## removing variable biceps

## 
## trainRMSE 3.453475
## testRMSE 4.850632
## removing variable forearm

## 
## trainRMSE 3.522938
## testRMSE 4.954367
## removing variable thigh

## 
## trainRMSE 3.651424
## testRMSE 5.134237
## removing variable age

## 
## trainRMSE 3.580804
## testRMSE 5.200768
## removing variable hip

## 
## trainRMSE 3.672563
## testRMSE 5.318662
## removing variable weight

## 
## trainRMSE 3.670937
## testRMSE 5.347998
## removing variable knee

## 
## trainRMSE 3.671904
## testRMSE 5.278105
## removing variable chest

## 
## trainRMSE 3.719272
## testRMSE 5.089037
## removing variable height

## 
## trainRMSE 3.685936
## testRMSE 5.032729
## removing variable adipos

## 
## trainRMSE 3.855121
## testRMSE 4.885201
## removing variable neck

## 
## trainRMSE 3.833188
## testRMSE 4.794337
## removing variable ankle

## 
## trainRMSE 4.136493
## testRMSE 4.904577
## removing variable wrist
head(trainset)
##     siri abdom
## 74  13.5  76.0
## 145 10.3  85.3
## 80  18.8  98.6
## 56  22.6 100.0
## 98  11.3  86.7
## 85  27.0  99.8
plot(x,testRMSE,type="b",col="blue",main="Variable Elimination", xlab="Covariates Removed", 
     ylab="RMSE", ylim=c(ymin,ymax), sub="Starting RMSE = 5.01")
legend(11,4.7, c('validation set'), lty=1, col=c("blue"), bty='n')
text(x, testRMSE, labels=varnames, cex= 0.7, pos=3)
abline(h=min(testRMSE), lty=2)

######################################################
# check interactions

ggpairs(fat[,c(8,15)], axisLabels = "none", 
        title = "Pairs Plots of Fat Data")

trainset <- fat[c(train2),]
trainset <- trainset[,c(1,8,15)]
testset <- fat[-c(train2),]
testset <- testset[,c(1,8,15)]

trainset$ad.wr <- trainset$abdom * trainset$wrist
head(trainset)
##     siri abdom wrist   ad.wr
## 74  13.5  76.0  16.9 1284.40
## 145 10.3  85.3  18.3 1560.99
## 80  18.8  98.6  19.0 1873.40
## 56  22.6 100.0  18.9 1890.00
## 98  11.3  86.7  18.2 1577.94
## 85  27.0  99.8  18.0 1796.40
testset$ad.wr <- testset$abdom * testset$wrist
head(testset)
##     siri abdom wrist   ad.wr
## 220 15.0  88.7  17.1 1516.77
## 191 11.4  80.6  17.9 1442.74
## 113 22.1  95.0  19.8 1881.00
## 9    4.1  82.5  18.2 1501.50
## 92  18.2  93.2  18.8 1752.16
## 105 25.4  98.6  17.7 1745.22
num <- ncol(trainset) - 2
trainRMSE <- array(dim=num)
trainY <- trainset$siri
testRMSE <- array(dim=num)
testY <- testset$siri
x <- array(dim=num)
varnames <- array(dim=num)

par(mfrow=c(1,1), oma=c(0,0,0,0))
for(i in 1:num){
  x[i] <- i
  
  Model=fit(siri~., trainset, model="svm")
  predtrain=predict(Model, trainset)
  trainRMSE[i]=mmetric(trainY, predtrain, "RMSE")
  predtest=predict(Model, testset)
  testRMSE[i]=mmetric(testY, predtest, "RMSE")
  
  varimp <- Importance(Model,trainset, method="sensv")
  dev.set(dev.next())
  
  L=list(runs=1,sen=t(varimp$imp),sresponses=varimp$sresponses)
  mgraph(L,graph="IMP", col="gray", Grid=10)
  
  z <- order(varimp$imp, decreasing=F)
  IND <- z[2]
  var_to_remove <- names(trainset[IND])
  varnames[i] <- var_to_remove
  
  ymin=min(testRMSE[1:i])
  ymax=max(testRMSE[1:i])
  ymin=ymin-((ymax-ymin)/4)
  plot(x[1:i],testRMSE[1:i],type="b",col="blue",main="Variable Elimination-Interaction", xlab="Covariates Removed",
       ylab="RMSE", ylim=c(ymin,ymax))
  legend("bottomright", c('validation set'), lty=1, col=c("blue"))
  text(x[1:i], testRMSE[1:i], labels=varnames, cex= 0.7, pos=3)
  
  trainset[IND] = NULL
  testset[IND] = NULL
  
  cat("\ntrainRMSE", trainRMSE[i])
  cat("\ntestRMSE", testRMSE[i])
  cat("\nremoving variable", var_to_remove)
  flush.console()
  
}

## 
## trainRMSE 4.115702
## testRMSE 4.898484
## removing variable ad.wr

## 
## trainRMSE 4.113007
## testRMSE 4.962976
## removing variable wrist
head(trainset)
##     siri abdom
## 74  13.5  76.0
## 145 10.3  85.3
## 80  18.8  98.6
## 56  22.6 100.0
## 98  11.3  86.7
## 85  27.0  99.8
plot(x,testRMSE,type="b",col="blue",main="Variable Elimination-Interaction", xlab="Covariates Removed", 
     ylab="RMSE", ylim=c(ymin,ymax), sub="Starting RMSE = 4.78")
legend(1.8, 4.89, c('validation set'), lty=1, col=c("blue"), bty='n')
text(x, testRMSE, labels=varnames, cex= 0.7, pos=3)
abline(h=min(testRMSE), lty=2)

# no visible benefit to including interacton term

######################################################
# tune cost and epsilon parameters

tuneResult <- tune(svm, siri ~ abdom+wrist, data = fat,
                   ranges = list(epsilon = seq(0,1,0.1), cost = 2^(1:9)))

print(tuneResult)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  epsilon cost
##      0.3    2
## 
## - best performance: 23.19606
plot(tuneResult, main="Performance of Reduced SVR")

#epsilon = 0.4, cost = 4

fat.finalsvr <- svm(siri ~ abdom+wrist, data=fat, epsilon=0.4,
                    cost=4)
summary(fat.finalsvr)
## 
## Call:
## svm(formula = siri ~ abdom + wrist, data = fat, epsilon = 0.4, 
##     cost = 4)
## 
## 
## Parameters:
##    SVM-Type:  eps-regression 
##  SVM-Kernel:  radial 
##        cost:  4 
##       gamma:  0.5 
##     epsilon:  0.4 
## 
## 
## Number of Support Vectors:  94
svrpred <- predict(fat.finalsvr, fat.test) 

######################################################
######################################################
# Compare various models
######################################################

# red lm: siri~weight+abdom+thigh+wrist
# red gam: siri~s(abdom:hip)+age+wrist
# red svr: siri~abdom+wrist

######################################################
######################################################
######################################################
# root mean square errors

RMSELMF <- rmse(fat.test$siri - lmfpred)
RMSELMF
## [1] 5.148925
RMSELM <- rmse(fat.test$siri - lmpred)
RMSELM
## [1] 5.089915
RMSEGAMF = rmse(fat.test$siri - gamfpred)
RMSEGAMF
## [1] 6.50116
RMSEGAM = rmse(fat.test$siri - gampred)
RMSEGAM
## [1] 5.179132
RMSESVF <- rmse(fat.test$siri - svrfpred)
RMSESVF
## [1] 4.983324
RMSESV <- rmse(fat.test$siri - svrpred)
RMSESV
## [1] 4.931453
Full.rmse <- cbind(RMSELMF, RMSEGAMF, RMSESVF)
Red.rmse  <- cbind(RMSELM, RMSEGAM, RMSESV)

Total.rmse <-rbind(Full.rmse, Red.rmse)
rownames(Total.rmse) <- c("Full", "Reduced")
colnames(Total.rmse) <- c("LM", "GAM", "SVR")
Total.rmse
##               LM      GAM      SVR
## Full    5.148925 6.501160 4.983324
## Reduced 5.089915 5.179132 4.931453
######################################################
######################################################
######################################################
# predictor plots

par(mfrow=c(1,1), oma=c(0,0,0,0))

ggplot(fat.test, aes(x = lmpred, y = siri)) +
  geom_point() +
  xlab("Predicted Siri") +
  ylab("Observed Siri") +
  ggtitle("Model vs Test Data: Reduced LM") +
  geom_abline(intercept=0,slope=1)

ggplot(fat.test, aes(x = gampred, y = siri)) +
  geom_point() +
  xlab("Predicted Siri") +
  ylab("Observed Siri") +
  ggtitle("Model vs Test Data: Reduced GAM") +
  geom_abline(intercept=0,slope=1)

ggplot(fat.test, aes(x = svrpred, y = siri)) +
  geom_point() +
  xlab("Predicted Siri") +
  ylab("Observed Siri") +
  ggtitle("Model vs Test Data: Reduced SVR") +
  geom_abline(intercept=0,slope=1)

x <-range(fat.test$abdom)
x <- seq(x[1], x[2], length.out=50)    
y <- range(fat.test$wrist)
y <- seq(y[1], y[2], length.out=50)

z <- outer(x,y, 
           function(abdom,wrist)
             predict(fat.finalsvr, data.frame(abdom,wrist)))
p <- persp(x,y,z, theta=30, phi=30, 
           col="lightblue", expand = 0.5,shade = 0.25,
           xlab="Abdom", ylab="Wrist", zlab="Siri.SVR", main="Predicted Siri - SVR")

obs <- trans3d(fat.test$abdom, fat.test$wrist, fat.test$siri,p)
pred <- trans3d(fat.test$abdom, fat.test$wrist,svrpred,p)
points(obs, col="red", pch=16)
segments(obs$x, obs$y, pred$x, pred$y)

######################################################
######################################################
######################################################
# Residuals plots

par(mfrow=c(1,2), oma=c(0,0,2,0))
plot(residuals(fat.finallm)~predict(fat.finallm), xlab="Predicted", ylab="Residuals", main="Residuals Plot")
abline(h=0)
qqnorm(residuals(fat.finallm))
qqline(residuals(fat.finallm))
mtext("Residuals for LM Reduced Model", cex=1.5, outer=T)

par(mfrow=c(1,2), oma=c(0,0,2,0))
plot(residuals(fat.finalgam)~predict(fat.finalgam), xlab="Predicted", ylab="Residuals", main="Residuals Plot")
abline(h=0)
qqnorm(residuals(fat.finalgam))
qqline(residuals(fat.finalgam))
mtext("Residuals for GAM Reduced Model", cex=1.5, outer=T)

par(mfrow=c(1,2), oma=c(0,0,2,0))
plot(residuals(fat.finalsvr)~predict(fat.finalsvr), xlab="Predicted", ylab="Residuals", main="Residuals Plot")
abline(h=0)
qqnorm(residuals(fat.finalsvr))
qqline(residuals(fat.finalsvr))
mtext("Residuals for SVR Reduced Model", cex=1.5, outer=T)