######################################################
# Stat 6801 - Project
#
# Sarah Rathwell
#
# Objective - Investigate SVR vs GAM on Fat data.
######################################################
rm(list=ls())
par(mfrow=c(1,1), oma=c(0,0,0,0))
library(ggplot2)
library(mgcv)
## Loading required package: nlme
## This is mgcv 1.8-12. For overview type 'help("mgcv-package")'.
library(faraway)
library(e1071)
library(rminer)
library(GGally)
##
## Attaching package: 'GGally'
## The following object is masked from 'package:faraway':
##
## happy
library(scatterplot3d)
######################################################
cat(" Investigate SVR vs GAM on Fat data ",
"Last modification:",date(),'\n')
## Investigate SVR vs GAM on Fat data Last modification: Wed Dec 7 22:48:46 2016
######################################################
# get data
data(fat, package='faraway')
str(fat)
## 'data.frame': 252 obs. of 18 variables:
## $ brozek : num 12.6 6.9 24.6 10.9 27.8 20.6 19 12.8 5.1 12 ...
## $ siri : num 12.3 6.1 25.3 10.4 28.7 20.9 19.2 12.4 4.1 11.7 ...
## $ density: num 1.07 1.09 1.04 1.08 1.03 ...
## $ age : int 23 22 22 26 24 24 26 25 25 23 ...
## $ weight : num 154 173 154 185 184 ...
## $ height : num 67.8 72.2 66.2 72.2 71.2 ...
## $ adipos : num 23.7 23.4 24.7 24.9 25.6 26.5 26.2 23.6 24.6 25.8 ...
## $ free : num 135 161 116 165 133 ...
## $ neck : num 36.2 38.5 34 37.4 34.4 39 36.4 37.8 38.1 42.1 ...
## $ chest : num 93.1 93.6 95.8 101.8 97.3 ...
## $ abdom : num 85.2 83 87.9 86.4 100 94.4 90.7 88.5 82.5 88.6 ...
## $ hip : num 94.5 98.7 99.2 101.2 101.9 ...
## $ thigh : num 59 58.7 59.6 60.1 63.2 66 58.4 60 62.9 63.1 ...
## $ knee : num 37.3 37.3 38.9 37.3 42.2 42 38.3 39.4 38.3 41.7 ...
## $ ankle : num 21.9 23.4 24 22.8 24 25.6 22.9 23.2 23.8 25 ...
## $ biceps : num 32 30.5 28.8 32.4 32.2 35.7 31.9 30.5 35.9 35.6 ...
## $ forearm: num 27.4 28.9 25.2 29.4 27.7 30.6 27.8 29 31.1 30 ...
## $ wrist : num 17.1 18.2 16.6 18.2 17.7 18.8 17.7 18.8 18.2 19.2 ...
head(fat)
## brozek siri density age weight height adipos free neck chest abdom
## 1 12.6 12.3 1.0708 23 154.25 67.75 23.7 134.9 36.2 93.1 85.2
## 2 6.9 6.1 1.0853 22 173.25 72.25 23.4 161.3 38.5 93.6 83.0
## 3 24.6 25.3 1.0414 22 154.00 66.25 24.7 116.0 34.0 95.8 87.9
## 4 10.9 10.4 1.0751 26 184.75 72.25 24.9 164.7 37.4 101.8 86.4
## 5 27.8 28.7 1.0340 24 184.25 71.25 25.6 133.1 34.4 97.3 100.0
## 6 20.6 20.9 1.0502 24 210.25 74.75 26.5 167.0 39.0 104.5 94.4
## hip thigh knee ankle biceps forearm wrist
## 1 94.5 59.0 37.3 21.9 32.0 27.4 17.1
## 2 98.7 58.7 37.3 23.4 30.5 28.9 18.2
## 3 99.2 59.6 38.9 24.0 28.8 25.2 16.6
## 4 101.2 60.1 37.3 22.8 32.4 29.4 18.2
## 5 101.9 63.2 42.2 24.0 32.2 27.7 17.7
## 6 107.8 66.0 42.0 25.6 35.7 30.6 18.8
# clean up known errors
fat <- fat[,-c(1,3,8)]
fat$height[42] <- 69.5
fat <- fat[-182,]
head(fat)
## siri age weight height adipos neck chest abdom hip thigh knee ankle
## 1 12.3 23 154.25 67.75 23.7 36.2 93.1 85.2 94.5 59.0 37.3 21.9
## 2 6.1 22 173.25 72.25 23.4 38.5 93.6 83.0 98.7 58.7 37.3 23.4
## 3 25.3 22 154.00 66.25 24.7 34.0 95.8 87.9 99.2 59.6 38.9 24.0
## 4 10.4 26 184.75 72.25 24.9 37.4 101.8 86.4 101.2 60.1 37.3 22.8
## 5 28.7 24 184.25 71.25 25.6 34.4 97.3 100.0 101.9 63.2 42.2 24.0
## 6 20.9 24 210.25 74.75 26.5 39.0 104.5 94.4 107.8 66.0 42.0 25.6
## biceps forearm wrist
## 1 32.0 27.4 17.1
## 2 30.5 28.9 18.2
## 3 28.8 25.2 16.6
## 4 32.4 29.4 18.2
## 5 32.2 27.7 17.7
## 6 35.7 30.6 18.8
str(fat)
## 'data.frame': 251 obs. of 15 variables:
## $ siri : num 12.3 6.1 25.3 10.4 28.7 20.9 19.2 12.4 4.1 11.7 ...
## $ age : int 23 22 22 26 24 24 26 25 25 23 ...
## $ weight : num 154 173 154 185 184 ...
## $ height : num 67.8 72.2 66.2 72.2 71.2 ...
## $ adipos : num 23.7 23.4 24.7 24.9 25.6 26.5 26.2 23.6 24.6 25.8 ...
## $ neck : num 36.2 38.5 34 37.4 34.4 39 36.4 37.8 38.1 42.1 ...
## $ chest : num 93.1 93.6 95.8 101.8 97.3 ...
## $ abdom : num 85.2 83 87.9 86.4 100 94.4 90.7 88.5 82.5 88.6 ...
## $ hip : num 94.5 98.7 99.2 101.2 101.9 ...
## $ thigh : num 59 58.7 59.6 60.1 63.2 66 58.4 60 62.9 63.1 ...
## $ knee : num 37.3 37.3 38.9 37.3 42.2 42 38.3 39.4 38.3 41.7 ...
## $ ankle : num 21.9 23.4 24 22.8 24 25.6 22.9 23.2 23.8 25 ...
## $ biceps : num 32 30.5 28.8 32.4 32.2 35.7 31.9 30.5 35.9 35.6 ...
## $ forearm: num 27.4 28.9 25.2 29.4 27.7 30.6 27.8 29 31.1 30 ...
## $ wrist : num 17.1 18.2 16.6 18.2 17.7 18.8 17.7 18.8 18.2 19.2 ...
######################################################
# split into training/test -- 80/20
set.seed(12345)
train <-sample(1:251, 200, replace=FALSE)
fat.test <- fat[-c(train),]
str(fat.test)
## 'data.frame': 51 obs. of 15 variables:
## $ siri : num 25.3 20.9 19.2 12.4 29 16.5 7.9 5.7 35.2 32.6 ...
## $ age : int 22 24 26 25 34 33 34 29 46 50 ...
## $ weight : num 154 210 181 176 196 ...
## $ height : num 66.2 74.8 69.8 72.5 71 ...
## $ adipos : num 24.7 26.5 26.2 23.6 27.3 27.6 20.3 22.2 48.9 31.8 ...
## $ neck : num 34 39 36.4 37.8 38.9 40 36.2 37.3 51.2 40.2 ...
## $ chest : num 95.8 104.5 105.1 99.6 101.9 ...
## $ abdom : num 87.9 94.4 90.7 88.5 96.4 ...
## $ hip : num 99.2 107.8 100.3 97.1 105.2 ...
## $ thigh : num 59.6 66 58.4 60 64.8 65.8 51.7 58.5 87.3 61.3 ...
## $ knee : num 38.9 42 38.3 39.4 40.8 40.6 34.7 38.8 49.1 41.1 ...
## $ ankle : num 24 25.6 22.9 23.2 23.1 24 21.4 21.5 29.6 24.7 ...
## $ biceps : num 28.8 35.7 31.9 30.5 36.2 37.1 28.7 30.1 45 34.1 ...
## $ forearm: num 25.2 30.6 27.8 29 30.8 30.1 27 26.4 29 31 ...
## $ wrist : num 16.6 18.8 17.7 18.8 17.3 18.2 16.5 17.9 21.4 18.3 ...
fat <- fat[c(train),]
str(fat)
## 'data.frame': 200 obs. of 15 variables:
## $ siri : num 26.6 15 11.4 12.4 22.1 34.5 18.8 13.8 13.1 17 ...
## $ age : int 39 53 41 54 47 45 66 50 37 65 ...
## $ weight : num 219 154 153 153 178 ...
## $ height : num 74.2 69.2 69.2 70.5 70 ...
## $ adipos : num 28 22.7 22.5 24.5 25.6 39.1 25.1 25.6 23.7 20.8 ...
## $ neck : num 40 37.6 36.4 38.5 40.2 43.2 37.4 37.7 35.3 34.7 ...
## $ chest : num 108.5 93.9 91.4 99 99.7 ...
## $ abdom : num 104.6 88.7 80.6 91.8 95 ...
## $ hip : num 109.8 94.5 92.3 96.2 98.6 ...
## $ thigh : num 68.1 53.7 54.3 57.7 62.3 72.5 56.5 58.5 60 50.7 ...
## $ knee : num 42.8 36.2 36.3 38.1 38.1 39.6 39.3 36.6 38.1 33.4 ...
## $ ankle : num 24.1 22 21.8 23.9 23.9 26.6 22.7 23.5 22 20.1 ...
## $ biceps : num 35.6 28.5 29.6 31.4 35.3 36.4 30.3 34.4 31.5 28.5 ...
## $ forearm: num 29 25.7 27.3 29.9 31.1 32.7 28.7 29.2 26.6 24.8 ...
## $ wrist : num 19 17.1 17.9 18.9 19.8 21.4 19 18 16.7 16.5 ...
######################################################
# predict rmse for test set
rmse <- function(error)
{
sqrt(mean(error^2))
}
######################################################
######################################################
# Linear Regression
######################################################
fat.lm <- lm(siri~., data=fat)
summary(fat.lm)
##
## Call:
## lm(formula = siri ~ ., data = fat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.9508 -2.8269 -0.0407 2.8213 9.5360
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -14.06339 47.06987 -0.299 0.76545
## age 0.07981 0.03664 2.178 0.03066 *
## weight -0.06779 0.13023 -0.521 0.60334
## height 0.02205 0.68108 0.032 0.97421
## adipos 0.34640 0.97917 0.354 0.72391
## neck -0.16669 0.28581 -0.583 0.56047
## chest -0.19934 0.11922 -1.672 0.09620 .
## abdom 0.95065 0.10054 9.455 < 2e-16 ***
## hip -0.15544 0.16224 -0.958 0.33926
## thigh 0.23911 0.15708 1.522 0.12966
## knee -0.19102 0.27853 -0.686 0.49369
## ankle 0.11368 0.23016 0.494 0.62196
## biceps 0.08334 0.17971 0.464 0.64340
## forearm 0.26789 0.21651 1.237 0.21754
## wrist -1.89761 0.61070 -3.107 0.00219 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.174 on 185 degrees of freedom
## Multiple R-squared: 0.7677, Adjusted R-squared: 0.7501
## F-statistic: 43.66 on 14 and 185 DF, p-value: < 2.2e-16
lmfpred <- predict(fat.lm, fat.test)
halfnorm(cooks.distance(fat.lm), main="Cooks Distance - Full LM")

fat3 <- fat[-c(4,111),]
fat.lm2 <-lm(siri~., data=fat3)
summary(fat.lm2)
##
## Call:
## lm(formula = siri ~ ., data = fat3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.039 -2.789 -0.101 2.726 9.337
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -103.68006 58.57767 -1.770 0.0784 .
## age 0.06533 0.03662 1.784 0.0761 .
## weight -0.31887 0.16319 -1.954 0.0522 .
## height 1.24781 0.82455 1.513 0.1319
## adipos 1.90002 1.14696 1.657 0.0993 .
## neck -0.12149 0.28043 -0.433 0.6654
## chest -0.17381 0.11714 -1.484 0.1396
## abdom 0.97340 0.09889 9.844 <2e-16 ***
## hip -0.19855 0.15971 -1.243 0.2154
## thigh 0.31355 0.15593 2.011 0.0458 *
## knee -0.07687 0.27541 -0.279 0.7805
## ankle -0.21148 0.28049 -0.754 0.4518
## biceps 0.05798 0.17703 0.328 0.7437
## forearm 0.29083 0.21232 1.370 0.1724
## wrist -1.55676 0.61035 -2.551 0.0116 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.09 on 183 degrees of freedom
## Multiple R-squared: 0.7777, Adjusted R-squared: 0.7607
## F-statistic: 45.74 on 14 and 183 DF, p-value: < 2.2e-16
halfnorm(cooks.distance(fat.lm2))

fat.lm3 <-lm(siri~age+weight+adipos+abdom+thigh+wrist, data=fat3)
summary(fat.lm3)
##
## Call:
## lm(formula = siri ~ age + weight + adipos + abdom + thigh + wrist,
## data = fat3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.5440 -2.8313 -0.1751 3.0129 9.4796
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -36.09611 9.24908 -3.903 0.000132 ***
## age 0.05428 0.03408 1.593 0.112859
## weight -0.13765 0.03641 -3.780 0.000209 ***
## adipos 0.10388 0.23615 0.440 0.660528
## abdom 0.91826 0.09249 9.928 < 2e-16 ***
## thigh 0.28398 0.12412 2.288 0.023238 *
## wrist -1.49123 0.53461 -2.789 0.005816 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.091 on 191 degrees of freedom
## Multiple R-squared: 0.7678, Adjusted R-squared: 0.7605
## F-statistic: 105.3 on 6 and 191 DF, p-value: < 2.2e-16
anova(fat.lm3, fat.lm2, test="F")
## Analysis of Variance Table
##
## Model 1: siri ~ age + weight + adipos + abdom + thigh + wrist
## Model 2: siri ~ age + weight + height + adipos + neck + chest + abdom +
## hip + thigh + knee + ankle + biceps + forearm + wrist
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 191 3197.3
## 2 183 3060.6 8 136.73 1.0219 0.421
fat.lm4 <-lm(siri~weight+abdom+thigh+wrist, data=fat3)
summary(fat.lm4)
##
## Call:
## lm(formula = siri ~ weight + abdom + thigh + wrist, data = fat3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.0488 -2.8607 -0.0674 3.1068 9.5085
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -39.55945 8.84409 -4.473 1.32e-05 ***
## weight -0.16021 0.03354 -4.777 3.51e-06 ***
## abdom 1.01287 0.05791 17.490 < 2e-16 ***
## thigh 0.23034 0.11134 2.069 0.0399 *
## wrist -1.10578 0.48013 -2.303 0.0223 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.098 on 193 degrees of freedom
## Multiple R-squared: 0.7646, Adjusted R-squared: 0.7597
## F-statistic: 156.7 on 4 and 193 DF, p-value: < 2.2e-16
anova(fat.lm4, fat.lm2, test="F")
## Analysis of Variance Table
##
## Model 1: siri ~ weight + abdom + thigh + wrist
## Model 2: siri ~ age + weight + height + adipos + neck + chest + abdom +
## hip + thigh + knee + ankle + biceps + forearm + wrist
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 193 3241.7
## 2 183 3060.6 10 181.1 1.0829 0.3775
fat.lm5<-lm(siri~weight+abdom+wrist, data=fat3)
anova(fat.lm5, fat.lm2, test="F")
## Analysis of Variance Table
##
## Model 1: siri ~ weight + abdom + wrist
## Model 2: siri ~ age + weight + height + adipos + neck + chest + abdom +
## hip + thigh + knee + ankle + biceps + forearm + wrist
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 194 3313.5
## 2 183 3060.6 11 252.98 1.3752 0.1877
######################################################
# check interactions
ggpairs(fat[,c(3,8,15)], axisLabels = "none",
title = "Pairs Plots of Fat Data")

fat.lint <-lm(siri~weight*abdom*wrist, data=fat3)
summary(fat.lint)
##
## Call:
## lm(formula = siri ~ weight * abdom * wrist, data = fat3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.9940 -2.8957 -0.2935 2.9756 9.9287
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.185e+02 2.187e+02 0.542 0.589
## weight 1.935e-01 1.116e+00 0.173 0.863
## abdom -1.770e+00 2.721e+00 -0.651 0.516
## wrist -1.118e+01 1.231e+01 -0.909 0.365
## weight:abdom 3.520e-03 1.122e-02 0.314 0.754
## weight:wrist -7.064e-03 6.057e-02 -0.117 0.907
## abdom:wrist 1.697e-01 1.511e-01 1.123 0.263
## weight:abdom:wrist -2.901e-04 6.074e-04 -0.478 0.633
##
## Residual standard error: 4.133 on 190 degrees of freedom
## Multiple R-squared: 0.7643, Adjusted R-squared: 0.7556
## F-statistic: 88.01 on 7 and 190 DF, p-value: < 2.2e-16
anova(fat.lm2, fat.lint, test="F")
## Analysis of Variance Table
##
## Model 1: siri ~ age + weight + height + adipos + neck + chest + abdom +
## hip + thigh + knee + ankle + biceps + forearm + wrist
## Model 2: siri ~ weight * abdom * wrist
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 183 3060.6
## 2 190 3245.4 -7 -184.9 1.5794 0.144
fat.lint2 <-lm(siri~weight*abdom+wrist, data=fat3)
summary(fat.lint2)
##
## Call:
## lm(formula = siri ~ weight * abdom + wrist, data = fat3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.7773 -2.8812 -0.2731 3.0726 9.4313
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.874e+01 1.633e+01 -2.984 0.00321 **
## weight -3.765e-03 8.718e-02 -0.043 0.96560
## abdom 1.227e+00 1.744e-01 7.036 3.35e-11 ***
## wrist -1.388e+00 4.725e-01 -2.937 0.00372 **
## weight:abdom -1.180e-03 8.767e-04 -1.347 0.17970
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.124 on 193 degrees of freedom
## Multiple R-squared: 0.7616, Adjusted R-squared: 0.7567
## F-statistic: 154.1 on 4 and 193 DF, p-value: < 2.2e-16
anova(fat.lm2, fat.lint2, test="F")
## Analysis of Variance Table
##
## Model 1: siri ~ age + weight + height + adipos + neck + chest + abdom +
## hip + thigh + knee + ankle + biceps + forearm + wrist
## Model 2: siri ~ weight * abdom + wrist
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 183 3060.6
## 2 193 3282.7 -10 -222.14 1.3283 0.2181
fat.lint3 <-lm(siri~weight*wrist+abdom, data=fat3)
summary(fat.lint3)
##
## Call:
## lm(formula = siri ~ weight * wrist + abdom, data = fat3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.7237 -2.9877 -0.1459 3.1643 9.7639
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -63.983806 31.975156 -2.001 0.0468 *
## weight 0.078456 0.175120 0.448 0.6546
## wrist 0.582161 1.768954 0.329 0.7424
## abdom 1.003974 0.058282 17.226 <2e-16 ***
## weight:wrist -0.010533 0.009388 -1.122 0.2633
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.13 on 193 degrees of freedom
## Multiple R-squared: 0.7609, Adjusted R-squared: 0.756
## F-statistic: 153.6 on 4 and 193 DF, p-value: < 2.2e-16
anova(fat.lm2, fat.lint3, test="F")
## Analysis of Variance Table
##
## Model 1: siri ~ age + weight + height + adipos + neck + chest + abdom +
## hip + thigh + knee + ankle + biceps + forearm + wrist
## Model 2: siri ~ weight * wrist + abdom
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 183 3060.6
## 2 193 3292.1 -10 -231.51 1.3843 0.1904
#not enough change to merit the increase in complexity
######################################################
# fit reduced model
fat.finallm <-lm(siri~abdom+weight+wrist, data=fat3)
summary(fat.finallm)
##
## Call:
## lm(formula = siri ~ abdom + weight + wrist, data = fat3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.9236 -2.9650 -0.2539 3.0181 9.7623
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -29.0572 7.3029 -3.979 9.78e-05 ***
## abdom 1.0058 0.0583 17.253 < 2e-16 ***
## weight -0.1158 0.0260 -4.455 1.41e-05 ***
## wrist -1.3307 0.4716 -2.822 0.00527 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.133 on 194 degrees of freedom
## Multiple R-squared: 0.7594, Adjusted R-squared: 0.7556
## F-statistic: 204.1 on 3 and 194 DF, p-value: < 2.2e-16
lmpred <- predict(fat.finallm, fat.test)
######################################################
######################################################
# GAM Regression
######################################################
fat.gam <- gam(siri ~ s(age)+s(weight)+s(height)+s(adipos)+s(neck)+s(chest)+s(abdom)+s(hip)+
s(thigh)+s(knee)+s(ankle)+s(biceps)+s(forearm)+s(wrist), data = fat)
summary(fat.gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## siri ~ s(age) + s(weight) + s(height) + s(adipos) + s(neck) +
## s(chest) + s(abdom) + s(hip) + s(thigh) + s(knee) + s(ankle) +
## s(biceps) + s(forearm) + s(wrist)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.1800 0.2555 75.08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(age) 5.713 6.836 2.942 0.00796 **
## s(weight) 2.884 3.733 0.795 0.52242
## s(height) 1.673 2.092 1.060 0.35214
## s(adipos) 4.199 5.245 1.759 0.11430
## s(neck) 1.000 1.000 0.934 0.33532
## s(chest) 1.000 1.000 1.945 0.16510
## s(abdom) 4.936 5.998 17.484 < 2e-16 ***
## s(hip) 6.276 7.344 2.584 0.01421 *
## s(thigh) 1.000 1.000 2.435 0.12066
## s(knee) 2.286 2.936 1.185 0.28933
## s(ankle) 3.607 4.412 1.636 0.18697
## s(biceps) 8.075 8.731 1.643 0.07883 .
## s(forearm) 1.000 1.000 3.981 0.04776 *
## s(wrist) 1.626 2.015 3.234 0.04055 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.813 Deviance explained = 85.5%
## GCV = 16.981 Scale est. = 13.052 n = 200
fat.gamcooks <- cooks.distance(fat.gam)
halfnorm(fat.gamcooks, main="Cooks Distance - Full GAM")

gamfpred <- predict(fat.gam, fat.test)
fat[c(6,4),]
## siri age weight height adipos neck chest abdom hip thigh knee ankle
## 41 34.5 45 262.75 68.75 39.1 43.2 128.3 126.2 125.6 72.5 39.6 26.6
## 221 12.4 54 153.25 70.50 24.5 38.5 99.0 91.8 96.2 57.7 38.1 23.9
## biceps forearm wrist
## 41 36.4 32.7 21.4
## 221 31.4 29.9 18.9
fat2 <- fat[-c(6),]
fat.gam2 <- gam(siri ~ s(age)+s(weight)+s(height)+s(adipos)+s(neck)+s(chest)+s(abdom)+s(hip)+
s(thigh)+s(knee)+s(ankle)+s(biceps)+s(forearm)+s(wrist), data = fat2)
summary(fat.gam2)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## siri ~ s(age) + s(weight) + s(height) + s(adipos) + s(neck) +
## s(chest) + s(abdom) + s(hip) + s(thigh) + s(knee) + s(ankle) +
## s(biceps) + s(forearm) + s(wrist)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.1030 0.2557 74.7 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(age) 5.778 6.897 2.950 0.00737 **
## s(weight) 2.710 3.526 0.738 0.54619
## s(height) 1.496 1.836 0.718 0.38180
## s(adipos) 4.315 5.428 1.686 0.11797
## s(neck) 1.000 1.000 0.881 0.34940
## s(chest) 1.000 1.000 1.849 0.17583
## s(abdom) 4.869 5.955 17.455 < 2e-16 ***
## s(hip) 5.478 6.579 2.526 0.02002 *
## s(thigh) 1.000 1.000 2.361 0.12643
## s(knee) 2.212 2.839 1.017 0.34260
## s(ankle) 3.666 4.484 1.667 0.17783
## s(biceps) 8.138 8.760 1.752 0.06132 .
## s(forearm) 1.000 1.000 4.348 0.03868 *
## s(wrist) 1.751 2.193 3.052 0.04078 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.811 Deviance explained = 85.3%
## GCV = 16.863 Scale est. = 13.015 n = 199
fat.gamcooks2 <- cooks.distance(fat.gam2)
halfnorm(fat.gamcooks2)

par(mfrow=c(2,4), oma=c(0,0,2,0))
plot(fat.gam2, select=2)
abline(h=1, col="blue")
plot(fat.gam2, select=3)
abline(h=0, col="blue")
plot(fat.gam2, select=4)
abline(h=-1, col="blue")
plot(fat.gam2, select=5)
abline(h=0, col="blue")
plot(fat.gam2, select=6)
abline(h=0, col="blue")
plot(fat.gam2, select=9)
abline(h=0, col="blue")
plot(fat.gam2, select=10)
abline(h=-0.3, col="blue")
mtext("Non-significant Factors - GAM", cex=1.5, outer=T)
fat.gam3 <- gam(siri ~s(age)+s(abdom)+s(hip)+s(ankle)+s(biceps)+s(forearm)+s(wrist), data = fat2)
summary(fat.gam3)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## siri ~ s(age) + s(abdom) + s(hip) + s(ankle) + s(biceps) + s(forearm) +
## s(wrist)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.1030 0.2744 69.62 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(age) 5.818 6.966 2.780 0.00940 **
## s(abdom) 4.888 6.044 28.971 < 2e-16 ***
## s(hip) 6.688 7.793 3.278 0.00186 **
## s(ankle) 4.615 5.586 2.467 0.03046 *
## s(biceps) 1.000 1.000 1.547 0.21520
## s(forearm) 1.000 1.000 1.773 0.18475
## s(wrist) 1.276 1.501 16.122 1.1e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.782 Deviance explained = 81%
## GCV = 17.265 Scale est. = 14.984 n = 199
plot(fat.gam3)

par(mfrow=c(1,2))

plot(fat.gam3, select=5)
abline(h=0, col="blue")
plot(fat.gam3, select=6)
abline(h=0, col="blue")
mtext("Non-significant factors - Gam [iteration 2]", cex=1.5, outer=T)

fat.gam4 <- gam(siri ~s(age)+s(abdom)+s(hip)+s(ankle)+s(wrist), data = fat2)
summary(fat.gam4)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## siri ~ s(age) + s(abdom) + s(hip) + s(ankle) + s(wrist)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.1030 0.2789 68.5 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(age) 5.495 6.637 2.253 0.03621 *
## s(abdom) 4.837 5.988 30.732 < 2e-16 ***
## s(hip) 5.854 7.007 2.960 0.00581 **
## s(ankle) 4.307 5.238 2.099 0.06377 .
## s(wrist) 1.000 1.000 18.707 2.5e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.775 Deviance explained = 80%
## GCV = 17.448 Scale est. = 15.476 n = 199
par(mfrow=c(2,3))
plot(fat.gam4)
######################################################
# test interaction models based on highest correlations
ggpairs(fat[,c(2,8,9,12,15)], axisLabels = "none",
title = "Pairs Plots of Fat Data")
int <- gam(siri ~s(neck,abdom,ankle,wrist,hip)+s(age), data = fat2)
anova(fat.gam4, int, test="F")
## Analysis of Deviance Table
##
## Model 1: siri ~ s(age) + s(abdom) + s(hip) + s(ankle) + s(wrist)
## Model 2: siri ~ s(neck, abdom, ankle, wrist, hip) + s(age)
## Resid. Df Resid. Dev Df Deviance F Pr(>F)
## 1 176.51 2731.7
## 2 133.54 2105.4 42.964 626.29 0.9246 0.6065
int2 <- gam(siri ~s(abdom, hip)+s(age)+s(ankle)+s(wrist), data=fat2) #sig?
anova(fat.gam4, int2, test="F")
## Analysis of Deviance Table
##
## Model 1: siri ~ s(age) + s(abdom) + s(hip) + s(ankle) + s(wrist)
## Model 2: siri ~ s(abdom, hip) + s(age) + s(ankle) + s(wrist)
## Resid. Df Resid. Dev Df Deviance F Pr(>F)
## 1 176.51 2731.7
## 2 176.29 2864.3 0.21854 -132.62
int3 <- gam(siri ~s(wrist, hip)+s(abdom)+s(ankle)+s(age), data = fat2)
anova(fat.gam4, int3, test="F")
## Analysis of Deviance Table
##
## Model 1: siri ~ s(age) + s(abdom) + s(hip) + s(ankle) + s(wrist)
## Model 2: siri ~ s(wrist, hip) + s(abdom) + s(ankle) + s(age)
## Resid. Df Resid. Dev Df Deviance F Pr(>F)
## 1 176.51 2731.7
## 2 175.17 2699.1 1.3392 32.559 1.5778 0.2133
summary(int2)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## siri ~ s(abdom, hip) + s(age) + s(ankle) + s(wrist)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.1030 0.2857 66.86 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(abdom,hip) 11.408 15.520 30.382 < 2e-16 ***
## s(age) 5.334 6.458 1.986 0.0664 .
## s(ankle) 3.970 4.829 1.175 0.2790
## s(wrist) 1.000 1.000 17.412 4.65e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.764 Deviance explained = 79%
## GCV = 18.341 Scale est. = 16.248 n = 199
par(mfrow=c(1,1))

plot(int2, select=1)

par(mfrow=c(2,2), oma=c(0,0,2,0))
vis.gam(int2, theta=-45, color="cm")
vis.gam(int2, theta=-15, color="cm")
vis.gam(int2, theta=45, color="cm")
vis.gam(int2, theta=90, color="cm")
mtext("Plane of interaction smoother: Abdomen, Hip", outer=T, cex=1.5)

fat.gint <- gam(siri ~s(abdom, hip)+s(age)+s(wrist)+s(ankle), data=fat2)
summary(fat.gint)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## siri ~ s(abdom, hip) + s(age) + s(wrist) + s(ankle)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.1030 0.2857 66.86 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(abdom,hip) 11.408 15.520 30.382 < 2e-16 ***
## s(age) 5.334 6.458 1.986 0.0664 .
## s(wrist) 1.000 1.000 17.412 4.65e-05 ***
## s(ankle) 3.970 4.829 1.175 0.2790
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.764 Deviance explained = 79%
## GCV = 18.341 Scale est. = 16.248 n = 199
par(mfrow=c(2,2),oma=c(0,0,2,0))
plot(fat.gint)
mtext("Predictors in Reduced GAM Model", cex=1.5, outer=T)

######################################################
# test semi-parametric models
semi1 <- gam(siri ~abdom:hip+s(age)+s(wrist)+s(ankle), data = fat2)
semi2 <- gam(siri ~s(abdom, hip)+age+s(wrist)+s(ankle), data = fat2)
semi3 <- gam(siri ~s(abdom, hip)+s(age)+wrist+s(ankle), data = fat2)
semi4 <- gam(siri ~s(abdom, hip)+s(age)+s(wrist)+ankle, data = fat2)
anova(semi1,fat.gint, test="F") #sig
## Analysis of Deviance Table
##
## Model 1: siri ~ abdom:hip + s(age) + s(wrist) + s(ankle)
## Model 2: siri ~ s(abdom, hip) + s(age) + s(wrist) + s(ankle)
## Resid. Df Resid. Dev Df Deviance F Pr(>F)
## 1 186.05 3769.0
## 2 176.29 2864.3 9.7628 904.69 5.7034 2.764e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(semi2,fat.gint, test="F") #linear?
## Analysis of Deviance Table
##
## Model 1: siri ~ s(abdom, hip) + age + s(wrist) + s(ankle)
## Model 2: siri ~ s(abdom, hip) + s(age) + s(wrist) + s(ankle)
## Resid. Df Resid. Dev Df Deviance F Pr(>F)
## 1 181.33 3077.4
## 2 176.29 2864.3 5.0447 213.1 2.6 0.02651 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(semi3,fat.gint, test="F") #linear
## Analysis of Deviance Table
##
## Model 1: siri ~ s(abdom, hip) + s(age) + wrist + s(ankle)
## Model 2: siri ~ s(abdom, hip) + s(age) + s(wrist) + s(ankle)
## Resid. Df Resid. Dev Df Deviance F Pr(>F)
## 1 176.29 2864.3
## 2 176.29 2864.3 -7.3447e-06 -0.00023867 2 4.131e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(semi4,fat.gint, test="F") #linear?
## Analysis of Deviance Table
##
## Model 1: siri ~ s(abdom, hip) + s(age) + s(wrist) + ankle
## Model 2: siri ~ s(abdom, hip) + s(age) + s(wrist) + s(ankle)
## Resid. Df Resid. Dev Df Deviance F Pr(>F)
## 1 183.84 3161.2
## 2 176.29 2864.3 7.5553 296.95 2.419 0.01878 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
######################################################
# plot test linear hypothesis for three non-sig variables
par(mfrow=c(1,3), oma=c(0,0,2,0))
plot(fat.gint, residuals = T, select=2, ylim=c(-10,10))
abline(-2.5,0.0518, col="blue")
plot(fat.gint, residuals = T, select=3, ylim=c(-10,10))
abline(40,-2.2, col="blue")
plot(fat.gint, residuals = T, select=4, ylim=c(-10,10))
abline(6,-.26, col="blue")
mtext('Linear Predictor Testing for Fat GAM model', outer = T, cex = 1.5)

semi4 <- gam(siri~s(abdom,hip)+age+wrist+ankle, data=fat2)
summary(semi4)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## siri ~ s(abdom, hip) + age + wrist + ankle
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 61.86751 8.39583 7.369 5.31e-12 ***
## age 0.06966 0.03117 2.235 0.0266 *
## wrist -2.54056 0.49602 -5.122 7.47e-07 ***
## ankle 0.01699 0.21998 0.077 0.9385
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(abdom,hip) 7.32 10.12 46.52 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.743 Deviance explained = 75.7%
## GCV = 18.736 Scale est. = 17.67 n = 199
######################################################
# construct and check final semi-parametric reduced GAM
fat.finalforplot <- gam(siri~s(abdom, hip)+s(age)+s(wrist), data=fat2)
par(mfrow=c(1,2), oma=c(0,0,2,0))
plot(fat.finalforplot, residuals = T, select=2, ylim=c(-10,10))
abline(-2.5,0.0518, col="blue")
plot(fat.finalforplot, residuals = T, select=3, ylim=c(-10,10))
abline(40,-2.2, col="blue")
mtext('Linear Predictor Testing for Fat GAM model', outer = T, cex = 1.5)

layout(matrix(c(1,2,1,3), 2, 2, byrow = TRUE))
plot(fat.finalforplot, residuals=T)
mtext("Predictors for GAM - Reduced", cex=1.5, outer=T)

fat.finalgam <- gam(siri~s(abdom, hip)+age+wrist, data=fat2)
summary(fat.finalgam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## siri ~ s(abdom, hip) + age + wrist
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 62.03329 8.12754 7.632 1.11e-12 ***
## age 0.06935 0.03078 2.253 0.0254 *
## wrist -2.52731 0.46158 -5.475 1.38e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(abdom,hip) 7.398 10.23 46.92 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.745 Deviance explained = 75.7%
## GCV = 18.541 Scale est. = 17.572 n = 199
gampred <- predict(fat.finalgam, fat.test)
######################################################
######################################################
# Support Vector Regression
######################################################
fat.svr = svm(siri~.,data=fat)
summary(fat.svr)
##
## Call:
## svm(formula = siri ~ ., data = fat)
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: radial
## cost: 1
## gamma: 0.07142857
## epsilon: 0.1
##
##
## Number of Support Vectors: 173
svrfpred <-predict(fat.svr, fat.test)
######################################################
# reduce covariates -- set up validation set
set.seed(67890)
train2 <-sample(1:200, 140, replace=FALSE)
trainset <- fat[c(train2),]
testset <- fat[-c(train2),]
str(trainset)
## 'data.frame': 140 obs. of 15 variables:
## $ siri : num 13.5 10.3 18.8 22.6 11.3 27 23.6 4 27.3 20.4 ...
## $ age : int 55 23 66 54 50 72 41 47 34 48 ...
## $ weight : num 125 188 171 198 162 ...
## $ height : num 64 77.5 69.2 72 66.5 ...
## $ adipos : num 21.5 22.1 25.1 26.9 25.9 24.7 29.7 20.2 29.7 23.6 ...
## $ neck : num 33.2 38 37.4 39.9 38.7 38.5 41.9 34 39.5 37 ...
## $ chest : num 87.7 96.6 102.7 107.6 99.4 ...
## $ abdom : num 76 85.3 98.6 100 86.7 ...
## $ hip : num 88.6 102.5 100.2 99.6 96.2 ...
## $ thigh : num 50.9 59.1 56.5 57.2 62.1 56.3 67.7 50.6 63.8 59.3 ...
## $ knee : num 35.4 37.6 39.3 38 39.3 36.6 41.3 34.4 42 38.4 ...
## $ ankle : num 19.1 23.2 22.7 22 23.3 22 24.7 21.9 23.4 22.4 ...
## $ biceps : num 29.3 31.8 30.3 35.9 30.6 29.7 37.2 26.8 34 27.9 ...
## $ forearm: num 25.7 29.7 28.7 30.2 27.8 26.3 31.8 25.8 31.2 26.2 ...
## $ wrist : num 16.9 18.3 19 18.9 18.2 18 20 16.8 18.5 17 ...
str(testset)
## 'data.frame': 60 obs. of 15 variables:
## $ siri : num 15 11.4 22.1 4.1 18.2 25.4 11.5 19.2 10 29.9 ...
## $ age : int 53 41 47 25 44 43 54 24 28 65 ...
## $ weight : num 154 153 178 191 180 ...
## $ height : num 69.2 69.2 70 74 69.5 ...
## $ adipos : num 22.7 22.5 25.6 24.6 26.2 26 25 27.7 24.6 30.9 ...
## $ neck : num 37.6 36.4 40.2 38.1 39.2 39.6 37.4 39.2 37 40.8 ...
## $ chest : num 93.9 91.4 99.7 100.9 101.9 ...
## $ abdom : num 88.7 80.6 95 82.5 93.2 ...
## $ hip : num 94.5 92.3 98.6 99.9 100.6 ...
## $ thigh : num 53.7 54.3 62.3 62.9 58.9 59.5 59.7 71.2 60.8 59.2 ...
## $ knee : num 36.2 36.3 38.1 38.3 39.7 36.1 40.2 43.5 38.5 38.1 ...
## $ ankle : num 22 21.8 23.9 23.8 23.1 22 23.4 25.2 25 24 ...
## $ biceps : num 28.5 29.6 35.3 35.9 31.4 30.1 27.9 36.1 31.6 35.9 ...
## $ forearm: num 25.7 27.3 31.1 31.1 28.4 27.2 27 30.3 28 30.5 ...
## $ wrist : num 17.1 17.9 19.8 18.2 18.8 17.7 17.8 18.7 18.6 19.1 ...
num <- ncol(trainset) - 2
trainRMSE <- array(dim=num)
trainY <- trainset$siri
testRMSE <- array(dim=num)
testY <- testset$siri
x <- array(dim=num)
varnames <- array(dim=num)
par(mfrow=c(1,1), oma=c(0,0,0,0))
for(i in 1:num){
x[i] <- i
Model=fit(siri~., trainset, model="svm")
predtrain=predict(Model, trainset)
trainRMSE[i]=mmetric(trainY, predtrain, "RMSE")
predtest=predict(Model, testset)
testRMSE[i]=mmetric(testY, predtest, "RMSE")
varimp <- Importance(Model,trainset, method="sensv")
dev.set(dev.next())
L=list(runs=1,sen=t(varimp$imp),sresponses=varimp$sresponses)
mgraph(L,graph="IMP", col="gray", Grid=10)
z <- order(varimp$imp, decreasing=F)
IND <- z[2]
var_to_remove <- names(trainset[IND])
varnames[i] <- var_to_remove
ymin=min(testRMSE[1:i])
ymax=max(testRMSE[1:i])
ymin=ymin-((ymax-ymin)/4)
plot(x[1:i],testRMSE[1:i],type="b",col="blue",main="Variable Elimination", xlab="Covariates Removed",
ylab="RMSE", ylim=c(ymin,ymax))
legend("bottomright", c('validation set'), lty=1, col=c("blue"))
text(x[1:i], testRMSE[1:i], labels=varnames, cex= 0.7, pos=3)
trainset[IND] = NULL
testset[IND] = NULL
cat("\ntrainRMSE", trainRMSE[i])
cat("\ntestRMSE", testRMSE[i])
cat("\nremoving variable", var_to_remove)
flush.console()
}


##
## trainRMSE 3.405217
## testRMSE 5.055386
## removing variable biceps


##
## trainRMSE 3.453475
## testRMSE 4.850632
## removing variable forearm


##
## trainRMSE 3.522938
## testRMSE 4.954367
## removing variable thigh


##
## trainRMSE 3.651424
## testRMSE 5.134237
## removing variable age


##
## trainRMSE 3.580804
## testRMSE 5.200768
## removing variable hip


##
## trainRMSE 3.672563
## testRMSE 5.318662
## removing variable weight


##
## trainRMSE 3.670937
## testRMSE 5.347998
## removing variable knee


##
## trainRMSE 3.671904
## testRMSE 5.278105
## removing variable chest


##
## trainRMSE 3.719272
## testRMSE 5.089037
## removing variable height


##
## trainRMSE 3.685936
## testRMSE 5.032729
## removing variable adipos


##
## trainRMSE 3.855121
## testRMSE 4.885201
## removing variable neck


##
## trainRMSE 3.833188
## testRMSE 4.794337
## removing variable ankle


##
## trainRMSE 4.136493
## testRMSE 4.904577
## removing variable wrist
head(trainset)
## siri abdom
## 74 13.5 76.0
## 145 10.3 85.3
## 80 18.8 98.6
## 56 22.6 100.0
## 98 11.3 86.7
## 85 27.0 99.8
plot(x,testRMSE,type="b",col="blue",main="Variable Elimination", xlab="Covariates Removed",
ylab="RMSE", ylim=c(ymin,ymax), sub="Starting RMSE = 5.01")
legend(11,4.7, c('validation set'), lty=1, col=c("blue"), bty='n')
text(x, testRMSE, labels=varnames, cex= 0.7, pos=3)
abline(h=min(testRMSE), lty=2)

######################################################
# check interactions
ggpairs(fat[,c(8,15)], axisLabels = "none",
title = "Pairs Plots of Fat Data")

trainset <- fat[c(train2),]
trainset <- trainset[,c(1,8,15)]
testset <- fat[-c(train2),]
testset <- testset[,c(1,8,15)]
trainset$ad.wr <- trainset$abdom * trainset$wrist
head(trainset)
## siri abdom wrist ad.wr
## 74 13.5 76.0 16.9 1284.40
## 145 10.3 85.3 18.3 1560.99
## 80 18.8 98.6 19.0 1873.40
## 56 22.6 100.0 18.9 1890.00
## 98 11.3 86.7 18.2 1577.94
## 85 27.0 99.8 18.0 1796.40
testset$ad.wr <- testset$abdom * testset$wrist
head(testset)
## siri abdom wrist ad.wr
## 220 15.0 88.7 17.1 1516.77
## 191 11.4 80.6 17.9 1442.74
## 113 22.1 95.0 19.8 1881.00
## 9 4.1 82.5 18.2 1501.50
## 92 18.2 93.2 18.8 1752.16
## 105 25.4 98.6 17.7 1745.22
num <- ncol(trainset) - 2
trainRMSE <- array(dim=num)
trainY <- trainset$siri
testRMSE <- array(dim=num)
testY <- testset$siri
x <- array(dim=num)
varnames <- array(dim=num)
par(mfrow=c(1,1), oma=c(0,0,0,0))
for(i in 1:num){
x[i] <- i
Model=fit(siri~., trainset, model="svm")
predtrain=predict(Model, trainset)
trainRMSE[i]=mmetric(trainY, predtrain, "RMSE")
predtest=predict(Model, testset)
testRMSE[i]=mmetric(testY, predtest, "RMSE")
varimp <- Importance(Model,trainset, method="sensv")
dev.set(dev.next())
L=list(runs=1,sen=t(varimp$imp),sresponses=varimp$sresponses)
mgraph(L,graph="IMP", col="gray", Grid=10)
z <- order(varimp$imp, decreasing=F)
IND <- z[2]
var_to_remove <- names(trainset[IND])
varnames[i] <- var_to_remove
ymin=min(testRMSE[1:i])
ymax=max(testRMSE[1:i])
ymin=ymin-((ymax-ymin)/4)
plot(x[1:i],testRMSE[1:i],type="b",col="blue",main="Variable Elimination-Interaction", xlab="Covariates Removed",
ylab="RMSE", ylim=c(ymin,ymax))
legend("bottomright", c('validation set'), lty=1, col=c("blue"))
text(x[1:i], testRMSE[1:i], labels=varnames, cex= 0.7, pos=3)
trainset[IND] = NULL
testset[IND] = NULL
cat("\ntrainRMSE", trainRMSE[i])
cat("\ntestRMSE", testRMSE[i])
cat("\nremoving variable", var_to_remove)
flush.console()
}


##
## trainRMSE 4.115702
## testRMSE 4.898484
## removing variable ad.wr


##
## trainRMSE 4.113007
## testRMSE 4.962976
## removing variable wrist
head(trainset)
## siri abdom
## 74 13.5 76.0
## 145 10.3 85.3
## 80 18.8 98.6
## 56 22.6 100.0
## 98 11.3 86.7
## 85 27.0 99.8
plot(x,testRMSE,type="b",col="blue",main="Variable Elimination-Interaction", xlab="Covariates Removed",
ylab="RMSE", ylim=c(ymin,ymax), sub="Starting RMSE = 4.78")
legend(1.8, 4.89, c('validation set'), lty=1, col=c("blue"), bty='n')
text(x, testRMSE, labels=varnames, cex= 0.7, pos=3)
abline(h=min(testRMSE), lty=2)

# no visible benefit to including interacton term
######################################################
# tune cost and epsilon parameters
tuneResult <- tune(svm, siri ~ abdom+wrist, data = fat,
ranges = list(epsilon = seq(0,1,0.1), cost = 2^(1:9)))
print(tuneResult)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## epsilon cost
## 0.3 2
##
## - best performance: 23.19606
plot(tuneResult, main="Performance of Reduced SVR")

#epsilon = 0.4, cost = 4
fat.finalsvr <- svm(siri ~ abdom+wrist, data=fat, epsilon=0.4,
cost=4)
summary(fat.finalsvr)
##
## Call:
## svm(formula = siri ~ abdom + wrist, data = fat, epsilon = 0.4,
## cost = 4)
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: radial
## cost: 4
## gamma: 0.5
## epsilon: 0.4
##
##
## Number of Support Vectors: 94
svrpred <- predict(fat.finalsvr, fat.test)
######################################################
######################################################
# Compare various models
######################################################
# red lm: siri~weight+abdom+thigh+wrist
# red gam: siri~s(abdom:hip)+age+wrist
# red svr: siri~abdom+wrist
######################################################
######################################################
######################################################
# root mean square errors
RMSELMF <- rmse(fat.test$siri - lmfpred)
RMSELMF
## [1] 5.148925
RMSELM <- rmse(fat.test$siri - lmpred)
RMSELM
## [1] 5.089915
RMSEGAMF = rmse(fat.test$siri - gamfpred)
RMSEGAMF
## [1] 6.50116
RMSEGAM = rmse(fat.test$siri - gampred)
RMSEGAM
## [1] 5.179132
RMSESVF <- rmse(fat.test$siri - svrfpred)
RMSESVF
## [1] 4.983324
RMSESV <- rmse(fat.test$siri - svrpred)
RMSESV
## [1] 4.931453
Full.rmse <- cbind(RMSELMF, RMSEGAMF, RMSESVF)
Red.rmse <- cbind(RMSELM, RMSEGAM, RMSESV)
Total.rmse <-rbind(Full.rmse, Red.rmse)
rownames(Total.rmse) <- c("Full", "Reduced")
colnames(Total.rmse) <- c("LM", "GAM", "SVR")
Total.rmse
## LM GAM SVR
## Full 5.148925 6.501160 4.983324
## Reduced 5.089915 5.179132 4.931453
######################################################
######################################################
######################################################
# predictor plots
par(mfrow=c(1,1), oma=c(0,0,0,0))
ggplot(fat.test, aes(x = lmpred, y = siri)) +
geom_point() +
xlab("Predicted Siri") +
ylab("Observed Siri") +
ggtitle("Model vs Test Data: Reduced LM") +
geom_abline(intercept=0,slope=1)

ggplot(fat.test, aes(x = gampred, y = siri)) +
geom_point() +
xlab("Predicted Siri") +
ylab("Observed Siri") +
ggtitle("Model vs Test Data: Reduced GAM") +
geom_abline(intercept=0,slope=1)

ggplot(fat.test, aes(x = svrpred, y = siri)) +
geom_point() +
xlab("Predicted Siri") +
ylab("Observed Siri") +
ggtitle("Model vs Test Data: Reduced SVR") +
geom_abline(intercept=0,slope=1)

x <-range(fat.test$abdom)
x <- seq(x[1], x[2], length.out=50)
y <- range(fat.test$wrist)
y <- seq(y[1], y[2], length.out=50)
z <- outer(x,y,
function(abdom,wrist)
predict(fat.finalsvr, data.frame(abdom,wrist)))
p <- persp(x,y,z, theta=30, phi=30,
col="lightblue", expand = 0.5,shade = 0.25,
xlab="Abdom", ylab="Wrist", zlab="Siri.SVR", main="Predicted Siri - SVR")
obs <- trans3d(fat.test$abdom, fat.test$wrist, fat.test$siri,p)
pred <- trans3d(fat.test$abdom, fat.test$wrist,svrpred,p)
points(obs, col="red", pch=16)
segments(obs$x, obs$y, pred$x, pred$y)

######################################################
######################################################
######################################################
# Residuals plots
par(mfrow=c(1,2), oma=c(0,0,2,0))
plot(residuals(fat.finallm)~predict(fat.finallm), xlab="Predicted", ylab="Residuals", main="Residuals Plot")
abline(h=0)
qqnorm(residuals(fat.finallm))
qqline(residuals(fat.finallm))
mtext("Residuals for LM Reduced Model", cex=1.5, outer=T)

par(mfrow=c(1,2), oma=c(0,0,2,0))
plot(residuals(fat.finalgam)~predict(fat.finalgam), xlab="Predicted", ylab="Residuals", main="Residuals Plot")
abline(h=0)
qqnorm(residuals(fat.finalgam))
qqline(residuals(fat.finalgam))
mtext("Residuals for GAM Reduced Model", cex=1.5, outer=T)

par(mfrow=c(1,2), oma=c(0,0,2,0))
plot(residuals(fat.finalsvr)~predict(fat.finalsvr), xlab="Predicted", ylab="Residuals", main="Residuals Plot")
abline(h=0)
qqnorm(residuals(fat.finalsvr))
qqline(residuals(fat.finalsvr))
mtext("Residuals for SVR Reduced Model", cex=1.5, outer=T)
