## 'data.frame':    552 obs. of  4 variables:
##  $ sd     : num  6.48 11.84 11.69 10.73 11.21 ...
##  $ sharpe : num  0.33 0.48 0.47 0.48 -0.51 -0.29 0.19 1.19 0.71 0.7 ...
##  $ returns: num  7.95 13.62 13.45 12.85 -4.48 ...
##  $ risk   : Factor w/ 10 levels "1","10","2","3",..: 7 8 8 8 5 5 5 2 8 8 ...

classification under rpart

library(rpart)  # library for decision tree
library(rpart.plot)  # For decision tree visualization
## Warning: package 'rpart.plot' was built under R version 3.3.1
col <- ifelse(traindata$returns <0, "bad", ifelse(traindata$returns < 4.5, "average", "good")) 
subsetdata_results <- as.factor(col)

plot(traindata$risk, traindata$returns, col=subsetdata_results, xtab="risk", ytab="returns")

legend(x=0,y=30, legend=c("Bad", "Average", "Good"),col = c("red", "blue", "green"), pch = 1, bty = "n")

nr <- nrow(traindata)
set.seed(1)

tModel <-  rpart(returns ~ sharpe , data = traindata) # save the model ad tModel 
rpart.plot(tModel)

# Make the prediction
value <- predict(tModel, data=traindata)

# 5.5 Access the model performance. Use root mean square value
mean(sqrt(sum((value-traindata$returns)^2)))
## [1] 110.0581
head(data.frame(value, traindata$returns, traindata$risk, value-traindata$returns),  20)
##         value traindata.returns traindata.risk value...traindata.returns
## 1   7.9451247         7.9463834              6              -0.001258648
## 2  12.0938781        13.6164251              7              -1.522547012
## 3  12.0938781        13.4467422              7              -1.352864052
## 4  12.0938781        12.8512525              7              -0.757374352
## 5  -3.5019811        -4.4790649              4               0.977083775
## 6   0.9866623         0.4670037              4               0.519658523
## 7   7.9451247         5.2725210              4               2.672603708
## 8  17.1488412        23.7793257             10              -6.630484579
## 9  12.0938781        13.7861174              7              -1.692239282
## 10 12.0938781        13.6265176              7              -1.532639502
## 11 12.0938781        13.9503928              8              -1.856514702
## 12  0.9866623         0.3357734             10               0.650888901
## 13  0.9866623         0.6491214              9               0.337540884
## 14  3.7707233         4.9189383              8              -1.148215062
## 15  0.9866623        -3.6008725              9               4.587534796
## 16  3.7707233         4.4046315              7              -0.633908256
## 17  3.7707233         4.3161354              7              -0.545412146
## 18 12.0938781        11.0406296             10               1.053248498
## 19 12.0938781        14.4442702              9              -2.350392072
## 20 12.0938781        14.0652860              9              -1.971407912

Linear Model

# 1. Build the linear model 
lmModel <- lm(returns ~ . -risk,  data = traindata)

# 2. Use the linear model to make prediction
#predValue <- predict(lmModel, data.frame(sharpe = 0.3))
#predValue <- predict(lmModel, data.frame(sharpe = c(1,1.9)))
#predValue <- predict(lmModel, data.frame(sharpe = traindata$sharpe))


# 3. Access the model parameters
coef(lmModel)
## (Intercept)          sd      sharpe 
##   6.7036960  -0.1794602  13.8031113
# 5. Model Performance assessment
sumModel <- summary(lmModel)
sumModel$r.squared
## [1] 0.73707
r <- cor(traindata$returns, traindata$sharpe)
r
## [1] 0.8515433