This is a summary of section 4 in Machine Learning Course by JHU in coursera. The report include the following topics:
Can help with bias/variance tradeoff; may demand large dataset
library(ElemStatLearn); data("prostate")
str(prostate)
## 'data.frame': 97 obs. of 10 variables:
## $ lcavol : num -0.58 -0.994 -0.511 -1.204 0.751 ...
## $ lweight: num 2.77 3.32 2.69 3.28 3.43 ...
## $ age : int 50 58 74 58 62 50 64 58 47 63 ...
## $ lbph : num -1.39 -1.39 -1.39 -1.39 -1.39 ...
## $ svi : int 0 0 0 0 0 0 0 0 0 0 ...
## $ lcp : num -1.39 -1.39 -1.39 -1.39 -1.39 ...
## $ gleason: int 6 6 7 6 6 6 6 6 6 6 ...
## $ pgg45 : int 0 0 20 0 0 0 0 0 0 0 ...
## $ lpsa : num -0.431 -0.163 -0.163 -0.163 0.372 ...
## $ train : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
###regression subset selection
covnames <- names(prostate[-(9:10)])
y <- prostate$lpsa
x <- prostate[,covnames]
form <- as.formula(paste("lpsa~", paste(covnames, collapse="+"), sep=""))
summary(lm(form, data=prostate[prostate$train,]))
##
## Call:
## lm(formula = form, data = prostate[prostate$train, ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.64870 -0.34147 -0.05424 0.44941 1.48675
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.429170 1.553588 0.276 0.78334
## lcavol 0.576543 0.107438 5.366 1.47e-06 ***
## lweight 0.614020 0.223216 2.751 0.00792 **
## age -0.019001 0.013612 -1.396 0.16806
## lbph 0.144848 0.070457 2.056 0.04431 *
## svi 0.737209 0.298555 2.469 0.01651 *
## lcp -0.206324 0.110516 -1.867 0.06697 .
## gleason -0.029503 0.201136 -0.147 0.88389
## pgg45 0.009465 0.005447 1.738 0.08755 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7123 on 58 degrees of freedom
## Multiple R-squared: 0.6944, Adjusted R-squared: 0.6522
## F-statistic: 16.47 on 8 and 58 DF, p-value: 2.042e-12
set.seed(1)
train.ind <- sample(nrow(prostate), ceiling(nrow(prostate))/2)
y.test <- prostate$lpsa[-train.ind]
x.test <- x[-train.ind,]
y <- prostate$lpsa[train.ind]
x <- x[train.ind,]
p <- length(covnames)
rss <- list()
for (i in 1:p) {
cat(i)
Index <- combn(p,i)
rss[[i]] <- apply(Index, 2, function(is) {
form <- as.formula(paste("y~", paste(covnames[is], collapse="+"), sep=""))
isfit <- lm(form, data=x)
yhat <- predict(isfit)
train.rss <- sum((y - yhat)^2)
yhat <- predict(isfit, newdata=x.test)
test.rss <- sum((y.test - yhat)^2)
c(train.rss, test.rss)
})
}
## 12345678
#png("Plots/selection-plots-01.png", height=432, width=432, pointsize=12)
plot(1:p, 1:p, type="n", ylim=range(unlist(rss)), xlim=c(0,p), xlab="number of predictors", ylab="residual sum of squares", main="Prostate cancer data")
for (i in 1:p) {
points(rep(i-0.15, ncol(rss[[i]])), rss[[i]][1, ], col="blue")
points(rep(i+0.15, ncol(rss[[i]])), rss[[i]][2, ], col="red")
}
minrss <- sapply(rss, function(x) min(x[1,]))
lines((1:p)-0.15, minrss, col="blue", lwd=1.7)
minrss <- sapply(rss, function(x) min(x[2,]))
lines((1:p)+0.15, minrss, col="red", lwd=1.7)
legend("topright", c("Train", "Test"), col=c("blue", "red"), pch=1)
We find that as the predictor number increases, the error of training set is going down as we are overfitting the training dataset. However, at the same time the error of test set increases.
small = prostate[1:5,]
lm(lpsa~., data = small)
##
## Call:
## lm(formula = lpsa ~ ., data = small)
##
## Coefficients:
## (Intercept) lcavol lweight age lbph
## 9.60615 0.13901 -0.79142 0.09516 NA
## svi lcp gleason pgg45 trainTRUE
## NA NA -2.08710 NA NA
Here we see that some predictors were calculated as NA. This is because there are more predictors than the samples and R is not able to do the estimation.
In order to control the high variance in the data, we can shrink the coefficient by adding penalty. The penalty is used for reducing the complexity and respecting the structure of the problem.
Solve:
\[ \sum_{i=1}^N \left(y_i - \beta_0 + \sum_{j=1}^p x_{ij}\beta_j \right)^2 + \lambda \sum_{j=1}^p \beta_j^2\]
Lasso stands for least absolute shrinkage and selection operator.
caret
methods are:ridge
lasso
relaxo
library(ISLR); data(Wage)
library(ggplot2); library(caret)
## Loading required package: lattice
Wage <- subset(Wage,select=-c(logwage))
# Create a building data set and validation set
inBuild <- createDataPartition(y=Wage$wage,
p=0.7, list=FALSE)
validation <- Wage[-inBuild,]; buildData <- Wage[inBuild,]
inTrain <- createDataPartition(y=buildData$wage,
p=0.7, list=FALSE)
training <- buildData[inTrain,]; testing <- buildData[-inTrain,]
###build two different models to predict wage variables
mod1 <- train(wage~., method = "glm", data = training)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
mod2 <- train(wage~., method = "rf", data = training, trControl = trainControl(method = "cv"), number = 3)
## Loading required package: randomForest
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
### plot two predictions
pred1 <- predict(mod1, testing)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
pred2 <- predict(mod2, testing)
qplot(pred1, pred2, data = testing, color = wage)
##fit a model that combines predictors
predDF <- data.frame(pred1, pred2, wage = testing$wage)
comModFit <- train(wage~., method = "gam", data = predDF )
## Loading required package: mgcv
## Loading required package: nlme
## This is mgcv 1.8-12. For overview type 'help("mgcv-package")'.
combPred <- predict(comModFit, predDF)
###testing errors
sqrt(sum((pred1 - testing$wage)^2))
## [1] 913.4245
sqrt(sum((pred2 - testing$wage)^2))
## [1] 936.1133
sqrt(sum((combPred - testing$wage)^2))
## [1] 904.6411
###prediction on the validation set
pred1V <- predict(mod1,validation); pred2V <- predict(mod2,validation)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
predVDF <- data.frame(pred1=pred1V,pred2=pred2V)
combPredV <- predict(comModFit,predVDF)
Forcasting is typical applied on the time-series data. The characteristics of this kind of analysis includes:
library(quantmod)
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: TTR
## Version 0.4-0 included new data defaults. See ?getSymbols.
from.dat <- as.Date("01/01/08", format="%m/%d/%y")
to.dat <- as.Date("12/31/13", format="%m/%d/%y")
getSymbols("GOOG", src="google", from = from.dat, to = to.dat)
## As of 0.4-0, 'getSymbols' uses env=parent.frame() and
## auto.assign=TRUE by default.
##
## This behavior will be phased out in 0.5-0 when the call will
## default to use auto.assign=FALSE. getOption("getSymbols.env") and
## getOptions("getSymbols.auto.assign") are now checked for alternate defaults
##
## This message is shown once per session and may be disabled by setting
## options("getSymbols.warning4.0"=FALSE). See ?getSymbols for more details.
## [1] "GOOG"
head(GOOG)
## GOOG.Open GOOG.High GOOG.Low GOOG.Close GOOG.Volume
## 2008-01-02 346.09 348.34 338.53 342.25 NA
## 2008-01-03 342.29 343.08 337.92 342.32 NA
## 2008-01-04 339.51 340.14 327.17 328.17 NA
## 2008-01-07 326.64 330.81 318.36 324.30 NA
## 2008-01-08 326.17 329.65 315.18 315.52 NA
## 2008-01-09 314.70 326.34 310.94 326.27 NA
##summarize monthly and store as time series
GOOG <- as.data.frame(GOOG)
rownames(GOOG) = as.Date(rownames(GOOG))
#mGoog <- to.monthly(GOOG)
#googOpen <- Op(mGoog)
##decompose a time series into parts
#plot(decompose(ts1))
##Training and testing dataset on a consective basis
#ts1Train <- window(ts1,start=1,end=5)
#ts1Test <- window(ts1,start=5,end=(7-0.01))
######Simple moving average: ma()
#plot(ts1Train)
#lines(ma(ts1Train,order=3),col="red")
########exponential smoothing
#ets1 <- ets(ts1Train,model="MMM")
#fcast <- forecast(ets1)
#plot(fcast); lines(ts1Test,col="red")
#####get the accuracy: accuracy()
data(iris); library(ggplot2)
inTrain <- createDataPartition(y=iris$Species, p=0.7, list=FALSE)
training <- iris[inTrain,]
testing <- iris[-inTrain,]
dim(training); dim(testing)
## [1] 105 5
## [1] 45 5
###create cluster with K-means clustering
kMeans1 <- kmeans(subset(training,select=-c(Species)),centers=3)
training$clusters <- as.factor(kMeans1$cluster)
qplot(Petal.Width,Petal.Length,colour=clusters,data=training)
##compare to the real labels
table(kMeans1$cluster,training$Species)
##
## setosa versicolor virginica
## 1 35 0 0
## 2 0 34 12
## 3 0 1 23
####build predictor
modFit <- train(clusters ~.,data=subset(training,select=-c(Species)),method="rpart")
## Loading required package: rpart
###apply on the test
testClusterPred <- predict(modFit,testing)