Summary

We will take the data from the machine learning ex5 course (Machine learning - Andrew Ng) in order to redo the exercise in R

Note: Part 8 is unfinished and give different results than the exercise

Functions

Cost function

Cost_function <- function(X, y, Theta, Lambda= 0){
  m <- dim(X)[1]
  assign("m", m, .GlobalEnv)

  Hypothesis <- X %*% Theta

  assign("Hypothesis", Hypothesis, .GlobalEnv)
  J <- 0
  J <- (1/ (2 * m)) * t((X %*% Theta) - y) %*% ((X %*% Theta) - y)

  # with regularization. put lambda to 0 if not needed
  Reg <- (Lambda / (2 * m)) * sum(Theta[2: length(Theta)]^2)
  J <- J + Reg
  invisible(J)
}

Feature scaling and mean normalization

FeatScal_MeanNorm <- function(X){
  
  X_norm <- X
  mu <- vector(mode="numeric", length= ncol(X))
  sigma <- vector(mode="numeric", length= ncol(X))
  

  for(i in 1 : ncol(X)){
    avg <- mean(X[ ,i])
    S <- max(X[,i]) - min(X[,i])
    X_norm[ ,i] <- (X[ ,i] - avg) / S
    # save the mean etc... in another matrix
    mu[i] <- avg
    sigma[i] <- S
  }
  feat_norm <- list("mu" = mu, "sigma" = sigma, "X_norm" = X_norm)
  
  assign("feat_norm", feat_norm, .GlobalEnv)
}

Gradient function with regularization

Gradient_descent <- function(X, y, Theta, Alpha, nb_iteration, Lambda=0){
  Theta_grad <- Theta
  J <- Cost_function(X, y, Theta_grad, 0)
  df_learning <- data.frame(Iteration= integer(), J= integer()) #dataframe for learning rate
  
  for (i in 1 : nb_iteration){
    pred <- X %*% Theta_grad
    grad <- ((t(X) %*% (pred - y)) / m) + ((Lambda/m) * c(0, Theta_grad[2:length(Theta_grad)]))
    
    Theta_grad <- Theta_grad - Alpha * grad
    
    J <- Cost_function(X, y, Theta_grad, Lambda)
    df_learning[i, ] <- c(i, J)
  }
  
  print(paste("local min J", J))
  assign("df_learning", df_learning, .GlobalEnv) #save df in order to do the plot
  return(Theta_grad) # nb: code stop after return function  
} 

Normal equation with regularization

Norm_equ <- function(X, y, Lambda = 0){
  library(MASS) #for the function ginv
  
  # Creating the matrix needed if lambda is used
  n <- dim(X)[2]
  
  Mat <- diag(c(rep(1,n)))
  Mat[1, ] <- 0
  
  ifelse(Lambda == 0, Theta_grad <- ginv(t(X) %*% X) %*% t(X) %*% y,
        Theta_grad <- ginv(t(X) %*% X + Lambda * Mat) %*% t(X) %*% y)
  
  assign("Theta_grad", Theta_grad, .GlobalEnv)      
  return(Theta_grad) # nb : code stop after return function      
  
} 

Exercise

Part 1: Loading and Visualizing Data

X <- c(-15.9367581337854, -29.1529792172381, 36.1895486266625, 37.4921873319951,-48.0588294525701, -8.94145793804976, 15.3077928892261,  -34.7062658113225, 1.38915436863589,-44.3837598516869,7.01350208240411,22.7627489197113)

X <- cbind(1, X) 

y <- c(2.13431050672967,1.17325667875646,34.3591091805390,36.8379551637124,2.80896507447986,2.12107247666639,14.7102683065623,2.61418438643226,3.74017166569494,3.73169131054307,7.62765885203804,22.7524283022422)

x_test <- c(-33.3180039906184,-37.9121640283162,-51.2069379482732,-6.13259584822862,21.2611832738984,-40.3195294903085, -14.5415316729528,32.5597602417086,13.3934325473289,
44.2098859467817,-1.14267767548796,-12.7668606522789, 34.0545053895966,39.2235002774038,  1.97449673923894, 29.6217550987045,-23.6696297066597,-9.01180139266144,
-55.9405709067750,-35.7085975190649,9.51020532657655)

y_test <- c(3.31688953175636,5.39768952021615,0.130429837452014,6.19259820238387,17.0884871154762,0.799508046710172,
 2.82479183437835,28.6212333355455,17.0463908063278,55.3843733422079,4.07936733312805,8.27039793458303,
31.3235510195056,39.1590610329276,8.08727989355216,24.1113438935375,2.47735480275880,6.56606471930304,
6.03808880157204,4.69273955859250,10.8300460632057)

x_val <- c(-16.7465357780207,-14.5774707492419,34.5157586572993,-47.0100757432008,36.9751190463628,-40.6861100153675,-4.47201097576646,26.5336348947889,-42.7976831001796,25.3740993835277,-31.1095539773078,27.3117686352134,-3.26386201365672,-1.81827648711587,-40.7196624025162,-50.0132436454954,-17.4117715480164,3.58819369664413,7.08548026197067,46.2823690185389,14.6122890916565)

y_val <- c(4.17020200885063,4.06726280383897,31.8730675757893,10.6236561896951,31.8360212813384,4.95936972099711,     
4.45159880347035,22.2763184574898,-4.38738273915718e-05,20.5038015793080,3.85834476269733,19.3650529156076,    
4.88376280545305,11.0971588483821,7.46170826615401,1.47693464229839,2.71916387764704,10.9269006553687,     
8.34871234630111,52.7819279790438,13.3573396060747)

plot(X[,2], y)

Part 2: Regularized Linear Regression Cost

“You should now implement the cost function for regularized linear regression.”

theta = [1 ; 1];

J = linearRegCostFunction([ones(m, 1) X], y, theta, 1); this value should be about 303.993192

print(Cost_function(X, y, c(1,1),1))
##          [,1]
## [1,] 303.9932

Part 3: Regularized Linear Regression Gradient

“You should now implement the gradient for regularized linear regression.”

theta = [1 ; 1];

[J, grad] = linearRegCostFunction([ones(m, 1) X], y, theta, 1); this value should be about [-15.303016; 598.250744]

note: in order to obtain the same result, the function gradient_descent needs to be modified a bit. The gradient descent function output is already the J cost after one step.In the Machine learning exercise, linearRegCostFunction is not used with Alpha and doesn’t take the number of iterations. Another function do it named trainLinearReg which optimize the J cost through matlab functions.

% [theta] = TRAINLINEARREG (X, y, lambda) trains linear regression using % the dataset (X, y) and regularization parameter lambda. Returns the % trained parameters theta. %

% Initialize Theta initial_theta = zeros(size(X, 2), 1);

% Create “short hand” for the cost function to be minimized costFunction = @(t) linearRegCostFunction(X, y, t, lambda);

% Now, costFunction is a function that takes in only one argument options = optimset(‘MaxIter’, 200, ‘GradObj’, ‘on’);

% Minimize using fmincg theta = fmincg(costFunction, initial_theta, options);

Gradient_descent2 <- function(X, y, Theta, Alpha, nb_iteration, Lambda=0){
  Theta_grad <- Theta
  J <- Cost_function(X, y, Theta_grad, 0)
  df_learning <- data.frame(Iteration= integer(), J= integer()) #dataframe for learning rate
  
  for (i in 1 : nb_iteration){
    pred <- X %*% Theta_grad
    grad <- ((t(X) %*% (pred - y)) / m) + ((Lambda/m) * c(0, Theta_grad[2:length(Theta_grad)]))
    print(grad)
    Theta_grad <- Theta_grad - Alpha * grad
    
    J <- Cost_function(X, y, Theta_grad, Lambda)
    df_learning[i, ] <- c(i, J)
  }
  
  print(paste("local min J", J))
  assign("df_learning", df_learning, .GlobalEnv) #save df in order to do the plot
  return(Theta_grad) # nb: code stop after return function  
} 

Gradient_descent2(X,y,c(1,1), 0,1,1)
##        [,1]
##   -15.30302
## X 598.25074
## [1] "local min J 303.993192220264"
##   [,1]
##      1
## X    1

Part 4: Train Linear Regression

Instead of using the trainLinearReg, we will try to optimize J through our Gradient_descent function

result <- Gradient_descent(X,y,c(1,1),0.001,2500,0)
## [1] "local min J 22.928107157438"
print(result)
##         [,1]
##   12.0186845
## X  0.3613667
plot(df_learning$Iteration, df_learning$J, 
     main = "Learning rate", 
     xlab = "Nb iterations",
     ylab = "J cost")

# let's compare the J cost with normal equation function
norm <-  Norm_equ(X,y,0)
## Warning: package 'MASS' was built under R version 3.3.3
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
print(Cost_function(X,y,norm,0))
##          [,1]
## [1,] 22.37391
# pretty close!


# plot the function
plot_fct <- as.data.frame(cbind(X[,2],y))
colnames(plot_fct) <- c("X","y")
ggplot(plot_fct, aes(x= X, y= y))+
  geom_point()+
  geom_abline(slope= result[2], intercept = result[1], colour= "red", size=1.5)+
  ggtitle("Linear fit")

Part 5: Learning Curve for Linear Regression

“function [error_train, error_val] = learningCurve(X, y, Xval, yval, lambda)

LEARNINGCURVE Generates the train and cross validation set errors needed to plot a learning curve The function returns the train and cross validation set errors for a learning curve. In particular, it returns two vectors of the same length - error_train and error_val. Then, error_train(i) contains the training error for i examples (and similarly for error_val(i)).

In this function, you will compute the train and test errors for dataset sizes from 1 up to m. In practice, when working with larger datasets, you might want to do this in larger intervals.

Note:

  • You should evaluate the training error on the first i training examples (i.e., X(1:i, :) and y(1:i)). For the cross-validation error, you should instead evaluate on the entire cross validation set (Xval and yval).

  • If you are using your cost function (linearRegCostFunction) to compute the training and cross validation error, you should call the function with the lambda argument set to 0. Do note that you will still need to use lambda when running the training to obtain the theta parameters.

  • The previous gradient descent function I used had as parameters alpha and nb_iteration which was used to find manually (by several test) the lowest J cost. However, in order to use the function optim from R, we are going to create a gradient descent function for just one step and without alpha (and withou checkingthe cost). The optim function will minimize it.

x_val <- cbind(1, x_val) 

Gradient_descent_optim <- function(X, y, Theta, Lambda=0){
  
  m <- nrow(X)
  pred <- X %*% Theta
  
  grad <- ((t(X) %*% (pred - y)) / m) +
    ((Lambda/m) * c(0, Theta[2:length(Theta)]))
  
  return(grad) 
} 


learning_curve <- function(X, y, Xval, yval, lambda){
  m <- dim(X)[1]

  error_train <- vector(mode="numeric", length= m-1) # -1 because we are not taking th first arg
  error_val <- vector(mode="numeric", length= m-1)
  cost_train <- 0
  cost_cross <- 0 
  
  for (i in 2:m){
    # Compute train/cross validation errors using training examples 
    #  use trainLinearReg to know which Theta we need
    Xi <- X[1:i,]
    Yi <- y[1:i]
    ##    theta_i <- Norm_equ(Xi, Yi, lambda)
    theta_i <- optim(par=c(rep(0,ncol(Xi))),
                     fn=Cost_function,
                     X=Xi,
                     y=Yi,
                     method="L-BFGS-B",
                     Lambda=lambda)$par
    
    # for the error calculation, the lambda has to be put at 0
    cost_train <- Cost_function(Xi, Yi, theta_i, 0)
    error_train[i-1] <- cost_train

    cost_cross <- Cost_function(Xval, yval, theta_i, 0)
    error_val[i-1] <- cost_cross

  }

  df_learning_result <- as.data.frame(cbind(2:m, error_train, error_val))
  colnames(df_learning_result)[1] <- "Training_examples"
  assign("df_learning_result", df_learning_result, .GlobalEnv) #save df in order to do the plot
  print("done, check df_learning_result")
}

learning_curve(X,y,x_val,y_val,0)
## [1] "done, check df_learning_result"
df_learning_result <- gather(df_learning_result, Category, Value, -Training_examples)

ggplot(df_learning_result, aes(x= Training_examples, y=Value, colour=Category, group=Category))+
  geom_line(size=1.5)+
  ggtitle("Learning curve for linear regression")+
  xlab("Number of training examples")+
  ylab("error (J cost)")+
  theme(legend.position = "top")+
  scale_x_continuous(breaks = seq(0,dim(X)[1],1))

“You can observe that both the train error and cross validation error are high when the number of training examples is increased. This reflects a high bias problem in the model - the linear regression model is too simple and is unable to fit our dataset well. In the next section, you will implement polynomial regression to fit a better model for this dataset”

Part 6: Feature Mapping for Polynomial Regression

The function poly : Given a vector X, return a matrix X_poly where the p-th column of X contains the values of X to the p-th power.

In our case, we will use a matrix size m * 1. So we will remove the first column of 1 from X

X <- X[,2]
x_val <- x_val[,2]

polyFeatures <- function(X,p){
  X_poly <- 0
  X_poly <- matrix(nrow = length(X),ncol = p)
  
  for (i in 1:p){
    X_poly[ ,i] <- X ^ i
  }
  assign("X_poly", X_poly, .GlobalEnv)
  
}


# Map X onto Polynomial Features and Normalize
X_poly_train <- polyFeatures(X,8)
feat_norm_X_poly_train <- FeatScal_MeanNorm(X_poly) # normalize
X_poly_train <- cbind(1, feat_norm_X_poly_train$X_norm)# Add Ones

Part 7: Learning Curve for Polynomial Regression

“Now, you will get to experiment with polynomial regression with multiple values of lambda. The code below runs polynomial regression with lambda = 0. You should try running the code with different values of lambda to see how the fit and learning curve change”

Theta_poly <- Gradient_descent(X_poly_train, y, c(rep(0,9)),1.98,5000,0)
## [1] "local min J 0.218937482794448"
plot(df_learning)

fun_poly <- function(X){ 
  y_pred <- polyFeatures(X,8)
  y_pred <- sweep(y_pred, 2, feat_norm_X_poly_train$mu) # minus mu
  y_pred <- sweep(y_pred, 2, feat_norm_X_poly_train$sigma, FUN = "/") # div by sigma
  y_pred <- cbind(1,y_pred)
  y_pred <- y_pred %*% Theta_poly
  assign("y_pred", y_pred, .GlobalEnv)
}


ggplot(as.data.frame(cbind(X,y)), aes(x= X, y=y)) + 
  geom_point(size=2, colour= "blue") +
  stat_function(fun= fun_poly, xlim=c(-65,60), colour= "red",linetype="dashed") +
  ggtitle("Polynomial regression with lambda = 0 on Training examples")

#     Map X_poly_val and normalize (using mu and sigma)
X_poly_val <- polyFeatures(x_val,8)
X_poly_val <- sweep(X_poly_val, 2, feat_norm_X_poly_train$mu) # minus mu
X_poly_val <- sweep(X_poly_val, 2, feat_norm_X_poly_train$sigma, FUN = "/")
X_poly_val <- cbind(1,X_poly_val)


# Learning curve
learning_curve(X_poly_train,y,X_poly_val,y_val,0) 
## [1] "done, check df_learning_result"
df_learning_result <- gather(df_learning_result, Category, Value, -Training_examples)

ggplot(df_learning_result, aes(x= Training_examples, y=Value, colour=Category, group=Category))+
  geom_line(size=1.5)+
  ggtitle("Learning curve for linear regression")+
  xlab("Number of training examples")+
  ylab("error (J cost)")+
  theme(legend.position = "top")+
  scale_x_continuous(breaks = seq(0,dim(X_poly_train)[1],1))

Part 8: Learning Curve for Polynomial Regression

“In this section, you will get to observe how the regularization parameter affects the bias-variance of regularized polynomial regression. You should now modify the the lambda parameter in the ex5.m and try lambda = 1, 100. For each of these values, the script should generate a polynomial fit to the data and also a learning curve.”

# With lambda = 1
learning_curve(X_poly_train,y,X_poly_val,y_val,1) 
## [1] "done, check df_learning_result"
df_learning_result <- gather(df_learning_result, Category, Value, -Training_examples)

# polynomial learning curve
ggplot(df_learning_result, aes(x= Training_examples, y=Value, colour=Category, group=Category))+
  geom_line(size=1.5)+
  ggtitle("Learning curve for linear regression with lambda = 1")+
  xlab("Number of training examples")+
  ylab("error (J cost)")+
  theme(legend.position = "top")+
  scale_x_continuous(breaks = seq(0,dim(X_poly_train)[1],1))

#Theta_poly <- Gradient_descent(X_poly_train, y, c(rep(0,9)),0.001,5000,1)
#Theta_poly <- Norm_equ(X_poly_train,y,1)
Theta_poly <- optim(par=c(rep(0,ncol(X_poly_train))),
                    fn=Cost_function,
                    X=X_poly_train,
                    y=y,
                    method="L-BFGS-B",
                    Lambda=1)$par

# polynomial fit chart
ggplot(as.data.frame(cbind(X,y)), aes(x= X, y=y)) + 
  geom_point(size=2, colour= "blue") +
  stat_function(fun= fun_poly, xlim=c(-80,80), colour= "red",linetype="dashed") +
  ggtitle("Polynomial regression with lambda = 1 on Training examples")+
  scale_y_continuous(breaks= seq(-60,40,20),
                         limits = c(-60, 40))
## Warning: Removed 22 rows containing missing values (geom_path).

# With lambda = 100
#Theta_poly <- Norm_equ(X_poly_train,y,100)
Theta_poly <- optim(par=c(rep(0,ncol(X_poly_train))),
                    fn=Cost_function,
                    X=X_poly_train,
                    y=y,
                    method="L-BFGS-B",
                    Lambda=100)$par


# polynomial fit chart
ggplot(as.data.frame(cbind(X,y)), aes(x= X, y=y)) + 
  geom_point(size=2, colour= "blue") +
  stat_function(fun= fun_poly, xlim=c(-80,80), colour= "red",linetype="dashed") +
  ggtitle("Polynomial regression with lambda = 100 on Training examples")+
  scale_y_continuous(breaks= seq(-10,40,10),
                         limits = c(-10, 40))

Note: at this point, we have some differences with the course exrcise, particularly with lambda. The problem is how to optimize the cost function with the optim function from R? However, even when testing the norm_equ, the gradient descent with regularization gives different J cost… To investigate…

The rest of the ex is :

“From the previous parts of the exercise, you observed that the value of lambda can significantly affect the results of regularized polynomial regression on the training and cross validation set. In particular, a model without regular- ization (lambda = 0) fits the training set well, but does not generalize. Conversely, a model with too much regularization (lambda = 100) does not fit the training set and testing set well. A good choice of lambda (e.g., lambda = 1) can provide a good fit to the data.”

You will use a cross validation set to evaluate how good each lambda value is. After selecting the best lambda value using the cross validation set, we can then evaluate the model on the test set to estimate how well the model will perform on actual unseen data"