Summary

First programming exercise from the Machine Learning course on Coursera by Andrew Ng.

Linear regression with one variable

Plotting the Data

library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
library(readr)

data <- read_csv("~/R_projects/171028_exo_Ng_1/ex1data1.txt", col_names = FALSE)
## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   X2 = col_double()
## )
X <- as.matrix(data[,1])
y <- as.matrix(data[,2])
m <- length(X)

plot(X,y)

Gradient Descent

# initializing some parameters

X <- cbind(1,X)  # Add a column of ones to x
theta <-  rep(0,2) # initialize fitting parameters

iterations <- 1500
alpha <- 0.01 


Cost_function <- function(X, y, Theta, Lambda= 0){
  m <- dim(X)[1]
  assign("m", m, .GlobalEnv)

  Hypothesis <- X %*% Theta

  assign("Hypothesis", Hypothesis, .GlobalEnv)
  J <- 0
  J <- (1/ (2 * m)) * t((X %*% Theta) - y) %*% ((X %*% Theta) - y)

  # with regularization. put lambda to 0 if not needed
  Reg <- (Lambda / (2 * m)) * sum(Theta[2: length(Theta)]^2)
  J <- J + Reg
  invisible(J)
}

print(Cost_function(X,y,theta,0))
##          X2
## X2 32.07273
# fprintf('Expected cost value (approx) 32.07\n')

theta <-  c(-1,2)
print(Cost_function(X,y,theta,0))
##          X2
## X2 54.24246
#fprintf('Expected cost value (approx) 54.24\n');

Gradient_descent <- function(X, y, Theta, Alpha, nb_iteration, Lambda=0){
  Theta_grad <- Theta
  J <- Cost_function(X, y, Theta_grad, 0)
  df_learning <- data.frame(Iteration= integer(), J= integer()) #dataframe for learning rate
  
  for (i in 1 : nb_iteration){
    pred <- X %*% Theta_grad
    grad <- ((t(X) %*% (pred - y)) / m) + ((Lambda/m) * c(0, Theta_grad[2:length(Theta_grad)]))
    
    Theta_grad <- Theta_grad - Alpha * grad
    
    J <- Cost_function(X, y, Theta_grad, Lambda)
    df_learning[i, ] <- c(i, J)
  }
  
  print(paste("local min J", J))
  assign("df_learning", df_learning, .GlobalEnv) #save df in order to do the plot
  return(Theta_grad) # nb: code stop after return function  
} 

theta <- Gradient_descent(X, y, c(0,0), alpha, iterations, 0)
## [1] "local min J 4.48338825658772"
print(theta)
##           X2
##    -3.630291
## X1  1.166362
#fprintf('Expected theta values (approx)\n');
#fprintf(' -3.6303\n  1.1664\n\n');


# plot the function
plot_fct <- as.data.frame(cbind(X[,2],y))
colnames(plot_fct) <- c("X","y")
ggplot(plot_fct, aes(x= X, y= y))+
  geom_point()+
  geom_abline(slope= theta[2], intercept = theta[1], colour= "red", size=1.5)+
  ggtitle("Linear fit on the training data")+
  xlab("Population in cities in 10,000")+
  ylab("profit in $10,000")

#Predict values for population sizes of 35,000 and 70,000
predict1 <- c(1, 3.5) %*% theta

#fprintf('For population = 35,000, we predict a profit of %f\n',...
  
predict1 * 10000
##            X2
## [1,] 4519.768
predict2 <- c(1, 7) %*% theta

#fprintf('For population = 70,000, we predict a profit of %f\n',...
predict2 * 10000
##            X2
## [1,] 45342.45

Linear regression with multiple variables

Feature Normalization

FeatScal_MeanNorm <- function(X){
  
  X_norm <- X
  mu <- vector(mode="numeric", length= ncol(X))
  sigma <- vector(mode="numeric", length= ncol(X))
  

  for(i in 1 : ncol(X)){
    avg <- mean(X[ ,i])
    S <- max(X[,i]) - min(X[,i])
    X_norm[ ,i] <- (X[ ,i] - avg) / S
    # save the mean etc... in another matrix
    mu[i] <- avg
    sigma[i] <- S
  }
  feat_norm <- list("mu" = mu, "sigma" = sigma, "X_norm" = X_norm)
  
  assign("feat_norm", feat_norm, .GlobalEnv)
}

Gradient Descent

data <- read_csv("~/R_projects/171028_exo_Ng_1/ex1data2.txt", col_names = FALSE)
## Parsed with column specification:
## cols(
##   X1 = col_integer(),
##   X2 = col_integer(),
##   X3 = col_integer()
## )
X <- as.matrix(data[,1:2])
y <- as.matrix(data[,3])
m <- length(X)

data_feat <- FeatScal_MeanNorm(X)
X_feat <- cbind(1, data_feat$X_norm)

# intitial parameters
alpha <- 0.01
num_iters <- 400
theta <- rep(0,3)

theta <- Gradient_descent(X_feat,y,theta, alpha, num_iters,0)
## [1] "local min J 5675290099.09578"
#Plot the convergence graph
plot(df_learning$Iteration, df_learning$J)

# print theta we found
print(theta)
##           X3
##    334302.06
## X1  82063.90
## X2  34705.25

Trying other alpha learning rate:

“We recommend trying values of the learning rate alpha on a log-scale, at multiplicative steps of about 3 times the previous value (i.e., 0.3, 0.1, 0.03, 0.01 and so on)”

# alpha 0.03
theta <- Gradient_descent(X_feat,y,rep(0,3), 0.03, 50,0)
## [1] "local min J 9540741206.79847"
#Plot the convergence graph
plot(df_learning$Iteration, df_learning$J)

# alpha 0.1
theta <- Gradient_descent(X_feat,y,rep(0,3), 0.1, 50,0)
## [1] "local min J 5290031953.99697"
#Plot the convergence graph
plot(df_learning$Iteration, df_learning$J)

# alpha 0.3
theta <- Gradient_descent(X_feat,y,rep(0,3), 0.3, 50,0)
## [1] "local min J 3280971188.25037"
#Plot the convergence graph
plot(df_learning$Iteration, df_learning$J)

# alpha 0.9
theta <- Gradient_descent(X_feat,y,rep(0,3), 0.9, 50,0)
## [1] "local min J 2257414883.18589"
#Plot the convergence graph
plot(df_learning$Iteration, df_learning$J)

#We will use an alpha of 0.9
theta <- Gradient_descent(X_feat,y,rep(0,3), 0.9, 500,0)
## [1] "local min J 2043280163.01377"
#Predicted price of a 1650 sq-ft, 3 br house
predict_feat <- c((1650- feat_norm$mu[1]) / feat_norm$sigma[1], (3- feat_norm$mu[2]) / feat_norm$sigma[2])

cbind(1,t(predict_feat)) %*% theta
##            X3
## [1,] 293084.4

Normal Equation

# Normal Equation does not require any feature scaling, however, we still need to add a column of 1's to the X matrix to have an intercept term (Theta0).
Norm_equ <- function(X, y, Lambda = 0){
  library(MASS) #for the function ginv
  
  # Creating the matrix needed if lambda is used
  n <- dim(X)[2]
  
  Mat <- diag(c(rep(1,n)))
  Mat[1, ] <- 0
  
  ifelse(Lambda == 0, Theta_grad <- solve(t(X) %*% X) %*% t(X) %*% y, # solve instead of ginv
        Theta_grad <- ginv(t(X) %*% X + Lambda * Mat) %*% t(X) %*% y)
  
  assign("Theta_grad", Theta_grad, .GlobalEnv)      
  return(Theta_grad) # nb : code stop after return function      
  
} 


# prediction
c(1,1650,3) %*% Norm_equ(cbind(1,X),y)
## Warning: package 'MASS' was built under R version 3.3.3
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
##            X3
## [1,] 293081.5

Although my Norm_equ function used the ginv function to do the inverse of a matrix, the final result was a bit offset, around 284,000. Instead I needed to use the solve function to find the good answer.

  • solve(A) Inverse of A where A is a square matrix.
  • ginv(A) Moore-Penrose Generalized Inverse of A.