Summary
Linear regression with one variable
- Plotting the Data
- Gradient Descent
Linear regression with multiple variables

Summary

First programming exercise from the Machine Learning course on Coursera by Andrew Ng.

Linear regression with one variable

Plotting the Data

library(tidyverse)

## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr

## Conflicts with tidy packages ----------------------------------------------

## filter(): dplyr, stats
## lag():    dplyr, stats

library(readr)

data <- read_csv("~/R_projects/171028_exo_Ng_1/ex1data1.txt", col_names = FALSE)

## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   X2 = col_double()
## )

X <- as.matrix(data[,1])
y <- as.matrix(data[,2])
m <- length(X)

plot(X,y)

Gradient Descent

# initializing some parameters

X <- cbind(1,X)  # Add a column of ones to x
theta <-  rep(0,2) # initialize fitting parameters

iterations <- 1500
alpha <- 0.01 


Cost_function <- function(X, y, Theta, Lambda= 0){
  m <- dim(X)[1]
  assign("m", m, .GlobalEnv)

  Hypothesis <- X %*% Theta

  assign("Hypothesis", Hypothesis, .GlobalEnv)
  J <- 0
  J <- (1/ (2 * m)) * t((X %*% Theta) - y) %*% ((X %*% Theta) - y)

  # with regularization. put lambda to 0 if not needed
  Reg <- (Lambda / (2 * m)) * sum(Theta[2: length(Theta)]^2)
  J <- J + Reg
  invisible(J)
}

print(Cost_function(X,y,theta,0))

##          X2
## X2 32.07273

# fprintf('Expected cost value (approx) 32.07\n')

theta <-  c(-1,2)
print(Cost_function(X,y,theta,0))

##          X2
## X2 54.24246

#fprintf('Expected cost value (approx) 54.24\n');

Gradient_descent <- function(X, y, Theta, Alpha, nb_iteration, Lambda=0){
  Theta_grad <- Theta
  J <- Cost_function(X, y, Theta_grad, 0)
  df_learning <- data.frame(Iteration= integer(), J= integer()) #dataframe for learning rate
  
  for (i in 1 : nb_iteration){
    pred <- X %*% Theta_grad
    grad <- ((t(X) %*% (pred - y)) / m) + ((Lambda/m) * c(0, Theta_grad[2:length(Theta_grad)]))
    
    Theta_grad <- Theta_grad - Alpha * grad
    
    J <- Cost_function(X, y, Theta_grad, Lambda)
    df_learning[i, ] <- c(i, J)
  }
  
  print(paste("local min J", J))
  assign("df_learning", df_learning, .GlobalEnv) #save df in order to do the plot
  return(Theta_grad) # nb: code stop after return function  
} 

theta <- Gradient_descent(X, y, c(0,0), alpha, iterations, 0)

## [1] "local min J 4.48338825658772"

print(theta)

##           X2
##    -3.630291
## X1  1.166362

#fprintf('Expected theta values (approx)\n');
#fprintf(' -3.6303\n  1.1664\n\n');


# plot the function
plot_fct <- as.data.frame(cbind(X[,2],y))
colnames(plot_fct) <- c("X","y")
ggplot(plot_fct, aes(x= X, y= y))+
  geom_point()+
  geom_abline(slope= theta[2], intercept = theta[1], colour= "red", size=1.5)+
  ggtitle("Linear fit on the training data")+
  xlab("Population in cities in 10,000")+
  ylab("profit in $10,000")

#Predict values for population sizes of 35,000 and 70,000
predict1 <- c(1, 3.5) %*% theta

#fprintf('For population = 35,000, we predict a profit of %f\n',...
  
predict1 * 10000

##            X2
## [1,] 4519.768

predict2 <- c(1, 7) %*% theta

#fprintf('For population = 70,000, we predict a profit of %f\n',...
predict2 * 10000

##            X2
## [1,] 45342.45

Linear regression with multiple variables

Feature Normalization

FeatScal_MeanNorm <- function(X){
  
  X_norm <- X
  mu <- vector(mode="numeric", length= ncol(X))
  sigma <- vector(mode="numeric", length= ncol(X))
  

  for(i in 1 : ncol(X)){
    avg <- mean(X[ ,i])
    S <- max(X[,i]) - min(X[,i])
    X_norm[ ,i] <- (X[ ,i] - avg) / S
    # save the mean etc... in another matrix
    mu[i] <- avg
    sigma[i] <- S
  }
  feat_norm <- list("mu" = mu, "sigma" = sigma, "X_norm" = X_norm)
  
  assign("feat_norm", feat_norm, .GlobalEnv)
}

Gradient Descent

data <- read_csv("~/R_projects/171028_exo_Ng_1/ex1data2.txt", col_names = FALSE)

## Parsed with column specification:
## cols(
##   X1 = col_integer(),
##   X2 = col_integer(),
##   X3 = col_integer()
## )

X <- as.matrix(data[,1:2])
y <- as.matrix(data[,3])
m <- length(X)

data_feat <- FeatScal_MeanNorm(X)
X_feat <- cbind(1, data_feat$X_norm)

# intitial parameters
alpha <- 0.01
num_iters <- 400
theta <- rep(0,3)

theta <- Gradient_descent(X_feat,y,theta, alpha, num_iters,0)

## [1] "local min J 5675290099.09578"

#Plot the convergence graph
plot(df_learning$Iteration, df_learning$J)

# print theta we found
print(theta)

##           X3
##    334302.06
## X1  82063.90
## X2  34705.25

Trying other alpha learning rate:

“We recommend trying values of the learning rate alpha on a log-scale, at multiplicative steps of about 3 times the previous value (i.e., 0.3, 0.1, 0.03, 0.01 and so on)”

# alpha 0.03
theta <- Gradient_descent(X_feat,y,rep(0,3), 0.03, 50,0)

## [1] "local min J 9540741206.79847"

#Plot the convergence graph
plot(df_learning$Iteration, df_learning$J)

# alpha 0.1
theta <- Gradient_descent(X_feat,y,rep(0,3), 0.1, 50,0)

## [1] "local min J 5290031953.99697"

#Plot the convergence graph
plot(df_learning$Iteration, df_learning$J)

# alpha 0.3
theta <- Gradient_descent(X_feat,y,rep(0,3), 0.3, 50,0)

## [1] "local min J 3280971188.25037"

#Plot the convergence graph
plot(df_learning$Iteration, df_learning$J)

# alpha 0.9
theta <- Gradient_descent(X_feat,y,rep(0,3), 0.9, 50,0)

## [1] "local min J 2257414883.18589"

#Plot the convergence graph
plot(df_learning$Iteration, df_learning$J)

#We will use an alpha of 0.9
theta <- Gradient_descent(X_feat,y,rep(0,3), 0.9, 500,0)

## [1] "local min J 2043280163.01377"

#Predicted price of a 1650 sq-ft, 3 br house
predict_feat <- c((1650- feat_norm$mu[1]) / feat_norm$sigma[1], (3- feat_norm$mu[2]) / feat_norm$sigma[2])

cbind(1,t(predict_feat)) %*% theta

##            X3
## [1,] 293084.4

Normal Equation

# Normal Equation does not require any feature scaling, however, we still need to add a column of 1's to the X matrix to have an intercept term (Theta0).
Norm_equ <- function(X, y, Lambda = 0){
  library(MASS) #for the function ginv
  
  # Creating the matrix needed if lambda is used
  n <- dim(X)[2]
  
  Mat <- diag(c(rep(1,n)))
  Mat[1, ] <- 0
  
  ifelse(Lambda == 0, Theta_grad <- solve(t(X) %*% X) %*% t(X) %*% y, # solve instead of ginv
        Theta_grad <- ginv(t(X) %*% X + Lambda * Mat) %*% t(X) %*% y)
  
  assign("Theta_grad", Theta_grad, .GlobalEnv)      
  return(Theta_grad) # nb : code stop after return function      
  
} 


# prediction
c(1,1650,3) %*% Norm_equ(cbind(1,X),y)

## Warning: package 'MASS' was built under R version 3.3.3

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

##            X3
## [1,] 293081.5

Although my Norm_equ function used the ginv function to do the inverse of a matrix, the final result was a bit offset, around 284,000. Instead I needed to use the solve function to find the good answer.

solve(A) Inverse of A where A is a square matrix.
ginv(A) Moore-Penrose Generalized Inverse of A.

Linear Regression: Andrew Ng Coursera Machine Learning ex1

ndee

28 October 2017

Summary

Linear regression with one variable

Plotting the Data

Gradient Descent

Linear regression with multiple variables

Feature Normalization

Gradient Descent

Normal Equation