Summary

Linear regression implemented in R. The code is coming from the Machine learning course by Andrew Ng on Coursera. Initially, (as part of the weekly exercise) it was done in Matlab/Octave. This is a transposition in R.

Hypothesis function

Hypothesis based on the model y = Theta0 + Theta1 * X

Hypothesis_function <- function(X, Theta0, Theta1){
  Hypothesis <- Theta0 + (Theta1 * X)
  return(Hypothesis)
}

Cost function

Cost_function <- function(X, y, Theta0, Theta1){
  J <- 0
  m <- length(X)
  assign("m", m, .GlobalEnv)
  Hypothesis <- Hypothesis_function(X, Theta0, Theta1)
  assign("Hypothesis", Hypothesis, .GlobalEnv)
  
  for(i in 1 : m){
    Jcompute <- (1 / (2 * m)) * ((Hypothesis[i] - y[i])^2)
    J <- J + Jcompute
  }
  
  return(J)
  
}

Gradient descent

Repeat until convergence

Gradient_descent <- function(X, y, Alpha, Theta0, Theta1, nb_iteration){
  Temp0 <- 0
  Temp1 <- 0
  J <- Cost_function(X, y, Theta0, Theta1)
  print(paste("first J", J))
  
  df_learning <- data.frame(Iteration = integer(), J= integer()) #dataframe for learning rate
  
    for(i in 1 : nb_iteration){
      Temp0 <- Theta0 - Alpha * ((1/m) * sum(Hypothesis - y))
      Temp1 <- Theta1 - Alpha * ((1/m) * sum((Hypothesis - y) * X))
      
      Theta0 <- Temp0
      Theta1 <- Temp1
      
      J <- Cost_function(X, y, Theta0, Theta1)
      
      df_learning[i, ] <- c(i, J) # checking that the gradient works correctly
    }
  
  print(paste("Local min J", J))
  print(paste("Theta0", Theta0))
  print(paste("Theta1", Theta1))
  assign("df_learning", df_learning, .GlobalEnv)
}

Learning rate plot

Compare plot for Alpha = 0.001, 0.01, 0.1, 1 then take the one which is going down rapidly

“We recommend testing alphas at a rate of of 3 times the next smallest value (i.e. 0.01, 0.03, 0.1, 0.3 and so on)”"

ggplot(df_learning, aes(x= Iteration , y = J))+ geom_point(size= 2, alpha= 0.5, colour= “black”)

Testing

data found on : http://people.sc.fsu.edu/~jburkardt/datasets/regression/x01.txt

“The data records the average weight of the brain and body for a number of mammal species.”

“There are 62 rows of data. The 3 data columns include:

datasource <- read.csv("C:/Users/Marc/Desktop/data_R/brain.txt", stringsAsFactors = FALSE)


ggplot(datasource, aes(Brain.Weight, Body.Weight))+
  geom_point(size= 2, colour= "black", alpha=0.2)+
  ggtitle("Scatter plot : brain weight and body weight")

#plot with the outliers removed
ggplot(datasource, aes(Brain.Weight, Body.Weight))+
  geom_point(size= 2, colour= "black", alpha=0.2)+
  scale_y_continuous(limits = c(0, 700))+
  scale_x_continuous(limits = c(0, 700))+
  ggtitle("Brain weight and body weight with outliers removed")
## Warning: Removed 3 rows containing missing values (geom_point).

cor(datasource$Brain.Weight, datasource$Body.Weight)
## [1] 0.9341638
# function lm() result
lm(datasource$Brain.Weight ~ datasource$Body.Weight)
## 
## Call:
## lm(formula = datasource$Brain.Weight ~ datasource$Body.Weight)
## 
## Coefficients:
##            (Intercept)  datasource$Body.Weight  
##               -56.8555                  0.9029
Gradient_descent(datasource$Body.Weight, datasource$Brain.Weight, 0.00000001,0,0, 1500)
## [1] "first J 417481.251428686"
## [1] "Local min J 52122.3288988815"
## [1] "Theta0 -0.000510289788118951"
## [1] "Theta1 0.885633172377596"
# Learning rate
ggplot(df_learning, aes(x= Iteration , y = J))+
  geom_point(size= 2, alpha= 0.3, colour= "black")+
  ggtitle("Learning rate")

ggplot(datasource, aes(x= Body.Weight, y = Brain.Weight))+
  geom_point(alpha= 0.5, size= 2)+
  geom_smooth(method = "lm", se= FALSE, colour= "red")+
  geom_abline(slope = 0.885633172377596, intercept = -0.000510289788118951, colour= "blue")+
  ggtitle("R lm function (red) vs our Theta finding (blue)")