Linear regression implemented in R. The code is coming from the Machine learning course by Andrew Ng on Coursera. Initially, (as part of the weekly exercise) it was done in Matlab/Octave. This is a transposition in R.
Hypothesis based on the model y = Theta0 + Theta1 * X
Hypothesis_function <- function(X, Theta0, Theta1){
Hypothesis <- Theta0 + (Theta1 * X)
return(Hypothesis)
}
Cost_function <- function(X, y, Theta0, Theta1){
J <- 0
m <- length(X)
assign("m", m, .GlobalEnv)
Hypothesis <- Hypothesis_function(X, Theta0, Theta1)
assign("Hypothesis", Hypothesis, .GlobalEnv)
for(i in 1 : m){
Jcompute <- (1 / (2 * m)) * ((Hypothesis[i] - y[i])^2)
J <- J + Jcompute
}
return(J)
}
Repeat until convergence
Gradient_descent <- function(X, y, Alpha, Theta0, Theta1, nb_iteration){
Temp0 <- 0
Temp1 <- 0
J <- Cost_function(X, y, Theta0, Theta1)
print(paste("first J", J))
df_learning <- data.frame(Iteration = integer(), J= integer()) #dataframe for learning rate
for(i in 1 : nb_iteration){
Temp0 <- Theta0 - Alpha * ((1/m) * sum(Hypothesis - y))
Temp1 <- Theta1 - Alpha * ((1/m) * sum((Hypothesis - y) * X))
Theta0 <- Temp0
Theta1 <- Temp1
J <- Cost_function(X, y, Theta0, Theta1)
df_learning[i, ] <- c(i, J) # checking that the gradient works correctly
}
print(paste("Local min J", J))
print(paste("Theta0", Theta0))
print(paste("Theta1", Theta1))
assign("df_learning", df_learning, .GlobalEnv)
}
Compare plot for Alpha = 0.001, 0.01, 0.1, 1 then take the one which is going down rapidly
“We recommend testing alphas at a rate of of 3 times the next smallest value (i.e. 0.01, 0.03, 0.1, 0.3 and so on)”"
ggplot(df_learning, aes(x= Iteration , y = J))+ geom_point(size= 2, alpha= 0.5, colour= “black”)
data found on : http://people.sc.fsu.edu/~jburkardt/datasets/regression/x01.txt
“The data records the average weight of the brain and body for a number of mammal species.”
“There are 62 rows of data. The 3 data columns include:
datasource <- read.csv("C:/Users/Marc/Desktop/data_R/brain.txt", stringsAsFactors = FALSE)
ggplot(datasource, aes(Brain.Weight, Body.Weight))+
geom_point(size= 2, colour= "black", alpha=0.2)+
ggtitle("Scatter plot : brain weight and body weight")
#plot with the outliers removed
ggplot(datasource, aes(Brain.Weight, Body.Weight))+
geom_point(size= 2, colour= "black", alpha=0.2)+
scale_y_continuous(limits = c(0, 700))+
scale_x_continuous(limits = c(0, 700))+
ggtitle("Brain weight and body weight with outliers removed")
## Warning: Removed 3 rows containing missing values (geom_point).
cor(datasource$Brain.Weight, datasource$Body.Weight)
## [1] 0.9341638
# function lm() result
lm(datasource$Brain.Weight ~ datasource$Body.Weight)
##
## Call:
## lm(formula = datasource$Brain.Weight ~ datasource$Body.Weight)
##
## Coefficients:
## (Intercept) datasource$Body.Weight
## -56.8555 0.9029
Gradient_descent(datasource$Body.Weight, datasource$Brain.Weight, 0.00000001,0,0, 1500)
## [1] "first J 417481.251428686"
## [1] "Local min J 52122.3288988815"
## [1] "Theta0 -0.000510289788118951"
## [1] "Theta1 0.885633172377596"
# Learning rate
ggplot(df_learning, aes(x= Iteration , y = J))+
geom_point(size= 2, alpha= 0.3, colour= "black")+
ggtitle("Learning rate")
ggplot(datasource, aes(x= Body.Weight, y = Brain.Weight))+
geom_point(alpha= 0.5, size= 2)+
geom_smooth(method = "lm", se= FALSE, colour= "red")+
geom_abline(slope = 0.885633172377596, intercept = -0.000510289788118951, colour= "blue")+
ggtitle("R lm function (red) vs our Theta finding (blue)")