library(randomForest)
library(data.table)
library(dplyr)
library(caret)

Load the data

load("fn_data.RData")

Let’s see the Quartiles and a Historgram of the Duration.

summary(as.numeric(fn_data$duration))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    20.0   214.0   231.0   249.9   251.0  2853.0
hist(as.matrix(fn_data$duration), main="Histogram of Duration", xlab = "Duration", ylab="Frequency", col="green", breaks=100)

As we can see there are some extreme values with a duration more than 1000-2000 days. Maybe we should have excluded those values.

df<-fn_data
###create a column of years
df$years<-(year(df$start_date))
###create a column of months
df$months<-(month(df$start_date))
###set the duration to numeric
df$duration<-as.numeric(df$duration)


##For the country and id find the mean and duration and the times that they appear in the dataset and then find their 4 quartiles
## and create bins based on the quartiles
by_country<-df%>%group_by(country)%>%summarise(counts=n(), mean_duration=mean(duration))
by_fde_id<-df%>%group_by(fde_id)%>%summarise(counts_id=n(), mean_duration_id=mean(duration))
by_country_quantile <- by_country %>% mutate(quartile_counts_coountry = ntile(counts, 4), quartile_mean_country = ntile(mean_duration, 4))
by_fde_id_quantile <- by_fde_id %>% mutate(quartile_counts_id = ntile(counts_id, 4), quartile_mean_id = ntile(mean_duration_id, 4))

##Join the new tables with the initial dataset
df1<-merge(df, by_country_quantile)
df2<-merge(df1, by_fde_id_quantile)


###We keep the relevant variables and exclude the others
df3<-subset(df2, select=c("duration" ,"nopersons" , "years", "months", "quartile_counts_coountry" , "quartile_mean_country", 
                          "quartile_counts_id",  "quartile_mean_id" ))

###We convert the quartiles to factors
df3$quartile_mean_id<-as.factor(df3$quartile_mean_id)
df3$quartile_counts_id<-as.factor(df3$quartile_counts_id)
df3$quartile_counts_coountry<-as.factor(df3$quartile_counts_coountry)
df3$quartile_mean_country<-as.factor(df3$quartile_mean_country)

We run a Random Forest Regression and we plot the scatter plot of Predicted vs the Observed values

rf<-randomForest(duration~.,data=df3)
plot(df3$duration,predict(rf,df3),col="green", main="Actual vs Predicted", xlab="Duration", ylab="Predicted Duration")

We calculate the MSE

sqrt(sum((df3$duration-predict(rf,df3))^2)/nrow(df3))
## [1] 43.9148

We calculate the MAE

sum(abs(predict(rf,df3)-df3$duration))/nrow(df3)
## [1] 28.87363

We plot the residulas

plot(df3$duration-predict(rf,df3), main="Residuals", ylab="Residuals")

Finally a plot of Actual vs Predicted per row of the Data Frame

matplot(rownames(df3), cbind(df3$duration, predict(rf,df3)),type="l",col=c("red","green"),lty=c(1,1), xlab="Index", ylab = "Duration", main="Actual vs Predicted")
legend("topright", inset=.05, legend=c("Actual", "Predicted"), fill=c(2,3),col=c(2,3), horiz=TRUE)

Let’s do the Analysis on Training and Testing Dataset

###split the dataset into training and dataset
intrain<-createDataPartition(y=df3$duration,p=0.8,list=FALSE)
training<-df3[intrain,]
testing<-df3[-intrain,]


###run the random forest to training dataset
rf<-randomForest(duration~.,data=training)


###get the predicted values of the testing dataset

predicted<-predict(rf, testing)

We calculate the MSE on the Testing Dataset

sqrt(sum((testing$duration-predicted)^2)/length(predicted))
## [1] 50.87937

We calculate the MAE on the Testing Dataset

sum(abs(predicted-testing$duration))/length(predicted)
## [1] 29.96636