library(randomForest)
library(data.table)
library(dplyr)
library(caret)
Load the data
load("fn_data.RData")
Let’s see the Quartiles and a Historgram of the Duration.
summary(as.numeric(fn_data$duration))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 20.0 214.0 231.0 249.9 251.0 2853.0
hist(as.matrix(fn_data$duration), main="Histogram of Duration", xlab = "Duration", ylab="Frequency", col="green", breaks=100)
As we can see there are some extreme values with a duration more than 1000-2000 days. Maybe we should have excluded those values.
df<-fn_data
###create a column of years
df$years<-(year(df$start_date))
###create a column of months
df$months<-(month(df$start_date))
###set the duration to numeric
df$duration<-as.numeric(df$duration)
##For the country and id find the mean and duration and the times that they appear in the dataset and then find their 4 quartiles
## and create bins based on the quartiles
by_country<-df%>%group_by(country)%>%summarise(counts=n(), mean_duration=mean(duration))
by_fde_id<-df%>%group_by(fde_id)%>%summarise(counts_id=n(), mean_duration_id=mean(duration))
by_country_quantile <- by_country %>% mutate(quartile_counts_coountry = ntile(counts, 4), quartile_mean_country = ntile(mean_duration, 4))
by_fde_id_quantile <- by_fde_id %>% mutate(quartile_counts_id = ntile(counts_id, 4), quartile_mean_id = ntile(mean_duration_id, 4))
##Join the new tables with the initial dataset
df1<-merge(df, by_country_quantile)
df2<-merge(df1, by_fde_id_quantile)
###We keep the relevant variables and exclude the others
df3<-subset(df2, select=c("duration" ,"nopersons" , "years", "months", "quartile_counts_coountry" , "quartile_mean_country",
"quartile_counts_id", "quartile_mean_id" ))
###We convert the quartiles to factors
df3$quartile_mean_id<-as.factor(df3$quartile_mean_id)
df3$quartile_counts_id<-as.factor(df3$quartile_counts_id)
df3$quartile_counts_coountry<-as.factor(df3$quartile_counts_coountry)
df3$quartile_mean_country<-as.factor(df3$quartile_mean_country)
We run a Random Forest Regression and we plot the scatter plot of Predicted vs the Observed values
rf<-randomForest(duration~.,data=df3)
plot(df3$duration,predict(rf,df3),col="green", main="Actual vs Predicted", xlab="Duration", ylab="Predicted Duration")
sqrt(sum((df3$duration-predict(rf,df3))^2)/nrow(df3))
## [1] 43.9148
sum(abs(predict(rf,df3)-df3$duration))/nrow(df3)
## [1] 28.87363
We plot the residulas
plot(df3$duration-predict(rf,df3), main="Residuals", ylab="Residuals")
Finally a plot of Actual vs Predicted per row of the Data Frame
matplot(rownames(df3), cbind(df3$duration, predict(rf,df3)),type="l",col=c("red","green"),lty=c(1,1), xlab="Index", ylab = "Duration", main="Actual vs Predicted")
legend("topright", inset=.05, legend=c("Actual", "Predicted"), fill=c(2,3),col=c(2,3), horiz=TRUE)
###split the dataset into training and dataset
intrain<-createDataPartition(y=df3$duration,p=0.8,list=FALSE)
training<-df3[intrain,]
testing<-df3[-intrain,]
###run the random forest to training dataset
rf<-randomForest(duration~.,data=training)
###get the predicted values of the testing dataset
predicted<-predict(rf, testing)
sqrt(sum((testing$duration-predicted)^2)/length(predicted))
## [1] 50.87937
sum(abs(predicted-testing$duration))/length(predicted)
## [1] 29.96636