Machine Learning and Kaggle

1. Getting the data

#Import training and testing data:
# (Obviously, your file paths might be different here ):
train_raw <- read.csv2("data/train.csv", sep = ",",stringsAsFactors = TRUE)
test_raw <- read.csv2("data/test.csv", sep = ",",stringsAsFactors = TRUE)
# dim(train_raw)
# dim(test_raw)
# This is useful to look at data, from skimr package. Doesn't render in latex though.
#skim(train_raw)

2. Cleaning Data

# Functions to replace NAs with most frequent level or median
replace_na_most <- function(x){
fct_explicit_na(x, na_level = names(which.max(table(x))))
}
replace_na_med <- function(x){
x[is.na(x)] <- median(x,na.rm = TRUE)
x
}
cleanup_minimal <- function(data){
nomis <- data %>%
mutate_if(is.factor, replace_na_most) %>%
mutate_if(is.numeric, replace_na_med)
nomis
}
train_minclean <- cleanup_minimal(train_raw)
test_minclean <- cleanup_minimal(test_raw)

3.Tree algorithm suggestion

#mod_rpart <- rpart(SalePrice~., data=train_minclean)
# Try this command to make a nice tree plot!
# fancyRpartPlot(mod_rpart, caption = NULL)

#Here’s some code to export the predictions in the appropriate format
#pred_rpart <- predict(mod_rpart, newdata = test_minclean)
#submission_rpart <- tibble(Id=test_raw$Id, SalePrice=pred_rpart)
#head(submission_rpart)

# Obviously, your file path might be different here:
#write_csv(submission_rpart, file="submission_rpart.csv")

4.Extension attempt (I did not use this)

Re-order the dataset:

#set.seed(28)
#Let;s shuffle the dataset
#g<-runif(nrow(test_minclean)) #creates a list iwth numbers taken at random
#train_minclean_random<-train_minclean[order(g),] #order the database randomly
#model_rpart2<-rpart(SalePrice~., data=train_minclean_random[1:1000,]) #takes a model with a random sub-sample of 100 observations
#model_rpart2
#rpart.plot::rpart.plot(model_rpart2) #It looks very pretty :)
#p2<-predict(model_rpart2, data=train_minclean_random[1001:1459,])

5.Forest Extension (I used this one)

set.seed(28)
data.imputed <- rfImpute(SalePrice ~ ., data=train_raw, iter=5) #imputes the missing values
 |      Out-of-bag   |

Tree | MSE %Var(y) | 300 | 7.886e+08 12.50 | | Out-of-bag | Tree | MSE %Var(y) | 300 | 7.546e+08 11.96 | | Out-of-bag | Tree | MSE %Var(y) | 300 | 7.526e+08 11.93 | | Out-of-bag | Tree | MSE %Var(y) | 300 | 7.733e+08 12.26 | | Out-of-bag | Tree | MSE %Var(y) | 300 | 7.731e+08 12.26 |

model_forest_1 <- randomForest(SalePrice ~ ., data=data.imputed, proximity =TRUE)
model_forest_1

Call: randomForest(formula = SalePrice ~ ., data = data.imputed, proximity = TRUE) Type of random forest: regression Number of trees: 500 No. of variables tried at each split: 26

      Mean of squared residuals: 773857342
                % Var explained: 87.73
# These 2 lines below are just a stupid trick to fix a bug in R. Without it prediction gets an error.
# This thread gave me the solution:
# https://stackoverflow.com/questions/24829674/r-random-forest-error-type-of-predictors-in-new-data-do-not-
trainX <- dplyr::select(data.imputed, -SalePrice)
test_minclean <- rbind(trainX[1, ] , test_minclean)
test_minclean <- test_minclean[-1,]

#Here’s some code to export the predictions in the appropriate format
pred_rf <-predict(model_forest_1, newdata = test_minclean)
submission_rf <- tibble(Id=test_raw$Id, SalePrice=pred_rf)
write_csv(submission_rf, file="submission_rf_2.csv")

6.Clearly I need more practice