#install.packages("rpart")
library(tidyverse)
library(rpart)
library(caret)
#library(skimr)
library(randomForest)
library(neuralnet)
library(rattle)
library(nnet)
train_raw <- read.csv2("train.csv",sep = ",", stringsAsFactors= TRUE)
test_raw <- read.csv2("test.csv",sep = ",", stringsAsFactors= TRUE)
# dim(train_raw)
# dim(test_raw)
# This is useful to look at data, from skimr package. Doesn't render in latex though.
# skim(train_raw)
# Functions to replace NAs with most frequent level or median
replace_na_most <- function(x){
fct_explicit_na(x,na_level= names(which.max(table(x))))
}
replace_na_med<-function(x){
x[is.na(x)] <- median(x,na.rm =TRUE)
x
}
cleanup_minimal <- function(data){
nomis <- data %>%mutate_if(is.factor,replace_na_most)%>% mutate_if(is.numeric, replace_na_med)
nomis
}
train_minclean<-cleanup_minimal(train_raw)
test_minclean <-cleanup_minimal(test_raw)
mod_rpart <- rpart(SalePrice~., data=train_minclean)
# Try this command to make a nice tree plot!
# fancyRpartPlot(mod_rpart, caption = NULL)
pred_rpart<-predict(mod_rpart,newdata =test_minclean)
submission_rpart<-tibble(Id=test_raw$Id,SalePrice=pred_rpart)
head(submission_rpart)
## # A tibble: 6 x 2
## Id SalePrice
## <int> <dbl>
## 1 1461 118199.
## 2 1462 151246.
## 3 1463 185210.
## 4 1464 185210.
## 5 1465 249392.
## 6 1466 185210.
# Obviously, your file path might be different here:
write_csv(submission_rpart, file="submission_rpart.csv")

# Training a random forest
mod_rf<- randomForest(SalePrice ~ .,data=train_minclean)
# This thread gave me the solution:
trainX <- select(train_minclean, -SalePrice)
test_minclean <- rbind(trainX[1,] ,test_minclean)
test_minclean <- test_minclean[-1,]
# Get mypredictions:
pred_rf <-predict(mod_rf, newdata = test_minclean)
submission_rf <- tibble(Id=test_raw$Id, SalePrice=pred_rf)
write_csv(submission_rf, file="submission_rf.csv")
