In this assignment, we will run the machine learning prediction model, and submit the code to a kaggle competition. Using the housing prices data, we will predict the prices of homes based on characteristics.
# Import training and testing data
train_raw <- read.csv2("home-data-for-ml-course/train.csv", sep = ",",
stringsAsFactors = TRUE)
dim(train_raw)
## [1] 1460 81
test_raw <- read.csv2("home-data-for-ml-course/test.csv", sep = ",",
stringsAsFactors = TRUE)
dim(test_raw)
## [1] 1459 80
# skim(train_raw)
Note: The train dataset has one more column than the test dataset (SalePrice, which we want to predict).
We need to replace NAs with the most frequent level or median
# Replace NAs with most frequent level or median
replace_na_most <- function(x){
fct_explicit_na(x, na_level = names(which.max(table(x))))
}
replace_na_med <- function(x){
x[is.na(x)] <- median(x,na.rm = TRUE)
x
}
cleanup_minimal <- function(data){
nomis <- data %>%
mutate_if(is.factor, replace_na_most) %>%
mutate_if(is.numeric, replace_na_med)
nomis
}
train_minclean <- cleanup_minimal(train_raw)
test_minclean <- cleanup_minimal(test_raw)
Now, let us run the simplest tree algorithm and export the predictions.
mod_rpart <- rpart(SalePrice ~ ., data = train_minclean)
# Tree plot
fancyRpartPlot(mod_rpart, caption = NULL, palettes = "YlGn")
# Export the predictions
pred_rpart <- predict(mod_rpart, newdata = test_minclean)
submission_rpart <- tibble(Id = test_raw$Id, SalePrice = pred_rpart)
head(submission_rpart)
## # A tibble: 6 x 2
## Id SalePrice
## <int> <dbl>
## 1 1461 118199.
## 2 1462 151246.
## 3 1463 185210.
## 4 1464 185210.
## 5 1465 249392.
## 6 1466 185210.
write_csv(submission_rpart, file = "submission_rpart.csv")
Leaderboard position