Task 1

In this task, I run my first machine learning prediction model and submit my work to a Kagle competition.

# load necessary packages 
library(skimr)
library(tree)
library(gridExtra)
library(randomForest)
library(caret)
library(MLmetrics)
library(dplyr)
library(forcats)
library(rpart)
library(rattle)
library(readr)
# Import training and testing data:

train_raw <- read.csv2("/Users/godwinnutsugah/Dropbox/AAEE-UGA/AAEC 8610/Project_Data/data/train.csv", sep = ",",
                       stringsAsFactors = TRUE)

test_raw <- read.csv2("/Users/godwinnutsugah/Dropbox/AAEE-UGA/AAEC 8610/Project_Data/data/test.csv", sep = ",",
                      stringsAsFactors = TRUE)

# Functions to replace NAs with most frequent level or median
replace_na_most <- function(x){
    fct_explicit_na(x, na_level = names(which.max(table(x))))
}

replace_na_med <- function(x){
  x[is.na(x)] <- median(x,na.rm = TRUE)
  x
}

# Minimal cleaning 
cleanup_minimal <- function(data){
  nomis <- data %>%
    mutate_if(is.factor, replace_na_most) %>%
    mutate_if(is.numeric, replace_na_med)
  nomis
}

# cleaned data
train_minclean <- cleanup_minimal(train_raw)
test_minclean <- cleanup_minimal(test_raw)

Running a simple tree algorithm with a plot

# run the simplest tree algorithm
mod_rpart <- rpart(SalePrice~., data=train_minclean)
# Try this command to make a nice tree plot!
fancyRpartPlot(mod_rpart, caption = NULL)

# code to export the predictions in the appropriate format
pred_rpart <- predict(mod_rpart, newdata = test_minclean)
submission_rpart <- tibble(Id=test_raw$Id, SalePrice=pred_rpart)
head(submission_rpart)
## # A tibble: 6 × 2
##      Id SalePrice
##   <int>     <dbl>
## 1  1461   118199.
## 2  1462   151246.
## 3  1463   185210.
## 4  1464   185210.
## 5  1465   249392.
## 6  1466   185210.
#export output into .csv file
write_csv(submission_rpart, file="/Users/godwinnutsugah/Dropbox/AAEE-UGA/AAEC 8610/Project_Data/data/submission_rpart.csv")

Submitting my prediction

My score and position

My score and position