A Machine Learning model to predict housing prices

Retreive data

setwd("..")
testing <- read.csv("test.csv", stringsAsFactors = T) 
training<- read.csv("train.csv", stringsAsFactors = T)
dim(testing)
## [1] 1459   80
dim(training) 
## [1] 1460   81

Some cleaning

# some functions provided by Dr. F
# replace NA factors with the mode 
replace_na_most  <- function(x){
  fct_explicit_na(x, na_level = names(which.max(table(x))))
}

# replace NA numerics with the median  
replace_na_med <- function(x){
  x[is.na(x)] <- median(x, na.rm = T)
  x
}

# function that combines both 
cleanup_minimal <- function(data){
  nomis <- data %>% 
    mutate_if(is.factor, replace_na_most) %>% 
    mutate_if(is.numeric, replace_na_med)
  nomis
}

trainClean <- cleanup_minimal(training)
testClean <- cleanup_minimal(testing)

Running a simple tree algorithm

sales <- rpart(SalePrice ~ ., trainClean)
fancyRpartPlot(sales, caption = "Basic Regression Tree ")

Exporting predictions

preds <- predict(sales, newdata = testClean)
submission <- tibble(Id=testing$Id, SalePrice = preds)
head(submission)
## # A tibble: 6 x 2
##      Id SalePrice
##   <int>     <dbl>
## 1  1461   118199.
## 2  1462   151246.
## 3  1463   185210.
## 4  1464   185210.
## 5  1465   249392.
## 6  1466   185210.
write_csv(submission, "kaggleSubmissions.csv")

Proof of Submission