A Machine Learning model to predict housing prices
setwd("..")
testing <- read.csv("test.csv", stringsAsFactors = T)
training<- read.csv("train.csv", stringsAsFactors = T)
dim(testing)
## [1] 1459 80
dim(training)
## [1] 1460 81
# some functions provided by Dr. F
# replace NA factors with the mode
replace_na_most <- function(x){
fct_explicit_na(x, na_level = names(which.max(table(x))))
}
# replace NA numerics with the median
replace_na_med <- function(x){
x[is.na(x)] <- median(x, na.rm = T)
x
}
# function that combines both
cleanup_minimal <- function(data){
nomis <- data %>%
mutate_if(is.factor, replace_na_most) %>%
mutate_if(is.numeric, replace_na_med)
nomis
}
trainClean <- cleanup_minimal(training)
testClean <- cleanup_minimal(testing)
sales <- rpart(SalePrice ~ ., trainClean)
fancyRpartPlot(sales, caption = "Basic Regression Tree ")
preds <- predict(sales, newdata = testClean)
submission <- tibble(Id=testing$Id, SalePrice = preds)
head(submission)
## # A tibble: 6 x 2
## Id SalePrice
## <int> <dbl>
## 1 1461 118199.
## 2 1462 151246.
## 3 1463 185210.
## 4 1464 185210.
## 5 1465 249392.
## 6 1466 185210.
write_csv(submission, "kaggleSubmissions.csv")