In this task, we’ll walk through a minimal machine learning exercise and submit our results to kaggle.
Using the Housing Prices Competition for Kaggle Learn Users we will be trying to predict the prices of homes based on characteristics. To submit the results to kaggle, we will export 2-columns .csv file at the end using test data and see how well our model performs.
# Import Data - downloaded from kaggle
setwd("~/OneDrive - University of Georgia/4th Sem PhD/Adv Econometric Applications_Filipski/Assignments/HW11b")
train_raw <- read.csv2("train.csv", sep = ",",
stringsAsFactors = TRUE)
test_raw <- read.csv2("test.csv", sep = ",",
stringsAsFactors = TRUE)
dim(train_raw)
## [1] 1460 81
dim(test_raw)
## [1] 1459 80
sum(is.na(train_raw))
## [1] 6965
Total columns in these data set are different. 81st column in train data is SalePrice which is not present in test data because, we want to predict those values from the model we created from the training data set.
library(dplyr)
library(forcats)
#minimal cleaning, these functions replace missing values (NAs) with most frequent level or median
replace_na_most <- function(x){
fct_explicit_na(x, na_level = names(which.max(table(x))))
}
replace_na_med <- function(x){
x[is.na(x)] <- median(x,na.rm = TRUE)
x
}
cleanup_minimal <- function(data){
nomis <- data %>%
mutate_if(is.factor, replace_na_most) %>%
mutate_if(is.numeric, replace_na_med)
nomis
}
train_minclean <- cleanup_minimal(train_raw)
test_minclean <- cleanup_minimal(test_raw)
Lets run some tree algorithm:
library(rpart)
library(rattle)
mod_rpart <- rpart(SalePrice~., data=train_minclean)
plotcp(mod_rpart)
fancyRpartPlot(mod_rpart, main="Fitted Model")
This is
the tree which shows the fitted model.
library(tibble)
pred_rpart <- predict(mod_rpart, newdata = test_minclean)
submission_rpart <- tibble(Id=test_raw$Id, SalePrice=pred_rpart)
head(submission_rpart)
## # A tibble: 6 × 2
## Id SalePrice
## <int> <dbl>
## 1 1461 118199.
## 2 1462 151246.
## 3 1463 185210.
## 4 1464 185210.
## 5 1465 249392.
## 6 1466 185210.
# Obviously, your file path might be different here:
library(readr)
write_csv(submission_rpart, file="submission_rpart.csv")
Based on these codings, lets enter the kaggle competition and see how well we predicted the SalePrice in the test data. Here is the leader board screenshot and my position:
Position 1
Can we make a better model than this?
X_train <- select (train_minclean,-SalePrice)
X_test <- test_minclean
y_train <- (train_minclean$SalePrice)
# we do not have y_test: recall that we have to predict it and submit it in kaggle to get score
library(glmnet)
#lasso <- glmnet(X_train, y_train,
# family="gaussian",
# alpha=1,
# lambda= (seq(0,100,0.01)))
#plot(lasso, "lambda", cex = 0.7, cex.axis=0.7, cex.lab=0.7)
library(tibble)
#pred_lasso <- predict(lasso, newdata=test_minclean)
#submission_lasso <- tibble(Id=test_raw$Id, SalePrice=pred_rpart)
#head(submission_lasso)
# Obviously, your file path might be different here:
library(readr)
#write_csv(submission_lasso, file="submission_lasso.csv")
## LASSO 2nd way
library(caret)
parameters <- c(seq(0.1, 2, by =0.1) , seq(2, 5, 0.5) , seq(5, 25, 1))
lasso2<-train(SalePrice~., data=train_minclean,
method = 'glmnet',
tuneGrid = expand.grid(alpha = 1, lambda = parameters)
)
pred_lasso2 <- predict(lasso2, newdata = test_minclean)
submission_lasso2 <- tibble(Id=test_raw$Id, SalePrice=pred_lasso2)
head(submission_lasso2)
## # A tibble: 6 × 2
## Id SalePrice
## <int> <dbl>
## 1 1461 176053.
## 2 1462 224713.
## 3 1463 252128.
## 4 1464 264160.
## 5 1465 272450.
## 6 1466 237320.
# Obviously, your file path might be different here:
write_csv(submission_lasso2, file="submission_lasso2.csv")
# Training a random forest
library(randomForest)
mod_rf <- randomForest(SalePrice ~ ., data = train_minclean)
trainX <- select(train_minclean, -SalePrice)
test_minclean <- rbind(trainX[1, ] , test_minclean)
test_minclean <- test_minclean[-1,]
# Get my predictions:
pred_rf <-predict(mod_rf, newdata = test_minclean)
submission_rf <- tibble(Id=test_raw$Id, SalePrice=pred_rf)
write_csv(submission_rf, file="submission_rf.csv")
# Training a Gradient Boosting
library(mboost)
mod_gb <- glmboost(SalePrice ~ ., data = train_minclean,
control = boost_control(mstop = 2000),
center = FALSE)
# Get my predictions:
pred_gb <-predict(mod_gb, newdata = test_minclean)
submission_gb <- tibble(Id=test_raw$Id, SalePrice=pred_gb)
library(openxlsx)
write.xlsx(submission_gb, file="submission_gb.xlsx", sheetName = "Sheet1",
col.names = TRUE, row.names = TRUE, append = FALSE)
Upon trying multiple models, the random forest’s prediction performed
best.
Random Forest’s Prediction performed better