Task 1

In this task, we’ll walk through a minimal machine learning exercise and submit our results to kaggle.

Using the Housing Prices Competition for Kaggle Learn Users we will be trying to predict the prices of homes based on characteristics. To submit the results to kaggle, we will export 2-columns .csv file at the end using test data and see how well our model performs.

# Import Data - downloaded from kaggle
setwd("~/OneDrive - University of Georgia/4th Sem PhD/Adv Econometric Applications_Filipski/Assignments/HW11b")
train_raw <- read.csv2("train.csv", sep = ",",
                       stringsAsFactors = TRUE)
test_raw <- read.csv2("test.csv", sep = ",",
                       stringsAsFactors = TRUE)
dim(train_raw)
## [1] 1460   81
dim(test_raw)
## [1] 1459   80
sum(is.na(train_raw))
## [1] 6965

Total columns in these data set are different. 81st column in train data is SalePrice which is not present in test data because, we want to predict those values from the model we created from the training data set.

library(dplyr)
library(forcats)
#minimal cleaning, these functions replace missing values (NAs) with most frequent level or median
replace_na_most <- function(x){
  fct_explicit_na(x, na_level = names(which.max(table(x))))
  }

replace_na_med <- function(x){
  x[is.na(x)] <- median(x,na.rm = TRUE)
  x
  }


cleanup_minimal <- function(data){
  nomis <- data %>%
    mutate_if(is.factor, replace_na_most) %>%
    mutate_if(is.numeric, replace_na_med)
  nomis
  }

train_minclean <- cleanup_minimal(train_raw)
test_minclean <- cleanup_minimal(test_raw)

Lets run some tree algorithm:

library(rpart)
library(rattle)
mod_rpart <- rpart(SalePrice~., data=train_minclean)
plotcp(mod_rpart)

fancyRpartPlot(mod_rpart, main="Fitted Model")

This is the tree which shows the fitted model.

library(tibble)
pred_rpart <- predict(mod_rpart, newdata = test_minclean)
submission_rpart <- tibble(Id=test_raw$Id, SalePrice=pred_rpart)
head(submission_rpart)
## # A tibble: 6 × 2
##      Id SalePrice
##   <int>     <dbl>
## 1  1461   118199.
## 2  1462   151246.
## 3  1463   185210.
## 4  1464   185210.
## 5  1465   249392.
## 6  1466   185210.
# Obviously, your file path might be different here:
library(readr)
write_csv(submission_rpart, file="submission_rpart.csv")

Based on these codings, lets enter the kaggle competition and see how well we predicted the SalePrice in the test data. Here is the leader board screenshot and my position:

Position 1

Task 2: Pushing further

Can we make a better model than this?

X_train <- select (train_minclean,-SalePrice)
X_test <- test_minclean
y_train <- (train_minclean$SalePrice)
# we do not have y_test: recall that we have to predict it and submit it in kaggle to get score

library(glmnet)
#lasso <- glmnet(X_train, y_train,
#                family="gaussian",
#                alpha=1,
#                lambda= (seq(0,100,0.01)))
#plot(lasso, "lambda", cex = 0.7, cex.axis=0.7, cex.lab=0.7)

library(tibble)
#pred_lasso <- predict(lasso, newdata=test_minclean)
#submission_lasso <- tibble(Id=test_raw$Id, SalePrice=pred_rpart)
#head(submission_lasso)

# Obviously, your file path might be different here:
library(readr)
#write_csv(submission_lasso, file="submission_lasso.csv")

## LASSO 2nd way
library(caret)
parameters <- c(seq(0.1, 2, by =0.1) ,  seq(2, 5, 0.5) , seq(5, 25, 1))
lasso2<-train(SalePrice~., data=train_minclean,
                 method = 'glmnet', 
                 tuneGrid = expand.grid(alpha = 1, lambda = parameters)
             )

pred_lasso2 <- predict(lasso2, newdata = test_minclean)
submission_lasso2 <- tibble(Id=test_raw$Id, SalePrice=pred_lasso2)
head(submission_lasso2)
## # A tibble: 6 × 2
##      Id SalePrice
##   <int>     <dbl>
## 1  1461   176053.
## 2  1462   224713.
## 3  1463   252128.
## 4  1464   264160.
## 5  1465   272450.
## 6  1466   237320.
# Obviously, your file path might be different here:
write_csv(submission_lasso2, file="submission_lasso2.csv")
# Training a random forest
library(randomForest)
mod_rf <- randomForest(SalePrice ~ ., data = train_minclean)

trainX <- select(train_minclean, -SalePrice)
test_minclean <- rbind(trainX[1, ] , test_minclean)
test_minclean <- test_minclean[-1,]

# Get my predictions:
pred_rf <-predict(mod_rf, newdata = test_minclean)
submission_rf <- tibble(Id=test_raw$Id, SalePrice=pred_rf)
write_csv(submission_rf, file="submission_rf.csv")
# Training a Gradient Boosting
library(mboost)
mod_gb <- glmboost(SalePrice ~ ., data = train_minclean,
                   control = boost_control(mstop = 2000),
                        center = FALSE)

# Get my predictions:
pred_gb <-predict(mod_gb, newdata = test_minclean)
submission_gb <- tibble(Id=test_raw$Id, SalePrice=pred_gb)

library(openxlsx)
write.xlsx(submission_gb, file="submission_gb.xlsx", sheetName = "Sheet1", 
  col.names = TRUE, row.names = TRUE, append = FALSE)

Upon trying multiple models, the random forest’s prediction performed best. Here is the list of my submissions

Random Forest’s Prediction performed better