HW11

Duflo’s Plumber Economist

2023-05-01

#Loading required packages (might not end up using all of the loaded ones)
library(prettydoc) #For the theme used in this document
library(ggplot2)
library(tidyverse)
#install.packages("caret")
#install.packages("skimr")
#install.packages("rpart")
#install.packages("randomForest")
library(caret)
library(skimr)
library(rpart)
library(rpart.plot)
library(randomForest)
library(rattle)
library(neuralnet)
library(nnet)

##Loading and cleaning the data

#Setting up the directory
setwd("D:/UGA Coursework/Second Year/AAEC 8610/HWs/HW11")
getwd()
## [1] "D:/UGA Coursework/Second Year/AAEC 8610/HWs/HW11"
#Loading data using the code provided
train_raw <- read.csv2("train.csv", sep = ",",
                       stringsAsFactors = TRUE)

test_raw <- read.csv2("test.csv", sep = ",",
                      stringsAsFactors = TRUE)

# Functions to replace NAs with most frequent level or median
replace_na_most <- function(x){
fct_explicit_na(x, na_level = names(which.max(table(x))))
}
replace_na_med <- function(x){
x[is.na(x)] <- median(x,na.rm = TRUE)
x
}
cleanup_minimal <- function(data){
nomis <- data %>%
mutate_if(is.factor, replace_na_most) %>%
mutate_if(is.numeric, replace_na_med)
nomis
}
train_minclean <- cleanup_minimal(train_raw)
test_minclean <- cleanup_minimal(test_raw)

###Run the simplest tree algorithm there is

mod_rpart <- rpart(SalePrice~., data=train_minclean)

fancyRpartPlot(mod_rpart, caption = NULL)

pred_rpart <- predict(mod_rpart, newdata = test_minclean)
submission_rpart <- tibble(Id=test_raw$Id, SalePrice=pred_rpart)
head(submission_rpart)
## # A tibble: 6 × 2
##      Id SalePrice
##   <int>     <dbl>
## 1  1461   118199.
## 2  1462   151246.
## 3  1463   185210.
## 4  1464   185210.
## 5  1465   249392.
## 6  1466   185210.
knitr::include_graphics("ss.png")