#Loading necessary packages 

library(tidyverse)
library(skimr)
library(rpart)
library(randomForest)
library(rattle)
library(neuralnet)
library(nnet)
library(caret)

Task 1 (really easy)

Loading the datasets

#Import training and testing data:

train_raw <- read.csv2("train.csv", sep = ",",
                       stringsAsFactors = TRUE)

test_raw <- read.csv2("test.csv", sep = ",",
                      stringsAsFactors = TRUE)

dim(train_raw)
## [1] 1460   81
dim(test_raw)
## [1] 1459   80
#This is useful to look at data, from skimr package. Doesn't render in latex though.
skim(train_raw)
Data summary
Name train_raw
Number of rows 1460
Number of columns 81
_______________________
Column type frequency:
factor 43
numeric 38
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
MSZoning 0 1.00 FALSE 5 RL: 1151, RM: 218, FV: 65, RH: 16
Street 0 1.00 FALSE 2 Pav: 1454, Grv: 6
Alley 1369 0.06 FALSE 2 Grv: 50, Pav: 41
LotShape 0 1.00 FALSE 4 Reg: 925, IR1: 484, IR2: 41, IR3: 10
LandContour 0 1.00 FALSE 4 Lvl: 1311, Bnk: 63, HLS: 50, Low: 36
Utilities 0 1.00 FALSE 2 All: 1459, NoS: 1
LotConfig 0 1.00 FALSE 5 Ins: 1052, Cor: 263, Cul: 94, FR2: 47
LandSlope 0 1.00 FALSE 3 Gtl: 1382, Mod: 65, Sev: 13
Neighborhood 0 1.00 FALSE 25 NAm: 225, Col: 150, Old: 113, Edw: 100
Condition1 0 1.00 FALSE 9 Nor: 1260, Fee: 81, Art: 48, RRA: 26
Condition2 0 1.00 FALSE 8 Nor: 1445, Fee: 6, Art: 2, Pos: 2
BldgType 0 1.00 FALSE 5 1Fa: 1220, Twn: 114, Dup: 52, Twn: 43
HouseStyle 0 1.00 FALSE 8 1St: 726, 2St: 445, 1.5: 154, SLv: 65
RoofStyle 0 1.00 FALSE 6 Gab: 1141, Hip: 286, Fla: 13, Gam: 11
RoofMatl 0 1.00 FALSE 8 Com: 1434, Tar: 11, WdS: 6, WdS: 5
Exterior1st 0 1.00 FALSE 15 Vin: 515, HdB: 222, Met: 220, Wd : 206
Exterior2nd 0 1.00 FALSE 16 Vin: 504, Met: 214, HdB: 207, Wd : 197
MasVnrType 8 0.99 FALSE 4 Non: 864, Brk: 445, Sto: 128, Brk: 15
ExterQual 0 1.00 FALSE 4 TA: 906, Gd: 488, Ex: 52, Fa: 14
ExterCond 0 1.00 FALSE 5 TA: 1282, Gd: 146, Fa: 28, Ex: 3
Foundation 0 1.00 FALSE 6 PCo: 647, CBl: 634, Brk: 146, Sla: 24
BsmtQual 37 0.97 FALSE 4 TA: 649, Gd: 618, Ex: 121, Fa: 35
BsmtCond 37 0.97 FALSE 4 TA: 1311, Gd: 65, Fa: 45, Po: 2
BsmtExposure 38 0.97 FALSE 4 No: 953, Av: 221, Gd: 134, Mn: 114
BsmtFinType1 37 0.97 FALSE 6 Unf: 430, GLQ: 418, ALQ: 220, BLQ: 148
BsmtFinType2 38 0.97 FALSE 6 Unf: 1256, Rec: 54, LwQ: 46, BLQ: 33
Heating 0 1.00 FALSE 6 Gas: 1428, Gas: 18, Gra: 7, Wal: 4
HeatingQC 0 1.00 FALSE 5 Ex: 741, TA: 428, Gd: 241, Fa: 49
CentralAir 0 1.00 FALSE 2 Y: 1365, N: 95
Electrical 1 1.00 FALSE 5 SBr: 1334, Fus: 94, Fus: 27, Fus: 3
KitchenQual 0 1.00 FALSE 4 TA: 735, Gd: 586, Ex: 100, Fa: 39
Functional 0 1.00 FALSE 7 Typ: 1360, Min: 34, Min: 31, Mod: 15
FireplaceQu 690 0.53 FALSE 5 Gd: 380, TA: 313, Fa: 33, Ex: 24
GarageType 81 0.94 FALSE 6 Att: 870, Det: 387, Bui: 88, Bas: 19
GarageFinish 81 0.94 FALSE 3 Unf: 605, RFn: 422, Fin: 352
GarageQual 81 0.94 FALSE 5 TA: 1311, Fa: 48, Gd: 14, Ex: 3
GarageCond 81 0.94 FALSE 5 TA: 1326, Fa: 35, Gd: 9, Po: 7
PavedDrive 0 1.00 FALSE 3 Y: 1340, N: 90, P: 30
PoolQC 1453 0.00 FALSE 3 Gd: 3, Ex: 2, Fa: 2
Fence 1179 0.19 FALSE 4 MnP: 157, GdP: 59, GdW: 54, MnW: 11
MiscFeature 1406 0.04 FALSE 4 She: 49, Gar: 2, Oth: 2, Ten: 1
SaleType 0 1.00 FALSE 9 WD: 1267, New: 122, COD: 43, Con: 9
SaleCondition 0 1.00 FALSE 6 Nor: 1198, Par: 125, Abn: 101, Fam: 20

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Id 0 1.00 730.50 421.61 1 365.75 730.5 1095.25 1460 ▇▇▇▇▇
MSSubClass 0 1.00 56.90 42.30 20 20.00 50.0 70.00 190 ▇▅▂▁▁
LotFrontage 259 0.82 70.05 24.28 21 59.00 69.0 80.00 313 ▇▃▁▁▁
LotArea 0 1.00 10516.83 9981.26 1300 7553.50 9478.5 11601.50 215245 ▇▁▁▁▁
OverallQual 0 1.00 6.10 1.38 1 5.00 6.0 7.00 10 ▁▂▇▅▁
OverallCond 0 1.00 5.58 1.11 1 5.00 5.0 6.00 9 ▁▁▇▅▁
YearBuilt 0 1.00 1971.27 30.20 1872 1954.00 1973.0 2000.00 2010 ▁▂▃▆▇
YearRemodAdd 0 1.00 1984.87 20.65 1950 1967.00 1994.0 2004.00 2010 ▅▂▂▃▇
MasVnrArea 8 0.99 103.69 181.07 0 0.00 0.0 166.00 1600 ▇▁▁▁▁
BsmtFinSF1 0 1.00 443.64 456.10 0 0.00 383.5 712.25 5644 ▇▁▁▁▁
BsmtFinSF2 0 1.00 46.55 161.32 0 0.00 0.0 0.00 1474 ▇▁▁▁▁
BsmtUnfSF 0 1.00 567.24 441.87 0 223.00 477.5 808.00 2336 ▇▅▂▁▁
TotalBsmtSF 0 1.00 1057.43 438.71 0 795.75 991.5 1298.25 6110 ▇▃▁▁▁
X1stFlrSF 0 1.00 1162.63 386.59 334 882.00 1087.0 1391.25 4692 ▇▅▁▁▁
X2ndFlrSF 0 1.00 346.99 436.53 0 0.00 0.0 728.00 2065 ▇▃▂▁▁
LowQualFinSF 0 1.00 5.84 48.62 0 0.00 0.0 0.00 572 ▇▁▁▁▁
GrLivArea 0 1.00 1515.46 525.48 334 1129.50 1464.0 1776.75 5642 ▇▇▁▁▁
BsmtFullBath 0 1.00 0.43 0.52 0 0.00 0.0 1.00 3 ▇▆▁▁▁
BsmtHalfBath 0 1.00 0.06 0.24 0 0.00 0.0 0.00 2 ▇▁▁▁▁
FullBath 0 1.00 1.57 0.55 0 1.00 2.0 2.00 3 ▁▇▁▇▁
HalfBath 0 1.00 0.38 0.50 0 0.00 0.0 1.00 2 ▇▁▅▁▁
BedroomAbvGr 0 1.00 2.87 0.82 0 2.00 3.0 3.00 8 ▁▇▂▁▁
KitchenAbvGr 0 1.00 1.05 0.22 0 1.00 1.0 1.00 3 ▁▇▁▁▁
TotRmsAbvGrd 0 1.00 6.52 1.63 2 5.00 6.0 7.00 14 ▂▇▇▁▁
Fireplaces 0 1.00 0.61 0.64 0 0.00 1.0 1.00 3 ▇▇▁▁▁
GarageYrBlt 81 0.94 1978.51 24.69 1900 1961.00 1980.0 2002.00 2010 ▁▁▅▅▇
GarageCars 0 1.00 1.77 0.75 0 1.00 2.0 2.00 4 ▁▃▇▂▁
GarageArea 0 1.00 472.98 213.80 0 334.50 480.0 576.00 1418 ▂▇▃▁▁
WoodDeckSF 0 1.00 94.24 125.34 0 0.00 0.0 168.00 857 ▇▂▁▁▁
OpenPorchSF 0 1.00 46.66 66.26 0 0.00 25.0 68.00 547 ▇▁▁▁▁
EnclosedPorch 0 1.00 21.95 61.12 0 0.00 0.0 0.00 552 ▇▁▁▁▁
X3SsnPorch 0 1.00 3.41 29.32 0 0.00 0.0 0.00 508 ▇▁▁▁▁
ScreenPorch 0 1.00 15.06 55.76 0 0.00 0.0 0.00 480 ▇▁▁▁▁
PoolArea 0 1.00 2.76 40.18 0 0.00 0.0 0.00 738 ▇▁▁▁▁
MiscVal 0 1.00 43.49 496.12 0 0.00 0.0 0.00 15500 ▇▁▁▁▁
MoSold 0 1.00 6.32 2.70 1 5.00 6.0 8.00 12 ▃▆▇▃▃
YrSold 0 1.00 2007.82 1.33 2006 2007.00 2008.0 2009.00 2010 ▇▇▇▇▅
SalePrice 0 1.00 180921.20 79442.50 34900 129975.00 163000.0 214000.00 755000 ▇▅▁▁▁

Cleaning the dataset

# Functions to replace NAs with most frequent level or median
replace_na_most <- function(x){
fct_explicit_na(x, na_level = names(which.max(table(x))))
}
replace_na_med <- function(x){
x[is.na(x)] <- median(x,na.rm = TRUE)
x
}
cleanup_minimal <- function(data){
nomis <- data %>%
mutate_if(is.factor, replace_na_most) %>%
mutate_if(is.numeric, replace_na_med)
nomis
}
train_minclean <- cleanup_minimal(train_raw)
test_minclean <- cleanup_minimal(test_raw)

Run the simplest tree algorithm and plot it

mod_rpart <- rpart(SalePrice~., data=train_minclean)
# Try this command to make a nice tree plot!
fancyRpartPlot(mod_rpart, caption = NULL)

Exporting the predictions in appropriate format

pred_rpart <- predict(mod_rpart, newdata = test_minclean)
submission_rpart <- tibble(Id=test_raw$Id, SalePrice=pred_rpart)
head(submission_rpart)
## # A tibble: 6 x 2
##      Id SalePrice
##   <int>     <dbl>
## 1  1461   118199.
## 2  1462   151246.
## 3  1463   185210.
## 4  1464   185210.
## 5  1465   249392.
## 6  1466   185210.
# Obviously, your file path might be different here:
write_csv(submission_rpart, file="C:/Users/Asus/OneDrive - University of Georgia/PhD/Spring 2022/AAEC 8610/HW11/home-data-for-ml-course/submission_rpart.csv")

My submission of predictions using rpart in Kaggle

Task 2: Push further, as much as you like

# Training a random forest
mod_rf <- randomForest(SalePrice ~ ., data = train_minclean)

# These 2 lines below are just a stupid trick to fix a bug in R. Without it prediction gets an error
trainX <- select(train_minclean, -SalePrice)
test_minclean <- rbind(trainX[1, ] , test_minclean)
test_minclean <- test_minclean[-1,]

# Get my predictions:
pred_rf <-predict(mod_rf, newdata = test_minclean)
submission_rf_1 <- tibble(Id=test_raw$Id, SalePrice=pred_rf)
write_csv(submission_rf_1, file="C:/Users/Asus/OneDrive - University of Georgia/PhD/Spring 2022/AAEC 8610/HW11/home-data-for-ml-course/submission_rf_1.csv")

My submission of predictions using randomforest in Kaggle

# Merging, then normalizing and hot-encoding
merged <- rbind(select(train_minclean,-SalePrice),test_minclean)

preProcessor <- preProcess(merged[,-1]) # not processing the ID variable
normed <- predict(preProcessor, newdata = merged)
hotencoder <- dummyVars(" ~ .", data=normed)
encoded <- predict(hotencoder, newdata = normed)
# Spliting training and testing back out
train_encoded = encoded[1:dim(train_minclean)[1], ]
train_proc <- cbind(train_encoded,SalePrice=train_raw$SalePrice)
train_proc <- train_proc[,-1]
test_proc = encoded[(dim(train_minclean)[1]+1):dim(encoded)[1], ]
# Training a neural net
# (note: Dividing roughly by the approximate max value of SalePrice to make it [0,1]. 
mod_nn <- nnet(SalePrice/800000 ~ ., data = train_proc, size = 15, linout = TRUE, MaxNWts = 10000)
## # weights:  4351
## initial  value 232.164465 
## iter  10 value 5.777563
## iter  20 value 3.309414
## iter  30 value 1.550244
## iter  40 value 1.002767
## iter  50 value 0.734708
## iter  60 value 0.598571
## iter  70 value 0.509123
## iter  80 value 0.432249
## iter  90 value 0.355826
## iter 100 value 0.287284
## final  value 0.287284 
## stopped after 100 iterations
# Now predicting back:
pred_nnet <- (predict(mod_nn, newdata=test_proc))*800000
pp <- data.frame(pred_nnet)
submission_nnet <- tibble(Id=test_raw$Id, SalePrice=pp$pred_nnet)
write_csv(submission_nnet, file="C:/Users/Asus/OneDrive - University of Georgia/PhD/Spring 2022/AAEC 8610/HW11/home-data-for-ml-course/submission_nnet.csv")

My submission of predictions using nnet in Kaggle: