R Markdown

######################################################################################
#######                                                                      #########
####### ******* Descision Trees for Classification and Regression ********** #########
#######                                                                      #########
######################################################################################

# chagne working directory to where german.csv is at
setwd("~/..")

# load  housing Data
housing <- read.csv("housing.csv", sep = "," , header = T)

# preview housing data
#head(housing,3)

# load german data
german <- read.csv("germancredit.csv", sep = "," , header = T )

# preview german data
#head(german,3)

####### impute columns ###############################################################
# to duration, amount, installment, and age in this analysis, 
# along with loan history, purpose, and rent
german <- german[, c('duration', 'amount', 'installment', 'age', 'history', 
                     'purpose', 'housing','Default')]

# check if all columns are numeric
str(german)
## 'data.frame':    1000 obs. of  8 variables:
##  $ duration   : int  6 48 12 42 24 36 24 36 12 30 ...
##  $ amount     : int  1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
##  $ installment: int  4 2 2 2 3 2 3 2 2 4 ...
##  $ age        : int  67 22 49 45 53 35 53 35 61 28 ...
##  $ history    : Factor w/ 5 levels "A30","A31","A32",..: 5 3 5 3 4 3 3 3 3 5 ...
##  $ purpose    : Factor w/ 10 levels "A40","A41","A410",..: 5 5 8 4 1 8 4 2 5 1 ...
##  $ housing    : Factor w/ 3 levels "A151","A152",..: 2 2 2 3 3 3 2 1 2 2 ...
##  $ Default    : int  0 1 0 0 1 0 0 0 0 1 ...
# Set target as Factor
german$Default = factor(german$Default, levels = c(0,1))

####### Splitting the german and oldfaithful into the Training Set and Test Set#######

# if caTools library does not exist, then install 
if (!require("caTools")) install.packages("caTools", dependencies=TRUE)
## Loading required package: caTools
library(caTools)

# split data set to train and test
set.seed(1000)
split = sample.split(german$Default, SplitRatio = 0.9)
train_german <- subset(german, split == TRUE)
test_german <- subset(german, split == FALSE)

# take only numeric columns
numeric_cols <- names(housing)[which(sapply(housing, is.numeric))]
housing <-housing[numeric_cols]

# remove na
housing <- na.omit(housing)

split = sample.split(housing$SalePrice, SplitRatio = 0.9)
train_housing <- subset(housing, split == TRUE)
test_housing <- subset(housing, split == FALSE)

####### Decision Tree for regression problem #########################################

# if rpart library does not exist, then install 
if (!require("rpart")) install.packages("rpart", dependencies=TRUE)
## Loading required package: rpart
library(rpart)

# fit the decision tree model
regression.tree <- lm(SalePrice ~ ., data = train_housing)

# Predicting the Test Set Results
y_housing <- predict(regression.tree, newdata = test_housing)
## Warning in predict.lm(regression.tree, newdata = test_housing): prediction
## from a rank-deficient fit may be misleading
# building Confusion Matrix
cm <- table(test_housing[,'SalePrice'], y_housing)
#print(cm)

####### Decision Tree for classification problem #####################################

# fit the decision tree model
classification.tree <- rpart(formula = Default ~ duration + amount + installment + age + 
                                                 history + purpose + housing,data=train_german)
# Predicting the Test Set Results
y_german <- predict(classification.tree, newdata = test_german, type = 'class')

# building Confusion Matrix
cm <- table(test_german[,'Default'], y_german)
#print(cm)

####### Drawing the Training Set Decision Results ####################################
# chart 1
if (!require("plotmo")) install.packages("plotmo", dependencies=TRUE)
## Loading required package: plotmo
## Loading required package: plotrix
## Loading required package: TeachingDemos
if (!require("rpart.plot")) install.packages("rpart.plot", dependencies=TRUE)
## Loading required package: rpart.plot
library(plotmo)
library(rpart.plot)


# Decision Tree with two class response variable
regression.tree <- rpart(SalePrice ~ ., data = train_housing) 
# here is the dicision tree
prp(regression.tree, main="Housing SalePrice Regression Tree")

# this is a surface plot for the regression we created
plotmo(regression.tree, main="Housing SalePrice Regression Surface Plot")
##  plotmo grid:    Id MSSubClass LotFrontage LotArea OverallQual OverallCond
##                 716         50          70    9399           6           5
##  YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF
##       1974         1995          0        378          0       502
##  TotalBsmtSF X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath
##         1004      1104         0            0      1484            0
##  BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr TotRmsAbvGrd
##             0        2        0            3            1            6
##  Fireplaces GarageYrBlt GarageCars GarageArea WoodDeckSF OpenPorchSF
##           1        1981          2        484          0          28
##  EnclosedPorch X3SsnPorch ScreenPorch PoolArea MiscVal MoSold YrSold
##              0          0           0        0       0      6   2008

# chart 2
if (!require("rpart.plot")) install.packages("rpart.plot", dependencies=TRUE)
if (!require("rattle")) install.packages("rattle", dependencies=TRUE)
## Loading required package: rattle
## Rattle: A free graphical interface for data science with R.
## Version 5.2.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(rpart.plot)
library(rattle)

#rpart.plot(classification.tree, uniform=FALSE, main="German Credict Default Classification Tree")
#text(classification.tree, use.n=TRUE, all=TRUE, cex=.8)
fancyRpartPlot(classification.tree, main="German Credict Default Classification Tree")