Read data

data <- read.csv ("kc_house_data.csv", sep = ",", header = TRUE)

# omit missing data
sum(is.na(data))
## [1] 2
data <- na.omit (data)

The dataset has 21,611 observations with 21 variables to analyze. But some of its variables may not be relevant. Thus, several variables like id, view, zipcode, lat, and long can be dropped. Some new variables are added like house’s age (age = year when the house is built and the current year it got sold) and ‘renovated’ to tell whether the house was renovated or not.

data1 <- data [, -c(1,6,7,10,17,18,19)]

# converting the date as date vector
data1$date <- as.Date(data1$date, "%Y%m%dT000000")

# Getting 'Year' from date
data1$date <- as.numeric(format(data1$date, "%Y"))

# Calculating age of house by subtracting year of built from year of sale
data1$age <- data1$date-data1$yr_built

# Removing unnecessary columns
data1 <- data1[,-1]

# Since date has some negative values, adding +2 to neutralize the negative value
data1$age <- data1$age+2

#Converting variables to factor format
data1$yr_renovated <- ifelse(data1$yr_renovated == 0,0,1)
data1$yr_renovated  <- as.factor (data1$yr_renovated)
data1$waterfront <- as.factor(data1$waterfront)

# Removing unnecessary columns
data1  <- data1[,-c(10)]

Outlier Detection and Removal

outliers <- boxplot(data1$price,plot=FALSE)$out
outliers_data <- data1[which(data1$price %in% outliers),]
data2 <- data1[-which(data1$price %in% outliers),]

Creating training data and test data

set.seed(1234)
index <- sample(1:nrow(data2), .80*nrow(data2))
train <- data2[index,]
test <- data2[-index,]

Loading Necessary packages

library(caret)
library(rpart)
library(rpart.plot)

Using caret package, creating REPEATED CROSS-VALIDATION for maximum variability in data

cv <- trainControl(method = "repeatedcv", number = 10, repeats = 5, allowParallel = TRUE)

# Method used is 'repeated cross-validation'
# Here, 10 fold cross validation was used. (number = 10)
# Repeats = 5 implies that the cross validation was done on 5 bootstrapped data for obtaining maximum variability

Using the cv, train the model and build tree using caret package function ‘train’

mytree1 <- train (price ~., data = train, method= 'rpart', trControl = cv, tuneLength = 10)
print(mytree1)
## CART 
## 
## 16361 samples
##    12 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 14724, 14725, 14725, 14726, 14725, 14725, ... 
## Resampling results across tuning parameters:
## 
##   cp           RMSE      Rsquared   MAE     
##   0.006239760  145677.5  0.5106254  113464.6
##   0.008609071  148188.7  0.4936202  115576.4
##   0.008624337  148188.7  0.4936202  115576.4
##   0.008852668  148714.1  0.4900319  116013.1
##   0.010989269  150199.6  0.4798364  117260.1
##   0.013595740  152397.7  0.4645928  119076.8
##   0.031843290  159274.8  0.4147980  125165.2
##   0.032032086  161976.0  0.3948651  127752.0
##   0.091358050  171059.4  0.3246606  135914.8
##   0.277392252  192920.8  0.2637954  155047.5
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.00623976.
# It can be see that the optimal CP value of  0.006239760 was selected and used for creating Decision Tree  based on lowest RMSE values 

We can also build Tree using rpart package by using the CP value used by caret package

mytree2 <- rpart(price ~.,data = train, method= 'anova', cp = 0.00623976)
print(mytree2)
## n= 16361 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 16361 7.093170e+14 476484.7  
##    2) grade< 8.5 13768 4.194966e+14 428893.3  
##      4) grade< 7.5 9023 2.079934e+14 379142.5  
##        8) grade< 6.5 1827 2.678452e+13 292395.1 *
##        9) grade>=6.5 7196 1.639700e+14 401166.8  
##         18) age< 63.5 5007 7.967812e+13 359970.1  
##           36) sqft_living15< 1837 3375 3.885363e+13 332532.9 *
##           37) sqft_living15>=1837 1632 3.302961e+13 416710.5 *
##         19) age>=63.5 2189 5.635683e+13 495398.1  
##           38) sqft_lot15>=5823.5 932 2.405151e+13 434059.3 *
##           39) sqft_lot15< 5823.5 1257 2.619876e+13 540877.6 *
##      5) grade>=7.5 4745 1.467013e+14 523498.7  
##       10) age< 54.5 3804 8.911568e+13 489082.0  
##         20) sqft_living15< 2439.5 3020 6.150121e+13 468381.0 *
##         21) sqft_living15>=2439.5 784 2.133512e+13 568823.1 *
##       11) age>=54.5 941 3.486472e+13 662628.5 *
##    3) grade>=8.5 2593 9.306143e+13 729179.7  
##      6) grade< 9.5 1874 6.558341e+13 691405.1  
##       12) age< 46.5 1657 5.352875e+13 670729.1 *
##       13) age>=46.5 217 5.937272e+12 849286.0 *
##      7) grade>=9.5 719 1.783432e+13 827635.4 *

Viewing tree formed by caret package using rpart.plot package

# From the trained model, use  finalModel as the object
rpart.plot(mytree1$finalModel, cex = 0.75, digits = -3, roundint = F, fallen.leaves = F, 
           box.palette = c("RdYlGn"), shadow.col = c("gray"))

Prediction of test data using decision tree

# predict the test data using Tree formed by rpart package
predicted2 <- predict(mytree2, test) 

# predict the test data using Tree formed by caret package
predicted3 <- predict(mytree1, test) 

Calculating MAPE for checking the prediction accuracy of Decision Tree created using rpart package

comparison1 <- data.frame(Actual_value  = test$price, Predicted_value = predicted2)

mape_test2 <- mean(abs(comparison1$Actual_value-comparison1$Predicted_value)/comparison1$Actual_value)
mape_test2
## [1] 0.2756017
accuracy_test2 <- (1-mape_test2)*100
accuracy_test2
## [1] 72.43983

Calculating MAPE for checking the prediction accuracy of Decision Tree created using caret package

comparison2 <- data.frame(Actual_value  = test$price, Predicted_value = predicted3)
mape_test3 <- mean(abs(comparison2$Actual_value-comparison2$Predicted_value)/comparison2$Actual_value)
mape_test3
## [1] 0.2756017
accuracy_test3 <- (1-mape_test3)*100
accuracy_test3
## [1] 72.43983

THANK YOU