Read data
data <- read.csv ("kc_house_data.csv", sep = ",", header = TRUE)
# omit missing data
sum(is.na(data))
## [1] 2
data <- na.omit (data)
The dataset has 21,611 observations with 21 variables to analyze. But some of its variables may not be relevant. Thus, several variables like id, view, zipcode, lat, and long can be dropped. Some new variables are added like house’s age (age = year when the house is built and the current year it got sold) and ‘renovated’ to tell whether the house was renovated or not.
data1 <- data [, -c(1,6,7,10,17,18,19)]
# converting the date as date vector
data1$date <- as.Date(data1$date, "%Y%m%dT000000")
# Getting 'Year' from date
data1$date <- as.numeric(format(data1$date, "%Y"))
# Calculating age of house by subtracting year of built from year of sale
data1$age <- data1$date-data1$yr_built
# Removing unnecessary columns
data1 <- data1[,-1]
# Since date has some negative values, adding +2 to neutralize the negative value
data1$age <- data1$age+2
#Converting variables to factor format
data1$yr_renovated <- ifelse(data1$yr_renovated == 0,0,1)
data1$yr_renovated <- as.factor (data1$yr_renovated)
data1$waterfront <- as.factor(data1$waterfront)
# Removing unnecessary columns
data1 <- data1[,-c(10)]
Outlier Detection and Removal
outliers <- boxplot(data1$price,plot=FALSE)$out
outliers_data <- data1[which(data1$price %in% outliers),]
data2 <- data1[-which(data1$price %in% outliers),]
Creating training data and test data
set.seed(1234)
index <- sample(1:nrow(data2), .80*nrow(data2))
train <- data2[index,]
test <- data2[-index,]
Loading Necessary packages
library(caret)
library(rpart)
library(rpart.plot)
Using caret package, creating REPEATED CROSS-VALIDATION for maximum variability in data
cv <- trainControl(method = "repeatedcv", number = 10, repeats = 5, allowParallel = TRUE)
# Method used is 'repeated cross-validation'
# Here, 10 fold cross validation was used. (number = 10)
# Repeats = 5 implies that the cross validation was done on 5 bootstrapped data for obtaining maximum variability
Using the cv, train the model and build tree using caret package function ‘train’
mytree1 <- train (price ~., data = train, method= 'rpart', trControl = cv, tuneLength = 10)
print(mytree1)
## CART
##
## 16361 samples
## 12 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 14724, 14725, 14725, 14726, 14725, 14725, ...
## Resampling results across tuning parameters:
##
## cp RMSE Rsquared MAE
## 0.006239760 145677.5 0.5106254 113464.6
## 0.008609071 148188.7 0.4936202 115576.4
## 0.008624337 148188.7 0.4936202 115576.4
## 0.008852668 148714.1 0.4900319 116013.1
## 0.010989269 150199.6 0.4798364 117260.1
## 0.013595740 152397.7 0.4645928 119076.8
## 0.031843290 159274.8 0.4147980 125165.2
## 0.032032086 161976.0 0.3948651 127752.0
## 0.091358050 171059.4 0.3246606 135914.8
## 0.277392252 192920.8 0.2637954 155047.5
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.00623976.
# It can be see that the optimal CP value of 0.006239760 was selected and used for creating Decision Tree based on lowest RMSE values
We can also build Tree using rpart package by using the CP value used by caret package
mytree2 <- rpart(price ~.,data = train, method= 'anova', cp = 0.00623976)
print(mytree2)
## n= 16361
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 16361 7.093170e+14 476484.7
## 2) grade< 8.5 13768 4.194966e+14 428893.3
## 4) grade< 7.5 9023 2.079934e+14 379142.5
## 8) grade< 6.5 1827 2.678452e+13 292395.1 *
## 9) grade>=6.5 7196 1.639700e+14 401166.8
## 18) age< 63.5 5007 7.967812e+13 359970.1
## 36) sqft_living15< 1837 3375 3.885363e+13 332532.9 *
## 37) sqft_living15>=1837 1632 3.302961e+13 416710.5 *
## 19) age>=63.5 2189 5.635683e+13 495398.1
## 38) sqft_lot15>=5823.5 932 2.405151e+13 434059.3 *
## 39) sqft_lot15< 5823.5 1257 2.619876e+13 540877.6 *
## 5) grade>=7.5 4745 1.467013e+14 523498.7
## 10) age< 54.5 3804 8.911568e+13 489082.0
## 20) sqft_living15< 2439.5 3020 6.150121e+13 468381.0 *
## 21) sqft_living15>=2439.5 784 2.133512e+13 568823.1 *
## 11) age>=54.5 941 3.486472e+13 662628.5 *
## 3) grade>=8.5 2593 9.306143e+13 729179.7
## 6) grade< 9.5 1874 6.558341e+13 691405.1
## 12) age< 46.5 1657 5.352875e+13 670729.1 *
## 13) age>=46.5 217 5.937272e+12 849286.0 *
## 7) grade>=9.5 719 1.783432e+13 827635.4 *
Prediction of test data using decision tree
# predict the test data using Tree formed by rpart package
predicted2 <- predict(mytree2, test)
# predict the test data using Tree formed by caret package
predicted3 <- predict(mytree1, test)
Calculating MAPE for checking the prediction accuracy of Decision Tree created using rpart package
comparison1 <- data.frame(Actual_value = test$price, Predicted_value = predicted2)
mape_test2 <- mean(abs(comparison1$Actual_value-comparison1$Predicted_value)/comparison1$Actual_value)
mape_test2
## [1] 0.2756017
accuracy_test2 <- (1-mape_test2)*100
accuracy_test2
## [1] 72.43983
Calculating MAPE for checking the prediction accuracy of Decision Tree created using caret package
comparison2 <- data.frame(Actual_value = test$price, Predicted_value = predicted3)
mape_test3 <- mean(abs(comparison2$Actual_value-comparison2$Predicted_value)/comparison2$Actual_value)
mape_test3
## [1] 0.2756017
accuracy_test3 <- (1-mape_test3)*100
accuracy_test3
## [1] 72.43983
THANK YOU