NOTE Before starting this assignment please remember to clear your environment, you can do that by running the following code chunk
rm(list = ls(all=TRUE))
Get the data
Data Pre-processing
Build a model
Predictions
Communication
Make sure the dataset is located in your current working directory, or else you can change your working directory using the “setwd()” function.
setwd("C:\\Users\\C5215696\\Desktop\\Data Science\\Decision Trees")
des_data <- read.csv("ilpd_data.csv")
Use the str(), summary(), head() and tail() functions to get the dimensions and types of attributes in the dataset
The dataset has 582 observations and 11 variables
The variable descriptions are given below:
1 - age : Age of the patient
2 - gender : Gender of the patient
3 - TB : Total Bilirubin content
4 - DB : Direct Bilirubin content
5 - alk_phos : Alkaline Phosphotase content
6 - alamine : Alamine Aminotransferase content
7 - aspartate : Aspartate Aminotransferase content
8 - TP : Total Protiens content
9 - albumin : Albumin content
10 - A/G : Ratio of Albumin and Globulin
11 - Disease : Whether the patient has liver disease or not
str(des_data)
## 'data.frame': 582 obs. of 11 variables:
## $ age : int 62 62 58 72 46 26 29 17 55 57 ...
## $ gender : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 1 1 2 2 2 ...
## $ TB : num 10.9 7.3 1 3.9 1.8 0.9 0.9 0.9 0.7 0.6 ...
## $ DB : num 5.5 4.1 0.4 2 0.7 0.2 0.3 0.3 0.2 0.1 ...
## $ alk_phos : int 699 490 182 195 208 154 202 202 290 210 ...
## $ alamine : int 64 60 14 27 19 16 14 22 53 51 ...
## $ aspartate: int 100 68 20 59 14 12 11 19 58 59 ...
## $ TP : num 7.5 7 6.8 7.3 7.6 7 6.7 7.4 6.8 5.9 ...
## $ albumin : num 3.2 3.3 3.4 2.4 4.4 3.5 3.6 4.1 3.4 2.7 ...
## $ A.G : num 0.74 0.89 1 0.4 1.3 1 1.1 1.2 1 0.8 ...
## $ disease : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 1 2 2 ...
summary(des_data)
## age gender TB DB
## Min. : 4.00 Female:141 Min. : 0.400 Min. : 0.100
## 1st Qu.:33.00 Male :441 1st Qu.: 0.800 1st Qu.: 0.200
## Median :45.00 Median : 1.000 Median : 0.300
## Mean :44.71 Mean : 3.303 Mean : 1.488
## 3rd Qu.:57.75 3rd Qu.: 2.600 3rd Qu.: 1.300
## Max. :90.00 Max. :75.000 Max. :19.700
##
## alk_phos alamine aspartate TP
## Min. : 63.0 Min. : 10.00 Min. : 10.0 Min. :2.700
## 1st Qu.: 175.2 1st Qu.: 23.00 1st Qu.: 25.0 1st Qu.:5.800
## Median : 208.0 Median : 35.00 Median : 42.0 Median :6.600
## Mean : 290.8 Mean : 80.82 Mean : 110.1 Mean :6.483
## 3rd Qu.: 298.0 3rd Qu.: 60.75 3rd Qu.: 87.0 3rd Qu.:7.200
## Max. :2110.0 Max. :2000.00 Max. :4929.0 Max. :9.600
##
## albumin A.G disease
## Min. :0.900 Min. :0.3000 no :167
## 1st Qu.:2.600 1st Qu.:0.7000 yes:415
## Median :3.100 Median :0.9400
## Mean :3.142 Mean :0.9471
## 3rd Qu.:3.800 3rd Qu.:1.1000
## Max. :5.500 Max. :2.8000
## NA's :4
head(des_data,n=10)
## age gender TB DB alk_phos alamine aspartate TP albumin A.G disease
## 1 62 Male 10.9 5.5 699 64 100 7.5 3.2 0.74 yes
## 2 62 Male 7.3 4.1 490 60 68 7.0 3.3 0.89 yes
## 3 58 Male 1.0 0.4 182 14 20 6.8 3.4 1.00 yes
## 4 72 Male 3.9 2.0 195 27 59 7.3 2.4 0.40 yes
## 5 46 Male 1.8 0.7 208 19 14 7.6 4.4 1.30 yes
## 6 26 Female 0.9 0.2 154 16 12 7.0 3.5 1.00 yes
## 7 29 Female 0.9 0.3 202 14 11 6.7 3.6 1.10 yes
## 8 17 Male 0.9 0.3 202 22 19 7.4 4.1 1.20 no
## 9 55 Male 0.7 0.2 290 53 58 6.8 3.4 1.00 yes
## 10 57 Male 0.6 0.1 210 51 59 5.9 2.7 0.80 yes
tail(des_data)
## age gender TB DB alk_phos alamine aspartate TP albumin A.G
## 577 32 Male 12.7 8.4 190 28 47 5.4 2.6 0.90
## 578 60 Male 0.5 0.1 500 20 34 5.9 1.6 0.37
## 579 40 Male 0.6 0.1 98 35 31 6.0 3.2 1.10
## 580 52 Male 0.8 0.2 245 48 49 6.4 3.2 1.00
## 581 31 Male 1.3 0.5 184 29 32 6.8 3.4 1.00
## 582 38 Male 1.0 0.3 216 21 24 7.3 4.4 1.50
## disease
## 577 yes
## 578 no
## 579 yes
## 580 yes
## 581 yes
## 582 no
colSums(is.na(des_data))
## age gender TB DB alk_phos alamine aspartate
## 0 0 0 0 0 0 0
## TP albumin A.G disease
## 0 0 4 0
str(des_data)
## 'data.frame': 582 obs. of 11 variables:
## $ age : int 62 62 58 72 46 26 29 17 55 57 ...
## $ gender : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 1 1 2 2 2 ...
## $ TB : num 10.9 7.3 1 3.9 1.8 0.9 0.9 0.9 0.7 0.6 ...
## $ DB : num 5.5 4.1 0.4 2 0.7 0.2 0.3 0.3 0.2 0.1 ...
## $ alk_phos : int 699 490 182 195 208 154 202 202 290 210 ...
## $ alamine : int 64 60 14 27 19 16 14 22 53 51 ...
## $ aspartate: int 100 68 20 59 14 12 11 19 58 59 ...
## $ TP : num 7.5 7 6.8 7.3 7.6 7 6.7 7.4 6.8 5.9 ...
## $ albumin : num 3.2 3.3 3.4 2.4 4.4 3.5 3.6 4.1 3.4 2.7 ...
## $ A.G : num 0.74 0.89 1 0.4 1.3 1 1.1 1.2 1 0.8 ...
## $ disease : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 1 2 2 ...
Use stratified sampling to split the data into train/test sets (70/30)
Use the createDataPartition() function from the caret package to do stratified sampling
library(caret)
## Warning: package 'caret' was built under R version 3.3.3
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.3.3
set.seed(786)
train_rows <- createDataPartition(des_data$disease, p = 0.7, list = F)
train_data <- des_data[train_rows, ]
test_data <- des_data[-train_rows, ]
str(train_data)
## 'data.frame': 408 obs. of 11 variables:
## $ age : int 62 72 46 26 29 17 55 57 72 64 ...
## $ gender : Factor w/ 2 levels "Female","Male": 2 2 2 1 1 2 2 2 2 2 ...
## $ TB : num 10.9 3.9 1.8 0.9 0.9 0.9 0.7 0.6 2.7 0.9 ...
## $ DB : num 5.5 2 0.7 0.2 0.3 0.3 0.2 0.1 1.3 0.3 ...
## $ alk_phos : int 699 195 208 154 202 202 290 210 260 310 ...
## $ alamine : int 64 27 19 16 14 22 53 51 31 61 ...
## $ aspartate: int 100 59 14 12 11 19 58 59 56 58 ...
## $ TP : num 7.5 7.3 7.6 7 6.7 7.4 6.8 5.9 7.4 7 ...
## $ albumin : num 3.2 2.4 4.4 3.5 3.6 4.1 3.4 2.7 3 3.4 ...
## $ A.G : num 0.74 0.4 1.3 1 1.1 1.2 1 0.8 0.6 0.9 ...
## $ disease : Factor w/ 2 levels "no","yes": 2 2 2 2 2 1 2 2 2 1 ...
str(test_data)
## 'data.frame': 174 obs. of 11 variables:
## $ age : int 62 58 74 25 38 40 51 52 30 45 ...
## $ gender : Factor w/ 2 levels "Female","Male": 2 2 1 2 2 1 2 2 2 2 ...
## $ TB : num 7.3 1 1.1 0.6 1.8 0.9 2.2 0.9 1.3 2.4 ...
## $ DB : num 4.1 0.4 0.4 0.1 0.8 0.3 1 0.2 0.4 1.1 ...
## $ alk_phos : int 490 182 214 183 342 293 610 156 482 168 ...
## $ alamine : int 60 14 22 91 168 232 17 35 102 33 ...
## $ aspartate: int 68 20 30 53 441 245 28 44 80 50 ...
## $ TP : num 7 6.8 8.1 5.5 7.6 6.8 7.3 4.9 6.9 5.1 ...
## $ albumin : num 3.3 3.4 4.1 2.3 4.4 3.1 2.6 2.9 3.3 2.6 ...
## $ A.G : num 0.89 1 1 0.7 1.3 0.8 0.55 1.4 0.9 1 ...
## $ disease : Factor w/ 2 levels "no","yes": 2 2 2 1 2 2 2 2 2 2 ...
library(DMwR)
## Warning: package 'DMwR' was built under R version 3.3.3
## Loading required package: grid
train_data_imputed<-knnImputation(des_data,k=3,scale = T,meth = "weighAvg")
test_data_imputed<-knnImputation(des_data,k=3,scale = T,meth = "weighAvg")
library(C50)
## Warning: package 'C50' was built under R version 3.3.3
c5_tree <- C5.0(disease ~ . , train_data)
c5_rules <- C5.0(disease ~ . , train_data, rules = T)
C5imp(c5_tree, metric = "usage")
## Overall
## TB 100.00
## aspartate 65.93
## A.G 59.56
## alk_phos 44.12
## age 16.91
## gender 0.00
## DB 0.00
## alamine 0.00
## TP 0.00
## albumin 0.00
summary(c5_rules)
##
## Call:
## C5.0.formula(formula = disease ~ ., data = train_data, rules = T)
##
##
## C5.0 [Release 2.07 GPL Edition] Fri Aug 11 22:52:16 2017
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 408 cases (11 attributes) from undefined.data
##
## Rules:
##
## Rule 1: (5, lift 3.0)
## age <= 68
## TB <= 1.6
## aspartate <= 111
## A.G <= 0.52
## -> class no [0.857]
##
## Rule 2: (269/163, lift 1.4)
## TB <= 1.6
## -> class no [0.395]
##
## Rule 3: (12, lift 1.3)
## TB <= 1.6
## alk_phos <= 127
## A.G > 0.88
## -> class yes [0.929]
##
## Rule 4: (139/11, lift 1.3)
## TB > 1.6
## -> class yes [0.915]
##
## Rule 5: (368/100, lift 1.0)
## alk_phos > 146
## -> class yes [0.727]
##
## Default class: yes
##
##
## Evaluation on training data (408 cases):
##
## Rules
## ----------------
## No Errors
##
## 5 102(25.0%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 20 97 (a): class no
## 5 286 (b): class yes
##
##
## Attribute usage:
##
## 100.00% TB
## 93.14% alk_phos
## 4.17% A.G
## 1.23% age
## 1.23% aspartate
##
##
## Time: 0.0 secs
plot(c5_tree)
preds <- predict(c5_tree, test_data)
library(caret)
confusionMatrix(preds, test_data$disease)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 6 10
## yes 44 114
##
## Accuracy : 0.6897
## 95% CI : (0.6152, 0.7575)
## No Information Rate : 0.7126
## P-Value [Acc > NIR] : 0.776
##
## Kappa : 0.0494
## Mcnemar's Test P-Value : 7.098e-06
##
## Sensitivity : 0.12000
## Specificity : 0.91935
## Pos Pred Value : 0.37500
## Neg Pred Value : 0.72152
## Prevalence : 0.28736
## Detection Rate : 0.03448
## Detection Prevalence : 0.09195
## Balanced Accuracy : 0.51968
##
## 'Positive' Class : no
##
NOTE Before starting this assignment please remember to clear your environment, you can do that by running the following code chunk
rm(list=ls(all=T))
The goal of this activity is to predict the heating load of a residential building, if the building parameters are given
Hence, in the future architects would be able to build more energy efficient buildings as they can optimize the building parameters to reduce the heating load
Get the data
Data Pre-processing
Build a model
Predictions
Communication
setwd("C:\\Users\\C5215696\\Desktop\\Data Science\\Decision Trees")
building_data=read.csv("building_energy.csv", header = T, sep = ",")
Use the str(), summary(), head() and tail() functions to get the dimensions and types of attributes in the dataset
The dataset has 768 observations and 9 variables
str(building_data)
## 'data.frame': 768 obs. of 9 variables:
## $ relative_compactness : num 0.98 0.98 0.98 0.98 0.9 0.9 0.9 0.9 0.86 0.86 ...
## $ surface_area : num 514 514 514 514 564 ...
## $ wall_area : num 294 294 294 294 318 ...
## $ roof_area : num 110 110 110 110 122 ...
## $ overall_height : num 7 7 7 7 7 7 7 7 7 7 ...
## $ orientation : int 2 3 4 5 2 3 4 5 2 3 ...
## $ glazing_area : num 0 0 0 0 0 0 0 0 0 0 ...
## $ glazing_area_distribution: int 0 0 0 0 0 0 0 0 0 0 ...
## $ heating_load : num 15.6 15.6 15.6 15.6 20.8 ...
head(building_data)
## relative_compactness surface_area wall_area roof_area overall_height
## 1 0.98 514.5 294.0 110.25 7
## 2 0.98 514.5 294.0 110.25 7
## 3 0.98 514.5 294.0 110.25 7
## 4 0.98 514.5 294.0 110.25 7
## 5 0.90 563.5 318.5 122.50 7
## 6 0.90 563.5 318.5 122.50 7
## orientation glazing_area glazing_area_distribution heating_load
## 1 2 0 0 15.55
## 2 3 0 0 15.55
## 3 4 0 0 15.55
## 4 5 0 0 15.55
## 5 2 0 0 20.84
## 6 3 0 0 21.46
tail(building_data)
## relative_compactness surface_area wall_area roof_area overall_height
## 763 0.64 784.0 343.0 220.5 3.5
## 764 0.64 784.0 343.0 220.5 3.5
## 765 0.62 808.5 367.5 220.5 3.5
## 766 0.62 808.5 367.5 220.5 3.5
## 767 0.62 808.5 367.5 220.5 3.5
## 768 0.62 808.5 367.5 220.5 3.5
## orientation glazing_area glazing_area_distribution heating_load
## 763 4 0.4 5 18.16
## 764 5 0.4 5 17.88
## 765 2 0.4 5 16.54
## 766 3 0.4 5 16.44
## 767 4 0.4 5 16.48
## 768 5 0.4 5 16.64
sum(is.na(building_data))
## [1] 0
colSums(is.na(building_data))
## relative_compactness surface_area
## 0 0
## wall_area roof_area
## 0 0
## overall_height orientation
## 0 0
## glazing_area glazing_area_distribution
## 0 0
## heating_load
## 0
# Enter answer here
str(building_data)
## 'data.frame': 768 obs. of 9 variables:
## $ relative_compactness : num 0.98 0.98 0.98 0.98 0.9 0.9 0.9 0.9 0.86 0.86 ...
## $ surface_area : num 514 514 514 514 564 ...
## $ wall_area : num 294 294 294 294 318 ...
## $ roof_area : num 110 110 110 110 122 ...
## $ overall_height : num 7 7 7 7 7 7 7 7 7 7 ...
## $ orientation : int 2 3 4 5 2 3 4 5 2 3 ...
## $ glazing_area : num 0 0 0 0 0 0 0 0 0 0 ...
## $ glazing_area_distribution: int 0 0 0 0 0 0 0 0 0 0 ...
## $ heating_load : num 15.6 15.6 15.6 15.6 20.8 ...
smp_size <- floor(0.70 * nrow(building_data))
train_index <- sample(seq_len(nrow(building_data)), size = smp_size)
train_data <- building_data[train_index,]
test_data <- building_data[-train_index,]
library(rpart)
train_reg_tree <- rpart(heating_load ~ ., train_data)
printcp(train_reg_tree)
##
## Regression tree:
## rpart(formula = heating_load ~ ., data = train_data)
##
## Variables actually used in tree construction:
## [1] glazing_area overall_height relative_compactness
##
## Root node error: 53042/537 = 98.774
##
## n= 537
##
## CP nsplit rel error xerror xstd
## 1 0.793022 0 1.000000 1.001826 0.0385761
## 2 0.084397 1 0.206978 0.207709 0.0153959
## 3 0.033763 2 0.122581 0.123576 0.0097394
## 4 0.013132 3 0.088818 0.089774 0.0067811
## 5 0.012734 4 0.075686 0.082471 0.0066490
## 6 0.010728 5 0.062952 0.070402 0.0059013
## 7 0.010000 6 0.052224 0.061306 0.0047210
train_reg_tree$variable.importance
## relative_compactness surface_area
## 46539.814 46539.814
## overall_height roof_area
## 42063.259 42063.259
## wall_area glazing_area
## 17030.318 3894.924
## glazing_area_distribution
## 1093.120
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.3.3
rpart.plot(train_reg_tree)
pred_building_test <- predict(train_reg_tree, test_data)
library(DMwR)
regr.eval(test_data$heating_load, pred_building_test)
## mae mse rmse mape
## 2.2357982 7.6465711 2.7652434 0.1186324