Neural Network and C5.0 models

This patients’ (heart attack diagnosis) dataset (Patient_data.xlsx) was retrieved from the Internet. The file contains 7998 records. The following screenshot shows you what the dataset actually contains.

Question 1.) Explore the data set, then use a neural network to model this classification problem (no partition at this step).

library(readxl)
patientData<- read_excel("C:/Users/esbro/OneDrive/Desktop/IE 575/Week 7/Patient_Data.xlsx")

dupRowCount<-sum(duplicated(patientData)) #finding the count of duplicate records
dupRowCount

## [1] 4492

str(patientData) #overall structure of patientData

## tibble [7,998 × 9] (S3: tbl_df/tbl/data.frame)
##  $ age          : num [1:7998] 54 64 63 67 76 69 67 74 69 54 ...
##  $ gender       : chr [1:7998] "Female" "Female" "Female" "Male" ...
##  $ diabetes     : chr [1:7998] "No" "No" "No" "No" ...
##  $ smoker       : chr [1:7998] "No" "No" "No" "Yes" ...
##  $ active       : chr [1:7998] "Yes" "No" "No" "No" ...
##  $ obesity      : chr [1:7998] "No" "No" "No" "No" ...
##  $ heartattack_s: chr [1:7998] "No" "Yes" "Yes" "No" ...
##  $ bp           : chr [1:7998] "Hypertension" "Normal" "Normal" "Hypotension" ...
##  $ cholesteral  : chr [1:7998] "Normal" "Normal" "Highl" "Highl" ...

length(patientData)#number of columns

## [1] 9

nrow(patientData) #number of rows

## [1] 7998

ncol(patientData) #another way to determine number of columns should equal length of dataframe

## [1] 9

dim(patientData) #number of rows and columns of dataframe

## [1] 7998    9

colnames(patientData) #column names in dataframe

## [1] "age"           "gender"        "diabetes"      "smoker"       
## [5] "active"        "obesity"       "heartattack_s" "bp"           
## [9] "cholesteral"

head(patientData) #first six rows of the original dataframe

tail(patientData) #last six rows of the original dataframe

#checking the possible unique values of variables
uniBp<-unique(patientData$bp)
uniBp

## [1] "Hypertension" "Normal"       "Hypotension"

uniChol<-unique(patientData$cholesteral)
uniChol

## [1] "Normal" "Highl"

uniHkval<-unique(patientData$heartattack_s)
uniHkval

## [1] "No"  "Yes"

colSums(is.na(patientData)) #how many columns/variables have missing values

##           age        gender      diabetes        smoker        active 
##             0             0             0             0             0 
##       obesity heartattack_s            bp   cholesteral 
##             0             0             0             0

#descriptive statistical summary of numeric variables in dataframe
summary(patientData$age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   45.00   55.00   61.00   61.85   68.00   89.00

#changing all character variables in dataframe to factor variables to compute classification model
#C5.0 on data - C5.0 can not take character variables
chVars <- c(2:9) #all vars in dataframe but the numeric variable age
patientData[,chVars] <- lapply(patientData[,chVars],factor)

patientData<-unique(patientData)
#sum should equal the previous total row count of the original dataset
nrow(patientData)+dupRowCount #checking to make sure only duplicate rows where removed

## [1] 7998

library(polycor) #to use hetcor function to compute correlation matrix between variables
#of mixed data types
library(corrplot) #to display correlation matrix

## corrplot 0.92 loaded

with(patientData,corrplot(as.matrix(hetcor(age,gender,diabetes,smoker,active,obesity,
                                           heartattack_s,bp,cholesteral))))

#getting distribution of continuous variable age
with(patientData,hist(age))

#getting distribution of data between variables of the dataframe
library(ggplot2)

ggplot(patientData, aes(x = gender, y = age)) + geom_boxplot()

ggplot(patientData, aes(x = obesity, y = age)) + geom_boxplot()

ggplot(patientData, aes(x = heartattack_s, y = age)) + geom_boxplot()

ggplot(patientData, aes(x = cholesteral, y = age)) + geom_boxplot()

ggplot(patientData, aes(x = active, y = age)) + geom_boxplot()

ggplot(patientData, aes(x = smoker, y = age)) + geom_boxplot()

ggplot(patientData, aes(x = diabetes, y = age)) + geom_boxplot()

ggplot(patientData, aes(x = bp, y = age)) + geom_boxplot()

#changing all factor variables to numeric variables since neural network models can only take
#numeric variables
patientData$age<-scale(patientData$age)
patientData$gender<-as.numeric(patientData$gender)-1
patientData$diabetes<-as.numeric(patientData$diabetes)-1
patientData$smoker<-as.numeric(patientData$smoker)-1
patientData$active<-as.numeric(patientData$active)-1
patientData$obesity<-as.numeric(patientData$obesity)-1
patientData$heartattack_s<-as.numeric(patientData$heartattack_s)-1
patientData$bp<-as.numeric(patientData$bp)-1
patientData$cholesteral<-as.numeric(patientData$cholesteral)-1

library(neuralnet)
seed.val<-4
set.seed(seed.val)
nn=neuralnet(heartattack_s~age+gender+diabetes+smoker+active+
               obesity+bp+cholesteral,data=patientData,
               hidden= 5,
               linear.output= FALSE)

predicted.nn.values <- compute(nn,patientData)
print(head(predicted.nn.values$net.result))

##           [,1]
## [1,] 0.3283700
## [2,] 0.4621419
## [3,] 0.4859337
## [4,] 0.8289363
## [5,] 0.5433085
## [6,] 0.4688767

predicted.nn.values$net.result <- sapply(predicted.nn.values$net.result,round,digits=0)
cm1<-table(patientData$heartattack_s,predicted.nn.values$net.result)
cm1

##    
##        0    1
##   0  898  547
##   1  608 1453

#accuracy=(true positives + true Negatives)/(true Positives+true Negatives+false Positives+false Negatives)
accuracyResult<-sum(diag(cm1))/sum(cm1)
accuracyResult

## [1] 0.6705647

Response: The overall structure of the patient dataset is that there are 9 variables and 7,998 rows of data. From the 9 variables, 8 are categorical variables (age, gender, diabetes, smoker, active, obesity, heartattack_s, bp, cholesteral) and 1 is discrete numeric variable (age). There happens to be no missing data in the dataset, so we did not have to handle a missing data problem. The variable age displayed that people who were chosen for this dataset happen to be from the ages of 45 to 89. So, people who were young adults are adolescents who were 35 years of younger were not selected for this study. Which could affect the end modelling result because we are only looking a select portion of age groups. With regards to correlation, all variables seem not to be directly related to each other. Which is a good assumption that is met. However, variables such as age, diabetes, smoker, and obesity seem to be positively correlated to whether a person would experience a heartattack. Then variables such as diabetes & bp seem to be negatively correlated to each other. And variables such as active, bp, and cholesteral seem to be negatively correlated to whether a person would experience a heartattack. The distribution of people’s ages in the dataset seemed to be closely normally distributed. However, the variable age did seem to be somewhat rightly skewed. The boxplots showed there was not much difference between variables distribution of data. However, there were some more people in the upper range of age groups from 65-70 who faced obesity than people who had not faced obesity. Also, more people experienced a heartattack in their life at ages between 65 and 72.

Comparing with a C5.0 model you had, is there any difference?

#building CART classification model
library(rpart)
htAtModel_cart<-rpart(heartattack_s~.,method="class",data=patientData,cp=0.001)
#showing the variable importance order of based on the CART model
htAtModel_cart$variable.importance

##         age    diabetes      smoker     obesity          bp      active 
##   79.769578   56.796976   42.728702   20.056296   13.669953   12.788915 
## cholesteral      gender 
##   10.396228    2.221094

#showing the table of complexity parameter, including the cross-validated error
printcp(htAtModel_cart)

## 
## Classification tree:
## rpart(formula = heartattack_s ~ ., data = patientData, method = "class", 
##     cp = 0.001)
## 
## Variables actually used in tree construction:
## [1] active      age         bp          cholesteral diabetes    gender     
## [7] obesity     smoker     
## 
## Root node error: 1445/3506 = 0.41215
## 
## n= 3506 
## 
##           CP nsplit rel error  xerror     xstd
## 1  0.0366782      0   1.00000 1.00000 0.020170
## 2  0.0138408      3   0.88997 0.93356 0.019937
## 3  0.0041522      5   0.86228 0.93633 0.019948
## 4  0.0039216      7   0.85398 0.93287 0.019934
## 5  0.0038062     10   0.84221 0.93287 0.019934
## 6  0.0027682     12   0.83460 0.92388 0.019897
## 7  0.0023068     13   0.83183 0.92457 0.019900
## 8  0.0020761     17   0.82145 0.93426 0.019940
## 9  0.0013841     20   0.81522 0.94325 0.019975
## 10 0.0011534     28   0.80415 0.94325 0.019975
## 11 0.0010381     34   0.79723 0.97024 0.020073
## 12 0.0010000     38   0.79308 0.97370 0.020085

plotcp(htAtModel_cart) #visualizing the results of the classification tree

#predictions based on the CART model on the original patient dataset
pred_cart<-predict(htAtModel_cart,newdata=patientData,type="class")
#getting accuracy results from the model
mean(pred_cart==patientData$heartattack_s)

## [1] 0.6731318

Response: There was a slight difference between the non-partitioned neural network model I created in this assignment and the non-partitioned C5.0 model I created in Exercise 6. The main difference was based on the tree structure and variable importance of each model. Since, it was difficult to see, whether either model was overfitted or underfitted. Based on the fact there was no test dataset established to sufficiently evaluate either model’s performance. The C5.0 model had a tree structure with many splits that resulted into many nodes being created, all of which stemmed from the diabetes attribute. However, the neural network model used a different method to create its representation of the outcome variable. The model started with its input layer of all input variables in the original model, then one hidden layer, and lastly one output layer. Regarding variable importance, the C5.0 model had the order of importance as diabetes, age, smoker, bp, obesity, and active. With diabetes being the most important at predicting whether a person would experience a heart attack. While the neural network model had the order of importance as diabetes, smoker, bp, age, obesity, and active. So, the neural network had the variables of smoker and bp more important than age in its model creation of whether a person would experience a heart attack, compared to the C5.0 model.

Question 3.)

library(caTools)
split = sample.split(patientData$heartattack_s, SplitRatio = 0.80)
train = subset(patientData, split == TRUE)
test = subset(patientData, split == FALSE)

seed.val<-4
set.seed(seed.val)
nn_v2=neuralnet(heartattack_s~age+gender+diabetes+smoker+active+
               obesity+bp+cholesteral,data=train,
             hidden= 5,
             linear.output= FALSE)

predicted.nn.values2 <- compute(nn_v2,test)
print(head(predicted.nn.values2$net.result))

##           [,1]
## [1,] 0.6008108
## [2,] 0.5003855
## [3,] 0.9468913
## [4,] 0.7028397
## [5,] 0.7914185
## [6,] 0.6425482

predicted.nn.values2$net.result <- sapply(predicted.nn.values2$net.result,round,digits=0)
cm2<-table(test$heartattack_s,predicted.nn.values2$net.result)
cm2

##    
##       0   1
##   0 163 126
##   1 155 257

#accuracy=(true positives + true Negatives)/(true Positives+true Negatives+false Positives+false Negatives)
accuracyResult2<-sum(diag(cm2))/sum(cm2)
accuracyResult2

## [1] 0.5991441

#optimization of neural network model
seed.val<-4
set.seed(seed.val)
nn_v2=neuralnet(heartattack_s~age+gender+diabetes+smoker+active+
                  obesity+bp+cholesteral,data=train,
                hidden= c(12,2,8,8,2,2),threshold = 1.8,
                linear.output= FALSE)

predicted.nn.values3 <- compute(nn_v2,test)
print(head(predicted.nn.values3$net.result))

##           [,1]
## [1,] 0.4163859
## [2,] 0.4917036
## [3,] 0.8466461
## [4,] 0.6888345
## [5,] 0.5797104
## [6,] 0.4531923

predicted.nn.values3$net.result <- sapply(predicted.nn.values3$net.result,round,digits=0)
cm3<-table(test$heartattack_s,predicted.nn.values3$net.result)
cm3

##    
##       0   1
##   0 207  82
##   1 174 238

#accuracy=(true positives + true Negatives)/(true Positives+true Negatives+false Positives+false Negatives)
accuracyResult3<-sum(diag(cm3))/sum(cm3)
accuracyResult3

## [1] 0.6348074

Response: The performance results gathered from the neural network model on the training dataset and the neural network model on the test dataset were slightly similar. However, the test dataset - neural network model did produce slightly lower results than the non-partitioned neural network model. Since the 80:20 partitioned neural network model scored an accuracy score of around 63% on the test dataset, while the non-partitioned - neural network model scored an accuracy score of around 67%. One way we could improve this partitioned neural network model’s performance would be to optimize parameters such as the (hidden) - number of neuron layers and (threshold) - threshold for the partial derivatives of the error function.

Neural Network and C5.0 models

Eric B.

7/10/2022