#1. Introduction

In this project, I am exploring heart disease dataset for a university hospital. The results from my analysis will be used to develop predictive models that assess an individual’s risk of heart disease.

 

Installation:

Note: Due to known loading issues, load the following package(s) dependency(ies), if needed

library(hms)
library(ResourceSelection)
library(pROC)
library(rpart)
library(rpart.plot)
library(caret)
library(randomForest)

file_path <- '/Users/arthurrichardson/Documents/Documents - Arthur’s MacBook Pro/College/MAT 303/CSV/heart_disease.csv'
heart_data <- read.csv(file_path)
str(heart_data)

#2. Data Preparation

## Rows: 303
## Columns: 14

#3. Model #1 - First Logistic Regression Model

Logistic multiple regression model for heart disease (target) using variables age (age), resting blood pressure (trestbps), exercised induced angina (exang), and maximum heart rate achieved (thalach).

log_model1 <- glm(target ~ age + trestbps + exang + thalach, data = heart_data, family = "binomial")

summary(log_model1)
## 
## Call:
## glm(formula = target ~ age + trestbps + exang + thalach, family = "binomial", 
##     data = heart_data)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.021121   1.784194  -0.572   0.5671    
## age         -0.017549   0.017144  -1.024   0.3060    
## trestbps    -0.014888   0.008337  -1.786   0.0741 .  
## exang1      -1.624981   0.305774  -5.314 1.07e-07 ***
## thalach      0.031095   0.007275   4.274 1.92e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 417.64  on 302  degrees of freedom
## Residual deviance: 323.14  on 298  degrees of freedom
## AIC: 333.14
## 
## Number of Fisher Scoring iterations: 4

##Hosmer-Lemeshow test. Convert Target variable to numeric since its a factor.

print(hoslem_test)
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  heart_data$target_numeric, fitted(log_model1)
## X-squared = 9.192, df = 8, p-value = 0.3264

Multiple Logistic Regression Model ROC Curve

Multiple Logistic Regression Prediction Model

Predicted_Results_Multi <- predict(log_model1, newdata = heart_data, type = "response")

Round Predicted Results to 4 Decimal Places

## [1] "Multiple Logistic Regression Prediction Model Results"
##      1      2      3      4      5      6      7      8      9     10     11 
## 0.6213 0.5247 0.7330 0.2045 0.0800 0.7706 0.7872 0.9233 0.5583 0.8103 0.5395 
##     12     13     14     15     16     17     18     19     20     21     22 
## 0.4505 0.7853 0.6279 0.2941 0.5065 0.7922 0.5044 0.0587 0.7283 0.2634 0.7734 
##     23     24     25     26     27     28     29     30     31     32     33 
## 0.6364 0.6825 0.9018 0.3953 0.6764 0.8122 0.5343 0.7191 0.6398 0.2102 0.2119 
##     34     35     36     37     38     39     40     41     42     43     44 
## 0.6933 0.0402 0.2994 0.8106 0.6599 0.8068 0.7945 0.6861 0.8445 0.3334 0.4768 
##     45     46     47     48     49     50     51     52     53     54     55 
## 0.5331 0.5876 0.1353 0.7768 0.6614 0.7853 0.1941 0.8940 0.1741 0.1866 0.2851 
##     56     57     58     59     60     61     62     63     64     65     66 
## 0.0798 0.6264 0.3056 0.8227 0.3021 0.7557 0.2571 0.0862 0.2666 0.7280 0.5361 
##     67     68     69     70     71     72     73     74     75     76     77 
## 0.1604 0.7772 0.2831 0.8762 0.3921 0.7989 0.3393 0.1319 0.7549 0.6735 0.8581 
##     78     79     80     81     82     83     84     85     86     87     88 
## 0.4056 0.8623 0.5782 0.3510 0.9306 0.3655 0.1545 0.8423 0.7244 0.5696 0.3255 
##     89     90     91     92     93     94     95     96     97     98     99 
## 0.2031 0.4291 0.6364 0.7812 0.8204 0.7252 0.8049 0.2407 0.8246 0.3582 0.6868 
##    100    101    102    103    104    105    106    107    108    109    110 
## 0.6264 0.3875 0.7869 0.7777 0.6984 0.4290 0.7212 0.6584 0.7675 0.8277 0.5378 
##    111    112    113    114    115    116    117    118    119    120    121 
## 0.9029 0.7105 0.0791 0.5557 0.7997 0.1047 0.7982 0.6538 0.8617 0.6216 0.6040 
##    122    123    124    125    126    127    128    129    130    131    132 
## 0.1780 0.7091 0.2028 0.6247 0.8034 0.7368 0.8913 0.0792 0.3680 0.8449 0.8371 
##    133    134    135    136    137    138    139    140    141    142    143 
## 0.2770 0.4043 0.5936 0.1793 0.4446 0.8926 0.1830 0.8497 0.0824 0.4622 0.7648 
##    144    145    146    147    148    149    150    151    152    153    154 
## 0.5673 0.0710 0.1597 0.5936 0.6656 0.1448 0.3032 0.8513 0.2681 0.8563 0.6647 
##    155    156    157    158    159    160    161    162    163    164    165 
## 0.2353 0.6023 0.2957 0.1558 0.6721 0.6860 0.0779 0.8845 0.8359 0.4507 0.3561 
##    166    167    168    169    170    171    172    173    174    175    176 
## 0.7616 0.0959 0.4739 0.7103 0.8679 0.2429 0.3178 0.7902 0.1411 0.7471 0.7330 
##    177    178    179    180    181    182    183    184    185    186    187 
## 0.1225 0.9214 0.1802 0.5661 0.4328 0.2794 0.6549 0.8103 0.8233 0.2392 0.8165 
##    188    189    190    191    192    193    194    195    196    197    198 
## 0.3856 0.1481 0.8060 0.7815 0.7606 0.4881 0.6405 0.5529 0.8850 0.2033 0.8775 
##    199    200    201    202    203    204    205    206    207    208    209 
## 0.7325 0.7735 0.1891 0.7032 0.0658 0.6596 0.1810 0.7476 0.8177 0.4099 0.5923 
##    210    211    212    213    214    215    216    217    218    219    220 
## 0.0887 0.2083 0.6707 0.4388 0.7168 0.5999 0.2883 0.1684 0.5509 0.1912 0.8664 
##    221    222    223    224    225    226    227    228    229    230    231 
## 0.8464 0.1380 0.3140 0.5256 0.8948 0.7439 0.7749 0.8378 0.7528 0.6439 0.4687 
##    232    233    234    235    236    237    238    239    240    241    242 
## 0.8485 0.3493 0.4302 0.8371 0.8764 0.4399 0.7554 0.3110 0.2957 0.8579 0.3258 
##    243    244    245    246    247    248    249    250    251    252    253 
## 0.0963 0.1415 0.8343 0.8283 0.7250 0.7177 0.7595 0.3014 0.8419 0.3187 0.5959 
##    254    255    256    257    258    259    260    261    262    263    264 
## 0.1704 0.1784 0.6325 0.8247 0.8310 0.7897 0.2405 0.1388 0.8964 0.2731 0.7980 
##    265    266    267    268    269    270    271    272    273    274    275 
## 0.8882 0.4621 0.8210 0.1332 0.7527 0.5882 0.7344 0.2358 0.9435 0.3513 0.9010 
##    276    277    278    279    280    281    282    283    284    285    286 
## 0.6927 0.6581 0.6056 0.1847 0.7331 0.8165 0.1140 0.6245 0.1348 0.5838 0.6496 
##    287    288    289    290    291    292    293    294    295    296    297 
## 0.0549 0.7994 0.0834 0.7155 0.8722 0.7871 0.1294 0.1914 0.1862 0.7802 0.6747 
##    298    299    300    301    302    303 
## 0.7086 0.1186 0.7703 0.1916 0.8481 0.7949

AUC

## [1] "Area Under the Curve (AUC)"
## [1] 0.8007
## Area under the curve: 0.8007

Confusion Matrix

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  89  31
##          1  49 134
##                                           
##                Accuracy : 0.736           
##                  95% CI : (0.6825, 0.7847)
##     No Information Rate : 0.5446          
##     P-Value [Acc > NIR] : 5.29e-12        
##                                           
##                   Kappa : 0.462           
##                                           
##  Mcnemar's Test P-Value : 0.05735         
##                                           
##             Sensitivity : 0.8121          
##             Specificity : 0.6449          
##          Pos Pred Value : 0.7322          
##          Neg Pred Value : 0.7417          
##              Prevalence : 0.5446          
##          Detection Rate : 0.4422          
##    Detection Prevalence : 0.6040          
##       Balanced Accuracy : 0.7285          
##                                           
##        'Positive' Class : 1               
## 

Preductive Model One

What is the probability of an individual having heart disease who is 50 years old, has a resting blood pressure of 122, has exercise induced angina, and has maximum heart rate of 140?

log_model2 <- glm(target ~ age + trestbps + exang + thalach, data = heart_data, family = "binomial")

new_data <- data.frame(age = 50, trestbps = 122,  exang = factor(1, levels = levels(heart_data$exang)), thalach = 140)

Predicted Probs

## Probability of having Heart Disease: 0.2716
## Odds of Having Heart Disease: 0.3728

Preductive Model Two

What is the probability of an individual having heart disease who is 50 years old, has a resting blood pressure of 130, does not have an exercise induced angina, and has maximum heart rate of 165?

log_model3 <- glm(target ~ age + trestbps + exang + thalach, data = heart_data, family = "binomial")

new_data <- data.frame(age = 50, trestbps = 130,  exang = factor(0, levels = levels(heart_data$exang)), thalach = 165)

Predicted Probs

predicted_probability2 <- predict(log_model3, newdata = new_data, type = "response")

predicted_odds2 <- predicted_probability2 / (1 - predicted_probability2)

Results

## Probability of having Heart Disease: 0.7853
## Odds of Having Heart Disease: 3.6571

#4. Model #2 - Second Logistic Regression Model

log_model4 <- glm(target ~ age + trestbps + cp + thalach, data = heart_data, family = "binomial")

Summary of the Multiple Logistic Regression Model

## 
## Call:
## glm(formula = target ~ age + trestbps + cp + thalach, family = "binomial", 
##     data = heart_data)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.265458   1.811989  -1.250 0.211204    
## age         -0.012846   0.017656  -0.728 0.466873    
## trestbps    -0.019373   0.008899  -2.177 0.029476 *  
## cp1          1.998672   0.439940   4.543 5.54e-06 ***
## cp2          2.098584   0.345310   6.077 1.22e-09 ***
## cp3          1.786071   0.542572   3.292 0.000995 ***
## thalach      0.031350   0.007559   4.147 3.36e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 417.64  on 302  degrees of freedom
## Residual deviance: 300.05  on 296  degrees of freedom
## AIC: 314.05
## 
## Number of Fisher Scoring iterations: 4

Hosmer-Lemeshow test.

‘Convert Target variable to numeric since its a factor.’

 

## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  heart_data$target_numeric, fitted(log_model1)
## X-squared = 9.192, df = 8, p-value = 0.3264

Multiple Logistic Regression Model ROC Curve

Multiple Logistic Regression Prediction Model

Predicted_Results_Multi1 <- predict(log_model4, newdata = heart_data, type = "response")

Round Predicted Results to 4 Decimal Places

##      1      2      3      4      5      6      7      8      9     10     11 
## 0.7494 0.1943 0.7694 0.2188 0.0925 0.8582 0.4102 0.9548 0.6974 0.4799 0.5494 
##     12     13     14     15     16     17     18     19     20     21     22 
## 0.1470 0.4492 0.7418 0.4371 0.1812 0.8158 0.1658 0.3419 0.7765 0.3915 0.8605 
##     23     24     25     26     27     28     29     30     31     32     33 
## 0.9178 0.7208 0.9257 0.5597 0.7898 0.8794 0.7038 0.3563 0.2650 0.1996 0.2423 
##     34     35     36     37     38     39     40     41     42     43     44 
## 0.8058 0.0398 0.0807 0.8741 0.7537 0.4779 0.8635 0.3190 0.5157 0.4603 0.6150 
##     45     46     47     48     49     50     51     52     53     54     55 
## 0.6536 0.6747 0.5436 0.8256 0.7499 0.8616 0.2414 0.9300 0.6673 0.1992 0.2907 
##     56     57     58     59     60     61     62     63     64     65     66 
## 0.0804 0.7348 0.3549 0.5075 0.3440 0.8328 0.2714 0.0933 0.2907 0.8057 0.5657 
##     67     68     69     70     71     72     73     74     75     76     77 
## 0.1510 0.8584 0.7468 0.9242 0.8678 0.8667 0.8029 0.1414 0.8346 0.8033 0.9061 
##     78     79     80     81     82     83     84     85     86     87     88 
## 0.4456 0.9162 0.9261 0.8126 0.9539 0.3969 0.0330 0.9039 0.8059 0.7043 0.3414 
##     89     90     91     92     93     94     95     96     97     98     99 
## 0.2414 0.4340 0.2725 0.8701 0.8886 0.8142 0.4822 0.7402 0.8864 0.8486 0.7611 
##    100    101    102    103    104    105    106    107    108    109    110 
## 0.2776 0.4095 0.8767 0.4247 0.8020 0.1441 0.8094 0.7210 0.8126 0.8813 0.6530 
##    111    112    113    114    115    116    117    118    119    120    121 
## 0.6744 0.7970 0.3888 0.6924 0.8766 0.1135 0.8759 0.7699 0.9139 0.2554 0.6652 
##    122    123    124    125    126    127    128    129    130    131    132 
## 0.1952 0.3399 0.2291 0.7200 0.4864 0.3971 0.6423 0.0870 0.4113 0.8931 0.8904 
##    133    134    135    136    137    138    139    140    141    142    143 
## 0.2935 0.4284 0.6583 0.1793 0.1500 0.9271 0.2004 0.9089 0.0932 0.1417 0.8551 
##    144    145    146    147    148    149    150    151    152    153    154 
## 0.7116 0.0830 0.1484 0.6467 0.2990 0.0382 0.4455 0.9063 0.2760 0.9107 0.7776 
##    155    156    157    158    159    160    161    162    163    164    165 
## 0.2423 0.7145 0.3409 0.6079 0.7595 0.7909 0.0636 0.9049 0.8940 0.4650 0.8186 
##    166    167    168    169    170    171    172    173    174    175    176 
## 0.8301 0.0994 0.5327 0.8150 0.9132 0.2476 0.3204 0.8631 0.1410 0.8393 0.8113 
##    177    178    179    180    181    182    183    184    185    186    187 
## 0.5625 0.9578 0.5897 0.2454 0.1316 0.0726 0.3256 0.8671 0.8899 0.2611 0.8752 
##    188    189    190    191    192    193    194    195    196    197    198 
## 0.4288 0.1484 0.8900 0.8627 0.8389 0.1933 0.7467 0.9138 0.9347 0.2353 0.5786 
##    199    200    201    202    203    204    205    206    207    208    209 
## 0.8247 0.4330 0.2009 0.3392 0.0578 0.7702 0.1923 0.8304 0.8750 0.4471 0.7154 
##    210    211    212    213    214    215    216    217    218    219    220 
## 0.0953 0.2174 0.2955 0.4478 0.8032 0.2619 0.3312 0.1967 0.6748 0.1988 0.9110 
##    221    222    223    224    225    226    227    228    229    230    231 
## 0.8978 0.1409 0.3176 0.8670 0.6522 0.8533 0.8074 0.4876 0.4309 0.7486 0.5732 
##    232    233    234    235    236    237    238    239    240    241    242 
## 0.9048 0.0906 0.5690 0.8904 0.9149 0.1491 0.3967 0.0940 0.0822 0.9039 0.3450 
##    243    244    245    246    247    248    249    250    251    252    253 
## 0.0982 0.1563 0.9074 0.8448 0.3535 0.3645 0.8428 0.3102 0.8890 0.3322 0.6900 
##    254    255    256    257    258    259    260    261    262    263    264 
## 0.1892 0.1955 0.7133 0.5022 0.5246 0.4857 0.2719 0.1288 0.9398 0.2717 0.4782 
##    265    266    267    268    269    270    271    272    273    274    275 
## 0.6427 0.8989 0.8962 0.1539 0.4124 0.6116 0.3792 0.2616 0.9599 0.3722 0.9371 
##    276    277    278    279    280    281    282    283    284    285    286 
## 0.7878 0.7750 0.2400 0.1786 0.3923 0.8867 0.1252 0.2719 0.1270 0.6997 0.3036 
##    287    288    289    290    291    292    293    294    295    296    297 
## 0.0552 0.8911 0.0837 0.3418 0.9198 0.8645 0.1354 0.2081 0.2016 0.8569 0.3309 
##    298    299    300    301    302    303 
## 0.8053 0.1289 0.8420 0.2140 0.9001 0.8610

##AUC

## [1] "Area Under the Curve (AUC)"
## [1] TRUE
## [1] 0.8007
## Area under the curve: 0.8007

##Confusion Matrix

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 106  36
##          1  32 129
##                                           
##                Accuracy : 0.7756          
##                  95% CI : (0.7244, 0.8213)
##     No Information Rate : 0.5446          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.5486          
##                                           
##  Mcnemar's Test P-Value : 0.716           
##                                           
##             Sensitivity : 0.7818          
##             Specificity : 0.7681          
##          Pos Pred Value : 0.8012          
##          Neg Pred Value : 0.7465          
##              Prevalence : 0.5446          
##          Detection Rate : 0.4257          
##    Detection Prevalence : 0.5314          
##       Balanced Accuracy : 0.7750          
##                                           
##        'Positive' Class : 1               
## 
'What is the probability of an individual having heart disease who is 50 years old, 
has a resting blood pressure of 115, does not experience chest pain, and has maximum heart rate of 133?  '

log_model5 <- glm(target ~ age + trestbps + cp + thalach, data = heart_data, family = "binomial")

new_data <- data.frame(age = 50, trestbps = 115,  cp = factor(0, levels = levels(heart_data$cp)), thalach = 133)

##Predicted Probs

predicted_probability2 <- predict(log_model5, newdata = new_data, type = "response")

predicted_odds2 <- predicted_probability2 / (1 - predicted_probability2)

 

##Results

## Probability of having Heart Disease: 0.2756
## Odds of Having Heart Disease: 0.3805
'What is the probability of an an individual having heart disease who 
is 50 years old, has a resting blood pressure of 125, experiences typical angina, and has maximum heart rate of 155?   '

log_model6 <- glm(target ~ age + trestbps + cp + thalach, data = heart_data, family = "binomial")

new_data <- data.frame(age = 50, trestbps = 125,  cp = factor(1, levels = levels(heart_data$cp)), thalach = 155)

##Predicted Probs

predicted_probability3 <- predict(log_model6, newdata = new_data, type = "response")

predicted_odds3 <- predicted_probability3 / (1 - predicted_probability3)

##Results

## Probability of having Heart Disease: 0.8218
## Odds of Having Heart Disease: 4.611

#5. Random Forest Classification Model

Partition the data set into training and testing data

Training set

## [1] "Number of rows for the training set"
## Training Data Rows: 257
## Training Data Columns: 15

Testing set

## [1] "Number of rows for the test set"
## Test Set Rows: 46
## Test Set Columns: 15
## [1] "Graph the training and testing error against the number of trees using a classification random forest model for the \npresence of heart disease (target) using variables age (age), sex (sex), \nchest pain type (cp), resting blood pressure (trestbps), cholesterol measurement (chol), \nresting electrocardiographic measurement (restecg), exercise-induced angina (exang), \nand number of major vessels (ca). Use a maximum of 150 trees. Use set.seed(6522048).\nWhat is the optimal number of trees for the random forest model? \n"
set.seed(6522048)
tree_model1 <- randomForest(target ~ age + trestbps + sex + cp + chol + restecg + 
                              exang + ca, data=training.data, ntree = 150,
                            importance = TRUE)
## 
## Call:
##  randomForest(formula = target ~ age + trestbps + sex + cp + chol +      restecg + exang + ca, data = training.data, ntree = 150,      importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 150
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 18.29%
## Confusion matrix:
##    0   1 class.error
## 0 95  24   0.2016807
## 1 23 115   0.1666667

## Optimal Number of Trees: 103

##Predict on the Training set

## [1] "Training  Data Confusion Matrix and Statistics:"
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 119   0
##          1   0 138
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9857, 1)
##     No Information Rate : 0.537      
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.000      
##             Specificity : 1.000      
##          Pos Pred Value : 1.000      
##          Neg Pred Value : 1.000      
##              Prevalence : 0.463      
##          Detection Rate : 0.463      
##    Detection Prevalence : 0.463      
##       Balanced Accuracy : 1.000      
##                                      
##        'Positive' Class : 0          
## 

#Predict on the Testing set

## [1] "Testing Data Confusion Matrix and Statistics:"
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 15  5
##          1  4 22
##                                           
##                Accuracy : 0.8043          
##                  95% CI : (0.6609, 0.9064)
##     No Information Rate : 0.587           
##     P-Value [Acc > NIR] : 0.00155         
##                                           
##                   Kappa : 0.5996          
##                                           
##  Mcnemar's Test P-Value : 1.00000         
##                                           
##             Sensitivity : 0.7895          
##             Specificity : 0.8148          
##          Pos Pred Value : 0.7500          
##          Neg Pred Value : 0.8462          
##              Prevalence : 0.4130          
##          Detection Rate : 0.3261          
##    Detection Prevalence : 0.4348          
##       Balanced Accuracy : 0.8021          
##                                           
##        'Positive' Class : 0               
## 

##Plot Variable Importance Plot in Random Forest

##RMSE

Loop through different numbers of trees to calculate RMSE

##Calculate RMSE for Test Set

## RMSE for Training Set: 0.0624
## RMSE for Test Set: 0.4423

Plot RMSE for Training and Testing sets

'Using the appropriate number of trees found, create a classification random 
forest model for the presence of heart disease (target) using variables age (age), 
sex (sex), chest pain type (cp), resting blood pressure (trestbps), cholesterol 
measurement (chol), resting electrocardiographic measurement (restecg), 
exercise-induced angina (exang), and number of major vessels (ca). '

set.seed(6522048)
tree_model2 <- randomForest(target ~ age + trestbps + sex + cp + chol + restecg + 
                              exang + ca, data=training.data, ntree = 38,
                            importance = TRUE)

print(tree_model2)

#Predict on the Training set

## [1] "Training  Data Confusion Matrix and Statistics:"
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 119   0
##          1   0 138
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9857, 1)
##     No Information Rate : 0.537      
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.000      
##             Specificity : 1.000      
##          Pos Pred Value : 1.000      
##          Neg Pred Value : 1.000      
##              Prevalence : 0.463      
##          Detection Rate : 0.463      
##    Detection Prevalence : 0.463      
##       Balanced Accuracy : 1.000      
##                                      
##        'Positive' Class : 0          
## 

#Predict on the Testing set

## [1] "Testing Data Confusion Matrix and Statistics:"
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 14  4
##          1  5 23
##                                           
##                Accuracy : 0.8043          
##                  95% CI : (0.6609, 0.9064)
##     No Information Rate : 0.587           
##     P-Value [Acc > NIR] : 0.00155         
##                                           
##                   Kappa : 0.5933          
##                                           
##  Mcnemar's Test P-Value : 1.00000         
##                                           
##             Sensitivity : 0.7368          
##             Specificity : 0.8519          
##          Pos Pred Value : 0.7778          
##          Neg Pred Value : 0.8214          
##              Prevalence : 0.4130          
##          Detection Rate : 0.3043          
##    Detection Prevalence : 0.3913          
##       Balanced Accuracy : 0.7943          
##                                           
##        'Positive' Class : 0               
## 

#6. Random Forest Regression Model

## Training Data Rows: 242
## Training Data Columns: 15
## [1] "Number of rows for the test set"
## Test Set Rows: 61
## Test Set Columns: 15

Graph the mean squared error against the number of trees for a random forest regression model for maximum heart rate achieved using age (age), sex (sex), chest pain type (cp), resting blood pressure (trestbps), cholesterol measurement (chol), resting electrocardiographic measurement (restecg), exercise-induced angina (exang), and number of major vessels (ca). Use a maximum of 80 trees.

set.seed(6522048)
tree_model3 <- randomForest(thalach ~ age + trestbps + sex + cp + chol + restecg + 
                              exang + ca, data=training.data, ntree = 80,
                            importance = TRUE)
## 
## Call:
##  randomForest(formula = thalach ~ age + trestbps + sex + cp +      chol + restecg + exang + ca, data = training.data, ntree = 80,      importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 80
## No. of variables tried at each split: 2
## 
##           Mean of squared residuals: 444.0429
##                     % Var explained: 18.95
##                 Length Class  Mode     
## call              5    -none- call     
## type              1    -none- character
## predicted       242    -none- numeric  
## mse              80    -none- numeric  
## rsq              80    -none- numeric  
## oob.times       242    -none- numeric  
## importance       16    -none- numeric  
## importanceSD      8    -none- numeric  
## localImportance   0    -none- NULL     
## proximity         0    -none- NULL     
## ntree             1    -none- numeric  
## mtry              1    -none- numeric  
## forest           11    -none- list     
## coefs             0    -none- NULL     
## y               242    -none- numeric  
## test              0    -none- NULL     
## inbag             0    -none- NULL     
## terms             3    terms  call

## Optimal Number of Trees: 54

##RMSE

RMSE <- function(pred, obs) {
  return(sqrt(mean((as.numeric(pred) - as.numeric(obs))^2))) 
}
set.seed(6522048)
tree_model4 <- randomForest(thalach ~ age + trestbps + sex + cp + chol + restecg + 
                              exang + ca, data=training.data, ntree = 54,
                            importance = TRUE)

##Calculate RMSE for Training Set

train_predictions1 <- predict(tree_model4, newdata = training.data)
train_rmse1 <- RMSE(train_predictions, training.data$target)

##Calculate RMSE for Test Set

test_predictions <- predict(tree_model4, newdata = testing.data)
test_rmse1 <- RMSE(test_predictions, testing.data$target)
## RMSE for Training Set: 0.1871
## RMSE for Test Set: 147.2516

Plot RMSE for Training and Testing sets