my_logistics=read.csv('C:/Users/Sarvesh/Desktop/heart_D.csv')
View(my_logistics)
str(my_logistics)
'data.frame': 303 obs. of 14 variables:
$ age : int 63 37 41 56 57 57 56 44 52 57 ...
$ sex : int 1 1 0 1 0 1 0 1 1 1 ...
$ cp : int 3 2 1 1 0 0 1 1 2 2 ...
$ trestbps: int 145 130 130 120 120 140 140 120 172 150 ...
$ chol : int 233 250 204 236 354 192 294 263 199 168 ...
$ fbs : int 1 0 0 0 0 0 0 0 1 0 ...
$ restecg : int 0 1 0 1 1 1 0 1 1 1 ...
$ thalach : int 150 187 172 178 163 148 153 173 162 174 ...
$ exang : int 0 0 0 0 1 0 0 0 0 0 ...
$ oldpeak : num 2.3 3.5 1.4 0.8 0.6 0.4 1.3 0 0.5 1.6 ...
$ slope : int 0 0 2 2 2 1 1 2 2 2 ...
$ ca : int 0 0 0 0 0 0 0 0 0 0 ...
$ thal : int 1 2 2 2 2 1 2 3 3 2 ...
$ target : int 1 1 1 1 1 1 1 1 1 1 ...
summary(my_logistics)
age sex cp trestbps chol
Min. :29.00 Min. :0.0000 Min. :0.000 Min. : 94.0 Min. :126.0
1st Qu.:47.50 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:120.0 1st Qu.:211.0
Median :55.00 Median :1.0000 Median :1.000 Median :130.0 Median :240.0
Mean :54.37 Mean :0.6832 Mean :0.967 Mean :131.6 Mean :246.3
3rd Qu.:61.00 3rd Qu.:1.0000 3rd Qu.:2.000 3rd Qu.:140.0 3rd Qu.:274.5
Max. :77.00 Max. :1.0000 Max. :3.000 Max. :200.0 Max. :564.0
fbs restecg thalach exang oldpeak
Min. :0.0000 Min. :0.0000 Min. : 71.0 Min. :0.0000 Min. :0.00
1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:133.5 1st Qu.:0.0000 1st Qu.:0.00
Median :0.0000 Median :1.0000 Median :153.0 Median :0.0000 Median :0.80
Mean :0.1485 Mean :0.5281 Mean :149.6 Mean :0.3267 Mean :1.04
3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:166.0 3rd Qu.:1.0000 3rd Qu.:1.60
Max. :1.0000 Max. :2.0000 Max. :202.0 Max. :1.0000 Max. :6.20
slope ca thal target
Min. :0.000 Min. :0.0000 Min. :0.000 Min. :0.0000
1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000 1st Qu.:0.0000
Median :1.000 Median :0.0000 Median :2.000 Median :1.0000
Mean :1.399 Mean :0.7294 Mean :2.314 Mean :0.5446
3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:1.0000
Max. :2.000 Max. :4.0000 Max. :3.000 Max. :1.0000
is.na(sum(my_logistics))
[1] FALSE
str of the data and summary of HD
my_logistics$target=factor(my_logistics$target)
summary(my_logistics)
age sex cp trestbps chol
Min. :29.00 Min. :0.0000 Min. :0.000 Min. : 94.0 Min. :126.0
1st Qu.:47.50 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:120.0 1st Qu.:211.0
Median :55.00 Median :1.0000 Median :1.000 Median :130.0 Median :240.0
Mean :54.37 Mean :0.6832 Mean :0.967 Mean :131.6 Mean :246.3
3rd Qu.:61.00 3rd Qu.:1.0000 3rd Qu.:2.000 3rd Qu.:140.0 3rd Qu.:274.5
Max. :77.00 Max. :1.0000 Max. :3.000 Max. :200.0 Max. :564.0
fbs restecg thalach exang oldpeak
Min. :0.0000 Min. :0.0000 Min. : 71.0 Min. :0.0000 Min. :0.00
1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:133.5 1st Qu.:0.0000 1st Qu.:0.00
Median :0.0000 Median :1.0000 Median :153.0 Median :0.0000 Median :0.80
Mean :0.1485 Mean :0.5281 Mean :149.6 Mean :0.3267 Mean :1.04
3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:166.0 3rd Qu.:1.0000 3rd Qu.:1.60
Max. :1.0000 Max. :2.0000 Max. :202.0 Max. :1.0000 Max. :6.20
slope ca thal target
Min. :0.000 Min. :0.0000 Min. :0.000 0:138
1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000 1:165
Median :1.000 Median :0.0000 Median :2.000
Mean :1.399 Mean :0.7294 Mean :2.314
3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000
Max. :2.000 Max. :4.0000 Max. :3.000
after changing target variable as factor
install.packages("caTools")
Error in install.packages : Updating loaded packages
library(caTools)
set.seed(3)
sample_traintest=sample.split(my_logistics,SplitRatio = 0.8)
sample_traintest
[1] TRUE FALSE TRUE TRUE TRUE FALSE TRUE TRUE TRUE FALSE TRUE TRUE TRUE
[14] TRUE
for machine learning assign split ratio of 80%
Train=subset(my_logistics,sample_traintest=="TRUE")
Test=subset(my_logistics,sample_traintest=="FALSE")
# logistics model
my_logistics_eq=glm(target~.,data = Train, family = 'binomial')
summary(my_logistics_eq)
Call:
glm(formula = target ~ ., family = "binomial", data = Train)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.6511 -0.4000 0.1654 0.5297 2.4983
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 3.456208 2.868583 1.205 0.22826
age -0.005711 0.026322 -0.217 0.82824
sex -1.536576 0.512760 -2.997 0.00273 **
cp 0.918492 0.215967 4.253 2.11e-05 ***
trestbps -0.017547 0.011541 -1.520 0.12840
chol -0.004889 0.004696 -1.041 0.29788
fbs -0.081079 0.615954 -0.132 0.89528
restecg 0.858722 0.403628 2.128 0.03338 *
thalach 0.014728 0.011160 1.320 0.18692
exang -0.807684 0.467655 -1.727 0.08415 .
oldpeak -0.394727 0.247911 -1.592 0.11134
slope 0.966706 0.413217 2.339 0.01931 *
ca -0.655067 0.227220 -2.883 0.00394 **
thal -0.976079 0.340692 -2.865 0.00417 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 328.26 on 237 degrees of freedom
Residual deviance: 165.56 on 224 degrees of freedom
AIC: 193.56
Number of Fisher Scoring iterations: 6
my_logi1=glm(target~.-age,data = Train, family = 'binomial')
summary(my_logi1)
Call:
glm(formula = target ~ . - age, family = "binomial", data = Train)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.6450 -0.4005 0.1668 0.5290 2.4929
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 3.095026 2.333156 1.327 0.18466
sex -1.518968 0.506384 -3.000 0.00270 **
cp 0.917326 0.215755 4.252 2.12e-05 ***
trestbps -0.018123 0.011249 -1.611 0.10716
chol -0.004974 0.004687 -1.061 0.28862
fbs -0.098569 0.611039 -0.161 0.87185
restecg 0.866790 0.402215 2.155 0.03116 *
thalach 0.015673 0.010284 1.524 0.12751
exang -0.807652 0.467652 -1.727 0.08416 .
oldpeak -0.391601 0.247252 -1.584 0.11324
slope 0.965987 0.413196 2.338 0.01940 *
ca -0.664545 0.222299 -2.989 0.00280 **
thal -0.979260 0.340674 -2.874 0.00405 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 328.26 on 237 degrees of freedom
Residual deviance: 165.61 on 225 degrees of freedom
AIC: 191.61
Number of Fisher Scoring iterations: 6
my_logi2=glm(target~.-trestbps,data = Train, family = 'binomial')
summary(my_logi2)
Call:
glm(formula = target ~ . - trestbps, family = "binomial", data = Train)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.5241 -0.4203 0.1735 0.5201 2.5069
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.420453 2.727696 0.887 0.37488
age -0.014788 0.025518 -0.580 0.56223
sex -1.489457 0.498635 -2.987 0.00282 **
cp 0.877419 0.211731 4.144 3.41e-05 ***
chol -0.005765 0.004571 -1.261 0.20720
fbs -0.160992 0.601717 -0.268 0.78904
restecg 0.862269 0.397031 2.172 0.02987 *
thalach 0.011820 0.010777 1.097 0.27276
exang -0.863246 0.459681 -1.878 0.06039 .
oldpeak -0.430472 0.244802 -1.758 0.07867 .
slope 0.965988 0.413423 2.337 0.01946 *
ca -0.630204 0.230097 -2.739 0.00617 **
thal -1.018700 0.335566 -3.036 0.00240 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 328.26 on 237 degrees of freedom
Residual deviance: 167.94 on 225 degrees of freedom
AIC: 193.94
Number of Fisher Scoring iterations: 6
my_logi3=glm(target~.-chol,data = Train, family = 'binomial')
summary(my_logi3)
Call:
glm(formula = target ~ . - chol, family = "binomial", data = Train)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.7065 -0.4130 0.1701 0.5317 2.4053
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.69145 2.75750 0.976 0.32904
age -0.00774 0.02598 -0.298 0.76572
sex -1.36741 0.47829 -2.859 0.00425 **
cp 0.93743 0.21539 4.352 1.35e-05 ***
trestbps -0.01914 0.01142 -1.675 0.09384 .
fbs -0.08593 0.61668 -0.139 0.88917
restecg 0.92114 0.39622 2.325 0.02008 *
thalach 0.01370 0.01104 1.241 0.21476
exang -0.79022 0.46340 -1.705 0.08815 .
oldpeak -0.43065 0.24551 -1.754 0.07941 .
slope 0.91866 0.41066 2.237 0.02529 *
ca -0.65786 0.22385 -2.939 0.00329 **
thal -0.98620 0.33900 -2.909 0.00362 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 328.26 on 237 degrees of freedom
Residual deviance: 166.66 on 225 degrees of freedom
AIC: 192.66
Number of Fisher Scoring iterations: 6
my_logi4=glm(target~.-fbs,data = Train, family = 'binomial')
summary(my_logi4)
Call:
glm(formula = target ~ . - fbs, family = "binomial", data = Train)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.6465 -0.3987 0.1659 0.5325 2.5008
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 3.484340 2.859338 1.219 0.22300
age -0.006154 0.026074 -0.236 0.81343
sex -1.543074 0.510138 -3.025 0.00249 **
cp 0.912412 0.210457 4.335 1.46e-05 ***
trestbps -0.017659 0.011500 -1.536 0.12463
chol -0.004900 0.004702 -1.042 0.29740
restecg 0.859825 0.403542 2.131 0.03311 *
thalach 0.014720 0.011151 1.320 0.18680
exang -0.806916 0.466925 -1.728 0.08396 .
oldpeak -0.394070 0.247765 -1.591 0.11172
slope 0.967165 0.413227 2.341 0.01926 *
ca -0.658903 0.224987 -2.929 0.00340 **
thal -0.969538 0.337119 -2.876 0.00403 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 328.26 on 237 degrees of freedom
Residual deviance: 165.58 on 225 degrees of freedom
AIC: 191.58
Number of Fisher Scoring iterations: 6
my_logi5=glm(target~.-restecg,data = Train, family = 'binomial')
summary(my_logi5)
Call:
glm(formula = target ~ . - restecg, family = "binomial", data = Train)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.4793 -0.4426 0.1652 0.5948 2.6543
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 4.339093 2.770970 1.566 0.11737
age -0.010437 0.025946 -0.402 0.68750
sex -1.669926 0.511259 -3.266 0.00109 **
cp 0.893046 0.214656 4.160 3.18e-05 ***
trestbps -0.017983 0.011337 -1.586 0.11268
chol -0.006491 0.004569 -1.421 0.15542
fbs -0.108954 0.605502 -0.180 0.85720
thalach 0.014163 0.011118 1.274 0.20270
exang -0.772001 0.461504 -1.673 0.09437 .
oldpeak -0.332991 0.238785 -1.395 0.16316
slope 1.069177 0.411975 2.595 0.00945 **
ca -0.637128 0.221692 -2.874 0.00405 **
thal -0.855151 0.325485 -2.627 0.00861 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 328.26 on 237 degrees of freedom
Residual deviance: 170.20 on 225 degrees of freedom
AIC: 196.2
Number of Fisher Scoring iterations: 6
my_logi6=glm(target~.-slope, data = Train, family = 'binomial')
summary(my_logi6)
Call:
glm(formula = target ~ . - slope, family = "binomial", data = Train)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.5635 -0.4210 0.1671 0.5816 2.3716
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 3.935885 2.822964 1.394 0.16325
age -0.005001 0.025961 -0.193 0.84725
sex -1.355331 0.494241 -2.742 0.00610 **
cp 0.866555 0.206859 4.189 2.8e-05 ***
trestbps -0.017458 0.011417 -1.529 0.12622
chol -0.003544 0.004487 -0.790 0.42965
fbs -0.088732 0.596812 -0.149 0.88181
restecg 0.961398 0.395907 2.428 0.01517 *
thalach 0.019900 0.010727 1.855 0.06360 .
exang -0.889169 0.459341 -1.936 0.05290 .
oldpeak -0.678518 0.216683 -3.131 0.00174 **
ca -0.604000 0.215523 -2.802 0.00507 **
thal -1.035241 0.338573 -3.058 0.00223 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 328.26 on 237 degrees of freedom
Residual deviance: 171.02 on 225 degrees of freedom
AIC: 197.02
Number of Fisher Scoring iterations: 6
## best model after checking AIC with P value
my_logistic_best=glm(target~.-slope-age-trestbps-chol-fbs-restecg,data = Train, family = 'binomial')
summary(my_logistic_best)
Call:
glm(formula = target ~ . - slope - age - trestbps - chol - fbs -
restecg, family = "binomial", data = Train)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.3152 -0.5002 0.2200 0.5686 2.3628
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.102622 1.582972 0.697 0.486083
sex -1.126999 0.424358 -2.656 0.007913 **
cp 0.778083 0.194366 4.003 6.25e-05 ***
thalach 0.017976 0.009219 1.950 0.051182 .
exang -0.907902 0.433539 -2.094 0.036245 *
oldpeak -0.694944 0.203867 -3.409 0.000652 ***
ca -0.622474 0.200029 -3.112 0.001859 **
thal -0.973965 0.312005 -3.122 0.001799 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 328.26 on 237 degrees of freedom
Residual deviance: 183.15 on 230 degrees of freedom
AIC: 199.15
Number of Fisher Scoring iterations: 5
my_prediction=predict(my_logistic_best,Test)
my_prediction
2 6 10 16 20 24
0.51307174 1.38413192 1.59977689 2.43915984 1.70747446 0.44372872
30 34 38 44 48 52
1.48228186 1.34626886 0.46402732 1.44728585 2.38811872 0.46409469
58 62 66 72 76 80
1.35325723 0.63607024 1.45340699 -0.15217394 1.85399356 0.05333345
86 90 94 100 104 108
1.50114657 0.65282289 1.26058776 0.82628989 1.54128730 0.84015739
114 118 122 128 132 136
-0.05213277 0.97969920 1.29932914 3.18026079 2.84489113 2.08478400
142 146 150 156 160 164
0.44742036 1.37634720 2.28026255 1.09258473 0.76190244 0.20381612
170 174 178 184 188 192
-3.22221672 -1.74902233 2.42407078 -0.78389601 -4.04613822 -4.89561315
198 202 206 212 216 220
-1.40011705 -3.88787170 -1.58250805 -4.46180290 -2.36726747 -2.40271813
226 230 234 240 244 248
-3.41402559 -1.19404794 -3.30586127 -1.04991442 -3.72869086 -0.83845895
254 258 262 268 272 276
-2.50360308 -2.21464483 0.28138275 -0.57453860 -0.08333627 -1.86619207
282 286 290 296 300
1.17703758 -4.19289810 -2.40265345 -5.29035014 0.92687913
compare_mypred=table(av=Test$target,machinepredicted=my_prediction>0.5)
compare_mypred
machinepredicted
av FALSE TRUE
0 26 3
1 8 28
sum(diag(compare_mypred))/sum(compare_mypred)
[1] 0.8307692