library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'purrr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyr)
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(broom)
## Warning: package 'broom' was built under R version 4.2.3
#load data set
pak_data <- read.csv("FIC_Full.csv")
head(pak_data)
##   Age Age.Group Gender Locality Marital.status Life.Style Sleep Category
## 1  45     41-50 Female    RURAL        MARRIED         NO    NO     FREE
## 2  51     51-60 Female    URBAN        MARRIED         NO    NO     FREE
## 3  55     51-60 Female    RURAL        MARRIED        YES   YES     FREE
## 4  55     51-60 Female    RURAL        MARRIED        YES   YES     FREE
## 5  56     51-60 Female    RURAL        MARRIED        YES    NO     FREE
## 6  56     51-60 Female    URBAN        MARRIED         NO    NO     FREE
##   Depression Hyperlipi Smoking Family.History F.History Diabetes HTN Allergies
## 1        YES       YES      NO             NO         0        1  NO        NO
## 2        YES       YES      NO             NO         0        0  NO        NO
## 3        YES       YES      NO             NO         0        1 YES        NO
## 4        YES       YES      NO             NO         0        1 YES        NO
## 5        YES       YES      NO             NO         0        1 YES        NO
## 6        YES       YES      NO             NO         0        1 YES        NO
##      BP Thrombolysis BGR B.Urea S.Cr S.Sodium S.Potassium S.Chloride C.P.K
## 1 100.6            0  84     28  0.9      138         3.3        107   130
## 2  90.6            0 135     17  0.7      144         4.7        104   163
## 3 100.7            0 146     37  1.0      137         4.2        103   149
## 4 160.1            0 146     37  1.0      137         4.2        103   149
## 5  90.6            0  85     78  1.2      139         4.5        112    75
## 6 140.7            0 166    104  4.0      130         5.3        100   322
##   CK.MB ESR   WBC  RBC Hemoglobin P.C.V M.C.V M.C.H M.C.H.C PLATELET_COUNT
## 1    30  11  9900 4.26       11.6  0.34  79.7  27.2    0.34         265000
## 2    30  27 15800 5.74       14.5  0.44  78.0  25.0    0.32         287000
## 3    22  19  7900 4.83       14.1  0.42  87.0  29.0    0.33         183000
## 4    22  19  7900 4.83       14.1  0.42  87.0  29.0    0.33         183000
## 5    18  13  6900 4.41       12.3  0.36  82.0  27.0    0.33         211000
## 6    52 154 13500 3.90       10.0  0.29  74.4  25.7    0.35         288000
##   NEUTROPHIL LYMPHO MONOCYTE EOSINO          Others
## 1       0.70   0.25     0.03      2              no
## 2       0.73   0.20     0.04      3              no
## 3       0.60   0.33     0.04      3  LV dysfunction
## 4       0.60   0.33     0.04      3             HTN
## 5       0.71   0.25     0.02      2              no
## 6       0.85   0.10     0.03      2 PND, ORTHOPENIA
##                              CO                Diagnosis Hypersensitivity cp
## 1                   Chest pain,      EXT. ACUTE WALL M.I               NO  4
## 2           Central Chest pain,                  A/W M.I               NO  4
## 3 Chest pain,SOB, Cold sweating AC I/W M.I (RV) RE. M.I                NO  4
## 4           CENTRAL Chest pain,                  I/W M.I               NO  4
## 5                   Chest pain,                  A/W M.I               NO  4
## 6               SOB FROM 1 DAY               ACS, NSTEMI               NO  4
##   trestbps chol fbs restecg thalach exang oldpeak slope ca thal num SK SK.React
## 1      132  341   1       2     136     1     3.0     2  0    7   2  1       NO
## 2      130  305   0       0     142     1     1.2     2  0    7   2  1       NO
## 3      180  327   0       1     117     1     3.4     2  0    3   2  1       NO
## 4      128  205   0       1     130     1     2.0     2  1    7   3  1       NO
## 5      200  288   1       2     133     1     4.0     3  2    7   3  1       NO
## 6      134  409   0       2     150     1     1.9     2  2    7   2  1       NO
##   Reaction Mortality Follow.Up
## 1        0         0        60
## 2        0         0        15
## 3        0         0         6
## 4        0         0        52
## 5        0         0        34
## 6        0         1        32
colnames(pak_data)
##  [1] "Age"              "Age.Group"        "Gender"           "Locality"        
##  [5] "Marital.status"   "Life.Style"       "Sleep"            "Category"        
##  [9] "Depression"       "Hyperlipi"        "Smoking"          "Family.History"  
## [13] "F.History"        "Diabetes"         "HTN"              "Allergies"       
## [17] "BP"               "Thrombolysis"     "BGR"              "B.Urea"          
## [21] "S.Cr"             "S.Sodium"         "S.Potassium"      "S.Chloride"      
## [25] "C.P.K"            "CK.MB"            "ESR"              "WBC"             
## [29] "RBC"              "Hemoglobin"       "P.C.V"            "M.C.V"           
## [33] "M.C.H"            "M.C.H.C"          "PLATELET_COUNT"   "NEUTROPHIL"      
## [37] "LYMPHO"           "MONOCYTE"         "EOSINO"           "Others"          
## [41] "CO"               "Diagnosis"        "Hypersensitivity" "cp"              
## [45] "trestbps"         "chol"             "fbs"              "restecg"         
## [49] "thalach"          "exang"            "oldpeak"          "slope"           
## [53] "ca"               "thal"             "num"              "SK"              
## [57] "SK.React"         "Reaction"         "Mortality"        "Follow.Up"
pak_data %>% map(~sum(is.na(.)))
## $Age
## [1] 0
## 
## $Age.Group
## [1] 0
## 
## $Gender
## [1] 0
## 
## $Locality
## [1] 0
## 
## $Marital.status
## [1] 0
## 
## $Life.Style
## [1] 0
## 
## $Sleep
## [1] 0
## 
## $Category
## [1] 0
## 
## $Depression
## [1] 0
## 
## $Hyperlipi
## [1] 0
## 
## $Smoking
## [1] 0
## 
## $Family.History
## [1] 0
## 
## $F.History
## [1] 0
## 
## $Diabetes
## [1] 0
## 
## $HTN
## [1] 0
## 
## $Allergies
## [1] 0
## 
## $BP
## [1] 0
## 
## $Thrombolysis
## [1] 0
## 
## $BGR
## [1] 0
## 
## $B.Urea
## [1] 0
## 
## $S.Cr
## [1] 0
## 
## $S.Sodium
## [1] 0
## 
## $S.Potassium
## [1] 0
## 
## $S.Chloride
## [1] 0
## 
## $C.P.K
## [1] 0
## 
## $CK.MB
## [1] 0
## 
## $ESR
## [1] 0
## 
## $WBC
## [1] 0
## 
## $RBC
## [1] 0
## 
## $Hemoglobin
## [1] 0
## 
## $P.C.V
## [1] 0
## 
## $M.C.V
## [1] 0
## 
## $M.C.H
## [1] 0
## 
## $M.C.H.C
## [1] 0
## 
## $PLATELET_COUNT
## [1] 0
## 
## $NEUTROPHIL
## [1] 0
## 
## $LYMPHO
## [1] 0
## 
## $MONOCYTE
## [1] 0
## 
## $EOSINO
## [1] 0
## 
## $Others
## [1] 0
## 
## $CO
## [1] 0
## 
## $Diagnosis
## [1] 0
## 
## $Hypersensitivity
## [1] 0
## 
## $cp
## [1] 0
## 
## $trestbps
## [1] 0
## 
## $chol
## [1] 0
## 
## $fbs
## [1] 0
## 
## $restecg
## [1] 0
## 
## $thalach
## [1] 0
## 
## $exang
## [1] 0
## 
## $oldpeak
## [1] 0
## 
## $slope
## [1] 0
## 
## $ca
## [1] 0
## 
## $thal
## [1] 0
## 
## $num
## [1] 0
## 
## $SK
## [1] 0
## 
## $SK.React
## [1] 0
## 
## $Reaction
## [1] 0
## 
## $Mortality
## [1] 0
## 
## $Follow.Up
## [1] 0
pak_data %>% map(~sum(n_distinct(.)))
## $Age
## [1] 31
## 
## $Age.Group
## [1] 5
## 
## $Gender
## [1] 2
## 
## $Locality
## [1] 2
## 
## $Marital.status
## [1] 2
## 
## $Life.Style
## [1] 2
## 
## $Sleep
## [1] 2
## 
## $Category
## [1] 2
## 
## $Depression
## [1] 2
## 
## $Hyperlipi
## [1] 2
## 
## $Smoking
## [1] 2
## 
## $Family.History
## [1] 2
## 
## $F.History
## [1] 2
## 
## $Diabetes
## [1] 2
## 
## $HTN
## [1] 2
## 
## $Allergies
## [1] 2
## 
## $BP
## [1] 17
## 
## $Thrombolysis
## [1] 2
## 
## $BGR
## [1] 38
## 
## $B.Urea
## [1] 28
## 
## $S.Cr
## [1] 13
## 
## $S.Sodium
## [1] 17
## 
## $S.Potassium
## [1] 15
## 
## $S.Chloride
## [1] 14
## 
## $C.P.K
## [1] 44
## 
## $CK.MB
## [1] 31
## 
## $ESR
## [1] 22
## 
## $WBC
## [1] 31
## 
## $RBC
## [1] 32
## 
## $Hemoglobin
## [1] 32
## 
## $P.C.V
## [1] 19
## 
## $M.C.V
## [1] 27
## 
## $M.C.H
## [1] 22
## 
## $M.C.H.C
## [1] 8
## 
## $PLATELET_COUNT
## [1] 36
## 
## $NEUTROPHIL
## [1] 27
## 
## $LYMPHO
## [1] 24
## 
## $MONOCYTE
## [1] 8
## 
## $EOSINO
## [1] 5
## 
## $Others
## [1] 17
## 
## $CO
## [1] 37
## 
## $Diagnosis
## [1] 37
## 
## $Hypersensitivity
## [1] 2
## 
## $cp
## [1] 4
## 
## $trestbps
## [1] 39
## 
## $chol
## [1] 97
## 
## $fbs
## [1] 2
## 
## $restecg
## [1] 3
## 
## $thalach
## [1] 71
## 
## $exang
## [1] 2
## 
## $oldpeak
## [1] 35
## 
## $slope
## [1] 3
## 
## $ca
## [1] 4
## 
## $thal
## [1] 3
## 
## $num
## [1] 4
## 
## $SK
## [1] 2
## 
## $SK.React
## [1] 7
## 
## $Reaction
## [1] 2
## 
## $Mortality
## [1] 2
## 
## $Follow.Up
## [1] 22
glimpse(pak_data)
## Rows: 368
## Columns: 60
## $ Age              <int> 45, 51, 55, 55, 56, 56, 57, 57, 58, 58, 59, 60, 60, 6…
## $ Age.Group        <chr> "41-50", "51-60", "51-60", "51-60", "51-60", "51-60",…
## $ Gender           <chr> "Female", "Female", "Female", "Female", "Female", "Fe…
## $ Locality         <chr> "RURAL", "URBAN", "RURAL", "RURAL", "RURAL", "URBAN",…
## $ Marital.status   <chr> "MARRIED", "MARRIED", "MARRIED", "MARRIED", "MARRIED"…
## $ Life.Style       <chr> "NO", "NO", "YES", "YES", "YES", "NO", "YES", "NO", "…
## $ Sleep            <chr> "NO", "NO", "YES", "YES", "NO", "NO", "YES", "NO", "N…
## $ Category         <chr> "FREE", "FREE", "FREE", "FREE", "FREE", "FREE", "PAID…
## $ Depression       <chr> "YES", "YES", "YES", "YES", "YES", "YES", "YES", "YES…
## $ Hyperlipi        <chr> "YES", "YES", "YES", "YES", "YES", "YES", "YES", "YES…
## $ Smoking          <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",…
## $ Family.History   <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",…
## $ F.History        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Diabetes         <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,…
## $ HTN              <chr> "NO", "NO", "YES", "YES", "YES", "YES", "YES", "NO", …
## $ Allergies        <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",…
## $ BP               <dbl> 100.6, 90.6, 100.7, 160.1, 90.6, 140.7, 120.8, 100.6,…
## $ Thrombolysis     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ BGR              <int> 84, 135, 146, 146, 85, 166, 96, 84, 135, 146, 146, 85…
## $ B.Urea           <dbl> 28, 17, 37, 37, 78, 104, 42, 28, 17, 37, 37, 78, 104,…
## $ S.Cr             <dbl> 0.9, 0.7, 1.0, 1.0, 1.2, 4.0, 1.0, 0.9, 0.7, 1.0, 1.0…
## $ S.Sodium         <int> 138, 144, 137, 137, 139, 130, 146, 138, 144, 137, 137…
## $ S.Potassium      <dbl> 3.3, 4.7, 4.2, 4.2, 4.5, 5.3, 3.9, 3.3, 4.7, 4.2, 4.2…
## $ S.Chloride       <int> 107, 104, 103, 103, 112, 100, 100, 107, 104, 103, 103…
## $ C.P.K            <int> 130, 163, 149, 149, 75, 322, 146, 130, 163, 149, 149,…
## $ CK.MB            <int> 30, 30, 22, 22, 18, 52, 21, 30, 30, 22, 22, 18, 52, 2…
## $ ESR              <int> 11, 27, 19, 19, 13, 154, 25, 11, 27, 19, 19, 13, 154,…
## $ WBC              <int> 9900, 15800, 7900, 7900, 6900, 13500, 7400, 9900, 158…
## $ RBC              <dbl> 4.26, 5.74, 4.83, 4.83, 4.41, 3.90, 4.14, 4.26, 5.74,…
## $ Hemoglobin       <dbl> 11.6, 14.5, 14.1, 14.1, 12.3, 10.0, 11.7, 11.6, 14.5,…
## $ P.C.V            <dbl> 0.34, 0.44, 0.42, 0.42, 0.36, 0.29, 0.36, 0.34, 0.44,…
## $ M.C.V            <dbl> 79.7, 78.0, 87.0, 87.0, 82.0, 74.4, 87.0, 79.7, 78.0,…
## $ M.C.H            <dbl> 27.2, 25.0, 29.0, 29.0, 27.0, 25.7, 28.0, 27.2, 25.0,…
## $ M.C.H.C          <dbl> 0.34, 0.32, 0.33, 0.33, 0.33, 0.35, 0.32, 0.34, 0.32,…
## $ PLATELET_COUNT   <int> 265000, 287000, 183000, 183000, 211000, 288000, 39500…
## $ NEUTROPHIL       <dbl> 0.70, 0.73, 0.60, 0.60, 0.71, 0.85, 0.63, 0.70, 0.73,…
## $ LYMPHO           <dbl> 0.25, 0.20, 0.33, 0.33, 0.25, 0.10, 0.31, 0.25, 0.20,…
## $ MONOCYTE         <dbl> 0.03, 0.04, 0.04, 0.04, 0.02, 0.03, 0.03, 0.03, 0.04,…
## $ EOSINO           <int> 2, 3, 3, 3, 2, 2, 3, 2, 3, 3, 3, 2, 2, 3, 2, 3, 3, 3,…
## $ Others           <chr> "no", "no", "LV dysfunction", "HTN", "no", "PND, ORTH…
## $ CO               <chr> "Chest pain,", "Central Chest pain,", "Chest pain,SOB…
## $ Diagnosis        <chr> "EXT. ACUTE WALL M.I", "A/W M.I", "AC I/W M.I (RV) RE…
## $ Hypersensitivity <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",…
## $ cp               <int> 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 3,…
## $ trestbps         <int> 132, 130, 180, 128, 200, 134, 140, 130, 136, 170, 174…
## $ chol             <int> 341, 305, 327, 205, 288, 409, 241, 236, 319, 225, 249…
## $ fbs              <int> 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ restecg          <int> 2, 0, 1, 1, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0,…
## $ thalach          <int> 136, 142, 117, 130, 133, 150, 123, 174, 152, 146, 143…
## $ exang            <int> 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,…
## $ oldpeak          <dbl> 3.0, 1.2, 3.4, 2.0, 4.0, 1.9, 0.2, 0.0, 0.0, 2.8, 0.0…
## $ slope            <int> 2, 2, 2, 2, 3, 2, 2, 2, 1, 2, 2, 2, 1, 1, 2, 3, 3, 2,…
## $ ca               <int> 0, 0, 0, 1, 2, 2, 0, 1, 2, 2, 0, 2, 0, 0, 0, 2, 3, 1,…
## $ thal             <int> 7, 7, 3, 7, 7, 7, 7, 3, 3, 6, 3, 7, 3, 3, 7, 3, 7, 7,…
## $ num              <int> 2, 2, 2, 3, 3, 2, 1, 1, 3, 2, 1, 3, 1, 1, 1, 3, 3, 2,…
## $ SK               <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ SK.React         <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",…
## $ Reaction         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Mortality        <int> 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1,…
## $ Follow.Up        <int> 60, 15, 6, 52, 34, 32, 60, 3, 15, 6, 52, 34, 32, 12, …
head(pak_data)
##   Age Age.Group Gender Locality Marital.status Life.Style Sleep Category
## 1  45     41-50 Female    RURAL        MARRIED         NO    NO     FREE
## 2  51     51-60 Female    URBAN        MARRIED         NO    NO     FREE
## 3  55     51-60 Female    RURAL        MARRIED        YES   YES     FREE
## 4  55     51-60 Female    RURAL        MARRIED        YES   YES     FREE
## 5  56     51-60 Female    RURAL        MARRIED        YES    NO     FREE
## 6  56     51-60 Female    URBAN        MARRIED         NO    NO     FREE
##   Depression Hyperlipi Smoking Family.History F.History Diabetes HTN Allergies
## 1        YES       YES      NO             NO         0        1  NO        NO
## 2        YES       YES      NO             NO         0        0  NO        NO
## 3        YES       YES      NO             NO         0        1 YES        NO
## 4        YES       YES      NO             NO         0        1 YES        NO
## 5        YES       YES      NO             NO         0        1 YES        NO
## 6        YES       YES      NO             NO         0        1 YES        NO
##      BP Thrombolysis BGR B.Urea S.Cr S.Sodium S.Potassium S.Chloride C.P.K
## 1 100.6            0  84     28  0.9      138         3.3        107   130
## 2  90.6            0 135     17  0.7      144         4.7        104   163
## 3 100.7            0 146     37  1.0      137         4.2        103   149
## 4 160.1            0 146     37  1.0      137         4.2        103   149
## 5  90.6            0  85     78  1.2      139         4.5        112    75
## 6 140.7            0 166    104  4.0      130         5.3        100   322
##   CK.MB ESR   WBC  RBC Hemoglobin P.C.V M.C.V M.C.H M.C.H.C PLATELET_COUNT
## 1    30  11  9900 4.26       11.6  0.34  79.7  27.2    0.34         265000
## 2    30  27 15800 5.74       14.5  0.44  78.0  25.0    0.32         287000
## 3    22  19  7900 4.83       14.1  0.42  87.0  29.0    0.33         183000
## 4    22  19  7900 4.83       14.1  0.42  87.0  29.0    0.33         183000
## 5    18  13  6900 4.41       12.3  0.36  82.0  27.0    0.33         211000
## 6    52 154 13500 3.90       10.0  0.29  74.4  25.7    0.35         288000
##   NEUTROPHIL LYMPHO MONOCYTE EOSINO          Others
## 1       0.70   0.25     0.03      2              no
## 2       0.73   0.20     0.04      3              no
## 3       0.60   0.33     0.04      3  LV dysfunction
## 4       0.60   0.33     0.04      3             HTN
## 5       0.71   0.25     0.02      2              no
## 6       0.85   0.10     0.03      2 PND, ORTHOPENIA
##                              CO                Diagnosis Hypersensitivity cp
## 1                   Chest pain,      EXT. ACUTE WALL M.I               NO  4
## 2           Central Chest pain,                  A/W M.I               NO  4
## 3 Chest pain,SOB, Cold sweating AC I/W M.I (RV) RE. M.I                NO  4
## 4           CENTRAL Chest pain,                  I/W M.I               NO  4
## 5                   Chest pain,                  A/W M.I               NO  4
## 6               SOB FROM 1 DAY               ACS, NSTEMI               NO  4
##   trestbps chol fbs restecg thalach exang oldpeak slope ca thal num SK SK.React
## 1      132  341   1       2     136     1     3.0     2  0    7   2  1       NO
## 2      130  305   0       0     142     1     1.2     2  0    7   2  1       NO
## 3      180  327   0       1     117     1     3.4     2  0    3   2  1       NO
## 4      128  205   0       1     130     1     2.0     2  1    7   3  1       NO
## 5      200  288   1       2     133     1     4.0     3  2    7   3  1       NO
## 6      134  409   0       2     150     1     1.9     2  2    7   2  1       NO
##   Reaction Mortality Follow.Up
## 1        0         0        60
## 2        0         0        15
## 3        0         0         6
## 4        0         0        52
## 5        0         0        34
## 6        0         1        32
#to change the text to lower case in data
pak_data2 <- pak_data 
head(pak_data2)
##   Age Age.Group Gender Locality Marital.status Life.Style Sleep Category
## 1  45     41-50 Female    RURAL        MARRIED         NO    NO     FREE
## 2  51     51-60 Female    URBAN        MARRIED         NO    NO     FREE
## 3  55     51-60 Female    RURAL        MARRIED        YES   YES     FREE
## 4  55     51-60 Female    RURAL        MARRIED        YES   YES     FREE
## 5  56     51-60 Female    RURAL        MARRIED        YES    NO     FREE
## 6  56     51-60 Female    URBAN        MARRIED         NO    NO     FREE
##   Depression Hyperlipi Smoking Family.History F.History Diabetes HTN Allergies
## 1        YES       YES      NO             NO         0        1  NO        NO
## 2        YES       YES      NO             NO         0        0  NO        NO
## 3        YES       YES      NO             NO         0        1 YES        NO
## 4        YES       YES      NO             NO         0        1 YES        NO
## 5        YES       YES      NO             NO         0        1 YES        NO
## 6        YES       YES      NO             NO         0        1 YES        NO
##      BP Thrombolysis BGR B.Urea S.Cr S.Sodium S.Potassium S.Chloride C.P.K
## 1 100.6            0  84     28  0.9      138         3.3        107   130
## 2  90.6            0 135     17  0.7      144         4.7        104   163
## 3 100.7            0 146     37  1.0      137         4.2        103   149
## 4 160.1            0 146     37  1.0      137         4.2        103   149
## 5  90.6            0  85     78  1.2      139         4.5        112    75
## 6 140.7            0 166    104  4.0      130         5.3        100   322
##   CK.MB ESR   WBC  RBC Hemoglobin P.C.V M.C.V M.C.H M.C.H.C PLATELET_COUNT
## 1    30  11  9900 4.26       11.6  0.34  79.7  27.2    0.34         265000
## 2    30  27 15800 5.74       14.5  0.44  78.0  25.0    0.32         287000
## 3    22  19  7900 4.83       14.1  0.42  87.0  29.0    0.33         183000
## 4    22  19  7900 4.83       14.1  0.42  87.0  29.0    0.33         183000
## 5    18  13  6900 4.41       12.3  0.36  82.0  27.0    0.33         211000
## 6    52 154 13500 3.90       10.0  0.29  74.4  25.7    0.35         288000
##   NEUTROPHIL LYMPHO MONOCYTE EOSINO          Others
## 1       0.70   0.25     0.03      2              no
## 2       0.73   0.20     0.04      3              no
## 3       0.60   0.33     0.04      3  LV dysfunction
## 4       0.60   0.33     0.04      3             HTN
## 5       0.71   0.25     0.02      2              no
## 6       0.85   0.10     0.03      2 PND, ORTHOPENIA
##                              CO                Diagnosis Hypersensitivity cp
## 1                   Chest pain,      EXT. ACUTE WALL M.I               NO  4
## 2           Central Chest pain,                  A/W M.I               NO  4
## 3 Chest pain,SOB, Cold sweating AC I/W M.I (RV) RE. M.I                NO  4
## 4           CENTRAL Chest pain,                  I/W M.I               NO  4
## 5                   Chest pain,                  A/W M.I               NO  4
## 6               SOB FROM 1 DAY               ACS, NSTEMI               NO  4
##   trestbps chol fbs restecg thalach exang oldpeak slope ca thal num SK SK.React
## 1      132  341   1       2     136     1     3.0     2  0    7   2  1       NO
## 2      130  305   0       0     142     1     1.2     2  0    7   2  1       NO
## 3      180  327   0       1     117     1     3.4     2  0    3   2  1       NO
## 4      128  205   0       1     130     1     2.0     2  1    7   3  1       NO
## 5      200  288   1       2     133     1     4.0     3  2    7   3  1       NO
## 6      134  409   0       2     150     1     1.9     2  2    7   2  1       NO
##   Reaction Mortality Follow.Up
## 1        0         0        60
## 2        0         0        15
## 3        0         0         6
## 4        0         0        52
## 5        0         0        34
## 6        0         1        32
glimpse(pak_data2)
## Rows: 368
## Columns: 60
## $ Age              <int> 45, 51, 55, 55, 56, 56, 57, 57, 58, 58, 59, 60, 60, 6…
## $ Age.Group        <chr> "41-50", "51-60", "51-60", "51-60", "51-60", "51-60",…
## $ Gender           <chr> "Female", "Female", "Female", "Female", "Female", "Fe…
## $ Locality         <chr> "RURAL", "URBAN", "RURAL", "RURAL", "RURAL", "URBAN",…
## $ Marital.status   <chr> "MARRIED", "MARRIED", "MARRIED", "MARRIED", "MARRIED"…
## $ Life.Style       <chr> "NO", "NO", "YES", "YES", "YES", "NO", "YES", "NO", "…
## $ Sleep            <chr> "NO", "NO", "YES", "YES", "NO", "NO", "YES", "NO", "N…
## $ Category         <chr> "FREE", "FREE", "FREE", "FREE", "FREE", "FREE", "PAID…
## $ Depression       <chr> "YES", "YES", "YES", "YES", "YES", "YES", "YES", "YES…
## $ Hyperlipi        <chr> "YES", "YES", "YES", "YES", "YES", "YES", "YES", "YES…
## $ Smoking          <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",…
## $ Family.History   <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",…
## $ F.History        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Diabetes         <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,…
## $ HTN              <chr> "NO", "NO", "YES", "YES", "YES", "YES", "YES", "NO", …
## $ Allergies        <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",…
## $ BP               <dbl> 100.6, 90.6, 100.7, 160.1, 90.6, 140.7, 120.8, 100.6,…
## $ Thrombolysis     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ BGR              <int> 84, 135, 146, 146, 85, 166, 96, 84, 135, 146, 146, 85…
## $ B.Urea           <dbl> 28, 17, 37, 37, 78, 104, 42, 28, 17, 37, 37, 78, 104,…
## $ S.Cr             <dbl> 0.9, 0.7, 1.0, 1.0, 1.2, 4.0, 1.0, 0.9, 0.7, 1.0, 1.0…
## $ S.Sodium         <int> 138, 144, 137, 137, 139, 130, 146, 138, 144, 137, 137…
## $ S.Potassium      <dbl> 3.3, 4.7, 4.2, 4.2, 4.5, 5.3, 3.9, 3.3, 4.7, 4.2, 4.2…
## $ S.Chloride       <int> 107, 104, 103, 103, 112, 100, 100, 107, 104, 103, 103…
## $ C.P.K            <int> 130, 163, 149, 149, 75, 322, 146, 130, 163, 149, 149,…
## $ CK.MB            <int> 30, 30, 22, 22, 18, 52, 21, 30, 30, 22, 22, 18, 52, 2…
## $ ESR              <int> 11, 27, 19, 19, 13, 154, 25, 11, 27, 19, 19, 13, 154,…
## $ WBC              <int> 9900, 15800, 7900, 7900, 6900, 13500, 7400, 9900, 158…
## $ RBC              <dbl> 4.26, 5.74, 4.83, 4.83, 4.41, 3.90, 4.14, 4.26, 5.74,…
## $ Hemoglobin       <dbl> 11.6, 14.5, 14.1, 14.1, 12.3, 10.0, 11.7, 11.6, 14.5,…
## $ P.C.V            <dbl> 0.34, 0.44, 0.42, 0.42, 0.36, 0.29, 0.36, 0.34, 0.44,…
## $ M.C.V            <dbl> 79.7, 78.0, 87.0, 87.0, 82.0, 74.4, 87.0, 79.7, 78.0,…
## $ M.C.H            <dbl> 27.2, 25.0, 29.0, 29.0, 27.0, 25.7, 28.0, 27.2, 25.0,…
## $ M.C.H.C          <dbl> 0.34, 0.32, 0.33, 0.33, 0.33, 0.35, 0.32, 0.34, 0.32,…
## $ PLATELET_COUNT   <int> 265000, 287000, 183000, 183000, 211000, 288000, 39500…
## $ NEUTROPHIL       <dbl> 0.70, 0.73, 0.60, 0.60, 0.71, 0.85, 0.63, 0.70, 0.73,…
## $ LYMPHO           <dbl> 0.25, 0.20, 0.33, 0.33, 0.25, 0.10, 0.31, 0.25, 0.20,…
## $ MONOCYTE         <dbl> 0.03, 0.04, 0.04, 0.04, 0.02, 0.03, 0.03, 0.03, 0.04,…
## $ EOSINO           <int> 2, 3, 3, 3, 2, 2, 3, 2, 3, 3, 3, 2, 2, 3, 2, 3, 3, 3,…
## $ Others           <chr> "no", "no", "LV dysfunction", "HTN", "no", "PND, ORTH…
## $ CO               <chr> "Chest pain,", "Central Chest pain,", "Chest pain,SOB…
## $ Diagnosis        <chr> "EXT. ACUTE WALL M.I", "A/W M.I", "AC I/W M.I (RV) RE…
## $ Hypersensitivity <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",…
## $ cp               <int> 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 3,…
## $ trestbps         <int> 132, 130, 180, 128, 200, 134, 140, 130, 136, 170, 174…
## $ chol             <int> 341, 305, 327, 205, 288, 409, 241, 236, 319, 225, 249…
## $ fbs              <int> 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ restecg          <int> 2, 0, 1, 1, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0,…
## $ thalach          <int> 136, 142, 117, 130, 133, 150, 123, 174, 152, 146, 143…
## $ exang            <int> 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,…
## $ oldpeak          <dbl> 3.0, 1.2, 3.4, 2.0, 4.0, 1.9, 0.2, 0.0, 0.0, 2.8, 0.0…
## $ slope            <int> 2, 2, 2, 2, 3, 2, 2, 2, 1, 2, 2, 2, 1, 1, 2, 3, 3, 2,…
## $ ca               <int> 0, 0, 0, 1, 2, 2, 0, 1, 2, 2, 0, 2, 0, 0, 0, 2, 3, 1,…
## $ thal             <int> 7, 7, 3, 7, 7, 7, 7, 3, 3, 6, 3, 7, 3, 3, 7, 3, 7, 7,…
## $ num              <int> 2, 2, 2, 3, 3, 2, 1, 1, 3, 2, 1, 3, 1, 1, 1, 3, 3, 2,…
## $ SK               <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ SK.React         <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",…
## $ Reaction         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Mortality        <int> 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1,…
## $ Follow.Up        <int> 60, 15, 6, 52, 34, 32, 60, 3, 15, 6, 52, 34, 32, 12, …
summary(pak_data2$platelet_count)
## Length  Class   Mode 
##      0   NULL   NULL
#to change the variable names to small cases
names(pak_data2) <- tolower(names(pak_data2))
head(pak_data2)
##   age age.group gender locality marital.status life.style sleep category
## 1  45     41-50 Female    RURAL        MARRIED         NO    NO     FREE
## 2  51     51-60 Female    URBAN        MARRIED         NO    NO     FREE
## 3  55     51-60 Female    RURAL        MARRIED        YES   YES     FREE
## 4  55     51-60 Female    RURAL        MARRIED        YES   YES     FREE
## 5  56     51-60 Female    RURAL        MARRIED        YES    NO     FREE
## 6  56     51-60 Female    URBAN        MARRIED         NO    NO     FREE
##   depression hyperlipi smoking family.history f.history diabetes htn allergies
## 1        YES       YES      NO             NO         0        1  NO        NO
## 2        YES       YES      NO             NO         0        0  NO        NO
## 3        YES       YES      NO             NO         0        1 YES        NO
## 4        YES       YES      NO             NO         0        1 YES        NO
## 5        YES       YES      NO             NO         0        1 YES        NO
## 6        YES       YES      NO             NO         0        1 YES        NO
##      bp thrombolysis bgr b.urea s.cr s.sodium s.potassium s.chloride c.p.k
## 1 100.6            0  84     28  0.9      138         3.3        107   130
## 2  90.6            0 135     17  0.7      144         4.7        104   163
## 3 100.7            0 146     37  1.0      137         4.2        103   149
## 4 160.1            0 146     37  1.0      137         4.2        103   149
## 5  90.6            0  85     78  1.2      139         4.5        112    75
## 6 140.7            0 166    104  4.0      130         5.3        100   322
##   ck.mb esr   wbc  rbc hemoglobin p.c.v m.c.v m.c.h m.c.h.c platelet_count
## 1    30  11  9900 4.26       11.6  0.34  79.7  27.2    0.34         265000
## 2    30  27 15800 5.74       14.5  0.44  78.0  25.0    0.32         287000
## 3    22  19  7900 4.83       14.1  0.42  87.0  29.0    0.33         183000
## 4    22  19  7900 4.83       14.1  0.42  87.0  29.0    0.33         183000
## 5    18  13  6900 4.41       12.3  0.36  82.0  27.0    0.33         211000
## 6    52 154 13500 3.90       10.0  0.29  74.4  25.7    0.35         288000
##   neutrophil lympho monocyte eosino          others
## 1       0.70   0.25     0.03      2              no
## 2       0.73   0.20     0.04      3              no
## 3       0.60   0.33     0.04      3  LV dysfunction
## 4       0.60   0.33     0.04      3             HTN
## 5       0.71   0.25     0.02      2              no
## 6       0.85   0.10     0.03      2 PND, ORTHOPENIA
##                              co                diagnosis hypersensitivity cp
## 1                   Chest pain,      EXT. ACUTE WALL M.I               NO  4
## 2           Central Chest pain,                  A/W M.I               NO  4
## 3 Chest pain,SOB, Cold sweating AC I/W M.I (RV) RE. M.I                NO  4
## 4           CENTRAL Chest pain,                  I/W M.I               NO  4
## 5                   Chest pain,                  A/W M.I               NO  4
## 6               SOB FROM 1 DAY               ACS, NSTEMI               NO  4
##   trestbps chol fbs restecg thalach exang oldpeak slope ca thal num sk sk.react
## 1      132  341   1       2     136     1     3.0     2  0    7   2  1       NO
## 2      130  305   0       0     142     1     1.2     2  0    7   2  1       NO
## 3      180  327   0       1     117     1     3.4     2  0    3   2  1       NO
## 4      128  205   0       1     130     1     2.0     2  1    7   3  1       NO
## 5      200  288   1       2     133     1     4.0     3  2    7   3  1       NO
## 6      134  409   0       2     150     1     1.9     2  2    7   2  1       NO
##   reaction mortality follow.up
## 1        0         0        60
## 2        0         0        15
## 3        0         0         6
## 4        0         0        52
## 5        0         0        34
## 6        0         1        32
#clean variable names use janitor library
pak_data2 <- clean_names(pak_data2)
glimpse(pak_data2)
## Rows: 368
## Columns: 60
## $ age              <int> 45, 51, 55, 55, 56, 56, 57, 57, 58, 58, 59, 60, 60, 6…
## $ age_group        <chr> "41-50", "51-60", "51-60", "51-60", "51-60", "51-60",…
## $ gender           <chr> "Female", "Female", "Female", "Female", "Female", "Fe…
## $ locality         <chr> "RURAL", "URBAN", "RURAL", "RURAL", "RURAL", "URBAN",…
## $ marital_status   <chr> "MARRIED", "MARRIED", "MARRIED", "MARRIED", "MARRIED"…
## $ life_style       <chr> "NO", "NO", "YES", "YES", "YES", "NO", "YES", "NO", "…
## $ sleep            <chr> "NO", "NO", "YES", "YES", "NO", "NO", "YES", "NO", "N…
## $ category         <chr> "FREE", "FREE", "FREE", "FREE", "FREE", "FREE", "PAID…
## $ depression       <chr> "YES", "YES", "YES", "YES", "YES", "YES", "YES", "YES…
## $ hyperlipi        <chr> "YES", "YES", "YES", "YES", "YES", "YES", "YES", "YES…
## $ smoking          <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",…
## $ family_history   <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",…
## $ f_history        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ diabetes         <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,…
## $ htn              <chr> "NO", "NO", "YES", "YES", "YES", "YES", "YES", "NO", …
## $ allergies        <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",…
## $ bp               <dbl> 100.6, 90.6, 100.7, 160.1, 90.6, 140.7, 120.8, 100.6,…
## $ thrombolysis     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ bgr              <int> 84, 135, 146, 146, 85, 166, 96, 84, 135, 146, 146, 85…
## $ b_urea           <dbl> 28, 17, 37, 37, 78, 104, 42, 28, 17, 37, 37, 78, 104,…
## $ s_cr             <dbl> 0.9, 0.7, 1.0, 1.0, 1.2, 4.0, 1.0, 0.9, 0.7, 1.0, 1.0…
## $ s_sodium         <int> 138, 144, 137, 137, 139, 130, 146, 138, 144, 137, 137…
## $ s_potassium      <dbl> 3.3, 4.7, 4.2, 4.2, 4.5, 5.3, 3.9, 3.3, 4.7, 4.2, 4.2…
## $ s_chloride       <int> 107, 104, 103, 103, 112, 100, 100, 107, 104, 103, 103…
## $ c_p_k            <int> 130, 163, 149, 149, 75, 322, 146, 130, 163, 149, 149,…
## $ ck_mb            <int> 30, 30, 22, 22, 18, 52, 21, 30, 30, 22, 22, 18, 52, 2…
## $ esr              <int> 11, 27, 19, 19, 13, 154, 25, 11, 27, 19, 19, 13, 154,…
## $ wbc              <int> 9900, 15800, 7900, 7900, 6900, 13500, 7400, 9900, 158…
## $ rbc              <dbl> 4.26, 5.74, 4.83, 4.83, 4.41, 3.90, 4.14, 4.26, 5.74,…
## $ hemoglobin       <dbl> 11.6, 14.5, 14.1, 14.1, 12.3, 10.0, 11.7, 11.6, 14.5,…
## $ p_c_v            <dbl> 0.34, 0.44, 0.42, 0.42, 0.36, 0.29, 0.36, 0.34, 0.44,…
## $ m_c_v            <dbl> 79.7, 78.0, 87.0, 87.0, 82.0, 74.4, 87.0, 79.7, 78.0,…
## $ m_c_h            <dbl> 27.2, 25.0, 29.0, 29.0, 27.0, 25.7, 28.0, 27.2, 25.0,…
## $ m_c_h_c          <dbl> 0.34, 0.32, 0.33, 0.33, 0.33, 0.35, 0.32, 0.34, 0.32,…
## $ platelet_count   <int> 265000, 287000, 183000, 183000, 211000, 288000, 39500…
## $ neutrophil       <dbl> 0.70, 0.73, 0.60, 0.60, 0.71, 0.85, 0.63, 0.70, 0.73,…
## $ lympho           <dbl> 0.25, 0.20, 0.33, 0.33, 0.25, 0.10, 0.31, 0.25, 0.20,…
## $ monocyte         <dbl> 0.03, 0.04, 0.04, 0.04, 0.02, 0.03, 0.03, 0.03, 0.04,…
## $ eosino           <int> 2, 3, 3, 3, 2, 2, 3, 2, 3, 3, 3, 2, 2, 3, 2, 3, 3, 3,…
## $ others           <chr> "no", "no", "LV dysfunction", "HTN", "no", "PND, ORTH…
## $ co               <chr> "Chest pain,", "Central Chest pain,", "Chest pain,SOB…
## $ diagnosis        <chr> "EXT. ACUTE WALL M.I", "A/W M.I", "AC I/W M.I (RV) RE…
## $ hypersensitivity <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",…
## $ cp               <int> 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 3,…
## $ trestbps         <int> 132, 130, 180, 128, 200, 134, 140, 130, 136, 170, 174…
## $ chol             <int> 341, 305, 327, 205, 288, 409, 241, 236, 319, 225, 249…
## $ fbs              <int> 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ restecg          <int> 2, 0, 1, 1, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0,…
## $ thalach          <int> 136, 142, 117, 130, 133, 150, 123, 174, 152, 146, 143…
## $ exang            <int> 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,…
## $ oldpeak          <dbl> 3.0, 1.2, 3.4, 2.0, 4.0, 1.9, 0.2, 0.0, 0.0, 2.8, 0.0…
## $ slope            <int> 2, 2, 2, 2, 3, 2, 2, 2, 1, 2, 2, 2, 1, 1, 2, 3, 3, 2,…
## $ ca               <int> 0, 0, 0, 1, 2, 2, 0, 1, 2, 2, 0, 2, 0, 0, 0, 2, 3, 1,…
## $ thal             <int> 7, 7, 3, 7, 7, 7, 7, 3, 3, 6, 3, 7, 3, 3, 7, 3, 7, 7,…
## $ num              <int> 2, 2, 2, 3, 3, 2, 1, 1, 3, 2, 1, 3, 1, 1, 1, 3, 3, 2,…
## $ sk               <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ sk_react         <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",…
## $ reaction         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ mortality        <int> 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1,…
## $ follow_up        <int> 60, 15, 6, 52, 34, 32, 60, 3, 15, 6, 52, 34, 32, 12, …
#use mutate function from dplyr to recode some data variables
pak_data2$gender <- factor(pak_data2$gender)
pak_data2  <- pak_data2 %>%  mutate(gender = ifelse(gender == "Female", 0,1))
table(pak_data2$gender)
## 
##   0   1 
##  83 285
glimpse(pak_data2)
## Rows: 368
## Columns: 60
## $ age              <int> 45, 51, 55, 55, 56, 56, 57, 57, 58, 58, 59, 60, 60, 6…
## $ age_group        <chr> "41-50", "51-60", "51-60", "51-60", "51-60", "51-60",…
## $ gender           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ locality         <chr> "RURAL", "URBAN", "RURAL", "RURAL", "RURAL", "URBAN",…
## $ marital_status   <chr> "MARRIED", "MARRIED", "MARRIED", "MARRIED", "MARRIED"…
## $ life_style       <chr> "NO", "NO", "YES", "YES", "YES", "NO", "YES", "NO", "…
## $ sleep            <chr> "NO", "NO", "YES", "YES", "NO", "NO", "YES", "NO", "N…
## $ category         <chr> "FREE", "FREE", "FREE", "FREE", "FREE", "FREE", "PAID…
## $ depression       <chr> "YES", "YES", "YES", "YES", "YES", "YES", "YES", "YES…
## $ hyperlipi        <chr> "YES", "YES", "YES", "YES", "YES", "YES", "YES", "YES…
## $ smoking          <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",…
## $ family_history   <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",…
## $ f_history        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ diabetes         <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,…
## $ htn              <chr> "NO", "NO", "YES", "YES", "YES", "YES", "YES", "NO", …
## $ allergies        <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",…
## $ bp               <dbl> 100.6, 90.6, 100.7, 160.1, 90.6, 140.7, 120.8, 100.6,…
## $ thrombolysis     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ bgr              <int> 84, 135, 146, 146, 85, 166, 96, 84, 135, 146, 146, 85…
## $ b_urea           <dbl> 28, 17, 37, 37, 78, 104, 42, 28, 17, 37, 37, 78, 104,…
## $ s_cr             <dbl> 0.9, 0.7, 1.0, 1.0, 1.2, 4.0, 1.0, 0.9, 0.7, 1.0, 1.0…
## $ s_sodium         <int> 138, 144, 137, 137, 139, 130, 146, 138, 144, 137, 137…
## $ s_potassium      <dbl> 3.3, 4.7, 4.2, 4.2, 4.5, 5.3, 3.9, 3.3, 4.7, 4.2, 4.2…
## $ s_chloride       <int> 107, 104, 103, 103, 112, 100, 100, 107, 104, 103, 103…
## $ c_p_k            <int> 130, 163, 149, 149, 75, 322, 146, 130, 163, 149, 149,…
## $ ck_mb            <int> 30, 30, 22, 22, 18, 52, 21, 30, 30, 22, 22, 18, 52, 2…
## $ esr              <int> 11, 27, 19, 19, 13, 154, 25, 11, 27, 19, 19, 13, 154,…
## $ wbc              <int> 9900, 15800, 7900, 7900, 6900, 13500, 7400, 9900, 158…
## $ rbc              <dbl> 4.26, 5.74, 4.83, 4.83, 4.41, 3.90, 4.14, 4.26, 5.74,…
## $ hemoglobin       <dbl> 11.6, 14.5, 14.1, 14.1, 12.3, 10.0, 11.7, 11.6, 14.5,…
## $ p_c_v            <dbl> 0.34, 0.44, 0.42, 0.42, 0.36, 0.29, 0.36, 0.34, 0.44,…
## $ m_c_v            <dbl> 79.7, 78.0, 87.0, 87.0, 82.0, 74.4, 87.0, 79.7, 78.0,…
## $ m_c_h            <dbl> 27.2, 25.0, 29.0, 29.0, 27.0, 25.7, 28.0, 27.2, 25.0,…
## $ m_c_h_c          <dbl> 0.34, 0.32, 0.33, 0.33, 0.33, 0.35, 0.32, 0.34, 0.32,…
## $ platelet_count   <int> 265000, 287000, 183000, 183000, 211000, 288000, 39500…
## $ neutrophil       <dbl> 0.70, 0.73, 0.60, 0.60, 0.71, 0.85, 0.63, 0.70, 0.73,…
## $ lympho           <dbl> 0.25, 0.20, 0.33, 0.33, 0.25, 0.10, 0.31, 0.25, 0.20,…
## $ monocyte         <dbl> 0.03, 0.04, 0.04, 0.04, 0.02, 0.03, 0.03, 0.03, 0.04,…
## $ eosino           <int> 2, 3, 3, 3, 2, 2, 3, 2, 3, 3, 3, 2, 2, 3, 2, 3, 3, 3,…
## $ others           <chr> "no", "no", "LV dysfunction", "HTN", "no", "PND, ORTH…
## $ co               <chr> "Chest pain,", "Central Chest pain,", "Chest pain,SOB…
## $ diagnosis        <chr> "EXT. ACUTE WALL M.I", "A/W M.I", "AC I/W M.I (RV) RE…
## $ hypersensitivity <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",…
## $ cp               <int> 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 3,…
## $ trestbps         <int> 132, 130, 180, 128, 200, 134, 140, 130, 136, 170, 174…
## $ chol             <int> 341, 305, 327, 205, 288, 409, 241, 236, 319, 225, 249…
## $ fbs              <int> 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ restecg          <int> 2, 0, 1, 1, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0,…
## $ thalach          <int> 136, 142, 117, 130, 133, 150, 123, 174, 152, 146, 143…
## $ exang            <int> 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,…
## $ oldpeak          <dbl> 3.0, 1.2, 3.4, 2.0, 4.0, 1.9, 0.2, 0.0, 0.0, 2.8, 0.0…
## $ slope            <int> 2, 2, 2, 2, 3, 2, 2, 2, 1, 2, 2, 2, 1, 1, 2, 3, 3, 2,…
## $ ca               <int> 0, 0, 0, 1, 2, 2, 0, 1, 2, 2, 0, 2, 0, 0, 0, 2, 3, 1,…
## $ thal             <int> 7, 7, 3, 7, 7, 7, 7, 3, 3, 6, 3, 7, 3, 3, 7, 3, 7, 7,…
## $ num              <int> 2, 2, 2, 3, 3, 2, 1, 1, 3, 2, 1, 3, 1, 1, 1, 3, 3, 2,…
## $ sk               <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ sk_react         <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",…
## $ reaction         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ mortality        <int> 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1,…
## $ follow_up        <int> 60, 15, 6, 52, 34, 32, 60, 3, 15, 6, 52, 34, 32, 12, …
#recode for locality
table(pak_data2$locality)
## 
## RURAL URBAN 
##   134   234
pak_data2$locality <- factor(pak_data2$locality)
pak_data2  <- pak_data2 %>%  mutate(locality = ifelse(locality == "RURAL", 0,1))
table(pak_data2$locality)
## 
##   0   1 
## 134 234
#recode for marital_status
table(pak_data2$marital_status)
## 
## MARRIED  SINGLE 
##     365       3
pak_data2  <- pak_data2 %>%  mutate(marital_status = ifelse(marital_status == "MARRIED", 0, 1))
table(pak_data2$marital_status)
## 
##   0   1 
## 365   3
#recorde for lifestyle
table(pak_data2$life_style)
## 
##  NO YES 
## 151 217
pak_data2 <- pak_data2 %>% mutate(life_style = ifelse(life_style == "NO", 0, 1))
table(pak_data2$life_style)
## 
##   0   1 
## 151 217
#recode for sleep
table(pak_data2$sleep)
## 
##  NO YES 
## 224 144
pak_data2 <- pak_data2 %>% mutate(sleep = ifelse(sleep == "NO", 0, 1))
table(pak_data2$sleep)
## 
##   0   1 
## 224 144
#recode for category

pak_data2 <- pak_data2 %>% mutate(category = ifelse(category == "FREE", 0, 1))
table(pak_data2$category)
## 
##   0   1 
## 331  37
#recode for Depression
pak_data2 <- pak_data2 %>% mutate(depression = ifelse(depression == "NO", 0, 1))
table(pak_data2$depression)
## 
##   0   1 
##  17 351
#recode for HyperLipidemia
pak_data2 <- pak_data2 %>% mutate(hyperlipi = ifelse(hyperlipi == "NO", 0, 1))
table(pak_data2$hyperlipi)
## 
##   0   1 
##  27 341
#recode for familyHistory
pak_data2 <- pak_data2 %>% mutate(family_history = ifelse(family_history == "NO", 0, 1))
table(pak_data2$family_history)
## 
##   0   1 
## 296  72
#recode for htn
pak_data2$htn <- pak_data$HTN
pak_data2 <- pak_data2 %>% mutate(htn = ifelse(htn == "NO", 0, 1))
table(pak_data2$htn)
## 
##   0   1 
## 167 201
#recode for allergies
pak_data2 <- pak_data2 %>% mutate(allergies = ifelse(allergies == "NO", 0, 1))
table(pak_data2$allergies)
## 
##   0   1 
## 357  11
#recode for hypersenstitivity
pak_data2 <- pak_data2 %>% mutate(hypersensitivity = ifelse(hypersensitivity == "NO", 0, 1))
table(pak_data2$hypersensitivity)
## 
##   0   1 
## 356  12
#recode for sk_react
pak_data2 <- pak_data2 %>% mutate(sk_react = ifelse(sk_react == "NO", 0, 1))
table(pak_data2$sk_react)
## 
##   0   1 
##  93 275
#recode for smoking
pak_data2 <- pak_data2 %>% mutate(smoking = ifelse(smoking == "NO", 0, 1))
table(pak_data2$smoking)
## 
##   0   1 
## 173 195
glimpse(pak_data2)
## Rows: 368
## Columns: 60
## $ age              <int> 45, 51, 55, 55, 56, 56, 57, 57, 58, 58, 59, 60, 60, 6…
## $ age_group        <chr> "41-50", "51-60", "51-60", "51-60", "51-60", "51-60",…
## $ gender           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ locality         <dbl> 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,…
## $ marital_status   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ life_style       <dbl> 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1,…
## $ sleep            <dbl> 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1,…
## $ category         <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,…
## $ depression       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ hyperlipi        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ smoking          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ family_history   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ f_history        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ diabetes         <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,…
## $ htn              <dbl> 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,…
## $ allergies        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ bp               <dbl> 100.6, 90.6, 100.7, 160.1, 90.6, 140.7, 120.8, 100.6,…
## $ thrombolysis     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ bgr              <int> 84, 135, 146, 146, 85, 166, 96, 84, 135, 146, 146, 85…
## $ b_urea           <dbl> 28, 17, 37, 37, 78, 104, 42, 28, 17, 37, 37, 78, 104,…
## $ s_cr             <dbl> 0.9, 0.7, 1.0, 1.0, 1.2, 4.0, 1.0, 0.9, 0.7, 1.0, 1.0…
## $ s_sodium         <int> 138, 144, 137, 137, 139, 130, 146, 138, 144, 137, 137…
## $ s_potassium      <dbl> 3.3, 4.7, 4.2, 4.2, 4.5, 5.3, 3.9, 3.3, 4.7, 4.2, 4.2…
## $ s_chloride       <int> 107, 104, 103, 103, 112, 100, 100, 107, 104, 103, 103…
## $ c_p_k            <int> 130, 163, 149, 149, 75, 322, 146, 130, 163, 149, 149,…
## $ ck_mb            <int> 30, 30, 22, 22, 18, 52, 21, 30, 30, 22, 22, 18, 52, 2…
## $ esr              <int> 11, 27, 19, 19, 13, 154, 25, 11, 27, 19, 19, 13, 154,…
## $ wbc              <int> 9900, 15800, 7900, 7900, 6900, 13500, 7400, 9900, 158…
## $ rbc              <dbl> 4.26, 5.74, 4.83, 4.83, 4.41, 3.90, 4.14, 4.26, 5.74,…
## $ hemoglobin       <dbl> 11.6, 14.5, 14.1, 14.1, 12.3, 10.0, 11.7, 11.6, 14.5,…
## $ p_c_v            <dbl> 0.34, 0.44, 0.42, 0.42, 0.36, 0.29, 0.36, 0.34, 0.44,…
## $ m_c_v            <dbl> 79.7, 78.0, 87.0, 87.0, 82.0, 74.4, 87.0, 79.7, 78.0,…
## $ m_c_h            <dbl> 27.2, 25.0, 29.0, 29.0, 27.0, 25.7, 28.0, 27.2, 25.0,…
## $ m_c_h_c          <dbl> 0.34, 0.32, 0.33, 0.33, 0.33, 0.35, 0.32, 0.34, 0.32,…
## $ platelet_count   <int> 265000, 287000, 183000, 183000, 211000, 288000, 39500…
## $ neutrophil       <dbl> 0.70, 0.73, 0.60, 0.60, 0.71, 0.85, 0.63, 0.70, 0.73,…
## $ lympho           <dbl> 0.25, 0.20, 0.33, 0.33, 0.25, 0.10, 0.31, 0.25, 0.20,…
## $ monocyte         <dbl> 0.03, 0.04, 0.04, 0.04, 0.02, 0.03, 0.03, 0.03, 0.04,…
## $ eosino           <int> 2, 3, 3, 3, 2, 2, 3, 2, 3, 3, 3, 2, 2, 3, 2, 3, 3, 3,…
## $ others           <chr> "no", "no", "LV dysfunction", "HTN", "no", "PND, ORTH…
## $ co               <chr> "Chest pain,", "Central Chest pain,", "Chest pain,SOB…
## $ diagnosis        <chr> "EXT. ACUTE WALL M.I", "A/W M.I", "AC I/W M.I (RV) RE…
## $ hypersensitivity <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ cp               <int> 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 3,…
## $ trestbps         <int> 132, 130, 180, 128, 200, 134, 140, 130, 136, 170, 174…
## $ chol             <int> 341, 305, 327, 205, 288, 409, 241, 236, 319, 225, 249…
## $ fbs              <int> 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ restecg          <int> 2, 0, 1, 1, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0,…
## $ thalach          <int> 136, 142, 117, 130, 133, 150, 123, 174, 152, 146, 143…
## $ exang            <int> 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,…
## $ oldpeak          <dbl> 3.0, 1.2, 3.4, 2.0, 4.0, 1.9, 0.2, 0.0, 0.0, 2.8, 0.0…
## $ slope            <int> 2, 2, 2, 2, 3, 2, 2, 2, 1, 2, 2, 2, 1, 1, 2, 3, 3, 2,…
## $ ca               <int> 0, 0, 0, 1, 2, 2, 0, 1, 2, 2, 0, 2, 0, 0, 0, 2, 3, 1,…
## $ thal             <int> 7, 7, 3, 7, 7, 7, 7, 3, 3, 6, 3, 7, 3, 3, 7, 3, 7, 7,…
## $ num              <int> 2, 2, 2, 3, 3, 2, 1, 1, 3, 2, 1, 3, 1, 1, 1, 3, 3, 2,…
## $ sk               <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ sk_react         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ reaction         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ mortality        <int> 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1,…
## $ follow_up        <int> 60, 15, 6, 52, 34, 32, 60, 3, 15, 6, 52, 34, 32, 12, …
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
describe(pak_data2)
##                  vars   n      mean       sd    median   trimmed      mad
## age                 1 368     54.29     8.72     55.00     55.21     7.41
## age_group           2 368      3.88     0.95      4.00      3.99     0.74
## gender              3 368      0.77     0.42      1.00      0.84     0.00
## locality            4 368      0.64     0.48      1.00      0.67     0.00
## marital_status      5 368      0.01     0.09      0.00      0.00     0.00
## life_style          6 368      0.59     0.49      1.00      0.61     0.00
## sleep               7 368      0.39     0.49      0.00      0.36     0.00
## category            8 368      0.10     0.30      0.00      0.00     0.00
## depression          9 368      0.95     0.21      1.00      1.00     0.00
## hyperlipi          10 368      0.93     0.26      1.00      1.00     0.00
## smoking            11 368      0.53     0.50      1.00      0.54     0.00
## family_history     12 368      0.20     0.40      0.00      0.12     0.00
## f_history          13 368      0.20     0.40      0.00      0.12     0.00
## diabetes           14 368      0.46     0.50      0.00      0.45     0.00
## htn                15 368      0.55     0.50      1.00      0.56     0.00
## allergies          16 368      0.03     0.17      0.00      0.00     0.00
## bp                 17 368    121.21    24.54    120.80    119.52    29.80
## thrombolysis       18 368      0.03     0.18      0.00      0.00     0.00
## bgr                19 368    219.99   139.34    164.00    197.84   100.82
## b_urea             20 368     51.68    62.58     36.00     38.33    10.38
## s_cr               21 368      1.72     3.61      0.90      0.97     0.15
## s_sodium           22 368    138.02     4.08    138.00    138.06     4.45
## s_potassium        23 368      4.21     0.39      4.20      4.17     0.44
## s_chloride         24 368    103.82     4.80    104.00    103.96     5.93
## c_p_k              25 368    553.89   957.61    188.00    295.55   171.98
## ck_mb              26 368     62.49    89.79     36.00     39.17    22.24
## esr                27 368     26.57    32.58     16.00     18.21    10.38
## wbc                28 368  11181.28  3517.88  10650.00  10953.31  4225.41
## rbc                29 368      5.09     0.77      5.20      5.09     0.97
## hemoglobin         30 368     13.91     2.16     14.20     14.01     2.22
## p_c_v              31 368      0.42     0.06      0.42      0.42     0.07
## m_c_v              32 368     81.57     6.81     82.10     81.99     6.08
## m_c_h              33 368     27.27     3.08     27.90     27.34     2.08
## m_c_h_c            34 368      0.33     0.02      0.33      0.33     0.01
## platelet_count     35 368 248660.33 76707.56 237000.00 242216.22 74130.00
## neutrophil         36 368      2.93    13.40      0.72      0.70     0.13
## lympho             37 368      0.25     0.12      0.21      0.24     0.09
## monocyte           38 368      0.03     0.02      0.03      0.03     0.01
## eosino             39 368      2.26     0.88      2.00      2.21     1.48
## others             40 368     10.63     3.86     13.00     11.07     0.00
## co                 41 368     17.21    10.65     15.00     16.79    10.38
## diagnosis          42 368     17.88    10.57     17.00     17.58    13.34
## hypersensitivity   43 368      0.03     0.18      0.00      0.00     0.00
## cp                 44 368      3.67     0.77      4.00      3.89     0.00
## trestbps           45 368    132.74    18.19    130.00    131.04    14.83
## chol               46 368    248.94    50.13    249.00    247.62    54.86
## fbs                47 368      0.14     0.35      0.00      0.05     0.00
## restecg            48 368      1.07     0.99      2.00      1.09     0.00
## thalach            49 368    140.92    22.99    144.00    141.24    25.20
## exang              50 368      0.56     0.50      1.00      0.58     0.00
## oldpeak            51 368      1.54     1.39      1.20      1.38     1.48
## slope              52 368      1.84     0.56      2.00      1.81     0.00
## ca                 53 368      1.00     1.04      1.00      0.88     1.48
## thal               54 368      5.86     1.74      7.00      6.07     0.00
## num                55 368      2.04     1.03      2.00      1.94     1.48
## sk                 56 368      0.98     0.13      1.00      1.00     0.00
## sk_react           57 368      0.75     0.44      1.00      0.81     0.00
## reaction           58 368      0.75     0.44      1.00      0.81     0.00
## mortality          59 368      0.22     0.41      0.00      0.15     0.00
## follow_up          60 368     28.65    15.81     32.00     28.22    25.20
##                       min       max     range  skew kurtosis      se
## age                 24.00     77.00     53.00 -1.23     2.47    0.45
## age_group            1.00      5.00      4.00 -1.03     1.09    0.05
## gender               0.00      1.00      1.00 -1.31    -0.29    0.02
## locality             0.00      1.00      1.00 -0.56    -1.69    0.03
## marital_status       0.00      1.00      1.00 10.90   117.02    0.00
## life_style           0.00      1.00      1.00 -0.36    -1.87    0.03
## sleep                0.00      1.00      1.00  0.44    -1.81    0.03
## category             0.00      1.00      1.00  2.65     5.01    0.02
## depression           0.00      1.00      1.00 -4.31    16.59    0.01
## hyperlipi            0.00      1.00      1.00 -3.26     8.65    0.01
## smoking              0.00      1.00      1.00 -0.12    -1.99    0.03
## family_history       0.00      1.00      1.00  1.53     0.34    0.02
## f_history            0.00      1.00      1.00  1.53     0.34    0.02
## diabetes             0.00      1.00      1.00  0.15    -1.98    0.03
## htn                  0.00      1.00      1.00 -0.18    -1.97    0.03
## allergies            0.00      1.00      1.00  5.50    28.31    0.01
## bp                  80.50    190.11    109.61  0.59     0.00    1.28
## thrombolysis         0.00      1.00      1.00  5.24    25.54    0.01
## bgr                 60.00    563.00    503.00  1.19     0.48    7.26
## b_urea               2.30    394.00    391.70  4.54    21.55    3.26
## s_cr                 0.60     22.90     22.30  5.50    29.25    0.19
## s_sodium           129.00    146.00     17.00 -0.02    -0.40    0.21
## s_potassium          3.30      5.30      2.00  0.82     0.85    0.02
## s_chloride          90.00    112.00     22.00 -0.45     0.20    0.25
## c_p_k               52.00   4289.00   4237.00  2.86     7.18   49.92
## ck_mb               14.00    505.00    491.00  3.31    11.06    4.68
## esr                  5.00    154.00    149.00  2.94     8.07    1.70
## wbc               5800.00  19590.00  13790.00  0.40    -0.79  183.38
## rbc                  3.46      6.98      3.52 -0.02    -0.78    0.04
## hemoglobin           9.10     18.00      8.90 -0.39    -0.72    0.11
## p_c_v                0.29      0.54      0.25 -0.36    -0.74    0.00
## m_c_v               60.00     96.00     36.00 -0.82     1.90    0.36
## m_c_h               18.00     33.00     15.00 -0.58     1.39    0.16
## m_c_h_c              0.22      0.39      0.17 -2.19    10.09    0.00
## platelet_count   20000.00 459000.00 439000.00  0.46     1.49 3998.66
## neutrophil           0.36     83.00     82.64  5.79    31.63    0.70
## lympho               0.05      0.54      0.49  0.70    -0.21    0.01
## monocyte             0.01      0.08      0.07  0.88     1.05    0.00
## eosino               1.00      5.00      4.00  0.74     1.01    0.05
## others               1.00     17.00     16.00 -0.92    -0.37    0.20
## co                   1.00     37.00     36.00  0.31    -1.21    0.55
## diagnosis            1.00     37.00     36.00  0.20    -1.28    0.55
## hypersensitivity     0.00      1.00      1.00  5.24    25.54    0.01
## cp                   1.00      4.00      3.00 -2.44     4.95    0.04
## trestbps           100.00    200.00    100.00  0.90     0.93    0.95
## chol               131.00    409.00    278.00  0.39     0.27    2.61
## fbs                  0.00      1.00      1.00  2.05     2.21    0.02
## restecg              0.00      2.00      2.00 -0.15    -1.98    0.05
## thalach             71.00    195.00    124.00 -0.15    -0.54    1.20
## exang                0.00      1.00      1.00 -0.25    -1.94    0.03
## oldpeak              0.00      6.20      6.20  1.01     0.93    0.07
## slope                1.00      3.00      2.00 -0.02    -0.12    0.03
## ca                   0.00      3.00      3.00  0.65    -0.81    0.05
## thal                 3.00      7.00      4.00 -0.99    -0.96    0.09
## num                  1.00      4.00      3.00  0.39    -1.23    0.05
## sk                   0.00      1.00      1.00 -7.61    56.03    0.01
## sk_react             0.00      1.00      1.00 -1.13    -0.72    0.02
## reaction             0.00      1.00      1.00 -1.13    -0.72    0.02
## mortality            0.00      1.00      1.00  1.36    -0.14    0.02
## follow_up            1.00     60.00     59.00  0.20    -0.81    0.82
head(pak_data2$others)
## [1] "no"              "no"              "LV dysfunction"  "HTN"            
## [5] "no"              "PND, ORTHOPENIA"
head(pak_data2$co)
## [1] "Chest pain,"                   "Central Chest pain,"          
## [3] "Chest pain,SOB, Cold sweating" "CENTRAL Chest pain,"          
## [5] "Chest pain,"                   "SOB FROM 1 DAY "
head(pak_data2$diagnosis)
## [1] "EXT. ACUTE WALL M.I"      "A/W M.I"                 
## [3] "AC I/W M.I (RV) RE. M.I " "I/W M.I"                 
## [5] "A/W M.I"                  "ACS, NSTEMI"
n_distinct(pak_data2$cp)
## [1] 4
n_distinct(pak_data2$co)
## [1] 37
n_distinct(pak_data2$diagnosis)
## [1] 37
summary(pak_data2$diagnosis)
##    Length     Class      Mode 
##       368 character character
glimpse(pak_data2)
## Rows: 368
## Columns: 60
## $ age              <int> 45, 51, 55, 55, 56, 56, 57, 57, 58, 58, 59, 60, 60, 6…
## $ age_group        <chr> "41-50", "51-60", "51-60", "51-60", "51-60", "51-60",…
## $ gender           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ locality         <dbl> 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,…
## $ marital_status   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ life_style       <dbl> 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1,…
## $ sleep            <dbl> 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1,…
## $ category         <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,…
## $ depression       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ hyperlipi        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ smoking          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ family_history   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ f_history        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ diabetes         <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,…
## $ htn              <dbl> 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,…
## $ allergies        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ bp               <dbl> 100.6, 90.6, 100.7, 160.1, 90.6, 140.7, 120.8, 100.6,…
## $ thrombolysis     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ bgr              <int> 84, 135, 146, 146, 85, 166, 96, 84, 135, 146, 146, 85…
## $ b_urea           <dbl> 28, 17, 37, 37, 78, 104, 42, 28, 17, 37, 37, 78, 104,…
## $ s_cr             <dbl> 0.9, 0.7, 1.0, 1.0, 1.2, 4.0, 1.0, 0.9, 0.7, 1.0, 1.0…
## $ s_sodium         <int> 138, 144, 137, 137, 139, 130, 146, 138, 144, 137, 137…
## $ s_potassium      <dbl> 3.3, 4.7, 4.2, 4.2, 4.5, 5.3, 3.9, 3.3, 4.7, 4.2, 4.2…
## $ s_chloride       <int> 107, 104, 103, 103, 112, 100, 100, 107, 104, 103, 103…
## $ c_p_k            <int> 130, 163, 149, 149, 75, 322, 146, 130, 163, 149, 149,…
## $ ck_mb            <int> 30, 30, 22, 22, 18, 52, 21, 30, 30, 22, 22, 18, 52, 2…
## $ esr              <int> 11, 27, 19, 19, 13, 154, 25, 11, 27, 19, 19, 13, 154,…
## $ wbc              <int> 9900, 15800, 7900, 7900, 6900, 13500, 7400, 9900, 158…
## $ rbc              <dbl> 4.26, 5.74, 4.83, 4.83, 4.41, 3.90, 4.14, 4.26, 5.74,…
## $ hemoglobin       <dbl> 11.6, 14.5, 14.1, 14.1, 12.3, 10.0, 11.7, 11.6, 14.5,…
## $ p_c_v            <dbl> 0.34, 0.44, 0.42, 0.42, 0.36, 0.29, 0.36, 0.34, 0.44,…
## $ m_c_v            <dbl> 79.7, 78.0, 87.0, 87.0, 82.0, 74.4, 87.0, 79.7, 78.0,…
## $ m_c_h            <dbl> 27.2, 25.0, 29.0, 29.0, 27.0, 25.7, 28.0, 27.2, 25.0,…
## $ m_c_h_c          <dbl> 0.34, 0.32, 0.33, 0.33, 0.33, 0.35, 0.32, 0.34, 0.32,…
## $ platelet_count   <int> 265000, 287000, 183000, 183000, 211000, 288000, 39500…
## $ neutrophil       <dbl> 0.70, 0.73, 0.60, 0.60, 0.71, 0.85, 0.63, 0.70, 0.73,…
## $ lympho           <dbl> 0.25, 0.20, 0.33, 0.33, 0.25, 0.10, 0.31, 0.25, 0.20,…
## $ monocyte         <dbl> 0.03, 0.04, 0.04, 0.04, 0.02, 0.03, 0.03, 0.03, 0.04,…
## $ eosino           <int> 2, 3, 3, 3, 2, 2, 3, 2, 3, 3, 3, 2, 2, 3, 2, 3, 3, 3,…
## $ others           <chr> "no", "no", "LV dysfunction", "HTN", "no", "PND, ORTH…
## $ co               <chr> "Chest pain,", "Central Chest pain,", "Chest pain,SOB…
## $ diagnosis        <chr> "EXT. ACUTE WALL M.I", "A/W M.I", "AC I/W M.I (RV) RE…
## $ hypersensitivity <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ cp               <int> 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 3,…
## $ trestbps         <int> 132, 130, 180, 128, 200, 134, 140, 130, 136, 170, 174…
## $ chol             <int> 341, 305, 327, 205, 288, 409, 241, 236, 319, 225, 249…
## $ fbs              <int> 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ restecg          <int> 2, 0, 1, 1, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0,…
## $ thalach          <int> 136, 142, 117, 130, 133, 150, 123, 174, 152, 146, 143…
## $ exang            <int> 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,…
## $ oldpeak          <dbl> 3.0, 1.2, 3.4, 2.0, 4.0, 1.9, 0.2, 0.0, 0.0, 2.8, 0.0…
## $ slope            <int> 2, 2, 2, 2, 3, 2, 2, 2, 1, 2, 2, 2, 1, 1, 2, 3, 3, 2,…
## $ ca               <int> 0, 0, 0, 1, 2, 2, 0, 1, 2, 2, 0, 2, 0, 0, 0, 2, 3, 1,…
## $ thal             <int> 7, 7, 3, 7, 7, 7, 7, 3, 3, 6, 3, 7, 3, 3, 7, 3, 7, 7,…
## $ num              <int> 2, 2, 2, 3, 3, 2, 1, 1, 3, 2, 1, 3, 1, 1, 1, 3, 3, 2,…
## $ sk               <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ sk_react         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ reaction         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ mortality        <int> 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1,…
## $ follow_up        <int> 60, 15, 6, 52, 34, 32, 60, 3, 15, 6, 52, 34, 32, 12, …
table(pak_data2$cp)
## 
##   1   2   3   4 
##  18  15  37 298
summary(pak_data2$cp)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   4.000   4.000   3.671   4.000   4.000
colnames(pak_data2)
##  [1] "age"              "age_group"        "gender"           "locality"        
##  [5] "marital_status"   "life_style"       "sleep"            "category"        
##  [9] "depression"       "hyperlipi"        "smoking"          "family_history"  
## [13] "f_history"        "diabetes"         "htn"              "allergies"       
## [17] "bp"               "thrombolysis"     "bgr"              "b_urea"          
## [21] "s_cr"             "s_sodium"         "s_potassium"      "s_chloride"      
## [25] "c_p_k"            "ck_mb"            "esr"              "wbc"             
## [29] "rbc"              "hemoglobin"       "p_c_v"            "m_c_v"           
## [33] "m_c_h"            "m_c_h_c"          "platelet_count"   "neutrophil"      
## [37] "lympho"           "monocyte"         "eosino"           "others"          
## [41] "co"               "diagnosis"        "hypersensitivity" "cp"              
## [45] "trestbps"         "chol"             "fbs"              "restecg"         
## [49] "thalach"          "exang"            "oldpeak"          "slope"           
## [53] "ca"               "thal"             "num"              "sk"              
## [57] "sk_react"         "reaction"         "mortality"        "follow_up"

In Reference to Diabetes

#check if gender has an effect? gender is a binary variable  so use chi-squared test
db_gender <- tidy(chisq.test(pak_data2$gender, pak_data2$diabetes))
print(db_gender)
## # A tibble: 1 × 4
##   statistic  p.value parameter method                                           
##       <dbl>    <dbl>     <int> <chr>                                            
## 1      86.4 1.46e-20         1 Pearson's Chi-squared test with Yates' continuit…
db_smoking <- tidy(chisq.test(pak_data2$smoking, pak_data2$diabetes))
print(db_smoking)
## # A tibble: 1 × 4
##   statistic     p.value parameter method                                        
##       <dbl>       <dbl>     <int> <chr>                                         
## 1      26.5 0.000000261         1 Pearson's Chi-squared test with Yates' contin…
db_locality <- tidy(chisq.test(pak_data2$locality, pak_data2$diabetes))
print(db_locality)
## # A tibble: 1 × 4
##   statistic p.value parameter method                                            
##       <dbl>   <dbl>     <int> <chr>                                             
## 1      3.49  0.0617         1 Pearson's Chi-squared test with Yates' continuity…
db_depression <- tidy(chisq.test(pak_data2$depression, pak_data2$diabetes))
print(db_depression)
## # A tibble: 1 × 4
##   statistic p.value parameter method                                            
##       <dbl>   <dbl>     <int> <chr>                                             
## 1     0.104   0.747         1 Pearson's Chi-squared test with Yates' continuity…
library(Matrix)
## Warning: package 'Matrix' was built under R version 4.2.3
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
g <- print(paste("Gender: ", db_gender$p.value))
## [1] "Gender:  1.4568693974862e-20"
s <- print(paste("Smoking: ", db_smoking$p.value))
## [1] "Smoking:  2.60884464598861e-07"
l <- print(paste("Locality: ", db_locality$p.value))
## [1] "Locality:  0.0617224321212565"
d <- print(paste("Depression: ", db_depression$p.value))
## [1] "Depression:  0.747334758966391"
rbind(g, s, l, d)
##   [,1]                            
## g "Gender:  1.4568693974862e-20"  
## s "Smoking:  2.60884464598861e-07"
## l "Locality:  0.0617224321212565" 
## d "Depression:  0.747334758966391"
#perform t test for continuous variables
db_age <- t.test(pak_data2$age ~ pak_data2$diabetes)
print(db_age)
## 
##  Welch Two Sample t-test
## 
## data:  pak_data2$age by pak_data2$diabetes
## t = -4.6313, df = 362.3, p-value = 5.073e-06
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -5.748912 -2.321914
## sample estimates:
## mean in group 0 mean in group 1 
##        52.42929        56.46471
db_bp <- t.test(pak_data2$bp ~ pak_data2$diabetes)
print(db_bp)
## 
##  Welch Two Sample t-test
## 
## data:  pak_data2$bp by pak_data2$diabetes
## t = -0.26399, df = 364.3, p-value = 0.7919
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -5.693527  4.345803
## sample estimates:
## mean in group 0 mean in group 1 
##        120.9020        121.5759
db_platelet_count <- t.test(pak_data2$platelet_count ~ pak_data2$diabetes)
print(db_platelet_count)
## 
##  Welch Two Sample t-test
## 
## data:  pak_data2$platelet_count by pak_data2$diabetes
## t = -3.826, df = 316.32, p-value = 0.000157
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -46465.84 -14905.64
## sample estimates:
## mean in group 0 mean in group 1 
##        234484.8        265170.6
db_trestbps <- t.test(pak_data2$trestbps ~ pak_data2$diabetes)
print(db_trestbps)
## 
##  Welch Two Sample t-test
## 
## data:  pak_data2$trestbps by pak_data2$diabetes
## t = -1.7871, df = 340.78, p-value = 0.0748
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -7.1856547  0.3441812
## sample estimates:
## mean in group 0 mean in group 1 
##        131.1616        134.5824
db_choles <- t.test(pak_data2$chol ~ pak_data2$diabetes)
  print(db_choles)
## 
##  Welch Two Sample t-test
## 
## data:  pak_data2$chol by pak_data2$diabetes
## t = -1.6576, df = 315.17, p-value = 0.0984
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -19.323166   1.652102
## sample estimates:
## mean in group 0 mean in group 1 
##        244.8586        253.6941
db_thalach <- t.test(pak_data2$thalach ~ pak_data2$diabetes)
  print(db_thalach)
## 
##  Welch Two Sample t-test
## 
## data:  pak_data2$thalach by pak_data2$diabetes
## t = 0.3394, df = 365.55, p-value = 0.7345
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -3.881886  5.501375
## sample estimates:
## mean in group 0 mean in group 1 
##        141.2980        140.4882
db_follow_up <- t.test(pak_data2$follow_up ~ pak_data2$diabetes)
  print(db_follow_up)
## 
##  Welch Two Sample t-test
## 
## data:  pak_data2$follow_up by pak_data2$diabetes
## t = 0.91337, df = 333.37, p-value = 0.3617
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -1.764185  4.822533
## sample estimates:
## mean in group 0 mean in group 1 
##        29.35859        27.82941
a <- print(paste("Age: ",db_age$p.value)) 
## [1] "Age:  5.07266425232931e-06"
bp <- print(paste("BP: ",db_bp$p.value)) 
## [1] "BP:  0.79193587394789"
pc <- print(paste("Platelet_Count: ",db_platelet_count$p.value)) 
## [1] "Platelet_Count:  0.000156959257819211"
trest <- print(paste("Resting: ",db_trestbps$p.value)) 
## [1] "Resting:  0.0748043082835425"
chol <- print(paste("Cholsterol: ",db_choles$p.value)) 
## [1] "Cholsterol:  0.0983974805094631"
thal <- print(paste("Thal: ",db_thalach$p.value)) 
## [1] "Thal:  0.734502881239856"
follow <- print(paste("Follow-Up: ",db_follow_up$p.value)) 
## [1] "Follow-Up:  0.361709498356924"
rbind(a, bp, pc, trest, chol, thal, follow)
##        [,1]                                   
## a      "Age:  5.07266425232931e-06"           
## bp     "BP:  0.79193587394789"                
## pc     "Platelet_Count:  0.000156959257819211"
## trest  "Resting:  0.0748043082835425"         
## chol   "Cholsterol:  0.0983974805094631"      
## thal   "Thal:  0.734502881239856"             
## follow "Follow-Up:  0.361709498356924"

#Perform graphical association analysis for continuous

# gender vs diabetes
pak_data2 <- pak_data2 %>% mutate(diabetes_labelled = ifelse(diabetes == 0,"Diabetic", "Non_Diabetic"))

#ggplot(data = pak_data2, aes(x = diabetes_labelled, y = age)) + geom_boxplot()

#age
ggplot(data = pak_data2, aes(x = diabetes_labelled, y = age)) + geom_boxplot()

#bp
ggplot(data = pak_data2, aes(x = diabetes_labelled, y = bp)) + geom_boxplot()

#platlet_count
ggplot(data = pak_data2, aes(x = diabetes_labelled, y = platelet_count)) + geom_boxplot()

#trestbps
ggplot(data = pak_data2, aes(x = diabetes_labelled, y = trestbps)) + geom_boxplot()

#chol
ggplot(data = pak_data2, aes(x = diabetes_labelled, y = chol)) + geom_boxplot()

#thalach
ggplot(data = pak_data2, aes(x = diabetes_labelled, y = thalach)) + geom_boxplot()

#follow-up
ggplot(data = pak_data2, aes(x = diabetes_labelled, y = follow_up)) + geom_boxplot()

#Perform graphical association analysis for binary data

#gender
ggplot(data = pak_data2, aes(x = diabetes_labelled, fill= as.factor(gender))) + geom_bar(position = 'fill')+ylab("Gender %")+
  scale_fill_discrete(labels = c("Female", "Male"))

#smoking
ggplot(data = pak_data2, aes(x = diabetes_labelled, fill= as.factor(smoking))) + geom_bar(position = 'fill')+ylab("Gender %")+
  scale_fill_discrete(labels = c("Non-Smoker", "Smoker"))

#locality
ggplot(data = pak_data2, aes(x = diabetes_labelled, fill= as.factor(locality))) + geom_bar(position = 'fill')+ylab("Gender %")+
  scale_fill_discrete(labels = c("Rural", "Urban"))

#depression
ggplot(data = pak_data2, aes(x = diabetes_labelled, fill= as.factor(depression))) + geom_bar(position = 'fill')+ylab("Gender %")+
  scale_fill_discrete(labels = c("Not Depressed", "Depressed"))

library(tidyverse)
library(plotly)
## Warning: package 'plotly' was built under R version 4.2.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(DT)
## Warning: package 'DT' was built under R version 4.2.3
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.2.3
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 4.2.3
library(lubridate)
library(scales)
## 
## Attaching package: 'scales'
## The following objects are masked from 'package:psych':
## 
##     alpha, rescale
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
library(janitor)
library(RColorBrewer)

Short list the significant ones

#age
dbage <- ggplot(data = pak_data2, aes(x = diabetes_labelled, y = age)) + geom_boxplot()

#platlet_count
dbpc <- ggplot(data = pak_data2, aes(x = diabetes_labelled, y = platelet_count)) + geom_boxplot()

#gender
dbgender <- ggplot(data = pak_data2, aes(x = diabetes_labelled, fill= as.factor(gender))) + geom_bar()+ylab("Gender %")+ ylab("Gender %") +
  scale_fill_discrete(labels = c("Female", "Male"))

#smoking
dbsmoking <- ggplot(data = pak_data2, aes(x = diabetes_labelled, fill= as.factor(smoking))) + geom_bar()+ylab("Gender %")+
  scale_fill_discrete(labels = c("Non-Smoker", "Smoker"))

library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.2.3
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
grid.arrange(dbage,dbpc,dbgender,dbsmoking)

colnames(pak_data2)
##  [1] "age"               "age_group"         "gender"           
##  [4] "locality"          "marital_status"    "life_style"       
##  [7] "sleep"             "category"          "depression"       
## [10] "hyperlipi"         "smoking"           "family_history"   
## [13] "f_history"         "diabetes"          "htn"              
## [16] "allergies"         "bp"                "thrombolysis"     
## [19] "bgr"               "b_urea"            "s_cr"             
## [22] "s_sodium"          "s_potassium"       "s_chloride"       
## [25] "c_p_k"             "ck_mb"             "esr"              
## [28] "wbc"               "rbc"               "hemoglobin"       
## [31] "p_c_v"             "m_c_v"             "m_c_h"            
## [34] "m_c_h_c"           "platelet_count"    "neutrophil"       
## [37] "lympho"            "monocyte"          "eosino"           
## [40] "others"            "co"                "diagnosis"        
## [43] "hypersensitivity"  "cp"                "trestbps"         
## [46] "chol"              "fbs"               "restecg"          
## [49] "thalach"           "exang"             "oldpeak"          
## [52] "slope"             "ca"                "thal"             
## [55] "num"               "sk"                "sk_react"         
## [58] "reaction"          "mortality"         "follow_up"        
## [61] "diabetes_labelled"

#Put variables into one model The plots and statistical tests both confirmed that these 4 variables are highly significanlty associated with the outcome(Diabetes). With binary outcome variable and more than 2 predicting variables Logistic Regression model is used. For example our objective is to know age, platelet count, gender and smoking will likely have diabetes. The glm() command is designed to perform generalized linear models (regressions) on binary outcome data, count data, probability data, proportion data, and many other data types. In our case, the outcome is binary following a binomial distribution.

#use glm and run the model
db_model <- glm(data = pak_data2, diabetes 
                ~ age + platelet_count + as.factor(gender) + as.factor(smoking), family = "binomial")

#extract model summary
summary(db_model)
## 
## Call:
## glm(formula = diabetes ~ age + platelet_count + as.factor(gender) + 
##     as.factor(smoking), family = "binomial", data = pak_data2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3046  -0.8909  -0.6978   1.0215   1.7525  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          2.170e-01  1.087e+00   0.200  0.84173    
## age                  1.347e-02  1.501e-02   0.898  0.36942    
## platelet_count       5.333e-06  1.722e-06   3.097  0.00195 ** 
## as.factor(gender)1  -2.834e+00  4.757e-01  -5.957 2.57e-09 ***
## as.factor(smoking)1 -1.800e-01  2.864e-01  -0.628  0.52978    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 508.02  on 367  degrees of freedom
## Residual deviance: 398.51  on 363  degrees of freedom
## AIC: 408.51
## 
## Number of Fisher Scoring iterations: 5

#Extract the useful information from the model It’s common practice in medical research to report Odds Ratio (OR) to quantify how strongly the presence or absence of property A is associated with the presence or absence of the outcome. When the OR is greater than 1, we say A is positively associated with outcome B (increases the Odds of having B). Otherwise, we say A is negatively associated with B (decreases the Odds of having B).

The raw glm coefficient table (the ‘estimate’ column in the printed output) in R represents the log(Odds Ratios) of the outcome. Therefore, we need to convert the values to the original OR scale and calculate the corresponding 95% Confidence Interval (CI) of the estimated Odds Ratios when reporting results from a logistic regression.

#load broom package
library("broom")

#clean the coefficient table
clean_dbmodel <- tidy(db_model)
clean_dbmodel
## # A tibble: 5 × 5
##   term                   estimate  std.error statistic       p.value
##   <chr>                     <dbl>      <dbl>     <dbl>         <dbl>
## 1 (Intercept)          0.217      1.09           0.200 0.842        
## 2 age                  0.0135     0.0150         0.898 0.369        
## 3 platelet_count       0.00000533 0.00000172     3.10  0.00195      
## 4 as.factor(gender)1  -2.83       0.476         -5.96  0.00000000257
## 5 as.factor(smoking)1 -0.180      0.286         -0.628 0.530
#Calculate the OR
clean_dbmodel$OR <- exp(clean_dbmodel$estimate)

#calculate the 95% CI and save as lower CI and upper CI
clean_dbmodel$lower_CI <- exp(clean_dbmodel$estimate - 1.96 * clean_dbmodel$std.error) 
clean_dbmodel$upper_CI <- exp(clean_dbmodel$estimate + 1.96 * clean_dbmodel$std.error) 

#the updated coeffcient table
clean_dbmodel
## # A tibble: 5 × 8
##   term             estimate std.error statistic p.value     OR lower_CI upper_CI
##   <chr>               <dbl>     <dbl>     <dbl>   <dbl>  <dbl>    <dbl>    <dbl>
## 1 (Intercept)       2.17e-1   1.09e+0     0.200 8.42e-1 1.24     0.148    10.5  
## 2 age               1.35e-2   1.50e-2     0.898 3.69e-1 1.01     0.984     1.04 
## 3 platelet_count    5.33e-6   1.72e-6     3.10  1.95e-3 1.00     1.00      1.00 
## 4 as.factor(gende… -2.83e+0   4.76e-1    -5.96  2.57e-9 0.0588   0.0231    0.149
## 5 as.factor(smoki… -1.80e-1   2.86e-1    -0.628 5.30e-1 0.835    0.476     1.46

#Predicted probabilities from our model

So far, we have built a logistic regression model and examined the model coefficients/ORs. We may wonder how can we use this model we developed to predict a person’s likelihood of having heart disease given his/her age, sex, and maximum heart rate. Furthermore, we’d like to translate the predicted probability into a decision rule for clinical use by defining a cutoff value on the probability scale. In practice, when an individual comes in for a health check-up, the doctor would like to know the predicted probability of diabetes, we create a data frame called newdata, in which we include the desired values for our prediction.

#get the predicted probability in our dataset using the predict() function
pred_prob <- predict(db_model, pak_data2, type = "response")

#create a decision rule using prob 0.5 as cutoff and save the predicted decision into the main data frame
pak_data2$pred_diabetes <- ifelse(pred_prob >= 0.5, 1, 0)

#create a new data frame saving a new case information
newdata <- data.frame(age = 25, platelet_count =300000 , gender = 1, smoking = 0)

#predict the probability for this new data case and print out the predicted value
p_new <- predict(db_model, newdata, type = "response")
p_new
##         1 
## 0.3362089

#Check the model performance metrics Are the predictions accurate? How well does the model fit our data? We are going to use some common metrics to evaluate the model performance. The most straightforward one is Accuracy, which is the proportion of the total number of predictions that were correct. On the other hand, we can calculate the classification error rate using 1- accuracy. However, accuracy can be misleading when the response is rare (i.e., imbalanced response). Another popular metric, Area Under the ROC curve (AUC), has the advantage that it’s independent of the change in the proportion of responders. AUC ranges from 0 to 1. The closer it gets to 1 the better the model performance. Lastly, a confusion matrix is an N X N matrix, where N is the level of outcome. For the problem at hand, we have N=2, and hence we get a 2 X 2 matrix. It cross-tabulates the predicted outcome levels against the true outcome levels.

#load the metrics package
library(Metrics)
## Warning: package 'Metrics' was built under R version 4.2.3
#calculate auc, accuracy, classification error
auc <- auc(pak_data2$diabetes, pak_data2$pred_diabetes)
accuracy <- accuracy(pak_data2$diabetes, pak_data2$pred_diabetes)
classification_error <- ce(pak_data2$diabetes, pak_data2$pred_diabetes)

#print the metrics
print(paste("AUC= ", auc))
## [1] "AUC=  0.75668449197861"
print(paste("Accuracy= ", accuracy))
## [1] "Accuracy=  0.771739130434783"
print(paste("Classification Error= ", classification_error))
## [1] "Classification Error=  0.228260869565217"
#confusion Matrix
table(pak_data2$diabetes, pak_data2$pred_diabetes, dnn = c("True Status", "Predicted Status"))
##            Predicted Status
## True Status   0   1
##           0 189   9
##           1  75  95