library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'purrr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyr)
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(broom)
## Warning: package 'broom' was built under R version 4.2.3
#load data set
pak_data <- read.csv("FIC_Full.csv")
head(pak_data)
## Age Age.Group Gender Locality Marital.status Life.Style Sleep Category
## 1 45 41-50 Female RURAL MARRIED NO NO FREE
## 2 51 51-60 Female URBAN MARRIED NO NO FREE
## 3 55 51-60 Female RURAL MARRIED YES YES FREE
## 4 55 51-60 Female RURAL MARRIED YES YES FREE
## 5 56 51-60 Female RURAL MARRIED YES NO FREE
## 6 56 51-60 Female URBAN MARRIED NO NO FREE
## Depression Hyperlipi Smoking Family.History F.History Diabetes HTN Allergies
## 1 YES YES NO NO 0 1 NO NO
## 2 YES YES NO NO 0 0 NO NO
## 3 YES YES NO NO 0 1 YES NO
## 4 YES YES NO NO 0 1 YES NO
## 5 YES YES NO NO 0 1 YES NO
## 6 YES YES NO NO 0 1 YES NO
## BP Thrombolysis BGR B.Urea S.Cr S.Sodium S.Potassium S.Chloride C.P.K
## 1 100.6 0 84 28 0.9 138 3.3 107 130
## 2 90.6 0 135 17 0.7 144 4.7 104 163
## 3 100.7 0 146 37 1.0 137 4.2 103 149
## 4 160.1 0 146 37 1.0 137 4.2 103 149
## 5 90.6 0 85 78 1.2 139 4.5 112 75
## 6 140.7 0 166 104 4.0 130 5.3 100 322
## CK.MB ESR WBC RBC Hemoglobin P.C.V M.C.V M.C.H M.C.H.C PLATELET_COUNT
## 1 30 11 9900 4.26 11.6 0.34 79.7 27.2 0.34 265000
## 2 30 27 15800 5.74 14.5 0.44 78.0 25.0 0.32 287000
## 3 22 19 7900 4.83 14.1 0.42 87.0 29.0 0.33 183000
## 4 22 19 7900 4.83 14.1 0.42 87.0 29.0 0.33 183000
## 5 18 13 6900 4.41 12.3 0.36 82.0 27.0 0.33 211000
## 6 52 154 13500 3.90 10.0 0.29 74.4 25.7 0.35 288000
## NEUTROPHIL LYMPHO MONOCYTE EOSINO Others
## 1 0.70 0.25 0.03 2 no
## 2 0.73 0.20 0.04 3 no
## 3 0.60 0.33 0.04 3 LV dysfunction
## 4 0.60 0.33 0.04 3 HTN
## 5 0.71 0.25 0.02 2 no
## 6 0.85 0.10 0.03 2 PND, ORTHOPENIA
## CO Diagnosis Hypersensitivity cp
## 1 Chest pain, EXT. ACUTE WALL M.I NO 4
## 2 Central Chest pain, A/W M.I NO 4
## 3 Chest pain,SOB, Cold sweating AC I/W M.I (RV) RE. M.I NO 4
## 4 CENTRAL Chest pain, I/W M.I NO 4
## 5 Chest pain, A/W M.I NO 4
## 6 SOB FROM 1 DAY ACS, NSTEMI NO 4
## trestbps chol fbs restecg thalach exang oldpeak slope ca thal num SK SK.React
## 1 132 341 1 2 136 1 3.0 2 0 7 2 1 NO
## 2 130 305 0 0 142 1 1.2 2 0 7 2 1 NO
## 3 180 327 0 1 117 1 3.4 2 0 3 2 1 NO
## 4 128 205 0 1 130 1 2.0 2 1 7 3 1 NO
## 5 200 288 1 2 133 1 4.0 3 2 7 3 1 NO
## 6 134 409 0 2 150 1 1.9 2 2 7 2 1 NO
## Reaction Mortality Follow.Up
## 1 0 0 60
## 2 0 0 15
## 3 0 0 6
## 4 0 0 52
## 5 0 0 34
## 6 0 1 32
colnames(pak_data)
## [1] "Age" "Age.Group" "Gender" "Locality"
## [5] "Marital.status" "Life.Style" "Sleep" "Category"
## [9] "Depression" "Hyperlipi" "Smoking" "Family.History"
## [13] "F.History" "Diabetes" "HTN" "Allergies"
## [17] "BP" "Thrombolysis" "BGR" "B.Urea"
## [21] "S.Cr" "S.Sodium" "S.Potassium" "S.Chloride"
## [25] "C.P.K" "CK.MB" "ESR" "WBC"
## [29] "RBC" "Hemoglobin" "P.C.V" "M.C.V"
## [33] "M.C.H" "M.C.H.C" "PLATELET_COUNT" "NEUTROPHIL"
## [37] "LYMPHO" "MONOCYTE" "EOSINO" "Others"
## [41] "CO" "Diagnosis" "Hypersensitivity" "cp"
## [45] "trestbps" "chol" "fbs" "restecg"
## [49] "thalach" "exang" "oldpeak" "slope"
## [53] "ca" "thal" "num" "SK"
## [57] "SK.React" "Reaction" "Mortality" "Follow.Up"
pak_data %>% map(~sum(is.na(.)))
## $Age
## [1] 0
##
## $Age.Group
## [1] 0
##
## $Gender
## [1] 0
##
## $Locality
## [1] 0
##
## $Marital.status
## [1] 0
##
## $Life.Style
## [1] 0
##
## $Sleep
## [1] 0
##
## $Category
## [1] 0
##
## $Depression
## [1] 0
##
## $Hyperlipi
## [1] 0
##
## $Smoking
## [1] 0
##
## $Family.History
## [1] 0
##
## $F.History
## [1] 0
##
## $Diabetes
## [1] 0
##
## $HTN
## [1] 0
##
## $Allergies
## [1] 0
##
## $BP
## [1] 0
##
## $Thrombolysis
## [1] 0
##
## $BGR
## [1] 0
##
## $B.Urea
## [1] 0
##
## $S.Cr
## [1] 0
##
## $S.Sodium
## [1] 0
##
## $S.Potassium
## [1] 0
##
## $S.Chloride
## [1] 0
##
## $C.P.K
## [1] 0
##
## $CK.MB
## [1] 0
##
## $ESR
## [1] 0
##
## $WBC
## [1] 0
##
## $RBC
## [1] 0
##
## $Hemoglobin
## [1] 0
##
## $P.C.V
## [1] 0
##
## $M.C.V
## [1] 0
##
## $M.C.H
## [1] 0
##
## $M.C.H.C
## [1] 0
##
## $PLATELET_COUNT
## [1] 0
##
## $NEUTROPHIL
## [1] 0
##
## $LYMPHO
## [1] 0
##
## $MONOCYTE
## [1] 0
##
## $EOSINO
## [1] 0
##
## $Others
## [1] 0
##
## $CO
## [1] 0
##
## $Diagnosis
## [1] 0
##
## $Hypersensitivity
## [1] 0
##
## $cp
## [1] 0
##
## $trestbps
## [1] 0
##
## $chol
## [1] 0
##
## $fbs
## [1] 0
##
## $restecg
## [1] 0
##
## $thalach
## [1] 0
##
## $exang
## [1] 0
##
## $oldpeak
## [1] 0
##
## $slope
## [1] 0
##
## $ca
## [1] 0
##
## $thal
## [1] 0
##
## $num
## [1] 0
##
## $SK
## [1] 0
##
## $SK.React
## [1] 0
##
## $Reaction
## [1] 0
##
## $Mortality
## [1] 0
##
## $Follow.Up
## [1] 0
pak_data %>% map(~sum(n_distinct(.)))
## $Age
## [1] 31
##
## $Age.Group
## [1] 5
##
## $Gender
## [1] 2
##
## $Locality
## [1] 2
##
## $Marital.status
## [1] 2
##
## $Life.Style
## [1] 2
##
## $Sleep
## [1] 2
##
## $Category
## [1] 2
##
## $Depression
## [1] 2
##
## $Hyperlipi
## [1] 2
##
## $Smoking
## [1] 2
##
## $Family.History
## [1] 2
##
## $F.History
## [1] 2
##
## $Diabetes
## [1] 2
##
## $HTN
## [1] 2
##
## $Allergies
## [1] 2
##
## $BP
## [1] 17
##
## $Thrombolysis
## [1] 2
##
## $BGR
## [1] 38
##
## $B.Urea
## [1] 28
##
## $S.Cr
## [1] 13
##
## $S.Sodium
## [1] 17
##
## $S.Potassium
## [1] 15
##
## $S.Chloride
## [1] 14
##
## $C.P.K
## [1] 44
##
## $CK.MB
## [1] 31
##
## $ESR
## [1] 22
##
## $WBC
## [1] 31
##
## $RBC
## [1] 32
##
## $Hemoglobin
## [1] 32
##
## $P.C.V
## [1] 19
##
## $M.C.V
## [1] 27
##
## $M.C.H
## [1] 22
##
## $M.C.H.C
## [1] 8
##
## $PLATELET_COUNT
## [1] 36
##
## $NEUTROPHIL
## [1] 27
##
## $LYMPHO
## [1] 24
##
## $MONOCYTE
## [1] 8
##
## $EOSINO
## [1] 5
##
## $Others
## [1] 17
##
## $CO
## [1] 37
##
## $Diagnosis
## [1] 37
##
## $Hypersensitivity
## [1] 2
##
## $cp
## [1] 4
##
## $trestbps
## [1] 39
##
## $chol
## [1] 97
##
## $fbs
## [1] 2
##
## $restecg
## [1] 3
##
## $thalach
## [1] 71
##
## $exang
## [1] 2
##
## $oldpeak
## [1] 35
##
## $slope
## [1] 3
##
## $ca
## [1] 4
##
## $thal
## [1] 3
##
## $num
## [1] 4
##
## $SK
## [1] 2
##
## $SK.React
## [1] 7
##
## $Reaction
## [1] 2
##
## $Mortality
## [1] 2
##
## $Follow.Up
## [1] 22
glimpse(pak_data)
## Rows: 368
## Columns: 60
## $ Age <int> 45, 51, 55, 55, 56, 56, 57, 57, 58, 58, 59, 60, 60, 6…
## $ Age.Group <chr> "41-50", "51-60", "51-60", "51-60", "51-60", "51-60",…
## $ Gender <chr> "Female", "Female", "Female", "Female", "Female", "Fe…
## $ Locality <chr> "RURAL", "URBAN", "RURAL", "RURAL", "RURAL", "URBAN",…
## $ Marital.status <chr> "MARRIED", "MARRIED", "MARRIED", "MARRIED", "MARRIED"…
## $ Life.Style <chr> "NO", "NO", "YES", "YES", "YES", "NO", "YES", "NO", "…
## $ Sleep <chr> "NO", "NO", "YES", "YES", "NO", "NO", "YES", "NO", "N…
## $ Category <chr> "FREE", "FREE", "FREE", "FREE", "FREE", "FREE", "PAID…
## $ Depression <chr> "YES", "YES", "YES", "YES", "YES", "YES", "YES", "YES…
## $ Hyperlipi <chr> "YES", "YES", "YES", "YES", "YES", "YES", "YES", "YES…
## $ Smoking <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",…
## $ Family.History <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",…
## $ F.History <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Diabetes <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,…
## $ HTN <chr> "NO", "NO", "YES", "YES", "YES", "YES", "YES", "NO", …
## $ Allergies <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",…
## $ BP <dbl> 100.6, 90.6, 100.7, 160.1, 90.6, 140.7, 120.8, 100.6,…
## $ Thrombolysis <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ BGR <int> 84, 135, 146, 146, 85, 166, 96, 84, 135, 146, 146, 85…
## $ B.Urea <dbl> 28, 17, 37, 37, 78, 104, 42, 28, 17, 37, 37, 78, 104,…
## $ S.Cr <dbl> 0.9, 0.7, 1.0, 1.0, 1.2, 4.0, 1.0, 0.9, 0.7, 1.0, 1.0…
## $ S.Sodium <int> 138, 144, 137, 137, 139, 130, 146, 138, 144, 137, 137…
## $ S.Potassium <dbl> 3.3, 4.7, 4.2, 4.2, 4.5, 5.3, 3.9, 3.3, 4.7, 4.2, 4.2…
## $ S.Chloride <int> 107, 104, 103, 103, 112, 100, 100, 107, 104, 103, 103…
## $ C.P.K <int> 130, 163, 149, 149, 75, 322, 146, 130, 163, 149, 149,…
## $ CK.MB <int> 30, 30, 22, 22, 18, 52, 21, 30, 30, 22, 22, 18, 52, 2…
## $ ESR <int> 11, 27, 19, 19, 13, 154, 25, 11, 27, 19, 19, 13, 154,…
## $ WBC <int> 9900, 15800, 7900, 7900, 6900, 13500, 7400, 9900, 158…
## $ RBC <dbl> 4.26, 5.74, 4.83, 4.83, 4.41, 3.90, 4.14, 4.26, 5.74,…
## $ Hemoglobin <dbl> 11.6, 14.5, 14.1, 14.1, 12.3, 10.0, 11.7, 11.6, 14.5,…
## $ P.C.V <dbl> 0.34, 0.44, 0.42, 0.42, 0.36, 0.29, 0.36, 0.34, 0.44,…
## $ M.C.V <dbl> 79.7, 78.0, 87.0, 87.0, 82.0, 74.4, 87.0, 79.7, 78.0,…
## $ M.C.H <dbl> 27.2, 25.0, 29.0, 29.0, 27.0, 25.7, 28.0, 27.2, 25.0,…
## $ M.C.H.C <dbl> 0.34, 0.32, 0.33, 0.33, 0.33, 0.35, 0.32, 0.34, 0.32,…
## $ PLATELET_COUNT <int> 265000, 287000, 183000, 183000, 211000, 288000, 39500…
## $ NEUTROPHIL <dbl> 0.70, 0.73, 0.60, 0.60, 0.71, 0.85, 0.63, 0.70, 0.73,…
## $ LYMPHO <dbl> 0.25, 0.20, 0.33, 0.33, 0.25, 0.10, 0.31, 0.25, 0.20,…
## $ MONOCYTE <dbl> 0.03, 0.04, 0.04, 0.04, 0.02, 0.03, 0.03, 0.03, 0.04,…
## $ EOSINO <int> 2, 3, 3, 3, 2, 2, 3, 2, 3, 3, 3, 2, 2, 3, 2, 3, 3, 3,…
## $ Others <chr> "no", "no", "LV dysfunction", "HTN", "no", "PND, ORTH…
## $ CO <chr> "Chest pain,", "Central Chest pain,", "Chest pain,SOB…
## $ Diagnosis <chr> "EXT. ACUTE WALL M.I", "A/W M.I", "AC I/W M.I (RV) RE…
## $ Hypersensitivity <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",…
## $ cp <int> 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 3,…
## $ trestbps <int> 132, 130, 180, 128, 200, 134, 140, 130, 136, 170, 174…
## $ chol <int> 341, 305, 327, 205, 288, 409, 241, 236, 319, 225, 249…
## $ fbs <int> 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ restecg <int> 2, 0, 1, 1, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0,…
## $ thalach <int> 136, 142, 117, 130, 133, 150, 123, 174, 152, 146, 143…
## $ exang <int> 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,…
## $ oldpeak <dbl> 3.0, 1.2, 3.4, 2.0, 4.0, 1.9, 0.2, 0.0, 0.0, 2.8, 0.0…
## $ slope <int> 2, 2, 2, 2, 3, 2, 2, 2, 1, 2, 2, 2, 1, 1, 2, 3, 3, 2,…
## $ ca <int> 0, 0, 0, 1, 2, 2, 0, 1, 2, 2, 0, 2, 0, 0, 0, 2, 3, 1,…
## $ thal <int> 7, 7, 3, 7, 7, 7, 7, 3, 3, 6, 3, 7, 3, 3, 7, 3, 7, 7,…
## $ num <int> 2, 2, 2, 3, 3, 2, 1, 1, 3, 2, 1, 3, 1, 1, 1, 3, 3, 2,…
## $ SK <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ SK.React <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",…
## $ Reaction <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Mortality <int> 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1,…
## $ Follow.Up <int> 60, 15, 6, 52, 34, 32, 60, 3, 15, 6, 52, 34, 32, 12, …
head(pak_data)
## Age Age.Group Gender Locality Marital.status Life.Style Sleep Category
## 1 45 41-50 Female RURAL MARRIED NO NO FREE
## 2 51 51-60 Female URBAN MARRIED NO NO FREE
## 3 55 51-60 Female RURAL MARRIED YES YES FREE
## 4 55 51-60 Female RURAL MARRIED YES YES FREE
## 5 56 51-60 Female RURAL MARRIED YES NO FREE
## 6 56 51-60 Female URBAN MARRIED NO NO FREE
## Depression Hyperlipi Smoking Family.History F.History Diabetes HTN Allergies
## 1 YES YES NO NO 0 1 NO NO
## 2 YES YES NO NO 0 0 NO NO
## 3 YES YES NO NO 0 1 YES NO
## 4 YES YES NO NO 0 1 YES NO
## 5 YES YES NO NO 0 1 YES NO
## 6 YES YES NO NO 0 1 YES NO
## BP Thrombolysis BGR B.Urea S.Cr S.Sodium S.Potassium S.Chloride C.P.K
## 1 100.6 0 84 28 0.9 138 3.3 107 130
## 2 90.6 0 135 17 0.7 144 4.7 104 163
## 3 100.7 0 146 37 1.0 137 4.2 103 149
## 4 160.1 0 146 37 1.0 137 4.2 103 149
## 5 90.6 0 85 78 1.2 139 4.5 112 75
## 6 140.7 0 166 104 4.0 130 5.3 100 322
## CK.MB ESR WBC RBC Hemoglobin P.C.V M.C.V M.C.H M.C.H.C PLATELET_COUNT
## 1 30 11 9900 4.26 11.6 0.34 79.7 27.2 0.34 265000
## 2 30 27 15800 5.74 14.5 0.44 78.0 25.0 0.32 287000
## 3 22 19 7900 4.83 14.1 0.42 87.0 29.0 0.33 183000
## 4 22 19 7900 4.83 14.1 0.42 87.0 29.0 0.33 183000
## 5 18 13 6900 4.41 12.3 0.36 82.0 27.0 0.33 211000
## 6 52 154 13500 3.90 10.0 0.29 74.4 25.7 0.35 288000
## NEUTROPHIL LYMPHO MONOCYTE EOSINO Others
## 1 0.70 0.25 0.03 2 no
## 2 0.73 0.20 0.04 3 no
## 3 0.60 0.33 0.04 3 LV dysfunction
## 4 0.60 0.33 0.04 3 HTN
## 5 0.71 0.25 0.02 2 no
## 6 0.85 0.10 0.03 2 PND, ORTHOPENIA
## CO Diagnosis Hypersensitivity cp
## 1 Chest pain, EXT. ACUTE WALL M.I NO 4
## 2 Central Chest pain, A/W M.I NO 4
## 3 Chest pain,SOB, Cold sweating AC I/W M.I (RV) RE. M.I NO 4
## 4 CENTRAL Chest pain, I/W M.I NO 4
## 5 Chest pain, A/W M.I NO 4
## 6 SOB FROM 1 DAY ACS, NSTEMI NO 4
## trestbps chol fbs restecg thalach exang oldpeak slope ca thal num SK SK.React
## 1 132 341 1 2 136 1 3.0 2 0 7 2 1 NO
## 2 130 305 0 0 142 1 1.2 2 0 7 2 1 NO
## 3 180 327 0 1 117 1 3.4 2 0 3 2 1 NO
## 4 128 205 0 1 130 1 2.0 2 1 7 3 1 NO
## 5 200 288 1 2 133 1 4.0 3 2 7 3 1 NO
## 6 134 409 0 2 150 1 1.9 2 2 7 2 1 NO
## Reaction Mortality Follow.Up
## 1 0 0 60
## 2 0 0 15
## 3 0 0 6
## 4 0 0 52
## 5 0 0 34
## 6 0 1 32
#to change the text to lower case in data
pak_data2 <- pak_data
head(pak_data2)
## Age Age.Group Gender Locality Marital.status Life.Style Sleep Category
## 1 45 41-50 Female RURAL MARRIED NO NO FREE
## 2 51 51-60 Female URBAN MARRIED NO NO FREE
## 3 55 51-60 Female RURAL MARRIED YES YES FREE
## 4 55 51-60 Female RURAL MARRIED YES YES FREE
## 5 56 51-60 Female RURAL MARRIED YES NO FREE
## 6 56 51-60 Female URBAN MARRIED NO NO FREE
## Depression Hyperlipi Smoking Family.History F.History Diabetes HTN Allergies
## 1 YES YES NO NO 0 1 NO NO
## 2 YES YES NO NO 0 0 NO NO
## 3 YES YES NO NO 0 1 YES NO
## 4 YES YES NO NO 0 1 YES NO
## 5 YES YES NO NO 0 1 YES NO
## 6 YES YES NO NO 0 1 YES NO
## BP Thrombolysis BGR B.Urea S.Cr S.Sodium S.Potassium S.Chloride C.P.K
## 1 100.6 0 84 28 0.9 138 3.3 107 130
## 2 90.6 0 135 17 0.7 144 4.7 104 163
## 3 100.7 0 146 37 1.0 137 4.2 103 149
## 4 160.1 0 146 37 1.0 137 4.2 103 149
## 5 90.6 0 85 78 1.2 139 4.5 112 75
## 6 140.7 0 166 104 4.0 130 5.3 100 322
## CK.MB ESR WBC RBC Hemoglobin P.C.V M.C.V M.C.H M.C.H.C PLATELET_COUNT
## 1 30 11 9900 4.26 11.6 0.34 79.7 27.2 0.34 265000
## 2 30 27 15800 5.74 14.5 0.44 78.0 25.0 0.32 287000
## 3 22 19 7900 4.83 14.1 0.42 87.0 29.0 0.33 183000
## 4 22 19 7900 4.83 14.1 0.42 87.0 29.0 0.33 183000
## 5 18 13 6900 4.41 12.3 0.36 82.0 27.0 0.33 211000
## 6 52 154 13500 3.90 10.0 0.29 74.4 25.7 0.35 288000
## NEUTROPHIL LYMPHO MONOCYTE EOSINO Others
## 1 0.70 0.25 0.03 2 no
## 2 0.73 0.20 0.04 3 no
## 3 0.60 0.33 0.04 3 LV dysfunction
## 4 0.60 0.33 0.04 3 HTN
## 5 0.71 0.25 0.02 2 no
## 6 0.85 0.10 0.03 2 PND, ORTHOPENIA
## CO Diagnosis Hypersensitivity cp
## 1 Chest pain, EXT. ACUTE WALL M.I NO 4
## 2 Central Chest pain, A/W M.I NO 4
## 3 Chest pain,SOB, Cold sweating AC I/W M.I (RV) RE. M.I NO 4
## 4 CENTRAL Chest pain, I/W M.I NO 4
## 5 Chest pain, A/W M.I NO 4
## 6 SOB FROM 1 DAY ACS, NSTEMI NO 4
## trestbps chol fbs restecg thalach exang oldpeak slope ca thal num SK SK.React
## 1 132 341 1 2 136 1 3.0 2 0 7 2 1 NO
## 2 130 305 0 0 142 1 1.2 2 0 7 2 1 NO
## 3 180 327 0 1 117 1 3.4 2 0 3 2 1 NO
## 4 128 205 0 1 130 1 2.0 2 1 7 3 1 NO
## 5 200 288 1 2 133 1 4.0 3 2 7 3 1 NO
## 6 134 409 0 2 150 1 1.9 2 2 7 2 1 NO
## Reaction Mortality Follow.Up
## 1 0 0 60
## 2 0 0 15
## 3 0 0 6
## 4 0 0 52
## 5 0 0 34
## 6 0 1 32
glimpse(pak_data2)
## Rows: 368
## Columns: 60
## $ Age <int> 45, 51, 55, 55, 56, 56, 57, 57, 58, 58, 59, 60, 60, 6…
## $ Age.Group <chr> "41-50", "51-60", "51-60", "51-60", "51-60", "51-60",…
## $ Gender <chr> "Female", "Female", "Female", "Female", "Female", "Fe…
## $ Locality <chr> "RURAL", "URBAN", "RURAL", "RURAL", "RURAL", "URBAN",…
## $ Marital.status <chr> "MARRIED", "MARRIED", "MARRIED", "MARRIED", "MARRIED"…
## $ Life.Style <chr> "NO", "NO", "YES", "YES", "YES", "NO", "YES", "NO", "…
## $ Sleep <chr> "NO", "NO", "YES", "YES", "NO", "NO", "YES", "NO", "N…
## $ Category <chr> "FREE", "FREE", "FREE", "FREE", "FREE", "FREE", "PAID…
## $ Depression <chr> "YES", "YES", "YES", "YES", "YES", "YES", "YES", "YES…
## $ Hyperlipi <chr> "YES", "YES", "YES", "YES", "YES", "YES", "YES", "YES…
## $ Smoking <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",…
## $ Family.History <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",…
## $ F.History <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Diabetes <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,…
## $ HTN <chr> "NO", "NO", "YES", "YES", "YES", "YES", "YES", "NO", …
## $ Allergies <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",…
## $ BP <dbl> 100.6, 90.6, 100.7, 160.1, 90.6, 140.7, 120.8, 100.6,…
## $ Thrombolysis <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ BGR <int> 84, 135, 146, 146, 85, 166, 96, 84, 135, 146, 146, 85…
## $ B.Urea <dbl> 28, 17, 37, 37, 78, 104, 42, 28, 17, 37, 37, 78, 104,…
## $ S.Cr <dbl> 0.9, 0.7, 1.0, 1.0, 1.2, 4.0, 1.0, 0.9, 0.7, 1.0, 1.0…
## $ S.Sodium <int> 138, 144, 137, 137, 139, 130, 146, 138, 144, 137, 137…
## $ S.Potassium <dbl> 3.3, 4.7, 4.2, 4.2, 4.5, 5.3, 3.9, 3.3, 4.7, 4.2, 4.2…
## $ S.Chloride <int> 107, 104, 103, 103, 112, 100, 100, 107, 104, 103, 103…
## $ C.P.K <int> 130, 163, 149, 149, 75, 322, 146, 130, 163, 149, 149,…
## $ CK.MB <int> 30, 30, 22, 22, 18, 52, 21, 30, 30, 22, 22, 18, 52, 2…
## $ ESR <int> 11, 27, 19, 19, 13, 154, 25, 11, 27, 19, 19, 13, 154,…
## $ WBC <int> 9900, 15800, 7900, 7900, 6900, 13500, 7400, 9900, 158…
## $ RBC <dbl> 4.26, 5.74, 4.83, 4.83, 4.41, 3.90, 4.14, 4.26, 5.74,…
## $ Hemoglobin <dbl> 11.6, 14.5, 14.1, 14.1, 12.3, 10.0, 11.7, 11.6, 14.5,…
## $ P.C.V <dbl> 0.34, 0.44, 0.42, 0.42, 0.36, 0.29, 0.36, 0.34, 0.44,…
## $ M.C.V <dbl> 79.7, 78.0, 87.0, 87.0, 82.0, 74.4, 87.0, 79.7, 78.0,…
## $ M.C.H <dbl> 27.2, 25.0, 29.0, 29.0, 27.0, 25.7, 28.0, 27.2, 25.0,…
## $ M.C.H.C <dbl> 0.34, 0.32, 0.33, 0.33, 0.33, 0.35, 0.32, 0.34, 0.32,…
## $ PLATELET_COUNT <int> 265000, 287000, 183000, 183000, 211000, 288000, 39500…
## $ NEUTROPHIL <dbl> 0.70, 0.73, 0.60, 0.60, 0.71, 0.85, 0.63, 0.70, 0.73,…
## $ LYMPHO <dbl> 0.25, 0.20, 0.33, 0.33, 0.25, 0.10, 0.31, 0.25, 0.20,…
## $ MONOCYTE <dbl> 0.03, 0.04, 0.04, 0.04, 0.02, 0.03, 0.03, 0.03, 0.04,…
## $ EOSINO <int> 2, 3, 3, 3, 2, 2, 3, 2, 3, 3, 3, 2, 2, 3, 2, 3, 3, 3,…
## $ Others <chr> "no", "no", "LV dysfunction", "HTN", "no", "PND, ORTH…
## $ CO <chr> "Chest pain,", "Central Chest pain,", "Chest pain,SOB…
## $ Diagnosis <chr> "EXT. ACUTE WALL M.I", "A/W M.I", "AC I/W M.I (RV) RE…
## $ Hypersensitivity <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",…
## $ cp <int> 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 3,…
## $ trestbps <int> 132, 130, 180, 128, 200, 134, 140, 130, 136, 170, 174…
## $ chol <int> 341, 305, 327, 205, 288, 409, 241, 236, 319, 225, 249…
## $ fbs <int> 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ restecg <int> 2, 0, 1, 1, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0,…
## $ thalach <int> 136, 142, 117, 130, 133, 150, 123, 174, 152, 146, 143…
## $ exang <int> 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,…
## $ oldpeak <dbl> 3.0, 1.2, 3.4, 2.0, 4.0, 1.9, 0.2, 0.0, 0.0, 2.8, 0.0…
## $ slope <int> 2, 2, 2, 2, 3, 2, 2, 2, 1, 2, 2, 2, 1, 1, 2, 3, 3, 2,…
## $ ca <int> 0, 0, 0, 1, 2, 2, 0, 1, 2, 2, 0, 2, 0, 0, 0, 2, 3, 1,…
## $ thal <int> 7, 7, 3, 7, 7, 7, 7, 3, 3, 6, 3, 7, 3, 3, 7, 3, 7, 7,…
## $ num <int> 2, 2, 2, 3, 3, 2, 1, 1, 3, 2, 1, 3, 1, 1, 1, 3, 3, 2,…
## $ SK <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ SK.React <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",…
## $ Reaction <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Mortality <int> 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1,…
## $ Follow.Up <int> 60, 15, 6, 52, 34, 32, 60, 3, 15, 6, 52, 34, 32, 12, …
summary(pak_data2$platelet_count)
## Length Class Mode
## 0 NULL NULL
#to change the variable names to small cases
names(pak_data2) <- tolower(names(pak_data2))
head(pak_data2)
## age age.group gender locality marital.status life.style sleep category
## 1 45 41-50 Female RURAL MARRIED NO NO FREE
## 2 51 51-60 Female URBAN MARRIED NO NO FREE
## 3 55 51-60 Female RURAL MARRIED YES YES FREE
## 4 55 51-60 Female RURAL MARRIED YES YES FREE
## 5 56 51-60 Female RURAL MARRIED YES NO FREE
## 6 56 51-60 Female URBAN MARRIED NO NO FREE
## depression hyperlipi smoking family.history f.history diabetes htn allergies
## 1 YES YES NO NO 0 1 NO NO
## 2 YES YES NO NO 0 0 NO NO
## 3 YES YES NO NO 0 1 YES NO
## 4 YES YES NO NO 0 1 YES NO
## 5 YES YES NO NO 0 1 YES NO
## 6 YES YES NO NO 0 1 YES NO
## bp thrombolysis bgr b.urea s.cr s.sodium s.potassium s.chloride c.p.k
## 1 100.6 0 84 28 0.9 138 3.3 107 130
## 2 90.6 0 135 17 0.7 144 4.7 104 163
## 3 100.7 0 146 37 1.0 137 4.2 103 149
## 4 160.1 0 146 37 1.0 137 4.2 103 149
## 5 90.6 0 85 78 1.2 139 4.5 112 75
## 6 140.7 0 166 104 4.0 130 5.3 100 322
## ck.mb esr wbc rbc hemoglobin p.c.v m.c.v m.c.h m.c.h.c platelet_count
## 1 30 11 9900 4.26 11.6 0.34 79.7 27.2 0.34 265000
## 2 30 27 15800 5.74 14.5 0.44 78.0 25.0 0.32 287000
## 3 22 19 7900 4.83 14.1 0.42 87.0 29.0 0.33 183000
## 4 22 19 7900 4.83 14.1 0.42 87.0 29.0 0.33 183000
## 5 18 13 6900 4.41 12.3 0.36 82.0 27.0 0.33 211000
## 6 52 154 13500 3.90 10.0 0.29 74.4 25.7 0.35 288000
## neutrophil lympho monocyte eosino others
## 1 0.70 0.25 0.03 2 no
## 2 0.73 0.20 0.04 3 no
## 3 0.60 0.33 0.04 3 LV dysfunction
## 4 0.60 0.33 0.04 3 HTN
## 5 0.71 0.25 0.02 2 no
## 6 0.85 0.10 0.03 2 PND, ORTHOPENIA
## co diagnosis hypersensitivity cp
## 1 Chest pain, EXT. ACUTE WALL M.I NO 4
## 2 Central Chest pain, A/W M.I NO 4
## 3 Chest pain,SOB, Cold sweating AC I/W M.I (RV) RE. M.I NO 4
## 4 CENTRAL Chest pain, I/W M.I NO 4
## 5 Chest pain, A/W M.I NO 4
## 6 SOB FROM 1 DAY ACS, NSTEMI NO 4
## trestbps chol fbs restecg thalach exang oldpeak slope ca thal num sk sk.react
## 1 132 341 1 2 136 1 3.0 2 0 7 2 1 NO
## 2 130 305 0 0 142 1 1.2 2 0 7 2 1 NO
## 3 180 327 0 1 117 1 3.4 2 0 3 2 1 NO
## 4 128 205 0 1 130 1 2.0 2 1 7 3 1 NO
## 5 200 288 1 2 133 1 4.0 3 2 7 3 1 NO
## 6 134 409 0 2 150 1 1.9 2 2 7 2 1 NO
## reaction mortality follow.up
## 1 0 0 60
## 2 0 0 15
## 3 0 0 6
## 4 0 0 52
## 5 0 0 34
## 6 0 1 32
#clean variable names use janitor library
pak_data2 <- clean_names(pak_data2)
glimpse(pak_data2)
## Rows: 368
## Columns: 60
## $ age <int> 45, 51, 55, 55, 56, 56, 57, 57, 58, 58, 59, 60, 60, 6…
## $ age_group <chr> "41-50", "51-60", "51-60", "51-60", "51-60", "51-60",…
## $ gender <chr> "Female", "Female", "Female", "Female", "Female", "Fe…
## $ locality <chr> "RURAL", "URBAN", "RURAL", "RURAL", "RURAL", "URBAN",…
## $ marital_status <chr> "MARRIED", "MARRIED", "MARRIED", "MARRIED", "MARRIED"…
## $ life_style <chr> "NO", "NO", "YES", "YES", "YES", "NO", "YES", "NO", "…
## $ sleep <chr> "NO", "NO", "YES", "YES", "NO", "NO", "YES", "NO", "N…
## $ category <chr> "FREE", "FREE", "FREE", "FREE", "FREE", "FREE", "PAID…
## $ depression <chr> "YES", "YES", "YES", "YES", "YES", "YES", "YES", "YES…
## $ hyperlipi <chr> "YES", "YES", "YES", "YES", "YES", "YES", "YES", "YES…
## $ smoking <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",…
## $ family_history <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",…
## $ f_history <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ diabetes <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,…
## $ htn <chr> "NO", "NO", "YES", "YES", "YES", "YES", "YES", "NO", …
## $ allergies <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",…
## $ bp <dbl> 100.6, 90.6, 100.7, 160.1, 90.6, 140.7, 120.8, 100.6,…
## $ thrombolysis <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ bgr <int> 84, 135, 146, 146, 85, 166, 96, 84, 135, 146, 146, 85…
## $ b_urea <dbl> 28, 17, 37, 37, 78, 104, 42, 28, 17, 37, 37, 78, 104,…
## $ s_cr <dbl> 0.9, 0.7, 1.0, 1.0, 1.2, 4.0, 1.0, 0.9, 0.7, 1.0, 1.0…
## $ s_sodium <int> 138, 144, 137, 137, 139, 130, 146, 138, 144, 137, 137…
## $ s_potassium <dbl> 3.3, 4.7, 4.2, 4.2, 4.5, 5.3, 3.9, 3.3, 4.7, 4.2, 4.2…
## $ s_chloride <int> 107, 104, 103, 103, 112, 100, 100, 107, 104, 103, 103…
## $ c_p_k <int> 130, 163, 149, 149, 75, 322, 146, 130, 163, 149, 149,…
## $ ck_mb <int> 30, 30, 22, 22, 18, 52, 21, 30, 30, 22, 22, 18, 52, 2…
## $ esr <int> 11, 27, 19, 19, 13, 154, 25, 11, 27, 19, 19, 13, 154,…
## $ wbc <int> 9900, 15800, 7900, 7900, 6900, 13500, 7400, 9900, 158…
## $ rbc <dbl> 4.26, 5.74, 4.83, 4.83, 4.41, 3.90, 4.14, 4.26, 5.74,…
## $ hemoglobin <dbl> 11.6, 14.5, 14.1, 14.1, 12.3, 10.0, 11.7, 11.6, 14.5,…
## $ p_c_v <dbl> 0.34, 0.44, 0.42, 0.42, 0.36, 0.29, 0.36, 0.34, 0.44,…
## $ m_c_v <dbl> 79.7, 78.0, 87.0, 87.0, 82.0, 74.4, 87.0, 79.7, 78.0,…
## $ m_c_h <dbl> 27.2, 25.0, 29.0, 29.0, 27.0, 25.7, 28.0, 27.2, 25.0,…
## $ m_c_h_c <dbl> 0.34, 0.32, 0.33, 0.33, 0.33, 0.35, 0.32, 0.34, 0.32,…
## $ platelet_count <int> 265000, 287000, 183000, 183000, 211000, 288000, 39500…
## $ neutrophil <dbl> 0.70, 0.73, 0.60, 0.60, 0.71, 0.85, 0.63, 0.70, 0.73,…
## $ lympho <dbl> 0.25, 0.20, 0.33, 0.33, 0.25, 0.10, 0.31, 0.25, 0.20,…
## $ monocyte <dbl> 0.03, 0.04, 0.04, 0.04, 0.02, 0.03, 0.03, 0.03, 0.04,…
## $ eosino <int> 2, 3, 3, 3, 2, 2, 3, 2, 3, 3, 3, 2, 2, 3, 2, 3, 3, 3,…
## $ others <chr> "no", "no", "LV dysfunction", "HTN", "no", "PND, ORTH…
## $ co <chr> "Chest pain,", "Central Chest pain,", "Chest pain,SOB…
## $ diagnosis <chr> "EXT. ACUTE WALL M.I", "A/W M.I", "AC I/W M.I (RV) RE…
## $ hypersensitivity <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",…
## $ cp <int> 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 3,…
## $ trestbps <int> 132, 130, 180, 128, 200, 134, 140, 130, 136, 170, 174…
## $ chol <int> 341, 305, 327, 205, 288, 409, 241, 236, 319, 225, 249…
## $ fbs <int> 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ restecg <int> 2, 0, 1, 1, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0,…
## $ thalach <int> 136, 142, 117, 130, 133, 150, 123, 174, 152, 146, 143…
## $ exang <int> 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,…
## $ oldpeak <dbl> 3.0, 1.2, 3.4, 2.0, 4.0, 1.9, 0.2, 0.0, 0.0, 2.8, 0.0…
## $ slope <int> 2, 2, 2, 2, 3, 2, 2, 2, 1, 2, 2, 2, 1, 1, 2, 3, 3, 2,…
## $ ca <int> 0, 0, 0, 1, 2, 2, 0, 1, 2, 2, 0, 2, 0, 0, 0, 2, 3, 1,…
## $ thal <int> 7, 7, 3, 7, 7, 7, 7, 3, 3, 6, 3, 7, 3, 3, 7, 3, 7, 7,…
## $ num <int> 2, 2, 2, 3, 3, 2, 1, 1, 3, 2, 1, 3, 1, 1, 1, 3, 3, 2,…
## $ sk <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ sk_react <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",…
## $ reaction <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ mortality <int> 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1,…
## $ follow_up <int> 60, 15, 6, 52, 34, 32, 60, 3, 15, 6, 52, 34, 32, 12, …
#use mutate function from dplyr to recode some data variables
pak_data2$gender <- factor(pak_data2$gender)
pak_data2 <- pak_data2 %>% mutate(gender = ifelse(gender == "Female", 0,1))
table(pak_data2$gender)
##
## 0 1
## 83 285
glimpse(pak_data2)
## Rows: 368
## Columns: 60
## $ age <int> 45, 51, 55, 55, 56, 56, 57, 57, 58, 58, 59, 60, 60, 6…
## $ age_group <chr> "41-50", "51-60", "51-60", "51-60", "51-60", "51-60",…
## $ gender <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ locality <chr> "RURAL", "URBAN", "RURAL", "RURAL", "RURAL", "URBAN",…
## $ marital_status <chr> "MARRIED", "MARRIED", "MARRIED", "MARRIED", "MARRIED"…
## $ life_style <chr> "NO", "NO", "YES", "YES", "YES", "NO", "YES", "NO", "…
## $ sleep <chr> "NO", "NO", "YES", "YES", "NO", "NO", "YES", "NO", "N…
## $ category <chr> "FREE", "FREE", "FREE", "FREE", "FREE", "FREE", "PAID…
## $ depression <chr> "YES", "YES", "YES", "YES", "YES", "YES", "YES", "YES…
## $ hyperlipi <chr> "YES", "YES", "YES", "YES", "YES", "YES", "YES", "YES…
## $ smoking <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",…
## $ family_history <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",…
## $ f_history <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ diabetes <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,…
## $ htn <chr> "NO", "NO", "YES", "YES", "YES", "YES", "YES", "NO", …
## $ allergies <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",…
## $ bp <dbl> 100.6, 90.6, 100.7, 160.1, 90.6, 140.7, 120.8, 100.6,…
## $ thrombolysis <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ bgr <int> 84, 135, 146, 146, 85, 166, 96, 84, 135, 146, 146, 85…
## $ b_urea <dbl> 28, 17, 37, 37, 78, 104, 42, 28, 17, 37, 37, 78, 104,…
## $ s_cr <dbl> 0.9, 0.7, 1.0, 1.0, 1.2, 4.0, 1.0, 0.9, 0.7, 1.0, 1.0…
## $ s_sodium <int> 138, 144, 137, 137, 139, 130, 146, 138, 144, 137, 137…
## $ s_potassium <dbl> 3.3, 4.7, 4.2, 4.2, 4.5, 5.3, 3.9, 3.3, 4.7, 4.2, 4.2…
## $ s_chloride <int> 107, 104, 103, 103, 112, 100, 100, 107, 104, 103, 103…
## $ c_p_k <int> 130, 163, 149, 149, 75, 322, 146, 130, 163, 149, 149,…
## $ ck_mb <int> 30, 30, 22, 22, 18, 52, 21, 30, 30, 22, 22, 18, 52, 2…
## $ esr <int> 11, 27, 19, 19, 13, 154, 25, 11, 27, 19, 19, 13, 154,…
## $ wbc <int> 9900, 15800, 7900, 7900, 6900, 13500, 7400, 9900, 158…
## $ rbc <dbl> 4.26, 5.74, 4.83, 4.83, 4.41, 3.90, 4.14, 4.26, 5.74,…
## $ hemoglobin <dbl> 11.6, 14.5, 14.1, 14.1, 12.3, 10.0, 11.7, 11.6, 14.5,…
## $ p_c_v <dbl> 0.34, 0.44, 0.42, 0.42, 0.36, 0.29, 0.36, 0.34, 0.44,…
## $ m_c_v <dbl> 79.7, 78.0, 87.0, 87.0, 82.0, 74.4, 87.0, 79.7, 78.0,…
## $ m_c_h <dbl> 27.2, 25.0, 29.0, 29.0, 27.0, 25.7, 28.0, 27.2, 25.0,…
## $ m_c_h_c <dbl> 0.34, 0.32, 0.33, 0.33, 0.33, 0.35, 0.32, 0.34, 0.32,…
## $ platelet_count <int> 265000, 287000, 183000, 183000, 211000, 288000, 39500…
## $ neutrophil <dbl> 0.70, 0.73, 0.60, 0.60, 0.71, 0.85, 0.63, 0.70, 0.73,…
## $ lympho <dbl> 0.25, 0.20, 0.33, 0.33, 0.25, 0.10, 0.31, 0.25, 0.20,…
## $ monocyte <dbl> 0.03, 0.04, 0.04, 0.04, 0.02, 0.03, 0.03, 0.03, 0.04,…
## $ eosino <int> 2, 3, 3, 3, 2, 2, 3, 2, 3, 3, 3, 2, 2, 3, 2, 3, 3, 3,…
## $ others <chr> "no", "no", "LV dysfunction", "HTN", "no", "PND, ORTH…
## $ co <chr> "Chest pain,", "Central Chest pain,", "Chest pain,SOB…
## $ diagnosis <chr> "EXT. ACUTE WALL M.I", "A/W M.I", "AC I/W M.I (RV) RE…
## $ hypersensitivity <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",…
## $ cp <int> 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 3,…
## $ trestbps <int> 132, 130, 180, 128, 200, 134, 140, 130, 136, 170, 174…
## $ chol <int> 341, 305, 327, 205, 288, 409, 241, 236, 319, 225, 249…
## $ fbs <int> 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ restecg <int> 2, 0, 1, 1, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0,…
## $ thalach <int> 136, 142, 117, 130, 133, 150, 123, 174, 152, 146, 143…
## $ exang <int> 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,…
## $ oldpeak <dbl> 3.0, 1.2, 3.4, 2.0, 4.0, 1.9, 0.2, 0.0, 0.0, 2.8, 0.0…
## $ slope <int> 2, 2, 2, 2, 3, 2, 2, 2, 1, 2, 2, 2, 1, 1, 2, 3, 3, 2,…
## $ ca <int> 0, 0, 0, 1, 2, 2, 0, 1, 2, 2, 0, 2, 0, 0, 0, 2, 3, 1,…
## $ thal <int> 7, 7, 3, 7, 7, 7, 7, 3, 3, 6, 3, 7, 3, 3, 7, 3, 7, 7,…
## $ num <int> 2, 2, 2, 3, 3, 2, 1, 1, 3, 2, 1, 3, 1, 1, 1, 3, 3, 2,…
## $ sk <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ sk_react <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",…
## $ reaction <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ mortality <int> 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1,…
## $ follow_up <int> 60, 15, 6, 52, 34, 32, 60, 3, 15, 6, 52, 34, 32, 12, …
#recode for locality
table(pak_data2$locality)
##
## RURAL URBAN
## 134 234
pak_data2$locality <- factor(pak_data2$locality)
pak_data2 <- pak_data2 %>% mutate(locality = ifelse(locality == "RURAL", 0,1))
table(pak_data2$locality)
##
## 0 1
## 134 234
#recode for marital_status
table(pak_data2$marital_status)
##
## MARRIED SINGLE
## 365 3
pak_data2 <- pak_data2 %>% mutate(marital_status = ifelse(marital_status == "MARRIED", 0, 1))
table(pak_data2$marital_status)
##
## 0 1
## 365 3
#recorde for lifestyle
table(pak_data2$life_style)
##
## NO YES
## 151 217
pak_data2 <- pak_data2 %>% mutate(life_style = ifelse(life_style == "NO", 0, 1))
table(pak_data2$life_style)
##
## 0 1
## 151 217
#recode for sleep
table(pak_data2$sleep)
##
## NO YES
## 224 144
pak_data2 <- pak_data2 %>% mutate(sleep = ifelse(sleep == "NO", 0, 1))
table(pak_data2$sleep)
##
## 0 1
## 224 144
#recode for category
pak_data2 <- pak_data2 %>% mutate(category = ifelse(category == "FREE", 0, 1))
table(pak_data2$category)
##
## 0 1
## 331 37
#recode for Depression
pak_data2 <- pak_data2 %>% mutate(depression = ifelse(depression == "NO", 0, 1))
table(pak_data2$depression)
##
## 0 1
## 17 351
#recode for HyperLipidemia
pak_data2 <- pak_data2 %>% mutate(hyperlipi = ifelse(hyperlipi == "NO", 0, 1))
table(pak_data2$hyperlipi)
##
## 0 1
## 27 341
#recode for familyHistory
pak_data2 <- pak_data2 %>% mutate(family_history = ifelse(family_history == "NO", 0, 1))
table(pak_data2$family_history)
##
## 0 1
## 296 72
#recode for htn
pak_data2$htn <- pak_data$HTN
pak_data2 <- pak_data2 %>% mutate(htn = ifelse(htn == "NO", 0, 1))
table(pak_data2$htn)
##
## 0 1
## 167 201
#recode for allergies
pak_data2 <- pak_data2 %>% mutate(allergies = ifelse(allergies == "NO", 0, 1))
table(pak_data2$allergies)
##
## 0 1
## 357 11
#recode for hypersenstitivity
pak_data2 <- pak_data2 %>% mutate(hypersensitivity = ifelse(hypersensitivity == "NO", 0, 1))
table(pak_data2$hypersensitivity)
##
## 0 1
## 356 12
#recode for sk_react
pak_data2 <- pak_data2 %>% mutate(sk_react = ifelse(sk_react == "NO", 0, 1))
table(pak_data2$sk_react)
##
## 0 1
## 93 275
#recode for smoking
pak_data2 <- pak_data2 %>% mutate(smoking = ifelse(smoking == "NO", 0, 1))
table(pak_data2$smoking)
##
## 0 1
## 173 195
glimpse(pak_data2)
## Rows: 368
## Columns: 60
## $ age <int> 45, 51, 55, 55, 56, 56, 57, 57, 58, 58, 59, 60, 60, 6…
## $ age_group <chr> "41-50", "51-60", "51-60", "51-60", "51-60", "51-60",…
## $ gender <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ locality <dbl> 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,…
## $ marital_status <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ life_style <dbl> 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1,…
## $ sleep <dbl> 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1,…
## $ category <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,…
## $ depression <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ hyperlipi <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ smoking <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ family_history <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ f_history <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ diabetes <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,…
## $ htn <dbl> 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,…
## $ allergies <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ bp <dbl> 100.6, 90.6, 100.7, 160.1, 90.6, 140.7, 120.8, 100.6,…
## $ thrombolysis <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ bgr <int> 84, 135, 146, 146, 85, 166, 96, 84, 135, 146, 146, 85…
## $ b_urea <dbl> 28, 17, 37, 37, 78, 104, 42, 28, 17, 37, 37, 78, 104,…
## $ s_cr <dbl> 0.9, 0.7, 1.0, 1.0, 1.2, 4.0, 1.0, 0.9, 0.7, 1.0, 1.0…
## $ s_sodium <int> 138, 144, 137, 137, 139, 130, 146, 138, 144, 137, 137…
## $ s_potassium <dbl> 3.3, 4.7, 4.2, 4.2, 4.5, 5.3, 3.9, 3.3, 4.7, 4.2, 4.2…
## $ s_chloride <int> 107, 104, 103, 103, 112, 100, 100, 107, 104, 103, 103…
## $ c_p_k <int> 130, 163, 149, 149, 75, 322, 146, 130, 163, 149, 149,…
## $ ck_mb <int> 30, 30, 22, 22, 18, 52, 21, 30, 30, 22, 22, 18, 52, 2…
## $ esr <int> 11, 27, 19, 19, 13, 154, 25, 11, 27, 19, 19, 13, 154,…
## $ wbc <int> 9900, 15800, 7900, 7900, 6900, 13500, 7400, 9900, 158…
## $ rbc <dbl> 4.26, 5.74, 4.83, 4.83, 4.41, 3.90, 4.14, 4.26, 5.74,…
## $ hemoglobin <dbl> 11.6, 14.5, 14.1, 14.1, 12.3, 10.0, 11.7, 11.6, 14.5,…
## $ p_c_v <dbl> 0.34, 0.44, 0.42, 0.42, 0.36, 0.29, 0.36, 0.34, 0.44,…
## $ m_c_v <dbl> 79.7, 78.0, 87.0, 87.0, 82.0, 74.4, 87.0, 79.7, 78.0,…
## $ m_c_h <dbl> 27.2, 25.0, 29.0, 29.0, 27.0, 25.7, 28.0, 27.2, 25.0,…
## $ m_c_h_c <dbl> 0.34, 0.32, 0.33, 0.33, 0.33, 0.35, 0.32, 0.34, 0.32,…
## $ platelet_count <int> 265000, 287000, 183000, 183000, 211000, 288000, 39500…
## $ neutrophil <dbl> 0.70, 0.73, 0.60, 0.60, 0.71, 0.85, 0.63, 0.70, 0.73,…
## $ lympho <dbl> 0.25, 0.20, 0.33, 0.33, 0.25, 0.10, 0.31, 0.25, 0.20,…
## $ monocyte <dbl> 0.03, 0.04, 0.04, 0.04, 0.02, 0.03, 0.03, 0.03, 0.04,…
## $ eosino <int> 2, 3, 3, 3, 2, 2, 3, 2, 3, 3, 3, 2, 2, 3, 2, 3, 3, 3,…
## $ others <chr> "no", "no", "LV dysfunction", "HTN", "no", "PND, ORTH…
## $ co <chr> "Chest pain,", "Central Chest pain,", "Chest pain,SOB…
## $ diagnosis <chr> "EXT. ACUTE WALL M.I", "A/W M.I", "AC I/W M.I (RV) RE…
## $ hypersensitivity <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ cp <int> 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 3,…
## $ trestbps <int> 132, 130, 180, 128, 200, 134, 140, 130, 136, 170, 174…
## $ chol <int> 341, 305, 327, 205, 288, 409, 241, 236, 319, 225, 249…
## $ fbs <int> 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ restecg <int> 2, 0, 1, 1, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0,…
## $ thalach <int> 136, 142, 117, 130, 133, 150, 123, 174, 152, 146, 143…
## $ exang <int> 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,…
## $ oldpeak <dbl> 3.0, 1.2, 3.4, 2.0, 4.0, 1.9, 0.2, 0.0, 0.0, 2.8, 0.0…
## $ slope <int> 2, 2, 2, 2, 3, 2, 2, 2, 1, 2, 2, 2, 1, 1, 2, 3, 3, 2,…
## $ ca <int> 0, 0, 0, 1, 2, 2, 0, 1, 2, 2, 0, 2, 0, 0, 0, 2, 3, 1,…
## $ thal <int> 7, 7, 3, 7, 7, 7, 7, 3, 3, 6, 3, 7, 3, 3, 7, 3, 7, 7,…
## $ num <int> 2, 2, 2, 3, 3, 2, 1, 1, 3, 2, 1, 3, 1, 1, 1, 3, 3, 2,…
## $ sk <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ sk_react <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ reaction <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ mortality <int> 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1,…
## $ follow_up <int> 60, 15, 6, 52, 34, 32, 60, 3, 15, 6, 52, 34, 32, 12, …
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
describe(pak_data2)
## vars n mean sd median trimmed mad
## age 1 368 54.29 8.72 55.00 55.21 7.41
## age_group 2 368 3.88 0.95 4.00 3.99 0.74
## gender 3 368 0.77 0.42 1.00 0.84 0.00
## locality 4 368 0.64 0.48 1.00 0.67 0.00
## marital_status 5 368 0.01 0.09 0.00 0.00 0.00
## life_style 6 368 0.59 0.49 1.00 0.61 0.00
## sleep 7 368 0.39 0.49 0.00 0.36 0.00
## category 8 368 0.10 0.30 0.00 0.00 0.00
## depression 9 368 0.95 0.21 1.00 1.00 0.00
## hyperlipi 10 368 0.93 0.26 1.00 1.00 0.00
## smoking 11 368 0.53 0.50 1.00 0.54 0.00
## family_history 12 368 0.20 0.40 0.00 0.12 0.00
## f_history 13 368 0.20 0.40 0.00 0.12 0.00
## diabetes 14 368 0.46 0.50 0.00 0.45 0.00
## htn 15 368 0.55 0.50 1.00 0.56 0.00
## allergies 16 368 0.03 0.17 0.00 0.00 0.00
## bp 17 368 121.21 24.54 120.80 119.52 29.80
## thrombolysis 18 368 0.03 0.18 0.00 0.00 0.00
## bgr 19 368 219.99 139.34 164.00 197.84 100.82
## b_urea 20 368 51.68 62.58 36.00 38.33 10.38
## s_cr 21 368 1.72 3.61 0.90 0.97 0.15
## s_sodium 22 368 138.02 4.08 138.00 138.06 4.45
## s_potassium 23 368 4.21 0.39 4.20 4.17 0.44
## s_chloride 24 368 103.82 4.80 104.00 103.96 5.93
## c_p_k 25 368 553.89 957.61 188.00 295.55 171.98
## ck_mb 26 368 62.49 89.79 36.00 39.17 22.24
## esr 27 368 26.57 32.58 16.00 18.21 10.38
## wbc 28 368 11181.28 3517.88 10650.00 10953.31 4225.41
## rbc 29 368 5.09 0.77 5.20 5.09 0.97
## hemoglobin 30 368 13.91 2.16 14.20 14.01 2.22
## p_c_v 31 368 0.42 0.06 0.42 0.42 0.07
## m_c_v 32 368 81.57 6.81 82.10 81.99 6.08
## m_c_h 33 368 27.27 3.08 27.90 27.34 2.08
## m_c_h_c 34 368 0.33 0.02 0.33 0.33 0.01
## platelet_count 35 368 248660.33 76707.56 237000.00 242216.22 74130.00
## neutrophil 36 368 2.93 13.40 0.72 0.70 0.13
## lympho 37 368 0.25 0.12 0.21 0.24 0.09
## monocyte 38 368 0.03 0.02 0.03 0.03 0.01
## eosino 39 368 2.26 0.88 2.00 2.21 1.48
## others 40 368 10.63 3.86 13.00 11.07 0.00
## co 41 368 17.21 10.65 15.00 16.79 10.38
## diagnosis 42 368 17.88 10.57 17.00 17.58 13.34
## hypersensitivity 43 368 0.03 0.18 0.00 0.00 0.00
## cp 44 368 3.67 0.77 4.00 3.89 0.00
## trestbps 45 368 132.74 18.19 130.00 131.04 14.83
## chol 46 368 248.94 50.13 249.00 247.62 54.86
## fbs 47 368 0.14 0.35 0.00 0.05 0.00
## restecg 48 368 1.07 0.99 2.00 1.09 0.00
## thalach 49 368 140.92 22.99 144.00 141.24 25.20
## exang 50 368 0.56 0.50 1.00 0.58 0.00
## oldpeak 51 368 1.54 1.39 1.20 1.38 1.48
## slope 52 368 1.84 0.56 2.00 1.81 0.00
## ca 53 368 1.00 1.04 1.00 0.88 1.48
## thal 54 368 5.86 1.74 7.00 6.07 0.00
## num 55 368 2.04 1.03 2.00 1.94 1.48
## sk 56 368 0.98 0.13 1.00 1.00 0.00
## sk_react 57 368 0.75 0.44 1.00 0.81 0.00
## reaction 58 368 0.75 0.44 1.00 0.81 0.00
## mortality 59 368 0.22 0.41 0.00 0.15 0.00
## follow_up 60 368 28.65 15.81 32.00 28.22 25.20
## min max range skew kurtosis se
## age 24.00 77.00 53.00 -1.23 2.47 0.45
## age_group 1.00 5.00 4.00 -1.03 1.09 0.05
## gender 0.00 1.00 1.00 -1.31 -0.29 0.02
## locality 0.00 1.00 1.00 -0.56 -1.69 0.03
## marital_status 0.00 1.00 1.00 10.90 117.02 0.00
## life_style 0.00 1.00 1.00 -0.36 -1.87 0.03
## sleep 0.00 1.00 1.00 0.44 -1.81 0.03
## category 0.00 1.00 1.00 2.65 5.01 0.02
## depression 0.00 1.00 1.00 -4.31 16.59 0.01
## hyperlipi 0.00 1.00 1.00 -3.26 8.65 0.01
## smoking 0.00 1.00 1.00 -0.12 -1.99 0.03
## family_history 0.00 1.00 1.00 1.53 0.34 0.02
## f_history 0.00 1.00 1.00 1.53 0.34 0.02
## diabetes 0.00 1.00 1.00 0.15 -1.98 0.03
## htn 0.00 1.00 1.00 -0.18 -1.97 0.03
## allergies 0.00 1.00 1.00 5.50 28.31 0.01
## bp 80.50 190.11 109.61 0.59 0.00 1.28
## thrombolysis 0.00 1.00 1.00 5.24 25.54 0.01
## bgr 60.00 563.00 503.00 1.19 0.48 7.26
## b_urea 2.30 394.00 391.70 4.54 21.55 3.26
## s_cr 0.60 22.90 22.30 5.50 29.25 0.19
## s_sodium 129.00 146.00 17.00 -0.02 -0.40 0.21
## s_potassium 3.30 5.30 2.00 0.82 0.85 0.02
## s_chloride 90.00 112.00 22.00 -0.45 0.20 0.25
## c_p_k 52.00 4289.00 4237.00 2.86 7.18 49.92
## ck_mb 14.00 505.00 491.00 3.31 11.06 4.68
## esr 5.00 154.00 149.00 2.94 8.07 1.70
## wbc 5800.00 19590.00 13790.00 0.40 -0.79 183.38
## rbc 3.46 6.98 3.52 -0.02 -0.78 0.04
## hemoglobin 9.10 18.00 8.90 -0.39 -0.72 0.11
## p_c_v 0.29 0.54 0.25 -0.36 -0.74 0.00
## m_c_v 60.00 96.00 36.00 -0.82 1.90 0.36
## m_c_h 18.00 33.00 15.00 -0.58 1.39 0.16
## m_c_h_c 0.22 0.39 0.17 -2.19 10.09 0.00
## platelet_count 20000.00 459000.00 439000.00 0.46 1.49 3998.66
## neutrophil 0.36 83.00 82.64 5.79 31.63 0.70
## lympho 0.05 0.54 0.49 0.70 -0.21 0.01
## monocyte 0.01 0.08 0.07 0.88 1.05 0.00
## eosino 1.00 5.00 4.00 0.74 1.01 0.05
## others 1.00 17.00 16.00 -0.92 -0.37 0.20
## co 1.00 37.00 36.00 0.31 -1.21 0.55
## diagnosis 1.00 37.00 36.00 0.20 -1.28 0.55
## hypersensitivity 0.00 1.00 1.00 5.24 25.54 0.01
## cp 1.00 4.00 3.00 -2.44 4.95 0.04
## trestbps 100.00 200.00 100.00 0.90 0.93 0.95
## chol 131.00 409.00 278.00 0.39 0.27 2.61
## fbs 0.00 1.00 1.00 2.05 2.21 0.02
## restecg 0.00 2.00 2.00 -0.15 -1.98 0.05
## thalach 71.00 195.00 124.00 -0.15 -0.54 1.20
## exang 0.00 1.00 1.00 -0.25 -1.94 0.03
## oldpeak 0.00 6.20 6.20 1.01 0.93 0.07
## slope 1.00 3.00 2.00 -0.02 -0.12 0.03
## ca 0.00 3.00 3.00 0.65 -0.81 0.05
## thal 3.00 7.00 4.00 -0.99 -0.96 0.09
## num 1.00 4.00 3.00 0.39 -1.23 0.05
## sk 0.00 1.00 1.00 -7.61 56.03 0.01
## sk_react 0.00 1.00 1.00 -1.13 -0.72 0.02
## reaction 0.00 1.00 1.00 -1.13 -0.72 0.02
## mortality 0.00 1.00 1.00 1.36 -0.14 0.02
## follow_up 1.00 60.00 59.00 0.20 -0.81 0.82
head(pak_data2$others)
## [1] "no" "no" "LV dysfunction" "HTN"
## [5] "no" "PND, ORTHOPENIA"
head(pak_data2$co)
## [1] "Chest pain," "Central Chest pain,"
## [3] "Chest pain,SOB, Cold sweating" "CENTRAL Chest pain,"
## [5] "Chest pain," "SOB FROM 1 DAY "
head(pak_data2$diagnosis)
## [1] "EXT. ACUTE WALL M.I" "A/W M.I"
## [3] "AC I/W M.I (RV) RE. M.I " "I/W M.I"
## [5] "A/W M.I" "ACS, NSTEMI"
n_distinct(pak_data2$cp)
## [1] 4
n_distinct(pak_data2$co)
## [1] 37
n_distinct(pak_data2$diagnosis)
## [1] 37
summary(pak_data2$diagnosis)
## Length Class Mode
## 368 character character
glimpse(pak_data2)
## Rows: 368
## Columns: 60
## $ age <int> 45, 51, 55, 55, 56, 56, 57, 57, 58, 58, 59, 60, 60, 6…
## $ age_group <chr> "41-50", "51-60", "51-60", "51-60", "51-60", "51-60",…
## $ gender <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ locality <dbl> 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,…
## $ marital_status <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ life_style <dbl> 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1,…
## $ sleep <dbl> 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1,…
## $ category <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,…
## $ depression <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ hyperlipi <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ smoking <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ family_history <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ f_history <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ diabetes <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,…
## $ htn <dbl> 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,…
## $ allergies <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ bp <dbl> 100.6, 90.6, 100.7, 160.1, 90.6, 140.7, 120.8, 100.6,…
## $ thrombolysis <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ bgr <int> 84, 135, 146, 146, 85, 166, 96, 84, 135, 146, 146, 85…
## $ b_urea <dbl> 28, 17, 37, 37, 78, 104, 42, 28, 17, 37, 37, 78, 104,…
## $ s_cr <dbl> 0.9, 0.7, 1.0, 1.0, 1.2, 4.0, 1.0, 0.9, 0.7, 1.0, 1.0…
## $ s_sodium <int> 138, 144, 137, 137, 139, 130, 146, 138, 144, 137, 137…
## $ s_potassium <dbl> 3.3, 4.7, 4.2, 4.2, 4.5, 5.3, 3.9, 3.3, 4.7, 4.2, 4.2…
## $ s_chloride <int> 107, 104, 103, 103, 112, 100, 100, 107, 104, 103, 103…
## $ c_p_k <int> 130, 163, 149, 149, 75, 322, 146, 130, 163, 149, 149,…
## $ ck_mb <int> 30, 30, 22, 22, 18, 52, 21, 30, 30, 22, 22, 18, 52, 2…
## $ esr <int> 11, 27, 19, 19, 13, 154, 25, 11, 27, 19, 19, 13, 154,…
## $ wbc <int> 9900, 15800, 7900, 7900, 6900, 13500, 7400, 9900, 158…
## $ rbc <dbl> 4.26, 5.74, 4.83, 4.83, 4.41, 3.90, 4.14, 4.26, 5.74,…
## $ hemoglobin <dbl> 11.6, 14.5, 14.1, 14.1, 12.3, 10.0, 11.7, 11.6, 14.5,…
## $ p_c_v <dbl> 0.34, 0.44, 0.42, 0.42, 0.36, 0.29, 0.36, 0.34, 0.44,…
## $ m_c_v <dbl> 79.7, 78.0, 87.0, 87.0, 82.0, 74.4, 87.0, 79.7, 78.0,…
## $ m_c_h <dbl> 27.2, 25.0, 29.0, 29.0, 27.0, 25.7, 28.0, 27.2, 25.0,…
## $ m_c_h_c <dbl> 0.34, 0.32, 0.33, 0.33, 0.33, 0.35, 0.32, 0.34, 0.32,…
## $ platelet_count <int> 265000, 287000, 183000, 183000, 211000, 288000, 39500…
## $ neutrophil <dbl> 0.70, 0.73, 0.60, 0.60, 0.71, 0.85, 0.63, 0.70, 0.73,…
## $ lympho <dbl> 0.25, 0.20, 0.33, 0.33, 0.25, 0.10, 0.31, 0.25, 0.20,…
## $ monocyte <dbl> 0.03, 0.04, 0.04, 0.04, 0.02, 0.03, 0.03, 0.03, 0.04,…
## $ eosino <int> 2, 3, 3, 3, 2, 2, 3, 2, 3, 3, 3, 2, 2, 3, 2, 3, 3, 3,…
## $ others <chr> "no", "no", "LV dysfunction", "HTN", "no", "PND, ORTH…
## $ co <chr> "Chest pain,", "Central Chest pain,", "Chest pain,SOB…
## $ diagnosis <chr> "EXT. ACUTE WALL M.I", "A/W M.I", "AC I/W M.I (RV) RE…
## $ hypersensitivity <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ cp <int> 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 3,…
## $ trestbps <int> 132, 130, 180, 128, 200, 134, 140, 130, 136, 170, 174…
## $ chol <int> 341, 305, 327, 205, 288, 409, 241, 236, 319, 225, 249…
## $ fbs <int> 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ restecg <int> 2, 0, 1, 1, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0,…
## $ thalach <int> 136, 142, 117, 130, 133, 150, 123, 174, 152, 146, 143…
## $ exang <int> 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,…
## $ oldpeak <dbl> 3.0, 1.2, 3.4, 2.0, 4.0, 1.9, 0.2, 0.0, 0.0, 2.8, 0.0…
## $ slope <int> 2, 2, 2, 2, 3, 2, 2, 2, 1, 2, 2, 2, 1, 1, 2, 3, 3, 2,…
## $ ca <int> 0, 0, 0, 1, 2, 2, 0, 1, 2, 2, 0, 2, 0, 0, 0, 2, 3, 1,…
## $ thal <int> 7, 7, 3, 7, 7, 7, 7, 3, 3, 6, 3, 7, 3, 3, 7, 3, 7, 7,…
## $ num <int> 2, 2, 2, 3, 3, 2, 1, 1, 3, 2, 1, 3, 1, 1, 1, 3, 3, 2,…
## $ sk <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ sk_react <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ reaction <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ mortality <int> 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1,…
## $ follow_up <int> 60, 15, 6, 52, 34, 32, 60, 3, 15, 6, 52, 34, 32, 12, …
table(pak_data2$cp)
##
## 1 2 3 4
## 18 15 37 298
summary(pak_data2$cp)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 4.000 4.000 3.671 4.000 4.000
colnames(pak_data2)
## [1] "age" "age_group" "gender" "locality"
## [5] "marital_status" "life_style" "sleep" "category"
## [9] "depression" "hyperlipi" "smoking" "family_history"
## [13] "f_history" "diabetes" "htn" "allergies"
## [17] "bp" "thrombolysis" "bgr" "b_urea"
## [21] "s_cr" "s_sodium" "s_potassium" "s_chloride"
## [25] "c_p_k" "ck_mb" "esr" "wbc"
## [29] "rbc" "hemoglobin" "p_c_v" "m_c_v"
## [33] "m_c_h" "m_c_h_c" "platelet_count" "neutrophil"
## [37] "lympho" "monocyte" "eosino" "others"
## [41] "co" "diagnosis" "hypersensitivity" "cp"
## [45] "trestbps" "chol" "fbs" "restecg"
## [49] "thalach" "exang" "oldpeak" "slope"
## [53] "ca" "thal" "num" "sk"
## [57] "sk_react" "reaction" "mortality" "follow_up"
#check if gender has an effect? gender is a binary variable so use chi-squared test
db_gender <- tidy(chisq.test(pak_data2$gender, pak_data2$diabetes))
print(db_gender)
## # A tibble: 1 × 4
## statistic p.value parameter method
## <dbl> <dbl> <int> <chr>
## 1 86.4 1.46e-20 1 Pearson's Chi-squared test with Yates' continuit…
db_smoking <- tidy(chisq.test(pak_data2$smoking, pak_data2$diabetes))
print(db_smoking)
## # A tibble: 1 × 4
## statistic p.value parameter method
## <dbl> <dbl> <int> <chr>
## 1 26.5 0.000000261 1 Pearson's Chi-squared test with Yates' contin…
db_locality <- tidy(chisq.test(pak_data2$locality, pak_data2$diabetes))
print(db_locality)
## # A tibble: 1 × 4
## statistic p.value parameter method
## <dbl> <dbl> <int> <chr>
## 1 3.49 0.0617 1 Pearson's Chi-squared test with Yates' continuity…
db_depression <- tidy(chisq.test(pak_data2$depression, pak_data2$diabetes))
print(db_depression)
## # A tibble: 1 × 4
## statistic p.value parameter method
## <dbl> <dbl> <int> <chr>
## 1 0.104 0.747 1 Pearson's Chi-squared test with Yates' continuity…
library(Matrix)
## Warning: package 'Matrix' was built under R version 4.2.3
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
g <- print(paste("Gender: ", db_gender$p.value))
## [1] "Gender: 1.4568693974862e-20"
s <- print(paste("Smoking: ", db_smoking$p.value))
## [1] "Smoking: 2.60884464598861e-07"
l <- print(paste("Locality: ", db_locality$p.value))
## [1] "Locality: 0.0617224321212565"
d <- print(paste("Depression: ", db_depression$p.value))
## [1] "Depression: 0.747334758966391"
rbind(g, s, l, d)
## [,1]
## g "Gender: 1.4568693974862e-20"
## s "Smoking: 2.60884464598861e-07"
## l "Locality: 0.0617224321212565"
## d "Depression: 0.747334758966391"
#perform t test for continuous variables
db_age <- t.test(pak_data2$age ~ pak_data2$diabetes)
print(db_age)
##
## Welch Two Sample t-test
##
## data: pak_data2$age by pak_data2$diabetes
## t = -4.6313, df = 362.3, p-value = 5.073e-06
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -5.748912 -2.321914
## sample estimates:
## mean in group 0 mean in group 1
## 52.42929 56.46471
db_bp <- t.test(pak_data2$bp ~ pak_data2$diabetes)
print(db_bp)
##
## Welch Two Sample t-test
##
## data: pak_data2$bp by pak_data2$diabetes
## t = -0.26399, df = 364.3, p-value = 0.7919
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -5.693527 4.345803
## sample estimates:
## mean in group 0 mean in group 1
## 120.9020 121.5759
db_platelet_count <- t.test(pak_data2$platelet_count ~ pak_data2$diabetes)
print(db_platelet_count)
##
## Welch Two Sample t-test
##
## data: pak_data2$platelet_count by pak_data2$diabetes
## t = -3.826, df = 316.32, p-value = 0.000157
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -46465.84 -14905.64
## sample estimates:
## mean in group 0 mean in group 1
## 234484.8 265170.6
db_trestbps <- t.test(pak_data2$trestbps ~ pak_data2$diabetes)
print(db_trestbps)
##
## Welch Two Sample t-test
##
## data: pak_data2$trestbps by pak_data2$diabetes
## t = -1.7871, df = 340.78, p-value = 0.0748
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -7.1856547 0.3441812
## sample estimates:
## mean in group 0 mean in group 1
## 131.1616 134.5824
db_choles <- t.test(pak_data2$chol ~ pak_data2$diabetes)
print(db_choles)
##
## Welch Two Sample t-test
##
## data: pak_data2$chol by pak_data2$diabetes
## t = -1.6576, df = 315.17, p-value = 0.0984
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -19.323166 1.652102
## sample estimates:
## mean in group 0 mean in group 1
## 244.8586 253.6941
db_thalach <- t.test(pak_data2$thalach ~ pak_data2$diabetes)
print(db_thalach)
##
## Welch Two Sample t-test
##
## data: pak_data2$thalach by pak_data2$diabetes
## t = 0.3394, df = 365.55, p-value = 0.7345
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -3.881886 5.501375
## sample estimates:
## mean in group 0 mean in group 1
## 141.2980 140.4882
db_follow_up <- t.test(pak_data2$follow_up ~ pak_data2$diabetes)
print(db_follow_up)
##
## Welch Two Sample t-test
##
## data: pak_data2$follow_up by pak_data2$diabetes
## t = 0.91337, df = 333.37, p-value = 0.3617
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -1.764185 4.822533
## sample estimates:
## mean in group 0 mean in group 1
## 29.35859 27.82941
a <- print(paste("Age: ",db_age$p.value))
## [1] "Age: 5.07266425232931e-06"
bp <- print(paste("BP: ",db_bp$p.value))
## [1] "BP: 0.79193587394789"
pc <- print(paste("Platelet_Count: ",db_platelet_count$p.value))
## [1] "Platelet_Count: 0.000156959257819211"
trest <- print(paste("Resting: ",db_trestbps$p.value))
## [1] "Resting: 0.0748043082835425"
chol <- print(paste("Cholsterol: ",db_choles$p.value))
## [1] "Cholsterol: 0.0983974805094631"
thal <- print(paste("Thal: ",db_thalach$p.value))
## [1] "Thal: 0.734502881239856"
follow <- print(paste("Follow-Up: ",db_follow_up$p.value))
## [1] "Follow-Up: 0.361709498356924"
rbind(a, bp, pc, trest, chol, thal, follow)
## [,1]
## a "Age: 5.07266425232931e-06"
## bp "BP: 0.79193587394789"
## pc "Platelet_Count: 0.000156959257819211"
## trest "Resting: 0.0748043082835425"
## chol "Cholsterol: 0.0983974805094631"
## thal "Thal: 0.734502881239856"
## follow "Follow-Up: 0.361709498356924"
#Perform graphical association analysis for continuous
# gender vs diabetes
pak_data2 <- pak_data2 %>% mutate(diabetes_labelled = ifelse(diabetes == 0,"Diabetic", "Non_Diabetic"))
#ggplot(data = pak_data2, aes(x = diabetes_labelled, y = age)) + geom_boxplot()
#age
ggplot(data = pak_data2, aes(x = diabetes_labelled, y = age)) + geom_boxplot()
#bp
ggplot(data = pak_data2, aes(x = diabetes_labelled, y = bp)) + geom_boxplot()
#platlet_count
ggplot(data = pak_data2, aes(x = diabetes_labelled, y = platelet_count)) + geom_boxplot()
#trestbps
ggplot(data = pak_data2, aes(x = diabetes_labelled, y = trestbps)) + geom_boxplot()
#chol
ggplot(data = pak_data2, aes(x = diabetes_labelled, y = chol)) + geom_boxplot()
#thalach
ggplot(data = pak_data2, aes(x = diabetes_labelled, y = thalach)) + geom_boxplot()
#follow-up
ggplot(data = pak_data2, aes(x = diabetes_labelled, y = follow_up)) + geom_boxplot()
#Perform graphical association analysis for binary data
#gender
ggplot(data = pak_data2, aes(x = diabetes_labelled, fill= as.factor(gender))) + geom_bar(position = 'fill')+ylab("Gender %")+
scale_fill_discrete(labels = c("Female", "Male"))
#smoking
ggplot(data = pak_data2, aes(x = diabetes_labelled, fill= as.factor(smoking))) + geom_bar(position = 'fill')+ylab("Gender %")+
scale_fill_discrete(labels = c("Non-Smoker", "Smoker"))
#locality
ggplot(data = pak_data2, aes(x = diabetes_labelled, fill= as.factor(locality))) + geom_bar(position = 'fill')+ylab("Gender %")+
scale_fill_discrete(labels = c("Rural", "Urban"))
#depression
ggplot(data = pak_data2, aes(x = diabetes_labelled, fill= as.factor(depression))) + geom_bar(position = 'fill')+ylab("Gender %")+
scale_fill_discrete(labels = c("Not Depressed", "Depressed"))
library(tidyverse)
library(plotly)
## Warning: package 'plotly' was built under R version 4.2.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(DT)
## Warning: package 'DT' was built under R version 4.2.3
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.2.3
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 4.2.3
library(lubridate)
library(scales)
##
## Attaching package: 'scales'
## The following objects are masked from 'package:psych':
##
## alpha, rescale
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(janitor)
library(RColorBrewer)
#age
dbage <- ggplot(data = pak_data2, aes(x = diabetes_labelled, y = age)) + geom_boxplot()
#platlet_count
dbpc <- ggplot(data = pak_data2, aes(x = diabetes_labelled, y = platelet_count)) + geom_boxplot()
#gender
dbgender <- ggplot(data = pak_data2, aes(x = diabetes_labelled, fill= as.factor(gender))) + geom_bar()+ylab("Gender %")+ ylab("Gender %") +
scale_fill_discrete(labels = c("Female", "Male"))
#smoking
dbsmoking <- ggplot(data = pak_data2, aes(x = diabetes_labelled, fill= as.factor(smoking))) + geom_bar()+ylab("Gender %")+
scale_fill_discrete(labels = c("Non-Smoker", "Smoker"))
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.2.3
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(dbage,dbpc,dbgender,dbsmoking)
colnames(pak_data2)
## [1] "age" "age_group" "gender"
## [4] "locality" "marital_status" "life_style"
## [7] "sleep" "category" "depression"
## [10] "hyperlipi" "smoking" "family_history"
## [13] "f_history" "diabetes" "htn"
## [16] "allergies" "bp" "thrombolysis"
## [19] "bgr" "b_urea" "s_cr"
## [22] "s_sodium" "s_potassium" "s_chloride"
## [25] "c_p_k" "ck_mb" "esr"
## [28] "wbc" "rbc" "hemoglobin"
## [31] "p_c_v" "m_c_v" "m_c_h"
## [34] "m_c_h_c" "platelet_count" "neutrophil"
## [37] "lympho" "monocyte" "eosino"
## [40] "others" "co" "diagnosis"
## [43] "hypersensitivity" "cp" "trestbps"
## [46] "chol" "fbs" "restecg"
## [49] "thalach" "exang" "oldpeak"
## [52] "slope" "ca" "thal"
## [55] "num" "sk" "sk_react"
## [58] "reaction" "mortality" "follow_up"
## [61] "diabetes_labelled"
#Put variables into one model The plots and statistical tests both confirmed that these 4 variables are highly significanlty associated with the outcome(Diabetes). With binary outcome variable and more than 2 predicting variables Logistic Regression model is used. For example our objective is to know age, platelet count, gender and smoking will likely have diabetes. The glm() command is designed to perform generalized linear models (regressions) on binary outcome data, count data, probability data, proportion data, and many other data types. In our case, the outcome is binary following a binomial distribution.
#use glm and run the model
db_model <- glm(data = pak_data2, diabetes
~ age + platelet_count + as.factor(gender) + as.factor(smoking), family = "binomial")
#extract model summary
summary(db_model)
##
## Call:
## glm(formula = diabetes ~ age + platelet_count + as.factor(gender) +
## as.factor(smoking), family = "binomial", data = pak_data2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3046 -0.8909 -0.6978 1.0215 1.7525
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.170e-01 1.087e+00 0.200 0.84173
## age 1.347e-02 1.501e-02 0.898 0.36942
## platelet_count 5.333e-06 1.722e-06 3.097 0.00195 **
## as.factor(gender)1 -2.834e+00 4.757e-01 -5.957 2.57e-09 ***
## as.factor(smoking)1 -1.800e-01 2.864e-01 -0.628 0.52978
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 508.02 on 367 degrees of freedom
## Residual deviance: 398.51 on 363 degrees of freedom
## AIC: 408.51
##
## Number of Fisher Scoring iterations: 5
#Extract the useful information from the model It’s common practice in medical research to report Odds Ratio (OR) to quantify how strongly the presence or absence of property A is associated with the presence or absence of the outcome. When the OR is greater than 1, we say A is positively associated with outcome B (increases the Odds of having B). Otherwise, we say A is negatively associated with B (decreases the Odds of having B).
The raw glm coefficient table (the ‘estimate’ column in the printed output) in R represents the log(Odds Ratios) of the outcome. Therefore, we need to convert the values to the original OR scale and calculate the corresponding 95% Confidence Interval (CI) of the estimated Odds Ratios when reporting results from a logistic regression.
#load broom package
library("broom")
#clean the coefficient table
clean_dbmodel <- tidy(db_model)
clean_dbmodel
## # A tibble: 5 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 0.217 1.09 0.200 0.842
## 2 age 0.0135 0.0150 0.898 0.369
## 3 platelet_count 0.00000533 0.00000172 3.10 0.00195
## 4 as.factor(gender)1 -2.83 0.476 -5.96 0.00000000257
## 5 as.factor(smoking)1 -0.180 0.286 -0.628 0.530
#Calculate the OR
clean_dbmodel$OR <- exp(clean_dbmodel$estimate)
#calculate the 95% CI and save as lower CI and upper CI
clean_dbmodel$lower_CI <- exp(clean_dbmodel$estimate - 1.96 * clean_dbmodel$std.error)
clean_dbmodel$upper_CI <- exp(clean_dbmodel$estimate + 1.96 * clean_dbmodel$std.error)
#the updated coeffcient table
clean_dbmodel
## # A tibble: 5 × 8
## term estimate std.error statistic p.value OR lower_CI upper_CI
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 2.17e-1 1.09e+0 0.200 8.42e-1 1.24 0.148 10.5
## 2 age 1.35e-2 1.50e-2 0.898 3.69e-1 1.01 0.984 1.04
## 3 platelet_count 5.33e-6 1.72e-6 3.10 1.95e-3 1.00 1.00 1.00
## 4 as.factor(gende… -2.83e+0 4.76e-1 -5.96 2.57e-9 0.0588 0.0231 0.149
## 5 as.factor(smoki… -1.80e-1 2.86e-1 -0.628 5.30e-1 0.835 0.476 1.46
#Predicted probabilities from our model
So far, we have built a logistic regression model and examined the model coefficients/ORs. We may wonder how can we use this model we developed to predict a person’s likelihood of having heart disease given his/her age, sex, and maximum heart rate. Furthermore, we’d like to translate the predicted probability into a decision rule for clinical use by defining a cutoff value on the probability scale. In practice, when an individual comes in for a health check-up, the doctor would like to know the predicted probability of diabetes, we create a data frame called newdata, in which we include the desired values for our prediction.
#get the predicted probability in our dataset using the predict() function
pred_prob <- predict(db_model, pak_data2, type = "response")
#create a decision rule using prob 0.5 as cutoff and save the predicted decision into the main data frame
pak_data2$pred_diabetes <- ifelse(pred_prob >= 0.5, 1, 0)
#create a new data frame saving a new case information
newdata <- data.frame(age = 25, platelet_count =300000 , gender = 1, smoking = 0)
#predict the probability for this new data case and print out the predicted value
p_new <- predict(db_model, newdata, type = "response")
p_new
## 1
## 0.3362089
#Check the model performance metrics Are the predictions accurate? How well does the model fit our data? We are going to use some common metrics to evaluate the model performance. The most straightforward one is Accuracy, which is the proportion of the total number of predictions that were correct. On the other hand, we can calculate the classification error rate using 1- accuracy. However, accuracy can be misleading when the response is rare (i.e., imbalanced response). Another popular metric, Area Under the ROC curve (AUC), has the advantage that it’s independent of the change in the proportion of responders. AUC ranges from 0 to 1. The closer it gets to 1 the better the model performance. Lastly, a confusion matrix is an N X N matrix, where N is the level of outcome. For the problem at hand, we have N=2, and hence we get a 2 X 2 matrix. It cross-tabulates the predicted outcome levels against the true outcome levels.
#load the metrics package
library(Metrics)
## Warning: package 'Metrics' was built under R version 4.2.3
#calculate auc, accuracy, classification error
auc <- auc(pak_data2$diabetes, pak_data2$pred_diabetes)
accuracy <- accuracy(pak_data2$diabetes, pak_data2$pred_diabetes)
classification_error <- ce(pak_data2$diabetes, pak_data2$pred_diabetes)
#print the metrics
print(paste("AUC= ", auc))
## [1] "AUC= 0.75668449197861"
print(paste("Accuracy= ", accuracy))
## [1] "Accuracy= 0.771739130434783"
print(paste("Classification Error= ", classification_error))
## [1] "Classification Error= 0.228260869565217"
#confusion Matrix
table(pak_data2$diabetes, pak_data2$pred_diabetes, dnn = c("True Status", "Predicted Status"))
## Predicted Status
## True Status 0 1
## 0 189 9
## 1 75 95