library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.2
library(table1)
## Warning: package 'table1' was built under R version 4.5.2
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.3.0
## ✔ purrr 1.0.4 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(gtsummary)
library(boot)
## Warning: package 'boot' was built under R version 4.5.2
library(lessR)
## Warning: package 'lessR' was built under R version 4.5.2
##
## lessR 4.5 feedback: gerbing@pdx.edu
## --------------------------------------------------------------
## > d <- Read("") Read data file, many formats available, e.g., Excel
## d is the default data frame, data= in analysis routines optional
##
## Many examples of reading, writing, and manipulating data, graphics,
## testing means and proportions, regression, factor analysis,
## customization, forecasting, and aggregation to pivot tables.
## Enter: browseVignettes("lessR")
##
## View lessR updates, now including modern time series forecasting
## and many, new Plotly interactive visualizations output. Most
## visualization functions are now reorganized to three functions:
## Chart(): type="bar", "pie", "radar", "bubble", "treemap", "icicle"
## X(): type="histogram", "density", "vbs" and more
## XY(): type="scatter" for a scatterplot, or "contour", "smooth"
## Most previous function calls still work, such as:
## BarChart(), Histogram, and Plot().
## Enter: news(package="lessR"), or ?Chart, ?X, or ?XY
## There is also Flows() for Sankey flow diagrams, see ?Flows
##
## Interactive data analysis for constructing visualizations.
## Enter: interact()
##
##
## Attaching package: 'lessR'
##
## The following objects are masked from 'package:dplyr':
##
## order_by, recode, rename
##
## The following object is masked from 'package:table1':
##
## label
th1 <- read.csv("Stroke Data.csv",header = T)
th1 <- th1 %>%
mutate_if(is.character,factor) %>%
mutate(hypertension=as.factor(hypertension),
stroke=as.factor(stroke),
heart_disease=as.factor(heart_disease),
id=as.factor(id))
# 4.1 Có bao nhiêu biến số (variable) và quan sát (observation)
dim(th1)
## [1] 5110 12
## Data th1 có 5110 obs và 12 variables
# 4.2 Liệt kê 10 quan sát đầu tiên của dữ liệu.
head(th1,10)
## id gender age hypertension heart_disease ever_married work_type
## 1 9046 Male 67 0 1 Yes Private
## 2 51676 Female 61 0 0 Yes Self-employed
## 3 31112 Male 80 0 1 Yes Private
## 4 60182 Female 49 0 0 Yes Private
## 5 1665 Female 79 1 0 Yes Self-employed
## 6 56669 Male 81 0 0 Yes Private
## 7 53882 Male 74 1 1 Yes Private
## 8 10434 Female 69 0 0 No Private
## 9 27419 Female 59 0 0 Yes Private
## 10 60491 Female 78 0 0 Yes Private
## Residence_type avg_glucose_level bmi smoking_status stroke
## 1 Urban 228.69 36.6 formerly smoked 1
## 2 Rural 202.21 NA never smoked 1
## 3 Rural 105.92 32.5 never smoked 1
## 4 Urban 171.23 34.4 smokes 1
## 5 Rural 174.12 24.0 never smoked 1
## 6 Urban 186.21 29.0 formerly smoked 1
## 7 Rural 70.09 27.4 never smoked 1
## 8 Urban 94.39 22.8 never smoked 1
## 9 Rural 76.15 NA Unknown 1
## 10 Urban 58.57 24.2 Unknown 1
# 4.3 Liệt kê 6 quan sát cuối cùng của dữ liệu
tail(th1)
## id gender age hypertension heart_disease ever_married work_type
## 5105 14180 Female 13 0 0 No children
## 5106 18234 Female 80 1 0 Yes Private
## 5107 44873 Female 81 0 0 Yes Self-employed
## 5108 19723 Female 35 0 0 Yes Self-employed
## 5109 37544 Male 51 0 0 Yes Private
## 5110 44679 Female 44 0 0 Yes Govt_job
## Residence_type avg_glucose_level bmi smoking_status stroke
## 5105 Rural 103.08 18.6 Unknown 0
## 5106 Urban 83.75 NA never smoked 0
## 5107 Urban 125.20 40.0 never smoked 0
## 5108 Rural 82.99 30.6 never smoked 0
## 5109 Rural 166.29 25.6 formerly smoked 0
## 5110 Urban 85.28 26.2 Unknown 0
# 4.4 Tóm tắt dữ liệu bằng hàm summary
summary(th1)
## id gender age hypertension heart_disease
## 67 : 1 Female:2994 Min. : 0.08 0:4612 0:4834
## 77 : 1 Male :2115 1st Qu.:25.00 1: 498 1: 276
## 84 : 1 Other : 1 Median :45.00
## 91 : 1 Mean :43.23
## 99 : 1 3rd Qu.:61.00
## 121 : 1 Max. :82.00
## (Other):5104
## ever_married work_type Residence_type avg_glucose_level
## No :1757 children : 687 Rural:2514 Min. : 55.12
## Yes:3353 Govt_job : 657 Urban:2596 1st Qu.: 77.25
## Never_worked : 22 Median : 91.89
## Private :2925 Mean :106.15
## Self-employed: 819 3rd Qu.:114.09
## Max. :271.74
##
## bmi smoking_status stroke
## Min. :10.30 formerly smoked: 885 0:4861
## 1st Qu.:23.50 never smoked :1892 1: 249
## Median :28.10 smokes : 789
## Mean :28.89 Unknown :1544
## 3rd Qu.:33.10
## Max. :97.60
## NA's :201
glimpse(th1)
## Rows: 5,110
## Columns: 12
## $ id <fct> 9046, 51676, 31112, 60182, 1665, 56669, 53882, 10434…
## $ gender <fct> Male, Female, Male, Female, Female, Male, Male, Fema…
## $ age <dbl> 67, 61, 80, 49, 79, 81, 74, 69, 59, 78, 81, 61, 54, …
## $ hypertension <fct> 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1…
## $ heart_disease <fct> 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0…
## $ ever_married <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, Yes, Yes, Yes…
## $ work_type <fct> Private, Self-employed, Private, Private, Self-emplo…
## $ Residence_type <fct> Urban, Rural, Rural, Urban, Rural, Urban, Rural, Urb…
## $ avg_glucose_level <dbl> 228.69, 202.21, 105.92, 171.23, 174.12, 186.21, 70.0…
## $ bmi <dbl> 36.6, NA, 32.5, 34.4, 24.0, 29.0, 27.4, 22.8, NA, 24…
## $ smoking_status <fct> formerly smoked, never smoked, never smoked, smokes,…
## $ stroke <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
th2 <- th1 %>%
mutate(sex = case_when(
gender =="Female"~"0",
gender =="Male"~"1",
gender =="Other"~"2"
)) %>%
mutate(sex=as.factor(sex))
th3 <- th2 %>%
mutate(bmi_cut= case_when(
bmi<18.5 ~ "underweight",
bmi>=18.5 & bmi <25 ~ "normal",
bmi>=25 & bmi <30 ~ "overweight",
bmi>=30 ~ "obese"
))
table1(~age+gender+hypertension+
heart_disease+ever_married+
smoking_status+Residence_type+
bmi+sex+work_type|stroke,data=th3)
| 0 (N=4861) |
1 (N=249) |
Overall (N=5110) |
|
|---|---|---|---|
| age | |||
| Mean (SD) | 42.0 (22.3) | 67.7 (12.7) | 43.2 (22.6) |
| Median [Min, Max] | 43.0 [0.0800, 82.0] | 71.0 [1.32, 82.0] | 45.0 [0.0800, 82.0] |
| gender | |||
| Female | 2853 (58.7%) | 141 (56.6%) | 2994 (58.6%) |
| Male | 2007 (41.3%) | 108 (43.4%) | 2115 (41.4%) |
| Other | 1 (0.0%) | 0 (0%) | 1 (0.0%) |
| hypertension | |||
| 0 | 4429 (91.1%) | 183 (73.5%) | 4612 (90.3%) |
| 1 | 432 (8.9%) | 66 (26.5%) | 498 (9.7%) |
| heart_disease | |||
| 0 | 4632 (95.3%) | 202 (81.1%) | 4834 (94.6%) |
| 1 | 229 (4.7%) | 47 (18.9%) | 276 (5.4%) |
| ever_married | |||
| No | 1728 (35.5%) | 29 (11.6%) | 1757 (34.4%) |
| Yes | 3133 (64.5%) | 220 (88.4%) | 3353 (65.6%) |
| smoking_status | |||
| formerly smoked | 815 (16.8%) | 70 (28.1%) | 885 (17.3%) |
| never smoked | 1802 (37.1%) | 90 (36.1%) | 1892 (37.0%) |
| smokes | 747 (15.4%) | 42 (16.9%) | 789 (15.4%) |
| Unknown | 1497 (30.8%) | 47 (18.9%) | 1544 (30.2%) |
| Residence_type | |||
| Rural | 2400 (49.4%) | 114 (45.8%) | 2514 (49.2%) |
| Urban | 2461 (50.6%) | 135 (54.2%) | 2596 (50.8%) |
| bmi | |||
| Mean (SD) | 28.8 (7.91) | 30.5 (6.33) | 28.9 (7.85) |
| Median [Min, Max] | 28.0 [10.3, 97.6] | 29.7 [16.9, 56.6] | 28.1 [10.3, 97.6] |
| Missing | 161 (3.3%) | 40 (16.1%) | 201 (3.9%) |
| sex | |||
| 0 | 2853 (58.7%) | 141 (56.6%) | 2994 (58.6%) |
| 1 | 2007 (41.3%) | 108 (43.4%) | 2115 (41.4%) |
| 2 | 1 (0.0%) | 0 (0%) | 1 (0.0%) |
| work_type | |||
| children | 685 (14.1%) | 2 (0.8%) | 687 (13.4%) |
| Govt_job | 624 (12.8%) | 33 (13.3%) | 657 (12.9%) |
| Never_worked | 22 (0.5%) | 0 (0%) | 22 (0.4%) |
| Private | 2776 (57.1%) | 149 (59.8%) | 2925 (57.2%) |
| Self-employed | 754 (15.5%) | 65 (26.1%) | 819 (16.0%) |
a <- table1(~hypertension+heart_disease+smoking_status+
bmi+bmi_cut|stroke,data=th3)
a
| 0 (N=4861) |
1 (N=249) |
Overall (N=5110) |
|
|---|---|---|---|
| hypertension | |||
| 0 | 4429 (91.1%) | 183 (73.5%) | 4612 (90.3%) |
| 1 | 432 (8.9%) | 66 (26.5%) | 498 (9.7%) |
| heart_disease | |||
| 0 | 4632 (95.3%) | 202 (81.1%) | 4834 (94.6%) |
| 1 | 229 (4.7%) | 47 (18.9%) | 276 (5.4%) |
| smoking_status | |||
| formerly smoked | 815 (16.8%) | 70 (28.1%) | 885 (17.3%) |
| never smoked | 1802 (37.1%) | 90 (36.1%) | 1892 (37.0%) |
| smokes | 747 (15.4%) | 42 (16.9%) | 789 (15.4%) |
| Unknown | 1497 (30.8%) | 47 (18.9%) | 1544 (30.2%) |
| bmi | |||
| Mean (SD) | 28.8 (7.91) | 30.5 (6.33) | 28.9 (7.85) |
| Median [Min, Max] | 28.0 [10.3, 97.6] | 29.7 [16.9, 56.6] | 28.1 [10.3, 97.6] |
| Missing | 161 (3.3%) | 40 (16.1%) | 201 (3.9%) |
| bmi_cut | |||
| normal | 1208 (24.9%) | 35 (14.1%) | 1243 (24.3%) |
| obese | 1822 (37.5%) | 98 (39.4%) | 1920 (37.6%) |
| overweight | 1334 (27.4%) | 75 (30.1%) | 1409 (27.6%) |
| underweight | 336 (6.9%) | 1 (0.4%) | 337 (6.6%) |
| Missing | 161 (3.3%) | 40 (16.1%) | 201 (3.9%) |