DAY 1

Let’s BEGIN

# packages (cài sẵn bên ngoài)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(lessR)

## 
## lessR 4.4.3                         feedback: gerbing@pdx.edu 
## --------------------------------------------------------------
## > d <- Read("")  Read data file, many formats available, e.g., Excel
##   d is default data frame, data= in analysis routines optional
## 
## Many examples of reading, writing, and manipulating data, 
## graphics, testing means and proportions, regression, factor analysis,
## customization, forecasting, and aggregation from pivot tables
##   Enter: browseVignettes("lessR")
## 
## View lessR updates, now including time series forecasting
##   Enter: news(package="lessR")
## 
## Interactive data analysis
##   Enter: interact()
## 
## 
## Attaching package: 'lessR'
## 
## The following objects are masked from 'package:dplyr':
## 
##     order_by, recode, rename
## 
## The following object is masked from 'package:base':
## 
##     sort_by

library(table1)

## 
## Attaching package: 'table1'
## 
## The following object is masked from 'package:lessR':
## 
##     label
## 
## The following objects are masked from 'package:base':
## 
##     units, units<-

library(compareGroups)
library(gridExtra)

## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine

library(simpleboot)

## Simple Bootstrap Routines (1.1-8)

library(boot)
library(relaimpo)

## Loading required package: MASS
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## Loading required package: survey
## Loading required package: grid
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loading required package: survival
## 
## Attaching package: 'survival'
## 
## The following object is masked from 'package:boot':
## 
##     aml
## 
## 
## Attaching package: 'survey'
## 
## The following object is masked from 'package:graphics':
## 
##     dotchart
## 
## Loading required package: mitools
## This is the global version of package relaimpo.
## 
## If you are a non-US user, a version with the interesting additional metric pmvd is available
## 
## from Ulrike Groempings web site at prof.beuth-hochschule.de/groemping.

library(caret)

## Loading required package: lattice
## 
## Attaching package: 'lattice'
## 
## The following object is masked from 'package:boot':
## 
##     melanoma
## 
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:survival':
## 
##     cluster
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(BMA)

## Loading required package: leaps
## Loading required package: robustbase
## 
## Attaching package: 'robustbase'
## 
## The following object is masked from 'package:survival':
## 
##     heart
## 
## The following object is masked from 'package:boot':
## 
##     salinity
## 
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.7-7)

library(car)

## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:boot':
## 
##     logit
## 
## The following objects are masked from 'package:lessR':
## 
##     bc, recode, sp
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some

library(epiDisplay)

## Loading required package: foreign
## Loading required package: nnet
## 
## Attaching package: 'epiDisplay'
## 
## The following object is masked from 'package:lattice':
## 
##     dotplot
## 
## The following object is masked from 'package:ggplot2':
## 
##     alpha

Đọc dữ liệu trong R

link = "C:\\Users\\ADMIN\\Downloads\\Khóa tập huấn ứng dụng AI vào phân tích dữ liệu\\birthwt.csv"

bw = read.csv(link)

# Có bao nhiêu biến số (variable) và quan sát (observation)

dim(bw)

## [1] 189  11

# Liệt kê 6 quan sát đầu tiên của dữ liệu.

head(bw)

##   id low age lwt race smoke ptl ht ui ftv  bwt
## 1 85   0  19 182    2     0   0  0  1   0 2523
## 2 86   0  33 155    3     0   0  0  0   3 2551
## 3 87   0  20 105    1     1   0  0  0   1 2557
## 4 88   0  21 108    1     1   0  0  1   2 2594
## 5 89   0  18 107    1     1   0  0  1   0 2600
## 6 91   0  21 124    3     0   0  0  0   0 2622

tail(bw)

##     id low age lwt race smoke ptl ht ui ftv  bwt
## 184 78   1  14 101    3     1   1  0  0   0 2466
## 185 79   1  28  95    1     1   0  0  0   2 2466
## 186 81   1  14 100    3     0   0  0  0   2 2495
## 187 82   1  23  94    3     1   0  0  0   0 2495
## 188 83   1  17 142    2     0   0  1  0   0 2495
## 189 84   1  21 130    1     1   0  1  0   3 2495

# Tạo biến số mới mwt là cân nặng của mẹ tính bằng kg

bw$mwt = bw$lwt*0.45 

# Tạo biến số mới ethnicity là biến factor với điều kiện sau: Nếu race = 1 thì ethnicity = “White”, Nếu race = 2 thì ethnicity = “Black”, Nếu race = 3 thì ethnicity = “Other”

bw$ethnicity [bw$race==1] = "White"
bw$ethnicity [bw$race==2] = "Black"
bw$ethnicity [bw$race==3] = "Others"

# Thay đổi thứ tự theo cách mình thích

bw$ethnicity = factor(bw$ethnicity, levels=c("White", "Black", "Others"))

# Tạo 1 tập dữ liệu bw1 chỉ gồm 3 biến số id, low và bwt. Dữ liệu này có bao nhiêu biến số và quan sát?
  
bw1 = bw[, c("id", "low", "bwt")] # chon tat ca cac dong nhung chi 3 bien so

bw1 = bw[, c(1, 2, 8:11)] # chon tat ca cac dong nhung cot 1, 2, 8-11

# Tạo 1 tập dữ liệu bw2 chỉ gồm những thai phụ có cân nặng thấp (low = 1). Dữ liệu này có bao nhiêu biến số và quan sát?
  
bw2 = subset(bw, low==1)

Phân tích

# Mô tả đặc điểm tuổi của mẹ (age), cân nặng của mẹ (lwt) và cân nặng của con (bwt)

table1(~age + mwt + bwt, data=bw)

	Overall (N=189)
age
Mean (SD)	23.2 (5.30)
Median [Min, Max]	23.0 [14.0, 45.0]
mwt
Mean (SD)	58.4 (13.8)
Median [Min, Max]	54.5 [36.0, 113]
bwt
Mean (SD)	2940 (729)
Median [Min, Max]	2980 [709, 4990]

table1(~age + mwt + bwt | low, data=bw)

	0 (N=130)	1 (N=59)	Overall (N=189)
age
Mean (SD)	23.7 (5.58)	22.3 (4.51)	23.2 (5.30)
Median [Min, Max]	23.0 [14.0, 45.0]	22.0 [14.0, 34.0]	23.0 [14.0, 45.0]
mwt
Mean (SD)	60.0 (14.3)	55.0 (12.0)	58.4 (13.8)
Median [Min, Max]	55.6 [38.3, 113]	54.0 [36.0, 90.0]	54.5 [36.0, 113]
bwt
Mean (SD)	3330 (478)	2100 (391)	2940 (729)
Median [Min, Max]	3270 [2520, 4990]	2210 [709, 2500]	2980 [709, 4990]

# Mô tả đặc điểm tuổi của mẹ (age), cân nặng của mẹ (lwt), tình trạng hút thuốc trong thai kỳ (smoke) , chủng tộc (race), và cân nặng của con (bwt) theo tình trạng trẻ thiếu cân (low)

bw$smoking = ifelse(bw$smoke==1, "Smoking", "No Smoking")
table1(~age + mwt + bwt + ethnicity + smoking | low, data=bw)

	0 (N=130)	1 (N=59)	Overall (N=189)
age
Mean (SD)	23.7 (5.58)	22.3 (4.51)	23.2 (5.30)
Median [Min, Max]	23.0 [14.0, 45.0]	22.0 [14.0, 34.0]	23.0 [14.0, 45.0]
mwt
Mean (SD)	60.0 (14.3)	55.0 (12.0)	58.4 (13.8)
Median [Min, Max]	55.6 [38.3, 113]	54.0 [36.0, 90.0]	54.5 [36.0, 113]
bwt
Mean (SD)	3330 (478)	2100 (391)	2940 (729)
Median [Min, Max]	3270 [2520, 4990]	2210 [709, 2500]	2980 [709, 4990]
ethnicity
White	73 (56.2%)	23 (39.0%)	96 (50.8%)
Black	15 (11.5%)	11 (18.6%)	26 (13.8%)
Others	42 (32.3%)	25 (42.4%)	67 (35.4%)
smoking
No Smoking	86 (66.2%)	29 (49.2%)	115 (60.8%)
Smoking	44 (33.8%)	30 (50.8%)	74 (39.2%)

Vẽ biểu đồ dùng lessR

# Phân bố bwt (cân nặng của trẻ em)

Histogram(bwt, xlab="Cân nặng của trẻ sơ sanh", ylab="Số bà mẹ", data=bw, fill = "navy")

## >>> Suggestions 
## bin_width: set the width of each bin 
## bin_start: set the start of the first bin 
## bin_end: set the end of the last bin 
## Histogram(bwt, density=TRUE)  # smoothed curve + histogram 
## Plot(bwt)  # Violin/Box/Scatterplot (VBS) plot 
## 
## --- bwt --- 
##  
##       n   miss       mean         sd        min        mdn        max 
##      189      0    2944.59     729.21     709.00    2977.00    4990.00 
##  
## 
##   
## --- Outliers ---     from the box plot: 1 
##  
## Small        Large 
## -----        ----- 
##  709.0            
## 
## 
## Bin Width: 500 
## Number of Bins: 9 
##  
##          Bin  Midpnt  Count    Prop  Cumul.c  Cumul.p 
## ----------------------------------------------------- 
##   500 > 1000     750      1    0.01        1     0.01 
##  1000 > 1500    1250      4    0.02        5     0.03 
##  1500 > 2000    1750     14    0.07       19     0.10 
##  2000 > 2500    2250     40    0.21       59     0.31 
##  2500 > 3000    2750     38    0.20       97     0.51 
##  3000 > 3500    3250     45    0.24      142     0.75 
##  3500 > 4000    3750     38    0.20      180     0.95 
##  4000 > 4500    4250      7    0.04      187     0.99 
##  4500 > 5000    4750      2    0.01      189     1.00 
##

# Phân bố biến số ethnicity

BarChart(ethnicity, ylab="Số bà mẹ", data=bw)

## >>> Suggestions
## BarChart(ethnicity, horiz=TRUE)  # horizontal bar chart
## BarChart(ethnicity, fill="reds")  # red bars of varying lightness
## PieChart(ethnicity)  # doughnut (ring) chart
## Plot(ethnicity)  # bubble plot
## Plot(ethnicity, stat="count")  # lollipop plot 
## 
## --- ethnicity --- 
## 
## Missing Values: 0 
## 
##                White  Black  Others     Total 
## Frequencies:      96     26      67       189 
## Proportions:   0.508  0.138   0.354     1.000 
## 
## Chi-squared test of null hypothesis of equal probabilities 
##   Chisq = 39.270, df = 2, p-value = 0.000

# Cân nặng của mẹ và trẻ 

Plot(mwt, bwt, xlab="Cân nặng của mẹ", ylab="Cân nặng của con", fit="lm", data=bw)

## 
## 
## >>> Suggestions  or  enter: style(suggest=FALSE)
## Plot(mwt, bwt, enhance=TRUE)  # many options
## Plot(mwt, bwt, color="red")  # exterior edge color of points
## Plot(mwt, bwt, MD_cut=6)  # Mahalanobis distance from center > 6 is an outlier 
## 
## 
## >>> Pearson's product-moment correlation 
##  
## Number of paired values with neither missing, n = 189 
## Sample Correlation of mwt and bwt: r = 0.186 
##   
## Hypothesis Test of 0 Correlation:  t = 2.585,  df = 187,  p-value = 0.011 
## 95% Confidence Interval for Correlation:  0.044 to 0.320 
##   
## 
##   Line: b0 = 2369.624    b1 = 9.842    Linear Model MSE = 516,155.173   Rsq = 0.034
##

Khóa tập huấn ứng dụng AI vào phân tích dữ liệu với thầy Nguyễn Văn Tuấn và thầy Trần Sơn Trạch

Đào Hoàng Bảo

2025-06-16

DAY 1

Let’s BEGIN

Đọc dữ liệu trong R

Phân tích

Vẽ biểu đồ dùng lessR