Ngày 1: Giới thiệu ngôn ngữ R

Việc 1. Cài đặt R và RStudio

https://www.rstudio.com/products/rstudio/download

Việc 2. Cài dặt các gói phân tích (packages)

#install.packages(c("tidyverse", "dplyr", "lessR",  "table1", "compareGroups", "ggplot2", "gridExtra", "ggthemes", "GGally", "ggfortify", "DescTools", "simpleboot", "boot", "relaimpo", "carData", "rms", "caret", "BMA", "glmnet", "epiDisplay"), dependencies = T)

Việc 3. Đọc dữ liệu vào R

# Đọc trực tiếp:
  # Tìm đường dẫn với file.choose()
  # t = file.choose()
  # t
bw = read.csv("C:\\Thach\\VN trips\\2025_2May\\Phuong Nam Institute\\Datasets\\birthwt.csv")

# Đọc dữ liệu bằng lệnh "Import Dataset"

[ChatGPT]

Upload tập dữ liệu birthwt.csv vào ChatGPT.

PROMPT: “Bạn hãy giúp soạn lệnh R để đọc tập dữ liệu birthwt.csv vào R và gọi dữ liệu là bw2”

ChatGPT:

Dưới đây là lệnh R để đọc tập dữ liệu birthwt.csv và lưu vào đối tượng có tên là bw2:

bw2 <- read.csv(“path/to/birthwt.csv”)

Việc 4. Thông tin về dữ liệu bw

4.1 Có bao nhiêu biến số và quan sát

dim(bw)

## [1] 189  11

4.2 Liệt kê 6 quan sát đầu tiên

head(bw)

##   id low age lwt race smoke ptl ht ui ftv  bwt
## 1 85   0  19 182    2     0   0  0  1   0 2523
## 2 86   0  33 155    3     0   0  0  0   3 2551
## 3 87   0  20 105    1     1   0  0  0   1 2557
## 4 88   0  21 108    1     1   0  0  1   2 2594
## 5 89   0  18 107    1     1   0  0  1   0 2600
## 6 91   0  21 124    3     0   0  0  0   0 2622

4.3 Liệt kê 6 quan sát cuối cùng

tail(bw)

##     id low age lwt race smoke ptl ht ui ftv  bwt
## 184 78   1  14 101    3     1   1  0  0   0 2466
## 185 79   1  28  95    1     1   0  0  0   2 2466
## 186 81   1  14 100    3     0   0  0  0   2 2495
## 187 82   1  23  94    3     1   0  0  0   0 2495
## 188 83   1  17 142    2     0   0  1  0   0 2495
## 189 84   1  21 130    1     1   0  1  0   3 2495

[ChatGPT]

PROMPT: “Soạn lệnh R để liệt kê 6 quan sát cuối cùng của dữ liệu này”

tail(bw2, 6)

Việc 5. Biên tập dữ liệu:

5.1 Tạo biến số mới mwt

bw$mwt = bw$lwt*0.453592
head(bw)

##   id low age lwt race smoke ptl ht ui ftv  bwt      mwt
## 1 85   0  19 182    2     0   0  0  1   0 2523 82.55374
## 2 86   0  33 155    3     0   0  0  0   3 2551 70.30676
## 3 87   0  20 105    1     1   0  0  0   1 2557 47.62716
## 4 88   0  21 108    1     1   0  0  1   2 2594 48.98794
## 5 89   0  18 107    1     1   0  0  1   0 2600 48.53434
## 6 91   0  21 124    3     0   0  0  0   0 2622 56.24541

5.2 Tạo biến số mới ethnicity

bw$ethnicity[bw$race == 1] = "White"
bw$ethnicity[bw$race == 2] = "Black"
bw$ethnicity[bw$race == 3] = "Other"
head(bw)

##   id low age lwt race smoke ptl ht ui ftv  bwt      mwt ethnicity
## 1 85   0  19 182    2     0   0  0  1   0 2523 82.55374     Black
## 2 86   0  33 155    3     0   0  0  0   3 2551 70.30676     Other
## 3 87   0  20 105    1     1   0  0  0   1 2557 47.62716     White
## 4 88   0  21 108    1     1   0  0  1   2 2594 48.98794     White
## 5 89   0  18 107    1     1   0  0  1   0 2600 48.53434     White
## 6 91   0  21 124    3     0   0  0  0   0 2622 56.24541     Other

[ChatGPT]

bw2$ethnicity <- factor(bw2$race, levels = c(1, 2, 3), labels = c(“White”, “Black”, “Other”))

5.3 Tạo tập dữ liệu bw1

bw1 = bw[, c("id", "low", "bwt")]
dim(bw1)

## [1] 189   3

head(bw1)

##   id low  bwt
## 1 85   0 2523
## 2 86   0 2551
## 3 87   0 2557
## 4 88   0 2594
## 5 89   0 2600
## 6 91   0 2622

5.4 Tạo tập dữ liệu bw3

bw3 = subset(bw, low == 1)
dim(bw3)

## [1] 59 13

head(bw3)

##     id low age lwt race smoke ptl ht ui ftv  bwt      mwt ethnicity
## 131  4   1  28 120    3     1   1  0  1   0  709 54.43104     Other
## 132 10   1  29 130    1     0   0  0  1   2 1021 58.96696     White
## 133 11   1  34 187    2     1   0  1  0   0 1135 84.82170     Black
## 134 13   1  25 105    3     0   1  1  0   0 1330 47.62716     Other
## 135 15   1  25  85    3     0   0  0  1   0 1474 38.55532     Other
## 136 16   1  27 150    3     0   0  0  0   0 1588 68.03880     Other

5.5 Tạo tập dữ liệu bw4

bw4 = subset(bw, low == 1 & smoke == 1)
dim(bw4)

## [1] 30 13

head(bw4)

##     id low age lwt race smoke ptl ht ui ftv  bwt      mwt ethnicity
## 131  4   1  28 120    3     1   1  0  1   0  709 54.43104     Other
## 133 11   1  34 187    2     1   0  1  0   0 1135 84.82170     Black
## 140 20   1  21 165    1     1   0  1  0   1 1790 74.84268     White
## 141 22   1  32 105    1     1   0  0  0   0 1818 47.62716     White
## 142 23   1  19  91    1     1   2  0  1   0 1885 41.27687     White
## 145 26   1  25  92    1     1   0  0  0   0 1928 41.73046     White

5.6 [ChatGPT]

bw5 <- subset(bw2, low == 1 & smoke == 1)

dim(bw5)

hoặc

nrow(bw5)

ncol(bw5)

Việc 6. Sử dụng gói lessR

6.1 Vẽ biễu đồ phân bố histogram cân nặng của con

library(lessR)

## Warning: package 'lessR' was built under R version 4.3.3

## 
## lessR 4.3.9                         feedback: gerbing@pdx.edu 
## --------------------------------------------------------------
## > d <- Read("")   Read text, Excel, SPSS, SAS, or R data file
##   d is default data frame, data= in analysis routines optional
## 
## Many examples of reading, writing, and manipulating data, 
## graphics, testing means and proportions, regression, factor analysis,
## customization, and descriptive statistics from pivot tables
##   Enter: browseVignettes("lessR")
## 
## View lessR updates, now including time series forecasting
##   Enter: news(package="lessR")
## 
## Interactive data analysis
##   Enter: interact()

Histogram(bwt, fill = "blue", xlab = "Birthweight (g)", ylab = "Frequency",  data = bw)

## >>> Suggestions 
## bin_width: set the width of each bin 
## bin_start: set the start of the first bin 
## bin_end: set the end of the last bin 
## Histogram(bwt, density=TRUE)  # smoothed curve + histogram 
## Plot(bwt)  # Violin/Box/Scatterplot (VBS) plot 
## 
## --- bwt --- 
##  
##       n   miss       mean         sd        min        mdn        max 
##      189      0    2944.59     729.21     709.00    2977.00    4990.00 
## 
##   
## --- Outliers ---     from the box plot: 1 
##  
## Small        Large 
## -----        ----- 
##  709.0            
## 
## 
## Bin Width: 500 
## Number of Bins: 9 
##  
##          Bin  Midpnt  Count    Prop  Cumul.c  Cumul.p 
## ----------------------------------------------------- 
##   500 > 1000     750      1    0.01        1     0.01 
##  1000 > 1500    1250      4    0.02        5     0.03 
##  1500 > 2000    1750     14    0.07       19     0.10 
##  2000 > 2500    2250     40    0.21       59     0.31 
##  2500 > 3000    2750     38    0.20       97     0.51 
##  3000 > 3500    3250     45    0.24      142     0.75 
##  3500 > 4000    3750     38    0.20      180     0.95 
##  4000 > 4500    4250      7    0.04      187     0.99 
##  4500 > 5000    4750      2    0.01      189     1.00

[ChatGPT]

library(lessR)

Histogram(bw2$bwt, xlab = “Birth Weight”, main = “Histogram of Birth Weight”)

Nếu bạn muốn vẽ đẹp hơn một chút với màu sắc:

Histogram(bw2$bwt, fill = “lightblue”, color = “black”, xlab = “Birth Weight”, main = “Distribution of Birth Weight”)

6.2 Vẽ biểu đồ thanh chủng tộc

BarChart(ethnicity, data = bw)

## >>> Suggestions
## BarChart(ethnicity, horiz=TRUE)  # horizontal bar chart
## BarChart(ethnicity, fill="reds")  # red bars of varying lightness
## PieChart(ethnicity)  # doughnut (ring) chart
## Plot(ethnicity)  # bubble plot
## Plot(ethnicity, stat="count")  # lollipop plot 
## 
## --- ethnicity --- 
## 
## Missing Values: 0 
## 
##                Black  Other  White     Total 
## Frequencies:      26     67     96       189 
## Proportions:   0.138  0.354  0.508     1.000 
## 
## Chi-squared test of null hypothesis of equal probabilities 
##   Chisq = 39.270, df = 2, p-value = 0.000

[ChatGPT]

BarChart(bw2$ethnicity, main = “Distribution of Ethnicity”, fill = “lightgreen”)

BarChart(bw2$ethnicity, main = “Distribution of Ethnicity”, fill = “lightgreen”, values = “percent”)

Xoay nhãn trục hoành nếu cần:

BarChart(bw2$ethnicity, horiz = FALSE, rotate.values = TRUE)

6.3 Vẽ biểu đồ tương quan giữa cân nặng mẹ và cân nặng con

Plot(lwt, bwt, fit = "lm", data = bw)

## 
## >>> Suggestions  or  enter: style(suggest=FALSE)
## Plot(lwt, bwt, enhance=TRUE)  # many options
## Plot(lwt, bwt, color="red")  # exterior edge color of points
## Plot(lwt, bwt, out_cut=.10)  # label top 10% from center as outliers 
## 
## 
## >>> Pearson's product-moment correlation 
##  
## Number of paired values with neither missing, n = 189 
## Sample Correlation of lwt and bwt: r = 0.186 
##   
## Hypothesis Test of 0 Correlation:  t = 2.585,  df = 187,  p-value = 0.011 
## 95% Confidence Interval for Correlation:  0.044 to 0.320 
##   
## 
##  Line: b0 = 2369.62   b1 = 4.43    Fit: MSE = 516,155   Rsq = 0.034
##

[ChatGPT]

Scatter(lwt, bwt, data = bw2, xlab = “Mother’s Weight (lwt)”, ylab = “Birth Weight (bwt)”, main = “Scatterplot of Mother’s Weight vs. Birth Weight”)

Thêm đường hồi quy:

Scatter(lwt, bwt, data = bw2, fit.line = TRUE)

Thay đổi màu và kích thước điểm:

Scatter(lwt, bwt, data = bw2, fit.line = TRUE, color = “blue”, size = 2)

6.4 Vẽ biểu đồ tương quan giữa cân nặng mẹ và cân nặng con theo chủng tộc

Plot(lwt, bwt, by = race, fit = "lm", data = bw)

## 
## >>> Suggestions  or  enter: style(suggest=FALSE)
## Plot(lwt, bwt, enhance=TRUE)  # many options
## Plot(lwt, bwt, color="red")  # exterior edge color of points
## Plot(lwt, bwt, out_cut=.10)  # label top 10% from center as outliers 
## 
## race: 1   Line: b0 = 2442.4    b1 = 5.0    Fit: MSE = 514,066   Rsq = 0.040
##  
## race: 2   Line: b0 = 2363.2    b1 = 2.4    Fit: MSE = 415,264   Rsq = 0.023
##  
## race: 3   Line: b0 = 2070.8    b1 = 6.1    Fit: MSE = 505,570   Rsq = 0.045
##

[ChatGPT]

Scatter(lwt, bwt, by = race, data = bw2, xlab = “Mother’s Weight (lwt)”, ylab = “Birth Weight (bwt)”, main = “Birth Weight vs. Mother’s Weight by Ethnicity”)

Bạn có thể thêm đường hồi quy riêng cho từng nhóm:

Scatter(lwt, bwt, by = ethnicity, data = bw2, fit.line = TRUE)

Phân tích dữ liệu bằng ngôn ngữ R - Viện Phương Nam (10-15/5/2025)

Thach Tran

2025-04-20

Ngày 1: Giới thiệu ngôn ngữ R

Việc 1. Cài đặt R và RStudio

Việc 2. Cài dặt các gói phân tích (packages)

Việc 3. Đọc dữ liệu vào R

[ChatGPT]

Việc 4. Thông tin về dữ liệu bw

4.1 Có bao nhiêu biến số và quan sát

4.2 Liệt kê 6 quan sát đầu tiên

4.3 Liệt kê 6 quan sát cuối cùng

[ChatGPT]

Việc 5. Biên tập dữ liệu:

5.1 Tạo biến số mới mwt

5.2 Tạo biến số mới ethnicity

[ChatGPT]

5.3 Tạo tập dữ liệu bw1

5.4 Tạo tập dữ liệu bw3

5.5 Tạo tập dữ liệu bw4

5.6 [ChatGPT]

Việc 6. Sử dụng gói lessR

6.1 Vẽ biễu đồ phân bố histogram cân nặng của con

[ChatGPT]

6.2 Vẽ biểu đồ thanh chủng tộc

[ChatGPT]

6.3 Vẽ biểu đồ tương quan giữa cân nặng mẹ và cân nặng con

[ChatGPT]

6.4 Vẽ biểu đồ tương quan giữa cân nặng mẹ và cân nặng con theo chủng tộc

[ChatGPT]

Việc 7. Ghi lại tất cả các hàm/lệnh trên và chia sẻ lên tài khoản rpubs