#Ngày 1: Giới thiệu R #dữ liệu “birthwt.csv”

file.choose()

## [1] "D:\\HOC TAP\\TAP HUAN UNG DUNG AI TRONG PT DU LIEU SU DUNG R\\THUC HANH TAI LOP\\hocRngay2\\bt1.html"

bw=read.csv("D:\\HOC TAP\\TAP HUAN UNG DUNG AI TRONG PT DU LIEU SU DUNG R\\thuc hanh\\birthwt.csv")

head (bw)

##   id low age lwt race smoke ptl ht ui ftv  bwt
## 1 85   0  19 182    2     0   0  0  1   0 2523
## 2 86   0  33 155    3     0   0  0  0   3 2551
## 3 87   0  20 105    1     1   0  0  0   1 2557
## 4 88   0  21 108    1     1   0  0  1   2 2594
## 5 89   0  18 107    1     1   0  0  1   0 2600
## 6 91   0  21 124    3     0   0  0  0   0 2622

dim(bw)

## [1] 189  11

#5.1 Tạo biến số mới mwt là cân nặng của mẹ tính bằng kg #lwt có đơn vị là pound # Mục tiêu: hoán chuyển sang kg + lưu lại trong một biến mới mwt

bw$mwt = bw$lwt* 0.453592

head(bw)

##   id low age lwt race smoke ptl ht ui ftv  bwt      mwt
## 1 85   0  19 182    2     0   0  0  1   0 2523 82.55374
## 2 86   0  33 155    3     0   0  0  0   3 2551 70.30676
## 3 87   0  20 105    1     1   0  0  0   1 2557 47.62716
## 4 88   0  21 108    1     1   0  0  1   2 2594 48.98794
## 5 89   0  18 107    1     1   0  0  1   0 2600 48.53434
## 6 91   0  21 124    3     0   0  0  0   0 2622 56.24541

#5.2 Tạo biến số mới ethnicity là biến factor với điều kiện sau:Nếu race = 1 thì ethnicity = “White” #Nếu race = 2 thì ethnicity = “Black” #Nếu race = 3 thì ethnicity = “Other”

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.4.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

bw$ethnicity <- factor(bw$race,
                        levels = c(1, 2, 3),
                        labels = c("White","Black", "Other"))
head(bw)

##   id low age lwt race smoke ptl ht ui ftv  bwt      mwt ethnicity
## 1 85   0  19 182    2     0   0  0  1   0 2523 82.55374     Black
## 2 86   0  33 155    3     0   0  0  0   3 2551 70.30676     Other
## 3 87   0  20 105    1     1   0  0  0   1 2557 47.62716     White
## 4 88   0  21 108    1     1   0  0  1   2 2594 48.98794     White
## 5 89   0  18 107    1     1   0  0  1   0 2600 48.53434     White
## 6 91   0  21 124    3     0   0  0  0   0 2622 56.24541     Other

#5.3 Tạo 1 tập dữ liệu bw1 chỉ gồm 3 biến số id, low và bwt. Dữ liệu này có bao nhiêu biến số và quan sát?

bw1 = bw[, c("id", "low", "bwt")]
head(bw1)

##   id low  bwt
## 1 85   0 2523
## 2 86   0 2551
## 3 87   0 2557
## 4 88   0 2594
## 5 89   0 2600
## 6 91   0 2622

#5.4 Tạo 1 tập dữ liệu bw2 chỉ gồm những thai phụ có cân nặng thấp (low = 1). Dữ liệ ucó bao nhiêu biến số và quan sát?

bw2 = subset (bw, low == 1)
head(bw2)

##     id low age lwt race smoke ptl ht ui ftv  bwt      mwt ethnicity
## 131  4   1  28 120    3     1   1  0  1   0  709 54.43104     Other
## 132 10   1  29 130    1     0   0  0  1   2 1021 58.96696     White
## 133 11   1  34 187    2     1   0  1  0   0 1135 84.82170     Black
## 134 13   1  25 105    3     0   1  1  0   0 1330 47.62716     Other
## 135 15   1  25  85    3     0   0  0  1   0 1474 38.55532     Other
## 136 16   1  27 150    3     0   0  0  0   0 1588 68.03880     Other

dim(bw2)

## [1] 59 13

#5.5 Tạo 1 tập dữ liệu bw3 chỉ gồm những thai phụ có cân nặng thấp (low = 1) và có hú thuốc trong lúc mang thai (smoke = 1). Dữ liệu này có bao nhiêu biến số và quan sát?

bw3 = subset(bw, low ==1 & smoke ==1)
head(bw3)

##     id low age lwt race smoke ptl ht ui ftv  bwt      mwt ethnicity
## 131  4   1  28 120    3     1   1  0  1   0  709 54.43104     Other
## 133 11   1  34 187    2     1   0  1  0   0 1135 84.82170     Black
## 140 20   1  21 165    1     1   0  1  0   1 1790 74.84268     White
## 141 22   1  32 105    1     1   0  0  0   0 1818 47.62716     White
## 142 23   1  19  91    1     1   2  0  1   0 1885 41.27687     White
## 145 26   1  25  92    1     1   0  0  0   0 1928 41.73046     White

dim(bw3)

## [1] 30 13

#————————–Việc 6. Phân tích mô tả: ————————————— #6.1 Mô tả đặc điểm tuổi của mẹ (age), cân nặng của mẹ (lwt) và cân nặng của con (bwt)

library(table1)

## Warning: package 'table1' was built under R version 4.4.3

## 
## Attaching package: 'table1'

## The following objects are masked from 'package:base':
## 
##     units, units<-

table1(~age + lwt + bwt, data=bw)

	Overall (N=189)
age
Mean (SD)	23.2 (5.30)
Median [Min, Max]	23.0 [14.0, 45.0]
lwt
Mean (SD)	130 (30.6)
Median [Min, Max]	121 [80.0, 250]
bwt
Mean (SD)	2940 (729)
Median [Min, Max]	2980 [709, 4990]

#6.2 Mô tả đặc điểm tuổi của mẹ (age), cân nặng của mẹ (lwt), tình trạng hút thuốc trong nthai kỳ (smoke) , chủng tộc (race), và cân nặng của con (bwt) theo tình trạng trẻ thiếu cân (low)

table1(~age + lwt + bwt | low, data=bw)

## Warning in table1.formula(~age + lwt + bwt | low, data = bw): Terms to the
## right of '|' in formula 'x' define table columns and are expected to be factors
## with meaningful labels.

	0 (N=130)	1 (N=59)	Overall (N=189)
age
Mean (SD)	23.7 (5.58)	22.3 (4.51)	23.2 (5.30)
Median [Min, Max]	23.0 [14.0, 45.0]	22.0 [14.0, 34.0]	23.0 [14.0, 45.0]
lwt
Mean (SD)	133 (31.7)	122 (26.6)	130 (30.6)
Median [Min, Max]	124 [85.0, 250]	120 [80.0, 200]	121 [80.0, 250]
bwt
Mean (SD)	3330 (478)	2100 (391)	2940 (729)
Median [Min, Max]	3270 [2520, 4990]	2210 [709, 2500]	2980 [709, 4990]

Chuyển thành biến factor

bw$low <- factor(bw$low, levels = c(0, 1), labels = c("Normal weight", "Low birth weight"))
bw$smoke <- factor(bw$smoke, levels = c(0, 1), labels = c("Non-smoker", "Smoker"))
bw$race <- factor(bw$race, levels = c(1, 2, 3), labels = c("White", "Black", "Other"))
head(bw)

##   id           low age lwt  race      smoke ptl ht ui ftv  bwt      mwt
## 1 85 Normal weight  19 182 Black Non-smoker   0  0  1   0 2523 82.55374
## 2 86 Normal weight  33 155 Other Non-smoker   0  0  0   3 2551 70.30676
## 3 87 Normal weight  20 105 White     Smoker   0  0  0   1 2557 47.62716
## 4 88 Normal weight  21 108 White     Smoker   0  0  1   2 2594 48.98794
## 5 89 Normal weight  18 107 White     Smoker   0  0  1   0 2600 48.53434
## 6 91 Normal weight  21 124 Other Non-smoker   0  0  0   0 2622 56.24541
##   ethnicity
## 1     Black
## 2     Other
## 3     White
## 4     White
## 5     White
## 6     Other

table1(~ age + lwt + smoke + race + bwt | low, data = bw)

	Normal weight (N=130)	Low birth weight (N=59)	Overall (N=189)
age
Mean (SD)	23.7 (5.58)	22.3 (4.51)	23.2 (5.30)
Median [Min, Max]	23.0 [14.0, 45.0]	22.0 [14.0, 34.0]	23.0 [14.0, 45.0]
lwt
Mean (SD)	133 (31.7)	122 (26.6)	130 (30.6)
Median [Min, Max]	124 [85.0, 250]	120 [80.0, 200]	121 [80.0, 250]
smoke
Non-smoker	86 (66.2%)	29 (49.2%)	115 (60.8%)
Smoker	44 (33.8%)	30 (50.8%)	74 (39.2%)
race
White	73 (56.2%)	23 (39.0%)	96 (50.8%)
Black	15 (11.5%)	11 (18.6%)	26 (13.8%)
Other	42 (32.3%)	25 (42.4%)	67 (35.4%)
bwt
Mean (SD)	3330 (478)	2100 (391)	2940 (729)
Median [Min, Max]	3270 [2520, 4990]	2210 [709, 2500]	2980 [709, 4990]

#smoking

bw$smoking = ifelse(bw$smoke=="Yes", 1, 0)
head(bw)

##   id           low age lwt  race      smoke ptl ht ui ftv  bwt      mwt
## 1 85 Normal weight  19 182 Black Non-smoker   0  0  1   0 2523 82.55374
## 2 86 Normal weight  33 155 Other Non-smoker   0  0  0   3 2551 70.30676
## 3 87 Normal weight  20 105 White     Smoker   0  0  0   1 2557 47.62716
## 4 88 Normal weight  21 108 White     Smoker   0  0  1   2 2594 48.98794
## 5 89 Normal weight  18 107 White     Smoker   0  0  1   0 2600 48.53434
## 6 91 Normal weight  21 124 Other Non-smoker   0  0  0   0 2622 56.24541
##   ethnicity smoking
## 1     Black       0
## 2     Other       0
## 3     White       0
## 4     White       0
## 5     White       0
## 6     Other       0

biểu đồ phân bố

library(lessR)

## Warning: package 'lessR' was built under R version 4.4.3

## 
## lessR 4.4.3                         feedback: gerbing@pdx.edu 
## --------------------------------------------------------------
## > d <- Read("")  Read data file, many formats available, e.g., Excel
##   d is default data frame, data= in analysis routines optional
## 
## Many examples of reading, writing, and manipulating data, 
## graphics, testing means and proportions, regression, factor analysis,
## customization, forecasting, and aggregation from pivot tables
##   Enter: browseVignettes("lessR")
## 
## View lessR updates, now including time series forecasting
##   Enter: news(package="lessR")
## 
## Interactive data analysis
##   Enter: interact()

## 
## Attaching package: 'lessR'

## The following object is masked from 'package:table1':
## 
##     label

## The following objects are masked from 'package:dplyr':
## 
##     order_by, recode, rename

## The following object is masked from 'package:base':
## 
##     sort_by

Histogram(bwt, data=bw)

## >>> Suggestions 
## bin_width: set the width of each bin 
## bin_start: set the start of the first bin 
## bin_end: set the end of the last bin 
## Histogram(bwt, density=TRUE)  # smoothed curve + histogram 
## Plot(bwt)  # Violin/Box/Scatterplot (VBS) plot 
## 
## --- bwt --- 
##  
##       n   miss       mean         sd        min        mdn        max 
##      189      0    2944.59     729.21     709.00    2977.00    4990.00 
##  
## 
##   
## --- Outliers ---     from the box plot: 1 
##  
## Small        Large 
## -----        ----- 
##  709.0            
## 
## 
## Bin Width: 500 
## Number of Bins: 9 
##  
##          Bin  Midpnt  Count    Prop  Cumul.c  Cumul.p 
## ----------------------------------------------------- 
##   500 > 1000     750      1    0.01        1     0.01 
##  1000 > 1500    1250      4    0.02        5     0.03 
##  1500 > 2000    1750     14    0.07       19     0.10 
##  2000 > 2500    2250     40    0.21       59     0.31 
##  2500 > 3000    2750     38    0.20       97     0.51 
##  3000 > 3500    3250     45    0.24      142     0.75 
##  3500 > 4000    3750     38    0.20      180     0.95 
##  4000 > 4500    4250      7    0.04      187     0.99 
##  4500 > 5000    4750      2    0.01      189     1.00 
##

Histogram(bwt, fill="blue", xlab="Birthweight (g)", 
ylab="Frequency", data=bw)

## >>> Suggestions 
## bin_width: set the width of each bin 
## bin_start: set the start of the first bin 
## bin_end: set the end of the last bin 
## Histogram(bwt, density=TRUE)  # smoothed curve + histogram 
## Plot(bwt)  # Violin/Box/Scatterplot (VBS) plot 
## 
## --- bwt --- 
##  
##       n   miss       mean         sd        min        mdn        max 
##      189      0    2944.59     729.21     709.00    2977.00    4990.00 
##  
## 
##   
## --- Outliers ---     from the box plot: 1 
##  
## Small        Large 
## -----        ----- 
##  709.0            
## 
## 
## Bin Width: 500 
## Number of Bins: 9 
##  
##          Bin  Midpnt  Count    Prop  Cumul.c  Cumul.p 
## ----------------------------------------------------- 
##   500 > 1000     750      1    0.01        1     0.01 
##  1000 > 1500    1250      4    0.02        5     0.03 
##  1500 > 2000    1750     14    0.07       19     0.10 
##  2000 > 2500    2250     40    0.21       59     0.31 
##  2500 > 3000    2750     38    0.20       97     0.51 
##  3000 > 3500    3250     45    0.24      142     0.75 
##  3500 > 4000    3750     38    0.20      180     0.95 
##  4000 > 4500    4250      7    0.04      187     0.99 
##  4500 > 5000    4750      2    0.01      189     1.00 
##

BarChart(race, data=bw)

## >>> Suggestions
## BarChart(race, horiz=TRUE)  # horizontal bar chart
## BarChart(race, fill="reds")  # red bars of varying lightness
## PieChart(race)  # doughnut (ring) chart
## Plot(race)  # bubble plot
## Plot(race, stat="count")  # lollipop plot 
## 
## --- race --- 
## 
## Missing Values: 0 
## 
##                White  Black  Other     Total 
## Frequencies:      96     26     67       189 
## Proportions:   0.508  0.138  0.354     1.000 
## 
## Chi-squared test of null hypothesis of equal probabilities 
##   Chisq = 39.270, df = 2, p-value = 0.000

#Biểu đồ tương quan: Plot(x, y)

Plot(lwt, bwt, data=bw)

## 
## >>> Suggestions  or  enter: style(suggest=FALSE)
## Plot(lwt, bwt, enhance=TRUE)  # many options
## Plot(lwt, bwt, fill="skyblue")  # interior fill color of points
## Plot(lwt, bwt, fit="lm", fit_se=c(.90,.99))  # fit line, stnd errors
## Plot(lwt, bwt, MD_cut=6)  # Mahalanobis distance from center > 6 is an outlier 
## 
## 
## >>> Pearson's product-moment correlation 
##  
## Number of paired values with neither missing, n = 189 
## Sample Correlation of lwt and bwt: r = 0.186 
##   
## Hypothesis Test of 0 Correlation:  t = 2.585,  df = 187,  p-value = 0.011 
## 95% Confidence Interval for Correlation:  0.044 to 0.320 
##

Plot(lwt, bwt, fit="lm", data=bw)

## 
## 
## >>> Suggestions  or  enter: style(suggest=FALSE)
## Plot(lwt, bwt, enhance=TRUE)  # many options
## Plot(lwt, bwt, fill="skyblue")  # interior fill color of points
## Plot(lwt, bwt, out_cut=.10)  # label top 10% from center as outliers 
## 
## 
## >>> Pearson's product-moment correlation 
##  
## Number of paired values with neither missing, n = 189 
## Sample Correlation of lwt and bwt: r = 0.186 
##   
## Hypothesis Test of 0 Correlation:  t = 2.585,  df = 187,  p-value = 0.011 
## 95% Confidence Interval for Correlation:  0.044 to 0.320 
##   
## 
##   Line: b0 = 2369.624    b1 = 4.429    Linear Model MSE = 516,155.173   Rsq = 0.034
##

bw$ethnicity[bw$race==1]<-"White"
bw$ethnicity [bw$race==2] <- "Black"
bw$ethnicity [bw$race==3] <- "Other"

Plot(lwt, bwt, by= ethnicity, fit="lm", data=bw)

## 
## 
## >>> Suggestions  or  enter: style(suggest=FALSE)
## Plot(lwt, bwt, enhance=TRUE)  # many options
## Plot(lwt, bwt, fill="skyblue")  # interior fill color of points
## Plot(lwt, bwt, out_cut=.10)  # label top 10% from center as outliers 
## 
## ethnicity: White  Line: b0 = 2442.418    b1 = 5.000    Linear Model MSE = 514,065.615   Rsq = 0.040
##  
## ethnicity: Black  Line: b0 = 2363.222    b1 = 2.428    Linear Model MSE = 415,263.548   Rsq = 0.023
##  
## ethnicity: Other  Line: b0 = 2070.778    b1 = 6.120    Linear Model MSE = 505,570.324   Rsq = 0.045
##

#———————–using table1 thống kê mô tả————————————————- # phân tích mô tả theo low

bw$low <- factor(bw$low, levels = c(0, 1), labels = c("Normal weight", "Low birth weight"))

Ngày 1: Giới thiệu R

pthao

2025-06-16

Chuyển thành biến factor

biểu đồ phân bố