Gói lệnh liên quan

library(lessR)
## Warning: package 'lessR' was built under R version 4.4.3
## 
## lessR 4.4.3                         feedback: gerbing@pdx.edu 
## --------------------------------------------------------------
## > d <- Read("")  Read data file, many formats available, e.g., Excel
##   d is default data frame, data= in analysis routines optional
## 
## Many examples of reading, writing, and manipulating data, 
## graphics, testing means and proportions, regression, factor analysis,
## customization, forecasting, and aggregation from pivot tables
##   Enter: browseVignettes("lessR")
## 
## View lessR updates, now including time series forecasting
##   Enter: news(package="lessR")
## 
## Interactive data analysis
##   Enter: interact()
## 
## Attaching package: 'lessR'
## The following object is masked from 'package:base':
## 
##     sort_by
library(table1)
## 
## Attaching package: 'table1'
## The following object is masked from 'package:lessR':
## 
##     label
## The following objects are masked from 'package:base':
## 
##     units, units<-
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:lessR':
## 
##     order_by, recode, rename
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Đọc dữ liệu vào R

bw <- read.csv("C:/Users/lehoa/Desktop/thuế/học như một NCS/R với thầy Tuấn/birthwt.csv",header=TRUE)

### 4.1: Số biến và số quan sát
dim(bw)
## [1] 189  11
### 4.2: 6 quan sát đầu tiên
head(bw)
##   id low age lwt race smoke ptl ht ui ftv  bwt
## 1 85   0  19 182    2     0   0  0  1   0 2523
## 2 86   0  33 155    3     0   0  0  0   3 2551
## 3 87   0  20 105    1     1   0  0  0   1 2557
## 4 88   0  21 108    1     1   0  0  1   2 2594
## 5 89   0  18 107    1     1   0  0  1   0 2600
## 6 91   0  21 124    3     0   0  0  0   0 2622
## Biên tập dữ liệu

### 5.1: Cân nặng mẹ (kg)
bw$mwt <- round(bw$lwt * 0.453592, 2)
head(bw)
##   id low age lwt race smoke ptl ht ui ftv  bwt   mwt
## 1 85   0  19 182    2     0   0  0  1   0 2523 82.55
## 2 86   0  33 155    3     0   0  0  0   3 2551 70.31
## 3 87   0  20 105    1     1   0  0  0   1 2557 47.63
## 4 88   0  21 108    1     1   0  0  1   2 2594 48.99
## 5 89   0  18 107    1     1   0  0  1   0 2600 48.53
## 6 91   0  21 124    3     0   0  0  0   0 2622 56.25
### 5.2: Biến ethnicity từ race
bw$ethnicity <- factor(bw$race,
                       levels = c(1, 2, 3),
                       labels = c("White", "Black", "Other"))

### 5.3: Tập dữ liệu bw1
bw1 <- bw[, c("id", "low", "bwt")]
dim(bw1)
## [1] 189   3
head(bw1,20)
##     id low  bwt
## 1   85   0 2523
## 2   86   0 2551
## 3   87   0 2557
## 4   88   0 2594
## 5   89   0 2600
## 6   91   0 2622
## 7   92   0 2637
## 8   93   0 2637
## 9   94   0 2663
## 10  95   0 2665
## 11  96   0 2722
## 12  97   0 2733
## 13  98   0 2751
## 14  99   0 2750
## 15 100   0 2769
## 16 101   0 2769
## 17 102   0 2778
## 18 103   0 2782
## 19 104   0 2807
## 20 105   0 2821
### 5.4: bw2: chỉ các mẹ có trẻ thiếu cân
bw2 <- filter(bw, low == 1)
dim(bw2)
## [1] 59 13
head(bw2,20)
##    id low age lwt race smoke ptl ht ui ftv  bwt   mwt ethnicity
## 1   4   1  28 120    3     1   1  0  1   0  709 54.43     Other
## 2  10   1  29 130    1     0   0  0  1   2 1021 58.97     White
## 3  11   1  34 187    2     1   0  1  0   0 1135 84.82     Black
## 4  13   1  25 105    3     0   1  1  0   0 1330 47.63     Other
## 5  15   1  25  85    3     0   0  0  1   0 1474 38.56     Other
## 6  16   1  27 150    3     0   0  0  0   0 1588 68.04     Other
## 7  17   1  23  97    3     0   0  0  1   1 1588 44.00     Other
## 8  18   1  24 128    2     0   1  0  0   1 1701 58.06     Black
## 9  19   1  24 132    3     0   0  1  0   0 1729 59.87     Other
## 10 20   1  21 165    1     1   0  1  0   1 1790 74.84     White
## 11 22   1  32 105    1     1   0  0  0   0 1818 47.63     White
## 12 23   1  19  91    1     1   2  0  1   0 1885 41.28     White
## 13 24   1  25 115    3     0   0  0  0   0 1893 52.16     Other
## 14 25   1  16 130    3     0   0  0  0   1 1899 58.97     Other
## 15 26   1  25  92    1     1   0  0  0   0 1928 41.73     White
## 16 27   1  20 150    1     1   0  0  0   2 1928 68.04     White
## 17 28   1  21 200    2     0   0  0  1   2 1928 90.72     Black
## 18 29   1  24 155    1     1   1  0  0   0 1936 70.31     White
## 19 30   1  21 103    3     0   0  0  0   0 1970 46.72     Other
## 20 31   1  20 125    3     0   0  0  1   0 2055 56.70     Other
### 5.5: bw3: thiếu cân và mẹ hút thuốc
bw3 <- filter(bw, low == 1 & smoke == 1)
dim(bw3)
## [1] 30 13
head(bw3,20)
##    id low age lwt race smoke ptl ht ui ftv  bwt   mwt ethnicity
## 1   4   1  28 120    3     1   1  0  1   0  709 54.43     Other
## 2  11   1  34 187    2     1   0  1  0   0 1135 84.82     Black
## 3  20   1  21 165    1     1   0  1  0   1 1790 74.84     White
## 4  22   1  32 105    1     1   0  0  0   0 1818 47.63     White
## 5  23   1  19  91    1     1   2  0  1   0 1885 41.28     White
## 6  26   1  25  92    1     1   0  0  0   0 1928 41.73     White
## 7  27   1  20 150    1     1   0  0  0   2 1928 68.04     White
## 8  29   1  24 155    1     1   1  0  0   0 1936 70.31     White
## 9  34   1  19 112    1     1   0  0  1   0 2084 50.80     White
## 10 35   1  26 117    1     1   1  0  0   0 2084 53.07     White
## 11 37   1  17 130    3     1   1  0  1   0 2125 58.97     Other
## 12 40   1  20 120    2     1   0  0  0   3 2126 54.43     Black
## 13 42   1  22 130    1     1   1  0  1   1 2187 58.97     White
## 14 44   1  20  80    3     1   0  0  1   0 2211 36.29     Other
## 15 45   1  17 110    1     1   0  0  0   0 2225 49.90     White
## 16 50   1  18 110    2     1   1  0  0   0 2296 49.90     Black
## 17 51   1  20 121    1     1   1  0  1   0 2296 54.88     White
## 18 56   1  31 102    1     1   1  0  0   1 2353 46.27     White
## 19 59   1  23 187    2     1   0  0  0   1 2367 84.82     Black
## 20 60   1  20 122    2     1   0  0  0   0 2381 55.34     Black
## 6.1: Mô tả age, lwt, bwt
table1(~age + lwt + bwt, data=bw)
Overall
(N=189)
age
Mean (SD) 23.2 (5.30)
Median [Min, Max] 23.0 [14.0, 45.0]
lwt
Mean (SD) 130 (30.6)
Median [Min, Max] 121 [80.0, 250]
bwt
Mean (SD) 2940 (729)
Median [Min, Max] 2980 [709, 4990]
## 6.2: Mô tả theo nhóm "low"
bw$smoking= ifelse(bw$smoke==1,"yes","no")


table1(~ age + lwt + smoking + ethnicity + bwt | factor(low), data = bw)
0
(N=130)
1
(N=59)
Overall
(N=189)
age
Mean (SD) 23.7 (5.58) 22.3 (4.51) 23.2 (5.30)
Median [Min, Max] 23.0 [14.0, 45.0] 22.0 [14.0, 34.0] 23.0 [14.0, 45.0]
lwt
Mean (SD) 133 (31.7) 122 (26.6) 130 (30.6)
Median [Min, Max] 124 [85.0, 250] 120 [80.0, 200] 121 [80.0, 250]
smoking
no 86 (66.2%) 29 (49.2%) 115 (60.8%)
yes 44 (33.8%) 30 (50.8%) 74 (39.2%)
ethnicity
White 73 (56.2%) 23 (39.0%) 96 (50.8%)
Black 15 (11.5%) 11 (18.6%) 26 (13.8%)
Other 42 (32.3%) 25 (42.4%) 67 (35.4%)
bwt
Mean (SD) 3330 (478) 2100 (391) 2940 (729)
Median [Min, Max] 3270 [2520, 4990] 2210 [709, 2500] 2980 [709, 4990]
## 7.1 Biểu đồ phân tán (scatter plot) giữa cân nặng mẹ và cân nặng con
plot(bw$mwt, bw$bwt,
     main = "Mối liên hệ giữa cân nặng mẹ và cân nặng trẻ",
     xlab = "Cân nặng mẹ (kg)",
     ylab = "Cân nặng trẻ lúc sinh (g)",
     col = "blue", pch = 16)

## 7.2 Biểu đồ histogram cho cân nặng trẻ (bwt)

hist(bw$bwt,
     main = "Phân bố cân nặng trẻ lúc sinh",
     xlab = "Cân nặng trẻ (g)",ylab = "tỉ lệ (%)",
     col = "lightgreen", border = "white")

## 7.3 Histogram với lessR::Histogram() có đường cong chuẩn

library(lessR)
Histogram(bwt, data = bw, fill = "skyblue")

## >>> Suggestions 
## bin_width: set the width of each bin 
## bin_start: set the start of the first bin 
## bin_end: set the end of the last bin 
## Histogram(bwt, density=TRUE)  # smoothed curve + histogram 
## Plot(bwt)  # Violin/Box/Scatterplot (VBS) plot 
## 
## --- bwt --- 
##  
##       n   miss       mean         sd        min        mdn        max 
##      189      0    2944.59     729.21     709.00    2977.00    4990.00 
##  
## 
##   
## --- Outliers ---     from the box plot: 1 
##  
## Small        Large 
## -----        ----- 
##  709.0            
## 
## 
## Bin Width: 500 
## Number of Bins: 9 
##  
##          Bin  Midpnt  Count    Prop  Cumul.c  Cumul.p 
## ----------------------------------------------------- 
##   500 > 1000     750      1    0.01        1     0.01 
##  1000 > 1500    1250      4    0.02        5     0.03 
##  1500 > 2000    1750     14    0.07       19     0.10 
##  2000 > 2500    2250     40    0.21       59     0.31 
##  2500 > 3000    2750     38    0.20       97     0.51 
##  3000 > 3500    3250     45    0.24      142     0.75 
##  3500 > 4000    3750     38    0.20      180     0.95 
##  4000 > 4500    4250      7    0.04      187     0.99 
##  4500 > 5000    4750      2    0.01      189     1.00 
## 
## 7.4 Biểu đồ cột (bar chart) cho biến phân loại smoke và ethnicity

### Bar chart: hút thuốc
barplot(table(bw$smoke),
        main = "Tình trạng hút thuốc trong thai kỳ",
        names.arg = c("Không hút", "Có hút"),
        col = c("gray70", "tomato"))

### Bar chart: sắc tộc
barplot(table(bw$ethnicity),
        main = "Phân bố sắc tộc",
        col = c("orange", "lightblue", "gray"))

##7.5 Biểu đồ ghép (boxplot): cân nặng trẻ theo tình trạng hút thuốc

boxplot(bwt ~ smoke,
        data = bw,
        main = "So sánh cân nặng trẻ theo tình trạng hút thuốc",
        xlab = "Hút thuốc",
        ylab = "Cân nặng trẻ (g)",
        names = c("Không hút", "Có hút"),
        col = c("green", "red"))