#1. viec 1 #2. viec 2 #3. viec 3: doc du lieu birthwt.csv vao R va goi du lieu la “bw”

bw=read.csv("/Users/toansama/Documents/NCKH/PHAN TICH DU LIEU CO BAN - SIS CAN THO - 2-6:1:2025/DỮ LIỆU ĐÍNH KÈM BÀI TẬP/birthwt.csv",header = T,na.strings = " ")
library(lessR)
## 
## lessR 4.3.9                         feedback: gerbing@pdx.edu 
## --------------------------------------------------------------
## > d <- Read("")   Read text, Excel, SPSS, SAS, or R data file
##   d is default data frame, data= in analysis routines optional
## 
## Many examples of reading, writing, and manipulating data, 
## graphics, testing means and proportions, regression, factor analysis,
## customization, and descriptive statistics from pivot tables
##   Enter: browseVignettes("lessR")
## 
## View lessR updates, now including time series forecasting
##   Enter: news(package="lessR")
## 
## Interactive data analysis
##   Enter: interact()
## 
## Attaching package: 'lessR'
## The following object is masked from 'package:base':
## 
##     sort_by
bw=Read("/Users/toansama/Documents/NCKH/PHAN TICH DU LIEU CO BAN - SIS CAN THO - 2-6:1:2025/DỮ LIỆU ĐÍNH KÈM BÀI TẬP/birthwt.csv")
## 
## >>> Suggestions
## Recommended binary format for data files: feather
##   Create with Write(d, "your_file", format="feather")
## To read a csv or Excel file of variable labelsvar_labels=TRUE
##   Each row of the file:  Variable Name, Variable Label
## Read into a data frame named l  (the letter el)
## 
## More details about your data, Enter:  details()  for d, or  details(name)
## 
## Data Types
## ------------------------------------------------------------
## integer: Numeric data values, integers only
## ------------------------------------------------------------
## 
##     Variable                  Missing  Unique 
##         Name     Type  Values  Values  Values   First and last values
## ------------------------------------------------------------------------------------------
##  1        id   integer    189       0     189   85  86  87 ... 82  83  84
##  2       low   integer    189       0       2   0  0  0 ... 1  1  1
##  3       age   integer    189       0      24   19  33  20 ... 23  17  21
##  4       lwt   integer    189       0      75   182  155  105 ... 94  142  130
##  5      race   integer    189       0       3   2  3  1 ... 3  2  1
##  6     smoke   integer    189       0       2   0  0  1 ... 1  0  1
##  7       ptl   integer    189       0       4   0  0  0 ... 0  0  0
##  8        ht   integer    189       0       2   0  0  0 ... 0  1  1
##  9        ui   integer    189       0       2   1  0  0 ... 0  0  0
## 10       ftv   integer    189       0       6   0  3  1 ... 0  0  3
## 11       bwt   integer    189       0     131   2523  2551  2557 ... 2495  2495  2495
## ------------------------------------------------------------------------------------------

#4. viec 4: thong tin ve du lieu bw ##4.1. variable and observation

dim(bw)
## [1] 189  11

##4.2. 6 quan sat dau tien

head(bw,6)
##   id low age lwt race smoke ptl ht ui ftv  bwt
## 1 85   0  19 182    2     0   0  0  1   0 2523
## 2 86   0  33 155    3     0   0  0  0   3 2551
## 3 87   0  20 105    1     1   0  0  0   1 2557
## 4 88   0  21 108    1     1   0  0  1   2 2594
## 5 89   0  18 107    1     1   0  0  1   0 2600
## 6 91   0  21 124    3     0   0  0  0   0 2622

##4.3. 6 quan sat cuoi

tail(bw,6)
##     id low age lwt race smoke ptl ht ui ftv  bwt
## 184 78   1  14 101    3     1   1  0  0   0 2466
## 185 79   1  28  95    1     1   0  0  0   2 2466
## 186 81   1  14 100    3     0   0  0  0   2 2495
## 187 82   1  23  94    3     1   0  0  0   0 2495
## 188 83   1  17 142    2     0   0  1  0   0 2495
## 189 84   1  21 130    1     1   0  1  0   3 2495
tail(bw)
##     id low age lwt race smoke ptl ht ui ftv  bwt
## 184 78   1  14 101    3     1   1  0  0   0 2466
## 185 79   1  28  95    1     1   0  0  0   2 2466
## 186 81   1  14 100    3     0   0  0  0   2 2495
## 187 82   1  23  94    3     1   0  0  0   0 2495
## 188 83   1  17 142    2     0   0  1  0   0 2495
## 189 84   1  21 130    1     1   0  1  0   3 2495
table(bw$low)
## 
##   0   1 
## 130  59
59/(130+59)
## [1] 0.3121693
table(bw$low,bw$smoke)
##    
##      0  1
##   0 86 44
##   1 29 30
29/115
## [1] 0.2521739
30/74
## [1] 0.4054054

#5. viec 5: bien tap du lieu ##5.1. tao bien mwt: can nawng cua me theo kg

bw$mwt=bw$lwt*0.454
head(bw)
##   id low age lwt race smoke ptl ht ui ftv  bwt    mwt
## 1 85   0  19 182    2     0   0  0  1   0 2523 82.628
## 2 86   0  33 155    3     0   0  0  0   3 2551 70.370
## 3 87   0  20 105    1     1   0  0  0   1 2557 47.670
## 4 88   0  21 108    1     1   0  0  1   2 2594 49.032
## 5 89   0  18 107    1     1   0  0  1   0 2600 48.578
## 6 91   0  21 124    3     0   0  0  0   0 2622 56.296

##5.2. tao bien ethnicity la bien factor voi dk

bw$ethnicity[bw$race==1]="white"
bw$ethnicity[bw$race==2]="black"
bw$ethnicity[bw$race==3]="other"
head(bw)
##   id low age lwt race smoke ptl ht ui ftv  bwt    mwt ethnicity
## 1 85   0  19 182    2     0   0  0  1   0 2523 82.628     black
## 2 86   0  33 155    3     0   0  0  0   3 2551 70.370     other
## 3 87   0  20 105    1     1   0  0  0   1 2557 47.670     white
## 4 88   0  21 108    1     1   0  0  1   2 2594 49.032     white
## 5 89   0  18 107    1     1   0  0  1   0 2600 48.578     white
## 6 91   0  21 124    3     0   0  0  0   0 2622 56.296     other

##5.3. tao bien smoking la bien numeric voi dk

bw$smoking=ifelse(bw$smoke==1,"yes","no")
head(bw)
##   id low age lwt race smoke ptl ht ui ftv  bwt    mwt ethnicity smoking
## 1 85   0  19 182    2     0   0  0  1   0 2523 82.628     black      no
## 2 86   0  33 155    3     0   0  0  0   3 2551 70.370     other      no
## 3 87   0  20 105    1     1   0  0  0   1 2557 47.670     white     yes
## 4 88   0  21 108    1     1   0  0  1   2 2594 49.032     white     yes
## 5 89   0  18 107    1     1   0  0  1   0 2600 48.578     white     yes
## 6 91   0  21 124    3     0   0  0  0   0 2622 56.296     other      no

##5.4. tao bw1 gom 3 bien id, low, bwt. co bao nhieu bien va quan sat

bw1=bw[,c("id","low","bwt")]
head(bw1)
##   id low  bwt
## 1 85   0 2523
## 2 86   0 2551
## 3 87   0 2557
## 4 88   0 2594
## 5 89   0 2600
## 6 91   0 2622
dim(bw1)
## [1] 189   3

##5.5 tao bw2 chi gom thai phu co can nang thap. co bao nhieu bien va quan sat

bw2=subset(bw,low==1)
head(bw2)
##     id low age lwt race smoke ptl ht ui ftv  bwt    mwt ethnicity smoking
## 131  4   1  28 120    3     1   1  0  1   0  709 54.480     other     yes
## 132 10   1  29 130    1     0   0  0  1   2 1021 59.020     white      no
## 133 11   1  34 187    2     1   0  1  0   0 1135 84.898     black     yes
## 134 13   1  25 105    3     0   1  1  0   0 1330 47.670     other      no
## 135 15   1  25  85    3     0   0  0  1   0 1474 38.590     other      no
## 136 16   1  27 150    3     0   0  0  0   0 1588 68.100     other      no
dim(bw2)
## [1] 59 14

5.6. tao bw3 gom thai phu co can nang thap va co hut thuoc. co bao nhieu bien va quan sat

bw3=subset(bw,low==1&smoke==1)
head(bw3)
##     id low age lwt race smoke ptl ht ui ftv  bwt    mwt ethnicity smoking
## 131  4   1  28 120    3     1   1  0  1   0  709 54.480     other     yes
## 133 11   1  34 187    2     1   0  1  0   0 1135 84.898     black     yes
## 140 20   1  21 165    1     1   0  1  0   1 1790 74.910     white     yes
## 141 22   1  32 105    1     1   0  0  0   0 1818 47.670     white     yes
## 142 23   1  19  91    1     1   2  0  1   0 1885 41.314     white     yes
## 145 26   1  25  92    1     1   0  0  0   0 1928 41.768     white     yes
dim(bw3)
## [1] 30 14

6. phan tich mo ta

6.1. mo ta age, lwt, bwt

library(table1)
## 
## Attaching package: 'table1'
## The following object is masked from 'package:lessR':
## 
##     label
## The following objects are masked from 'package:base':
## 
##     units, units<-
table1(~age+lwt+bwt,data=bw)
Overall
(N=189)
age
Mean (SD) 23.2 (5.30)
Median [Min, Max] 23.0 [14.0, 45.0]
lwt
Mean (SD) 130 (30.6)
Median [Min, Max] 121 [80.0, 250]
bwt
Mean (SD) 2940 (729)
Median [Min, Max] 2980 [709, 4990]

6.2. mo ta age, lwt, bwt theo low

table1(~age+lwt+bwt|low,data=bw)
0
(N=130)
1
(N=59)
Overall
(N=189)
age
Mean (SD) 23.7 (5.58) 22.3 (4.51) 23.2 (5.30)
Median [Min, Max] 23.0 [14.0, 45.0] 22.0 [14.0, 34.0] 23.0 [14.0, 45.0]
lwt
Mean (SD) 133 (31.7) 122 (26.6) 130 (30.6)
Median [Min, Max] 124 [85.0, 250] 120 [80.0, 200] 121 [80.0, 250]
bwt
Mean (SD) 3330 (478) 2100 (391) 2940 (729)
Median [Min, Max] 3270 [2520, 4990] 2210 [709, 2500] 2980 [709, 4990]

6.3. mo ta low, smoke, race. ket qua the nao

table1(~low+smoke+race,data=bw)
Overall
(N=189)
low
Mean (SD) 0.312 (0.465)
Median [Min, Max] 0 [0, 1.00]
smoke
Mean (SD) 0.392 (0.489)
Median [Min, Max] 0 [0, 1.00]
race
Mean (SD) 1.85 (0.918)
Median [Min, Max] 1.00 [1.00, 3.00]

sua lai

table1(~factor(race)+factor(smoke)|factor(low),data=bw)
0
(N=130)
1
(N=59)
Overall
(N=189)
factor(race)
1 73 (56.2%) 23 (39.0%) 96 (50.8%)
2 15 (11.5%) 11 (18.6%) 26 (13.8%)
3 42 (32.3%) 25 (42.4%) 67 (35.4%)
factor(smoke)
0 86 (66.2%) 29 (49.2%) 115 (60.8%)
1 44 (33.8%) 30 (50.8%) 74 (39.2%)

7. su dung lessR

7.1. histogram cua bwt

library(lessR)
Histogram(bwt,data=bw)

## >>> Suggestions 
## bin_width: set the width of each bin 
## bin_start: set the start of the first bin 
## bin_end: set the end of the last bin 
## Histogram(bwt, density=TRUE)  # smoothed curve + histogram 
## Plot(bwt)  # Violin/Box/Scatterplot (VBS) plot 
## 
## --- bwt --- 
##  
##       n   miss       mean         sd        min        mdn        max 
##      189      0    2944.59     729.21     709.00    2977.00    4990.00 
## 
##   
## --- Outliers ---     from the box plot: 1 
##  
## Small        Large 
## -----        ----- 
##  709.0            
## 
## 
## Bin Width: 500 
## Number of Bins: 9 
##  
##          Bin  Midpnt  Count    Prop  Cumul.c  Cumul.p 
## ----------------------------------------------------- 
##   500 > 1000     750      1    0.01        1     0.01 
##  1000 > 1500    1250      4    0.02        5     0.03 
##  1500 > 2000    1750     14    0.07       19     0.10 
##  2000 > 2500    2250     40    0.21       59     0.31 
##  2500 > 3000    2750     38    0.20       97     0.51 
##  3000 > 3500    3250     45    0.24      142     0.75 
##  3500 > 4000    3750     38    0.20      180     0.95 
##  4000 > 4500    4250      7    0.04      187     0.99 
##  4500 > 5000    4750      2    0.01      189     1.00
Histogram(bwt,fill = "blue",xlab = "birthweight(g)",ylab = "frequency",data = bw)

## >>> Suggestions 
## bin_width: set the width of each bin 
## bin_start: set the start of the first bin 
## bin_end: set the end of the last bin 
## Histogram(bwt, density=TRUE)  # smoothed curve + histogram 
## Plot(bwt)  # Violin/Box/Scatterplot (VBS) plot 
## 
## --- bwt --- 
##  
##       n   miss       mean         sd        min        mdn        max 
##      189      0    2944.59     729.21     709.00    2977.00    4990.00 
## 
##   
## --- Outliers ---     from the box plot: 1 
##  
## Small        Large 
## -----        ----- 
##  709.0            
## 
## 
## Bin Width: 500 
## Number of Bins: 9 
##  
##          Bin  Midpnt  Count    Prop  Cumul.c  Cumul.p 
## ----------------------------------------------------- 
##   500 > 1000     750      1    0.01        1     0.01 
##  1000 > 1500    1250      4    0.02        5     0.03 
##  1500 > 2000    1750     14    0.07       19     0.10 
##  2000 > 2500    2250     40    0.21       59     0.31 
##  2500 > 3000    2750     38    0.20       97     0.51 
##  3000 > 3500    3250     45    0.24      142     0.75 
##  3500 > 4000    3750     38    0.20      180     0.95 
##  4000 > 4500    4250      7    0.04      187     0.99 
##  4500 > 5000    4750      2    0.01      189     1.00

7.2. barchat cua ethnicity

BarChart(ethnicity,data=bw)

## >>> Suggestions
## BarChart(ethnicity, horiz=TRUE)  # horizontal bar chart
## BarChart(ethnicity, fill="reds")  # red bars of varying lightness
## PieChart(ethnicity)  # doughnut (ring) chart
## Plot(ethnicity)  # bubble plot
## Plot(ethnicity, stat="count")  # lollipop plot 
## 
## --- ethnicity --- 
## 
## Missing Values: 0 
## 
##                black  other  white     Total 
## Frequencies:      26     67     96       189 
## Proportions:   0.138  0.354  0.508     1.000 
## 
## Chi-squared test of null hypothesis of equal probabilities 
##   Chisq = 39.270, df = 2, p-value = 0.000

7.3. tuong quan giua lwt va bwt

Plot(lwt,bwt,data=bw)

## >>> Suggestions  or  enter: style(suggest=FALSE)
## Plot(lwt, bwt, enhance=TRUE)  # many options
## Plot(lwt, bwt, color="red")  # exterior edge color of points
## Plot(lwt, bwt, fit="lm", fit_se=c(.90,.99))  # fit line, stnd errors
## Plot(lwt, bwt, MD_cut=6)  # Mahalanobis distance from center > 6 is an outlier 
## 
## 
## >>> Pearson's product-moment correlation 
##  
## Number of paired values with neither missing, n = 189 
## Sample Correlation of lwt and bwt: r = 0.186 
##   
## Hypothesis Test of 0 Correlation:  t = 2.585,  df = 187,  p-value = 0.011 
## 95% Confidence Interval for Correlation:  0.044 to 0.320 
## 

7.4. tuong quan giua lwt va bwt theo ethnicity

Plot(lwt,bwt,fit="lm",data=bw)

## 
## >>> Suggestions  or  enter: style(suggest=FALSE)
## Plot(lwt, bwt, enhance=TRUE)  # many options
## Plot(lwt, bwt, fill="skyblue")  # interior fill color of points
## Plot(lwt, bwt, MD_cut=6)  # Mahalanobis distance from center > 6 is an outlier 
## 
## 
## >>> Pearson's product-moment correlation 
##  
## Number of paired values with neither missing, n = 189 
## Sample Correlation of lwt and bwt: r = 0.186 
##   
## Hypothesis Test of 0 Correlation:  t = 2.585,  df = 187,  p-value = 0.011 
## 95% Confidence Interval for Correlation:  0.044 to 0.320 
##   
## 
##  Line: b0 = 2369.62   b1 = 4.43    Fit: MSE = 516,155   Rsq = 0.034
## 
Plot(lwt,bwt,by=ethnicity,fit="lm",data=bw)

## 
## >>> Suggestions  or  enter: style(suggest=FALSE)
## Plot(lwt, bwt, enhance=TRUE)  # many options
## Plot(lwt, bwt, color="red")  # exterior edge color of points
## Plot(lwt, bwt, MD_cut=6)  # Mahalanobis distance from center > 6 is an outlier 
## 
## ethnicity: black   Line: b0 = 2363.2    b1 = 2.4    Fit: MSE = 415,264   Rsq = 0.023
##  
## ethnicity: other   Line: b0 = 2070.8    b1 = 6.1    Fit: MSE = 505,570   Rsq = 0.045
##  
## ethnicity: white   Line: b0 = 2442.4    b1 = 5.0    Fit: MSE = 514,066   Rsq = 0.040
##