#1. viec 1 #2. viec 2 #3. viec 3: doc du lieu birthwt.csv vao R va goi du lieu la “bw”
bw=read.csv("/Users/toansama/Documents/NCKH/PHAN TICH DU LIEU CO BAN - SIS CAN THO - 2-6:1:2025/DỮ LIỆU ĐÍNH KÈM BÀI TẬP/birthwt.csv",header = T,na.strings = " ")
library(lessR)
##
## lessR 4.3.9 feedback: gerbing@pdx.edu
## --------------------------------------------------------------
## > d <- Read("") Read text, Excel, SPSS, SAS, or R data file
## d is default data frame, data= in analysis routines optional
##
## Many examples of reading, writing, and manipulating data,
## graphics, testing means and proportions, regression, factor analysis,
## customization, and descriptive statistics from pivot tables
## Enter: browseVignettes("lessR")
##
## View lessR updates, now including time series forecasting
## Enter: news(package="lessR")
##
## Interactive data analysis
## Enter: interact()
##
## Attaching package: 'lessR'
## The following object is masked from 'package:base':
##
## sort_by
bw=Read("/Users/toansama/Documents/NCKH/PHAN TICH DU LIEU CO BAN - SIS CAN THO - 2-6:1:2025/DỮ LIỆU ĐÍNH KÈM BÀI TẬP/birthwt.csv")
##
## >>> Suggestions
## Recommended binary format for data files: feather
## Create with Write(d, "your_file", format="feather")
## To read a csv or Excel file of variable labelsvar_labels=TRUE
## Each row of the file: Variable Name, Variable Label
## Read into a data frame named l (the letter el)
##
## More details about your data, Enter: details() for d, or details(name)
##
## Data Types
## ------------------------------------------------------------
## integer: Numeric data values, integers only
## ------------------------------------------------------------
##
## Variable Missing Unique
## Name Type Values Values Values First and last values
## ------------------------------------------------------------------------------------------
## 1 id integer 189 0 189 85 86 87 ... 82 83 84
## 2 low integer 189 0 2 0 0 0 ... 1 1 1
## 3 age integer 189 0 24 19 33 20 ... 23 17 21
## 4 lwt integer 189 0 75 182 155 105 ... 94 142 130
## 5 race integer 189 0 3 2 3 1 ... 3 2 1
## 6 smoke integer 189 0 2 0 0 1 ... 1 0 1
## 7 ptl integer 189 0 4 0 0 0 ... 0 0 0
## 8 ht integer 189 0 2 0 0 0 ... 0 1 1
## 9 ui integer 189 0 2 1 0 0 ... 0 0 0
## 10 ftv integer 189 0 6 0 3 1 ... 0 0 3
## 11 bwt integer 189 0 131 2523 2551 2557 ... 2495 2495 2495
## ------------------------------------------------------------------------------------------
#4. viec 4: thong tin ve du lieu bw ##4.1. variable and observation
dim(bw)
## [1] 189 11
##4.2. 6 quan sat dau tien
head(bw,6)
## id low age lwt race smoke ptl ht ui ftv bwt
## 1 85 0 19 182 2 0 0 0 1 0 2523
## 2 86 0 33 155 3 0 0 0 0 3 2551
## 3 87 0 20 105 1 1 0 0 0 1 2557
## 4 88 0 21 108 1 1 0 0 1 2 2594
## 5 89 0 18 107 1 1 0 0 1 0 2600
## 6 91 0 21 124 3 0 0 0 0 0 2622
##4.3. 6 quan sat cuoi
tail(bw,6)
## id low age lwt race smoke ptl ht ui ftv bwt
## 184 78 1 14 101 3 1 1 0 0 0 2466
## 185 79 1 28 95 1 1 0 0 0 2 2466
## 186 81 1 14 100 3 0 0 0 0 2 2495
## 187 82 1 23 94 3 1 0 0 0 0 2495
## 188 83 1 17 142 2 0 0 1 0 0 2495
## 189 84 1 21 130 1 1 0 1 0 3 2495
tail(bw)
## id low age lwt race smoke ptl ht ui ftv bwt
## 184 78 1 14 101 3 1 1 0 0 0 2466
## 185 79 1 28 95 1 1 0 0 0 2 2466
## 186 81 1 14 100 3 0 0 0 0 2 2495
## 187 82 1 23 94 3 1 0 0 0 0 2495
## 188 83 1 17 142 2 0 0 1 0 0 2495
## 189 84 1 21 130 1 1 0 1 0 3 2495
table(bw$low)
##
## 0 1
## 130 59
59/(130+59)
## [1] 0.3121693
table(bw$low,bw$smoke)
##
## 0 1
## 0 86 44
## 1 29 30
29/115
## [1] 0.2521739
30/74
## [1] 0.4054054
#5. viec 5: bien tap du lieu ##5.1. tao bien mwt: can nawng cua me theo kg
bw$mwt=bw$lwt*0.454
head(bw)
## id low age lwt race smoke ptl ht ui ftv bwt mwt
## 1 85 0 19 182 2 0 0 0 1 0 2523 82.628
## 2 86 0 33 155 3 0 0 0 0 3 2551 70.370
## 3 87 0 20 105 1 1 0 0 0 1 2557 47.670
## 4 88 0 21 108 1 1 0 0 1 2 2594 49.032
## 5 89 0 18 107 1 1 0 0 1 0 2600 48.578
## 6 91 0 21 124 3 0 0 0 0 0 2622 56.296
##5.2. tao bien ethnicity la bien factor voi dk
bw$ethnicity[bw$race==1]="white"
bw$ethnicity[bw$race==2]="black"
bw$ethnicity[bw$race==3]="other"
head(bw)
## id low age lwt race smoke ptl ht ui ftv bwt mwt ethnicity
## 1 85 0 19 182 2 0 0 0 1 0 2523 82.628 black
## 2 86 0 33 155 3 0 0 0 0 3 2551 70.370 other
## 3 87 0 20 105 1 1 0 0 0 1 2557 47.670 white
## 4 88 0 21 108 1 1 0 0 1 2 2594 49.032 white
## 5 89 0 18 107 1 1 0 0 1 0 2600 48.578 white
## 6 91 0 21 124 3 0 0 0 0 0 2622 56.296 other
##5.3. tao bien smoking la bien numeric voi dk
bw$smoking=ifelse(bw$smoke==1,"yes","no")
head(bw)
## id low age lwt race smoke ptl ht ui ftv bwt mwt ethnicity smoking
## 1 85 0 19 182 2 0 0 0 1 0 2523 82.628 black no
## 2 86 0 33 155 3 0 0 0 0 3 2551 70.370 other no
## 3 87 0 20 105 1 1 0 0 0 1 2557 47.670 white yes
## 4 88 0 21 108 1 1 0 0 1 2 2594 49.032 white yes
## 5 89 0 18 107 1 1 0 0 1 0 2600 48.578 white yes
## 6 91 0 21 124 3 0 0 0 0 0 2622 56.296 other no
##5.4. tao bw1 gom 3 bien id, low, bwt. co bao nhieu bien va quan sat
bw1=bw[,c("id","low","bwt")]
head(bw1)
## id low bwt
## 1 85 0 2523
## 2 86 0 2551
## 3 87 0 2557
## 4 88 0 2594
## 5 89 0 2600
## 6 91 0 2622
dim(bw1)
## [1] 189 3
##5.5 tao bw2 chi gom thai phu co can nang thap. co bao nhieu bien va quan sat
bw2=subset(bw,low==1)
head(bw2)
## id low age lwt race smoke ptl ht ui ftv bwt mwt ethnicity smoking
## 131 4 1 28 120 3 1 1 0 1 0 709 54.480 other yes
## 132 10 1 29 130 1 0 0 0 1 2 1021 59.020 white no
## 133 11 1 34 187 2 1 0 1 0 0 1135 84.898 black yes
## 134 13 1 25 105 3 0 1 1 0 0 1330 47.670 other no
## 135 15 1 25 85 3 0 0 0 1 0 1474 38.590 other no
## 136 16 1 27 150 3 0 0 0 0 0 1588 68.100 other no
dim(bw2)
## [1] 59 14
bw3=subset(bw,low==1&smoke==1)
head(bw3)
## id low age lwt race smoke ptl ht ui ftv bwt mwt ethnicity smoking
## 131 4 1 28 120 3 1 1 0 1 0 709 54.480 other yes
## 133 11 1 34 187 2 1 0 1 0 0 1135 84.898 black yes
## 140 20 1 21 165 1 1 0 1 0 1 1790 74.910 white yes
## 141 22 1 32 105 1 1 0 0 0 0 1818 47.670 white yes
## 142 23 1 19 91 1 1 2 0 1 0 1885 41.314 white yes
## 145 26 1 25 92 1 1 0 0 0 0 1928 41.768 white yes
dim(bw3)
## [1] 30 14
library(table1)
##
## Attaching package: 'table1'
## The following object is masked from 'package:lessR':
##
## label
## The following objects are masked from 'package:base':
##
## units, units<-
table1(~age+lwt+bwt,data=bw)
| Overall (N=189) |
|
|---|---|
| age | |
| Mean (SD) | 23.2 (5.30) |
| Median [Min, Max] | 23.0 [14.0, 45.0] |
| lwt | |
| Mean (SD) | 130 (30.6) |
| Median [Min, Max] | 121 [80.0, 250] |
| bwt | |
| Mean (SD) | 2940 (729) |
| Median [Min, Max] | 2980 [709, 4990] |
table1(~age+lwt+bwt|low,data=bw)
| 0 (N=130) |
1 (N=59) |
Overall (N=189) |
|
|---|---|---|---|
| age | |||
| Mean (SD) | 23.7 (5.58) | 22.3 (4.51) | 23.2 (5.30) |
| Median [Min, Max] | 23.0 [14.0, 45.0] | 22.0 [14.0, 34.0] | 23.0 [14.0, 45.0] |
| lwt | |||
| Mean (SD) | 133 (31.7) | 122 (26.6) | 130 (30.6) |
| Median [Min, Max] | 124 [85.0, 250] | 120 [80.0, 200] | 121 [80.0, 250] |
| bwt | |||
| Mean (SD) | 3330 (478) | 2100 (391) | 2940 (729) |
| Median [Min, Max] | 3270 [2520, 4990] | 2210 [709, 2500] | 2980 [709, 4990] |
table1(~low+smoke+race,data=bw)
| Overall (N=189) |
|
|---|---|
| low | |
| Mean (SD) | 0.312 (0.465) |
| Median [Min, Max] | 0 [0, 1.00] |
| smoke | |
| Mean (SD) | 0.392 (0.489) |
| Median [Min, Max] | 0 [0, 1.00] |
| race | |
| Mean (SD) | 1.85 (0.918) |
| Median [Min, Max] | 1.00 [1.00, 3.00] |
table1(~factor(race)+factor(smoke)|factor(low),data=bw)
| 0 (N=130) |
1 (N=59) |
Overall (N=189) |
|
|---|---|---|---|
| factor(race) | |||
| 1 | 73 (56.2%) | 23 (39.0%) | 96 (50.8%) |
| 2 | 15 (11.5%) | 11 (18.6%) | 26 (13.8%) |
| 3 | 42 (32.3%) | 25 (42.4%) | 67 (35.4%) |
| factor(smoke) | |||
| 0 | 86 (66.2%) | 29 (49.2%) | 115 (60.8%) |
| 1 | 44 (33.8%) | 30 (50.8%) | 74 (39.2%) |
library(lessR)
Histogram(bwt,data=bw)
## >>> Suggestions
## bin_width: set the width of each bin
## bin_start: set the start of the first bin
## bin_end: set the end of the last bin
## Histogram(bwt, density=TRUE) # smoothed curve + histogram
## Plot(bwt) # Violin/Box/Scatterplot (VBS) plot
##
## --- bwt ---
##
## n miss mean sd min mdn max
## 189 0 2944.59 729.21 709.00 2977.00 4990.00
##
##
## --- Outliers --- from the box plot: 1
##
## Small Large
## ----- -----
## 709.0
##
##
## Bin Width: 500
## Number of Bins: 9
##
## Bin Midpnt Count Prop Cumul.c Cumul.p
## -----------------------------------------------------
## 500 > 1000 750 1 0.01 1 0.01
## 1000 > 1500 1250 4 0.02 5 0.03
## 1500 > 2000 1750 14 0.07 19 0.10
## 2000 > 2500 2250 40 0.21 59 0.31
## 2500 > 3000 2750 38 0.20 97 0.51
## 3000 > 3500 3250 45 0.24 142 0.75
## 3500 > 4000 3750 38 0.20 180 0.95
## 4000 > 4500 4250 7 0.04 187 0.99
## 4500 > 5000 4750 2 0.01 189 1.00
Histogram(bwt,fill = "blue",xlab = "birthweight(g)",ylab = "frequency",data = bw)
## >>> Suggestions
## bin_width: set the width of each bin
## bin_start: set the start of the first bin
## bin_end: set the end of the last bin
## Histogram(bwt, density=TRUE) # smoothed curve + histogram
## Plot(bwt) # Violin/Box/Scatterplot (VBS) plot
##
## --- bwt ---
##
## n miss mean sd min mdn max
## 189 0 2944.59 729.21 709.00 2977.00 4990.00
##
##
## --- Outliers --- from the box plot: 1
##
## Small Large
## ----- -----
## 709.0
##
##
## Bin Width: 500
## Number of Bins: 9
##
## Bin Midpnt Count Prop Cumul.c Cumul.p
## -----------------------------------------------------
## 500 > 1000 750 1 0.01 1 0.01
## 1000 > 1500 1250 4 0.02 5 0.03
## 1500 > 2000 1750 14 0.07 19 0.10
## 2000 > 2500 2250 40 0.21 59 0.31
## 2500 > 3000 2750 38 0.20 97 0.51
## 3000 > 3500 3250 45 0.24 142 0.75
## 3500 > 4000 3750 38 0.20 180 0.95
## 4000 > 4500 4250 7 0.04 187 0.99
## 4500 > 5000 4750 2 0.01 189 1.00
BarChart(ethnicity,data=bw)
## >>> Suggestions
## BarChart(ethnicity, horiz=TRUE) # horizontal bar chart
## BarChart(ethnicity, fill="reds") # red bars of varying lightness
## PieChart(ethnicity) # doughnut (ring) chart
## Plot(ethnicity) # bubble plot
## Plot(ethnicity, stat="count") # lollipop plot
##
## --- ethnicity ---
##
## Missing Values: 0
##
## black other white Total
## Frequencies: 26 67 96 189
## Proportions: 0.138 0.354 0.508 1.000
##
## Chi-squared test of null hypothesis of equal probabilities
## Chisq = 39.270, df = 2, p-value = 0.000
Plot(lwt,bwt,data=bw)
## >>> Suggestions or enter: style(suggest=FALSE)
## Plot(lwt, bwt, enhance=TRUE) # many options
## Plot(lwt, bwt, color="red") # exterior edge color of points
## Plot(lwt, bwt, fit="lm", fit_se=c(.90,.99)) # fit line, stnd errors
## Plot(lwt, bwt, MD_cut=6) # Mahalanobis distance from center > 6 is an outlier
##
##
## >>> Pearson's product-moment correlation
##
## Number of paired values with neither missing, n = 189
## Sample Correlation of lwt and bwt: r = 0.186
##
## Hypothesis Test of 0 Correlation: t = 2.585, df = 187, p-value = 0.011
## 95% Confidence Interval for Correlation: 0.044 to 0.320
##
Plot(lwt,bwt,fit="lm",data=bw)
##
## >>> Suggestions or enter: style(suggest=FALSE)
## Plot(lwt, bwt, enhance=TRUE) # many options
## Plot(lwt, bwt, fill="skyblue") # interior fill color of points
## Plot(lwt, bwt, MD_cut=6) # Mahalanobis distance from center > 6 is an outlier
##
##
## >>> Pearson's product-moment correlation
##
## Number of paired values with neither missing, n = 189
## Sample Correlation of lwt and bwt: r = 0.186
##
## Hypothesis Test of 0 Correlation: t = 2.585, df = 187, p-value = 0.011
## 95% Confidence Interval for Correlation: 0.044 to 0.320
##
##
## Line: b0 = 2369.62 b1 = 4.43 Fit: MSE = 516,155 Rsq = 0.034
##
Plot(lwt,bwt,by=ethnicity,fit="lm",data=bw)
##
## >>> Suggestions or enter: style(suggest=FALSE)
## Plot(lwt, bwt, enhance=TRUE) # many options
## Plot(lwt, bwt, color="red") # exterior edge color of points
## Plot(lwt, bwt, MD_cut=6) # Mahalanobis distance from center > 6 is an outlier
##
## ethnicity: black Line: b0 = 2363.2 b1 = 2.4 Fit: MSE = 415,264 Rsq = 0.023
##
## ethnicity: other Line: b0 = 2070.8 b1 = 6.1 Fit: MSE = 505,570 Rsq = 0.045
##
## ethnicity: white Line: b0 = 2442.4 b1 = 5.0 Fit: MSE = 514,066 Rsq = 0.040
##