# packages (cài sẵn bên ngoài)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lessR)
##
## lessR 4.4.3 feedback: gerbing@pdx.edu
## --------------------------------------------------------------
## > d <- Read("") Read data file, many formats available, e.g., Excel
## d is default data frame, data= in analysis routines optional
##
## Many examples of reading, writing, and manipulating data,
## graphics, testing means and proportions, regression, factor analysis,
## customization, forecasting, and aggregation from pivot tables
## Enter: browseVignettes("lessR")
##
## View lessR updates, now including time series forecasting
## Enter: news(package="lessR")
##
## Interactive data analysis
## Enter: interact()
##
##
## Attaching package: 'lessR'
##
## The following objects are masked from 'package:dplyr':
##
## order_by, recode, rename
##
## The following object is masked from 'package:base':
##
## sort_by
library(table1)
##
## Attaching package: 'table1'
##
## The following object is masked from 'package:lessR':
##
## label
##
## The following objects are masked from 'package:base':
##
## units, units<-
library(compareGroups)
library(gridExtra)
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
library(simpleboot)
## Simple Bootstrap Routines (1.1-8)
library(boot)
library(relaimpo)
## Loading required package: MASS
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
##
## Loading required package: survey
## Loading required package: grid
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loading required package: survival
##
## Attaching package: 'survival'
##
## The following object is masked from 'package:boot':
##
## aml
##
##
## Attaching package: 'survey'
##
## The following object is masked from 'package:graphics':
##
## dotchart
##
## Loading required package: mitools
## This is the global version of package relaimpo.
##
## If you are a non-US user, a version with the interesting additional metric pmvd is available
##
## from Ulrike Groempings web site at prof.beuth-hochschule.de/groemping.
library(caret)
## Loading required package: lattice
##
## Attaching package: 'lattice'
##
## The following object is masked from 'package:boot':
##
## melanoma
##
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:survival':
##
## cluster
##
## The following object is masked from 'package:purrr':
##
## lift
library(BMA)
## Loading required package: leaps
## Loading required package: robustbase
##
## Attaching package: 'robustbase'
##
## The following object is masked from 'package:survival':
##
## heart
##
## The following object is masked from 'package:boot':
##
## salinity
##
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.7-7)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:boot':
##
## logit
##
## The following objects are masked from 'package:lessR':
##
## bc, recode, sp
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(epiDisplay)
## Loading required package: foreign
## Loading required package: nnet
##
## Attaching package: 'epiDisplay'
##
## The following object is masked from 'package:lattice':
##
## dotplot
##
## The following object is masked from 'package:ggplot2':
##
## alpha
link = "C:\\Users\\ADMIN\\Downloads\\Khóa tập huấn ứng dụng AI vào phân tích dữ liệu\\birthwt.csv"
bw = read.csv(link)
# Có bao nhiêu biến số (variable) và quan sát (observation)
dim(bw)
## [1] 189 11
# Liệt kê 6 quan sát đầu tiên của dữ liệu.
head(bw)
## id low age lwt race smoke ptl ht ui ftv bwt
## 1 85 0 19 182 2 0 0 0 1 0 2523
## 2 86 0 33 155 3 0 0 0 0 3 2551
## 3 87 0 20 105 1 1 0 0 0 1 2557
## 4 88 0 21 108 1 1 0 0 1 2 2594
## 5 89 0 18 107 1 1 0 0 1 0 2600
## 6 91 0 21 124 3 0 0 0 0 0 2622
tail(bw)
## id low age lwt race smoke ptl ht ui ftv bwt
## 184 78 1 14 101 3 1 1 0 0 0 2466
## 185 79 1 28 95 1 1 0 0 0 2 2466
## 186 81 1 14 100 3 0 0 0 0 2 2495
## 187 82 1 23 94 3 1 0 0 0 0 2495
## 188 83 1 17 142 2 0 0 1 0 0 2495
## 189 84 1 21 130 1 1 0 1 0 3 2495
# Tạo biến số mới mwt là cân nặng của mẹ tính bằng kg
bw$mwt = bw$lwt*0.45
# Tạo biến số mới ethnicity là biến factor với điều kiện sau: Nếu race = 1 thì ethnicity = “White”, Nếu race = 2 thì ethnicity = “Black”, Nếu race = 3 thì ethnicity = “Other”
bw$ethnicity [bw$race==1] = "White"
bw$ethnicity [bw$race==2] = "Black"
bw$ethnicity [bw$race==3] = "Others"
# Thay đổi thứ tự theo cách mình thích
bw$ethnicity = factor(bw$ethnicity, levels=c("White", "Black", "Others"))
# Tạo 1 tập dữ liệu bw1 chỉ gồm 3 biến số id, low và bwt. Dữ liệu này có bao nhiêu biến số và quan sát?
bw1 = bw[, c("id", "low", "bwt")] # chon tat ca cac dong nhung chi 3 bien so
bw1 = bw[, c(1, 2, 8:11)] # chon tat ca cac dong nhung cot 1, 2, 8-11
# Tạo 1 tập dữ liệu bw2 chỉ gồm những thai phụ có cân nặng thấp (low = 1). Dữ liệu này có bao nhiêu biến số và quan sát?
bw2 = subset(bw, low==1)
# Mô tả đặc điểm tuổi của mẹ (age), cân nặng của mẹ (lwt) và cân nặng của con (bwt)
table1(~age + mwt + bwt, data=bw)
| Overall (N=189) |
|
|---|---|
| age | |
| Mean (SD) | 23.2 (5.30) |
| Median [Min, Max] | 23.0 [14.0, 45.0] |
| mwt | |
| Mean (SD) | 58.4 (13.8) |
| Median [Min, Max] | 54.5 [36.0, 113] |
| bwt | |
| Mean (SD) | 2940 (729) |
| Median [Min, Max] | 2980 [709, 4990] |
table1(~age + mwt + bwt | low, data=bw)
| 0 (N=130) |
1 (N=59) |
Overall (N=189) |
|
|---|---|---|---|
| age | |||
| Mean (SD) | 23.7 (5.58) | 22.3 (4.51) | 23.2 (5.30) |
| Median [Min, Max] | 23.0 [14.0, 45.0] | 22.0 [14.0, 34.0] | 23.0 [14.0, 45.0] |
| mwt | |||
| Mean (SD) | 60.0 (14.3) | 55.0 (12.0) | 58.4 (13.8) |
| Median [Min, Max] | 55.6 [38.3, 113] | 54.0 [36.0, 90.0] | 54.5 [36.0, 113] |
| bwt | |||
| Mean (SD) | 3330 (478) | 2100 (391) | 2940 (729) |
| Median [Min, Max] | 3270 [2520, 4990] | 2210 [709, 2500] | 2980 [709, 4990] |
# Mô tả đặc điểm tuổi của mẹ (age), cân nặng của mẹ (lwt), tình trạng hút thuốc trong thai kỳ (smoke) , chủng tộc (race), và cân nặng của con (bwt) theo tình trạng trẻ thiếu cân (low)
bw$smoking = ifelse(bw$smoke==1, "Smoking", "No Smoking")
table1(~age + mwt + bwt + ethnicity + smoking | low, data=bw)
| 0 (N=130) |
1 (N=59) |
Overall (N=189) |
|
|---|---|---|---|
| age | |||
| Mean (SD) | 23.7 (5.58) | 22.3 (4.51) | 23.2 (5.30) |
| Median [Min, Max] | 23.0 [14.0, 45.0] | 22.0 [14.0, 34.0] | 23.0 [14.0, 45.0] |
| mwt | |||
| Mean (SD) | 60.0 (14.3) | 55.0 (12.0) | 58.4 (13.8) |
| Median [Min, Max] | 55.6 [38.3, 113] | 54.0 [36.0, 90.0] | 54.5 [36.0, 113] |
| bwt | |||
| Mean (SD) | 3330 (478) | 2100 (391) | 2940 (729) |
| Median [Min, Max] | 3270 [2520, 4990] | 2210 [709, 2500] | 2980 [709, 4990] |
| ethnicity | |||
| White | 73 (56.2%) | 23 (39.0%) | 96 (50.8%) |
| Black | 15 (11.5%) | 11 (18.6%) | 26 (13.8%) |
| Others | 42 (32.3%) | 25 (42.4%) | 67 (35.4%) |
| smoking | |||
| No Smoking | 86 (66.2%) | 29 (49.2%) | 115 (60.8%) |
| Smoking | 44 (33.8%) | 30 (50.8%) | 74 (39.2%) |
# Phân bố bwt (cân nặng của trẻ em)
Histogram(bwt, xlab="Cân nặng của trẻ sơ sanh", ylab="Số bà mẹ", data=bw, fill = "navy")
## >>> Suggestions
## bin_width: set the width of each bin
## bin_start: set the start of the first bin
## bin_end: set the end of the last bin
## Histogram(bwt, density=TRUE) # smoothed curve + histogram
## Plot(bwt) # Violin/Box/Scatterplot (VBS) plot
##
## --- bwt ---
##
## n miss mean sd min mdn max
## 189 0 2944.59 729.21 709.00 2977.00 4990.00
##
##
##
## --- Outliers --- from the box plot: 1
##
## Small Large
## ----- -----
## 709.0
##
##
## Bin Width: 500
## Number of Bins: 9
##
## Bin Midpnt Count Prop Cumul.c Cumul.p
## -----------------------------------------------------
## 500 > 1000 750 1 0.01 1 0.01
## 1000 > 1500 1250 4 0.02 5 0.03
## 1500 > 2000 1750 14 0.07 19 0.10
## 2000 > 2500 2250 40 0.21 59 0.31
## 2500 > 3000 2750 38 0.20 97 0.51
## 3000 > 3500 3250 45 0.24 142 0.75
## 3500 > 4000 3750 38 0.20 180 0.95
## 4000 > 4500 4250 7 0.04 187 0.99
## 4500 > 5000 4750 2 0.01 189 1.00
##
# Phân bố biến số ethnicity
BarChart(ethnicity, ylab="Số bà mẹ", data=bw)
## >>> Suggestions
## BarChart(ethnicity, horiz=TRUE) # horizontal bar chart
## BarChart(ethnicity, fill="reds") # red bars of varying lightness
## PieChart(ethnicity) # doughnut (ring) chart
## Plot(ethnicity) # bubble plot
## Plot(ethnicity, stat="count") # lollipop plot
##
## --- ethnicity ---
##
## Missing Values: 0
##
## White Black Others Total
## Frequencies: 96 26 67 189
## Proportions: 0.508 0.138 0.354 1.000
##
## Chi-squared test of null hypothesis of equal probabilities
## Chisq = 39.270, df = 2, p-value = 0.000
# Cân nặng của mẹ và trẻ
Plot(mwt, bwt, xlab="Cân nặng của mẹ", ylab="Cân nặng của con", fit="lm", data=bw)
##
##
## >>> Suggestions or enter: style(suggest=FALSE)
## Plot(mwt, bwt, enhance=TRUE) # many options
## Plot(mwt, bwt, color="red") # exterior edge color of points
## Plot(mwt, bwt, MD_cut=6) # Mahalanobis distance from center > 6 is an outlier
##
##
## >>> Pearson's product-moment correlation
##
## Number of paired values with neither missing, n = 189
## Sample Correlation of mwt and bwt: r = 0.186
##
## Hypothesis Test of 0 Correlation: t = 2.585, df = 187, p-value = 0.011
## 95% Confidence Interval for Correlation: 0.044 to 0.320
##
##
## Line: b0 = 2369.624 b1 = 9.842 Linear Model MSE = 516,155.173 Rsq = 0.034
##