#install.packages(c("tidyverse", "dplyr", "lessR", "table1", "compareGroups", "ggplot2", "gridExtra", "ggthemes", "GGally", "ggfortify", "DescTools", "simpleboot", "boot", "relaimpo", "carData", "rms", "caret", "BMA", "glmnet", "epiDisplay"), dependencies = T)
# Đọc trực tiếp:
# Tìm đường dẫn với file.choose()
# t = file.choose()
# t
bw = read.csv("C:\\Thach\\VN trips\\2025_2May\\Phuong Nam Institute\\Datasets\\birthwt.csv")
# Đọc dữ liệu bằng lệnh "Import Dataset"
Upload tập dữ liệu birthwt.csv vào ChatGPT.
PROMPT: “Bạn hãy giúp soạn lệnh R để đọc tập dữ liệu birthwt.csv vào R và gọi dữ liệu là bw2”
ChatGPT:
Dưới đây là lệnh R để đọc tập dữ liệu birthwt.csv và lưu vào đối tượng có tên là bw2:
bw2 <- read.csv(“path/to/birthwt.csv”)
dim(bw)
## [1] 189 11
head(bw)
## id low age lwt race smoke ptl ht ui ftv bwt
## 1 85 0 19 182 2 0 0 0 1 0 2523
## 2 86 0 33 155 3 0 0 0 0 3 2551
## 3 87 0 20 105 1 1 0 0 0 1 2557
## 4 88 0 21 108 1 1 0 0 1 2 2594
## 5 89 0 18 107 1 1 0 0 1 0 2600
## 6 91 0 21 124 3 0 0 0 0 0 2622
tail(bw)
## id low age lwt race smoke ptl ht ui ftv bwt
## 184 78 1 14 101 3 1 1 0 0 0 2466
## 185 79 1 28 95 1 1 0 0 0 2 2466
## 186 81 1 14 100 3 0 0 0 0 2 2495
## 187 82 1 23 94 3 1 0 0 0 0 2495
## 188 83 1 17 142 2 0 0 1 0 0 2495
## 189 84 1 21 130 1 1 0 1 0 3 2495
PROMPT: “Soạn lệnh R để liệt kê 6 quan sát cuối cùng của dữ liệu này”
tail(bw2, 6)
bw$mwt = bw$lwt*0.453592
head(bw)
## id low age lwt race smoke ptl ht ui ftv bwt mwt
## 1 85 0 19 182 2 0 0 0 1 0 2523 82.55374
## 2 86 0 33 155 3 0 0 0 0 3 2551 70.30676
## 3 87 0 20 105 1 1 0 0 0 1 2557 47.62716
## 4 88 0 21 108 1 1 0 0 1 2 2594 48.98794
## 5 89 0 18 107 1 1 0 0 1 0 2600 48.53434
## 6 91 0 21 124 3 0 0 0 0 0 2622 56.24541
bw$ethnicity[bw$race == 1] = "White"
bw$ethnicity[bw$race == 2] = "Black"
bw$ethnicity[bw$race == 3] = "Other"
head(bw)
## id low age lwt race smoke ptl ht ui ftv bwt mwt ethnicity
## 1 85 0 19 182 2 0 0 0 1 0 2523 82.55374 Black
## 2 86 0 33 155 3 0 0 0 0 3 2551 70.30676 Other
## 3 87 0 20 105 1 1 0 0 0 1 2557 47.62716 White
## 4 88 0 21 108 1 1 0 0 1 2 2594 48.98794 White
## 5 89 0 18 107 1 1 0 0 1 0 2600 48.53434 White
## 6 91 0 21 124 3 0 0 0 0 0 2622 56.24541 Other
bw2\(ethnicity <- factor(bw2\)race, levels = c(1, 2, 3), labels = c(“White”, “Black”, “Other”))
bw1 = bw[, c("id", "low", "bwt")]
dim(bw1)
## [1] 189 3
head(bw1)
## id low bwt
## 1 85 0 2523
## 2 86 0 2551
## 3 87 0 2557
## 4 88 0 2594
## 5 89 0 2600
## 6 91 0 2622
bw3 = subset(bw, low == 1)
dim(bw3)
## [1] 59 13
head(bw3)
## id low age lwt race smoke ptl ht ui ftv bwt mwt ethnicity
## 131 4 1 28 120 3 1 1 0 1 0 709 54.43104 Other
## 132 10 1 29 130 1 0 0 0 1 2 1021 58.96696 White
## 133 11 1 34 187 2 1 0 1 0 0 1135 84.82170 Black
## 134 13 1 25 105 3 0 1 1 0 0 1330 47.62716 Other
## 135 15 1 25 85 3 0 0 0 1 0 1474 38.55532 Other
## 136 16 1 27 150 3 0 0 0 0 0 1588 68.03880 Other
bw4 = subset(bw, low == 1 & smoke == 1)
dim(bw4)
## [1] 30 13
head(bw4)
## id low age lwt race smoke ptl ht ui ftv bwt mwt ethnicity
## 131 4 1 28 120 3 1 1 0 1 0 709 54.43104 Other
## 133 11 1 34 187 2 1 0 1 0 0 1135 84.82170 Black
## 140 20 1 21 165 1 1 0 1 0 1 1790 74.84268 White
## 141 22 1 32 105 1 1 0 0 0 0 1818 47.62716 White
## 142 23 1 19 91 1 1 2 0 1 0 1885 41.27687 White
## 145 26 1 25 92 1 1 0 0 0 0 1928 41.73046 White
bw5 <- subset(bw2, low == 1 & smoke == 1)
dim(bw5)
hoặc
nrow(bw5)
ncol(bw5)
library(lessR)
## Warning: package 'lessR' was built under R version 4.3.3
##
## lessR 4.3.9 feedback: gerbing@pdx.edu
## --------------------------------------------------------------
## > d <- Read("") Read text, Excel, SPSS, SAS, or R data file
## d is default data frame, data= in analysis routines optional
##
## Many examples of reading, writing, and manipulating data,
## graphics, testing means and proportions, regression, factor analysis,
## customization, and descriptive statistics from pivot tables
## Enter: browseVignettes("lessR")
##
## View lessR updates, now including time series forecasting
## Enter: news(package="lessR")
##
## Interactive data analysis
## Enter: interact()
Histogram(bwt, fill = "blue", xlab = "Birthweight (g)", ylab = "Frequency", data = bw)
## >>> Suggestions
## bin_width: set the width of each bin
## bin_start: set the start of the first bin
## bin_end: set the end of the last bin
## Histogram(bwt, density=TRUE) # smoothed curve + histogram
## Plot(bwt) # Violin/Box/Scatterplot (VBS) plot
##
## --- bwt ---
##
## n miss mean sd min mdn max
## 189 0 2944.59 729.21 709.00 2977.00 4990.00
##
##
## --- Outliers --- from the box plot: 1
##
## Small Large
## ----- -----
## 709.0
##
##
## Bin Width: 500
## Number of Bins: 9
##
## Bin Midpnt Count Prop Cumul.c Cumul.p
## -----------------------------------------------------
## 500 > 1000 750 1 0.01 1 0.01
## 1000 > 1500 1250 4 0.02 5 0.03
## 1500 > 2000 1750 14 0.07 19 0.10
## 2000 > 2500 2250 40 0.21 59 0.31
## 2500 > 3000 2750 38 0.20 97 0.51
## 3000 > 3500 3250 45 0.24 142 0.75
## 3500 > 4000 3750 38 0.20 180 0.95
## 4000 > 4500 4250 7 0.04 187 0.99
## 4500 > 5000 4750 2 0.01 189 1.00
library(lessR)
Histogram(bw2$bwt, xlab = “Birth Weight”, main = “Histogram of Birth Weight”)
Nếu bạn muốn vẽ đẹp hơn một chút với màu sắc:
Histogram(bw2$bwt, fill = “lightblue”, color = “black”, xlab = “Birth Weight”, main = “Distribution of Birth Weight”)
BarChart(ethnicity, data = bw)
## >>> Suggestions
## BarChart(ethnicity, horiz=TRUE) # horizontal bar chart
## BarChart(ethnicity, fill="reds") # red bars of varying lightness
## PieChart(ethnicity) # doughnut (ring) chart
## Plot(ethnicity) # bubble plot
## Plot(ethnicity, stat="count") # lollipop plot
##
## --- ethnicity ---
##
## Missing Values: 0
##
## Black Other White Total
## Frequencies: 26 67 96 189
## Proportions: 0.138 0.354 0.508 1.000
##
## Chi-squared test of null hypothesis of equal probabilities
## Chisq = 39.270, df = 2, p-value = 0.000
BarChart(bw2$ethnicity, main = “Distribution of Ethnicity”, fill = “lightgreen”)
BarChart(bw2$ethnicity, main = “Distribution of Ethnicity”, fill = “lightgreen”, values = “percent”)
Xoay nhãn trục hoành nếu cần:
BarChart(bw2$ethnicity, horiz = FALSE, rotate.values = TRUE)
Plot(lwt, bwt, fit = "lm", data = bw)
##
## >>> Suggestions or enter: style(suggest=FALSE)
## Plot(lwt, bwt, enhance=TRUE) # many options
## Plot(lwt, bwt, color="red") # exterior edge color of points
## Plot(lwt, bwt, out_cut=.10) # label top 10% from center as outliers
##
##
## >>> Pearson's product-moment correlation
##
## Number of paired values with neither missing, n = 189
## Sample Correlation of lwt and bwt: r = 0.186
##
## Hypothesis Test of 0 Correlation: t = 2.585, df = 187, p-value = 0.011
## 95% Confidence Interval for Correlation: 0.044 to 0.320
##
##
## Line: b0 = 2369.62 b1 = 4.43 Fit: MSE = 516,155 Rsq = 0.034
##
Scatter(lwt, bwt, data = bw2, xlab = “Mother’s Weight (lwt)”, ylab = “Birth Weight (bwt)”, main = “Scatterplot of Mother’s Weight vs. Birth Weight”)
Thêm đường hồi quy:
Scatter(lwt, bwt, data = bw2, fit.line = TRUE)
Thay đổi màu và kích thước điểm:
Scatter(lwt, bwt, data = bw2, fit.line = TRUE, color = “blue”, size = 2)
Plot(lwt, bwt, by = race, fit = "lm", data = bw)
##
## >>> Suggestions or enter: style(suggest=FALSE)
## Plot(lwt, bwt, enhance=TRUE) # many options
## Plot(lwt, bwt, color="red") # exterior edge color of points
## Plot(lwt, bwt, out_cut=.10) # label top 10% from center as outliers
##
## race: 1 Line: b0 = 2442.4 b1 = 5.0 Fit: MSE = 514,066 Rsq = 0.040
##
## race: 2 Line: b0 = 2363.2 b1 = 2.4 Fit: MSE = 415,264 Rsq = 0.023
##
## race: 3 Line: b0 = 2070.8 b1 = 6.1 Fit: MSE = 505,570 Rsq = 0.045
##
Scatter(lwt, bwt, by = race, data = bw2, xlab = “Mother’s Weight (lwt)”, ylab = “Birth Weight (bwt)”, main = “Birth Weight vs. Mother’s Weight by Ethnicity”)
Bạn có thể thêm đường hồi quy riêng cho từng nhóm:
Scatter(lwt, bwt, by = ethnicity, data = bw2, fit.line = TRUE)