R Markdown

1: Tiếp tục với dữ liệu “Arrest dataset.csv”. Vẽ biểu đồ phân bố đơn giản biến week với hàm hist:

arr = read.csv("C:/Users/Momo/Desktop/R - Learning/Dataset 4-2022/Arrest dataset.csv")
colnames(arr)
##  [1] "id"       "age"      "finance"  "week"     "arrest"   "race"    
##  [7] "work.exp" "married"  "parole"   "prior"    "educ"     "employ1"
head(arr)
##   id age finance week arrest  race work.exp     married parole prior educ
## 1  1  27      no   20      1 black       no not married    yes     3    3
## 2  2  18      no   17      1 black       no not married    yes     8    4
## 3  3  19      no   25      1 other      yes not married    yes    13    3
## 4  4  23     yes   52      0 black      yes     married    yes     1    5
## 5  5  19      no   52      0 other      yes not married    yes     3    3
## 6  6  24      no   52      0 black      yes not married     no     2    4
##   employ1
## 1      no
## 2      no
## 3      no
## 4      no
## 5      no
## 6      no
tail(arr)
##      id age finance week arrest  race work.exp     married parole prior educ
## 427 427  22     yes   12      1 black      yes     married    yes     2    4
## 428 428  31     yes   52      0 other      yes not married    yes     3    3
## 429 429  20      no   52      0 black       no not married    yes     1    4
## 430 430  20     yes   52      0 black      yes     married    yes     1    3
## 431 431  29      no   52      0 black      yes not married    yes     3    4
## 432 432  24     yes   52      0 black      yes not married    yes     1    4
##     employ1
## 427      no
## 428      no
## 429      no
## 430      no
## 431     yes
## 432     yes
summary(arr)
##        id             age         finance               week      
##  Min.   :  1.0   Min.   :17.0   Length:432         Min.   : 1.00  
##  1st Qu.:108.8   1st Qu.:20.0   Class :character   1st Qu.:50.00  
##  Median :216.5   Median :23.0   Mode  :character   Median :52.00  
##  Mean   :216.5   Mean   :24.6                      Mean   :45.85  
##  3rd Qu.:324.2   3rd Qu.:27.0                      3rd Qu.:52.00  
##  Max.   :432.0   Max.   :44.0                      Max.   :52.00  
##      arrest           race             work.exp           married         
##  Min.   :0.0000   Length:432         Length:432         Length:432        
##  1st Qu.:0.0000   Class :character   Class :character   Class :character  
##  Median :0.0000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :0.2639                                                           
##  3rd Qu.:1.0000                                                           
##  Max.   :1.0000                                                           
##     parole              prior             educ         employ1         
##  Length:432         Min.   : 0.000   Min.   :2.000   Length:432        
##  Class :character   1st Qu.: 1.000   1st Qu.:3.000   Class :character  
##  Mode  :character   Median : 2.000   Median :3.000   Mode  :character  
##                     Mean   : 2.984   Mean   :3.477                     
##                     3rd Qu.: 4.000   3rd Qu.:4.000                     
##                     Max.   :18.000   Max.   :6.000
hist(arr$week)

hist(arr$week, col = "blue", border = "white", main = "Distribution of time to arrest (week)", xlab = "Week", ylab = "Number of Participants")

library(table1)
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
table1(~ age + finance + week + arrest + race + work.exp + married + parole + prior + educ + employ1, data = arr)
Overall
(N=432)
age
Mean (SD) 24.6 (6.11)
Median [Min, Max] 23.0 [17.0, 44.0]
finance
no 216 (50.0%)
yes 216 (50.0%)
week
Mean (SD) 45.9 (12.7)
Median [Min, Max] 52.0 [1.00, 52.0]
arrest
Mean (SD) 0.264 (0.441)
Median [Min, Max] 0 [0, 1.00]
race
black 379 (87.7%)
other 53 (12.3%)
work.exp
no 185 (42.8%)
yes 247 (57.2%)
married
married 53 (12.3%)
not married 379 (87.7%)
parole
no 165 (38.2%)
yes 267 (61.8%)
prior
Mean (SD) 2.98 (2.90)
Median [Min, Max] 2.00 [0, 18.0]
educ
Mean (SD) 3.48 (0.834)
Median [Min, Max] 3.00 [2.00, 6.00]
employ1
no 372 (86.1%)
yes 60 (13.9%)
table1(~ age + finance + week + arrest + race + married + parole + prior + educ, data = arr)
Overall
(N=432)
age
Mean (SD) 24.6 (6.11)
Median [Min, Max] 23.0 [17.0, 44.0]
finance
no 216 (50.0%)
yes 216 (50.0%)
week
Mean (SD) 45.9 (12.7)
Median [Min, Max] 52.0 [1.00, 52.0]
arrest
Mean (SD) 0.264 (0.441)
Median [Min, Max] 0 [0, 1.00]
race
black 379 (87.7%)
other 53 (12.3%)
married
married 53 (12.3%)
not married 379 (87.7%)
parole
no 165 (38.2%)
yes 267 (61.8%)
prior
Mean (SD) 2.98 (2.90)
Median [Min, Max] 2.00 [0, 18.0]
educ
Mean (SD) 3.48 (0.834)
Median [Min, Max] 3.00 [2.00, 6.00]

2: Vẽ biểu đồ phân bố biến week với hàm ggplot trong package ggplot2:

library(ggplot2)
ggplot(data = arr, aes(x = week)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(arr, aes(x = week)) + geom_histogram(fill = "blue", col = "white") + labs(title = "Distribution of Time to Arrest", x = "Time to arrest (Week)", y = "Number of Participants") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

3: Vẽ biểu đồ phân bố biến age và đường probability density (%).

ggplot(arr, aes(x = age)) + geom_histogram(aes(y = ..density..), fill = "blue", col = "white") + geom_density(col = "red")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

4: Vẽ biểu đồ thanh (bar chart) biến educ:

Trục hoành và trục tung có nghĩa là gì? Các bạn hãy thêm label cho trục hoành và trục tung (dùng labs() ).

ggplot(arr, aes(x = educ)) + geom_bar()

ggplot(arr, aes(x = educ)) + geom_bar(fill = "blue") + labs(title = "Distribution of Education", x = "Education", y = "Number of Participants")

5: Vẽ biểu đồ thanh (bar chart) của 2 biến educ và arrest:

Tại sao hai biểu đồ khác nhau?

ggplot(arr, aes(x = educ, fill = factor(arrest))) + geom_bar()

6: Bạn hãy ghi lại tất cả những hàm/lệnh trên trong RMarkdown và share trên mạng rpubs.com/tài khoản của bạn.