1. Importing data set

arr = read.csv("C:/Users/thien/OneDrive/Desktop/R thuc hanh/R learning/SUMS - R class data/Arrest dataset.csv") # Note: data set path

str(arr)
## 'data.frame':    432 obs. of  12 variables:
##  $ id      : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ age     : int  27 18 19 23 19 24 25 21 22 20 ...
##  $ finance : chr  "no" "no" "no" "yes" ...
##  $ week    : int  20 17 25 52 52 52 23 52 52 52 ...
##  $ arrest  : int  1 1 1 0 0 0 1 0 0 0 ...
##  $ race    : chr  "black" "black" "other" "black" ...
##  $ work.exp: chr  "no" "no" "yes" "yes" ...
##  $ married : chr  "not married" "not married" "not married" "married" ...
##  $ parole  : chr  "yes" "yes" "yes" "yes" ...
##  $ prior   : int  3 8 13 1 3 2 0 4 6 0 ...
##  $ educ    : int  3 4 3 5 3 4 4 3 3 5 ...
##  $ employ1 : chr  "no" "no" "no" "no" ...
dim(arr)
## [1] 432  12

2. Check normal distribution

2.1 Base on simulation variable x

# Based on visual inspection.

x <- round(rnorm(1000, mean=10, sd=3), 2)
hist(x)

plot(density(x))

qqnorm(x)
qqline(x, col=2)

# statistical test, or regression.

2.2 Base on variable age on data set arr

# Eye
hist(arr$age)

qqnorm(arr$age)
qqline(arr$age, col = 2)

# Test
shapiro.test(arr$age)
## 
##  Shapiro-Wilk normality test
## 
## data:  arr$age
## W = 0.84992, p-value < 2.2e-16
library(table1)
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
table1(~ age + finance + week + arrest + married, data = arr)
Overall
(N=432)
age
Mean (SD) 24.6 (6.11)
Median [Min, Max] 23.0 [17.0, 44.0]
finance
no 216 (50.0%)
yes 216 (50.0%)
week
Mean (SD) 45.9 (12.7)
Median [Min, Max] 52.0 [1.00, 52.0]
arrest
Mean (SD) 0.264 (0.441)
Median [Min, Max] 0 [0, 1.00]
married
married 53 (12.3%)
not married 379 (87.7%)
table1(~ age + finance + week + arrest + married | race, data = arr)
black
(N=379)
other
(N=53)
Overall
(N=432)
age
Mean (SD) 24.6 (6.06) 24.6 (6.53) 24.6 (6.11)
Median [Min, Max] 23.0 [17.0, 44.0] 22.0 [17.0, 42.0] 23.0 [17.0, 44.0]
finance
no 185 (48.8%) 31 (58.5%) 216 (50.0%)
yes 194 (51.2%) 22 (41.5%) 216 (50.0%)
week
Mean (SD) 45.6 (13.0) 48.0 (9.73) 45.9 (12.7)
Median [Min, Max] 52.0 [1.00, 52.0] 52.0 [7.00, 52.0] 52.0 [1.00, 52.0]
arrest
Mean (SD) 0.269 (0.444) 0.226 (0.423) 0.264 (0.441)
Median [Min, Max] 0 [0, 1.00] 0 [0, 1.00] 0 [0, 1.00]
married
married 44 (11.6%) 9 (17.0%) 53 (12.3%)
not married 335 (88.4%) 44 (83.0%) 379 (87.7%)