arr=read.csv("/Users/gantsetsegganbaatar/Desktop/R class 2023/Arrest-dataset.csv")
dim(arr) # dim means dimension
## [1] 432 12
library(psych)
describe (arr)
## vars n mean sd median trimmed mad min max range skew
## id 1 432 216.50 124.85 216.5 216.50 160.12 1 432 431 0.00
## age 2 432 24.60 6.11 23.0 23.58 4.45 17 44 27 1.38
## finance* 3 432 1.50 0.50 1.5 1.50 0.74 1 2 1 0.00
## week 4 432 45.85 12.66 52.0 49.15 0.00 1 52 51 -1.98
## arrest 5 432 0.26 0.44 0.0 0.21 0.00 0 1 1 1.07
## race* 6 432 1.12 0.33 1.0 1.03 0.00 1 2 1 2.29
## work.exp* 7 432 1.57 0.50 2.0 1.59 0.00 1 2 1 -0.29
## married* 8 432 1.88 0.33 2.0 1.97 0.00 1 2 1 -2.29
## parole* 9 432 1.62 0.49 2.0 1.65 0.00 1 2 1 -0.48
## prior 10 432 2.98 2.90 2.0 2.46 1.48 0 18 18 2.07
## educ 11 432 3.48 0.83 3.0 3.38 0.00 2 6 4 0.91
## employ1* 12 432 1.14 0.35 1.0 1.05 0.00 1 2 1 2.08
## kurtosis se
## id -1.21 6.01
## age 1.32 0.29
## finance* -2.00 0.02
## week 2.62 0.61
## arrest -0.86 0.02
## race* 3.26 0.02
## work.exp* -1.92 0.02
## married* 3.26 0.02
## parole* -1.77 0.02
## prior 5.21 0.14
## educ 0.79 0.04
## employ1* 2.34 0.02
describe(arr, ranges = F) # display most important informations
## vars n mean sd skew kurtosis se
## id 1 432 216.50 124.85 0.00 -1.21 6.01
## age 2 432 24.60 6.11 1.38 1.32 0.29
## finance* 3 432 1.50 0.50 0.00 -2.00 0.02
## week 4 432 45.85 12.66 -1.98 2.62 0.61
## arrest 5 432 0.26 0.44 1.07 -0.86 0.02
## race* 6 432 1.12 0.33 2.29 3.26 0.02
## work.exp* 7 432 1.57 0.50 -0.29 -1.92 0.02
## married* 8 432 1.88 0.33 -2.29 3.26 0.02
## parole* 9 432 1.62 0.49 -0.48 -1.77 0.02
## prior 10 432 2.98 2.90 2.07 5.21 0.14
## educ 11 432 3.48 0.83 0.91 0.79 0.04
## employ1* 12 432 1.14 0.35 2.08 2.34 0.02
# Describe numerical variables by categorical variable
describeBy(arr, arr$finance, range = F)
##
## Descriptive statistics by group
## group: no
## vars n mean sd skew kurtosis se
## id 1 216 212.22 128.46 0.00 -1.27 8.74
## age 2 216 24.22 5.73 1.39 1.54 0.39
## finance* 3 216 1.00 0.00 NaN NaN 0.00
## week 4 216 44.83 13.52 -1.75 1.77 0.92
## arrest 5 216 0.31 0.46 0.84 -1.30 0.03
## race* 6 216 1.14 0.35 2.02 2.09 0.02
## work.exp* 7 216 1.57 0.50 -0.28 -1.93 0.03
## married* 8 216 1.87 0.34 -2.13 2.55 0.02
## parole* 9 216 1.62 0.49 -0.51 -1.75 0.03
## prior 10 216 2.99 2.92 2.31 7.05 0.20
## educ 11 216 3.44 0.84 0.76 0.48 0.06
## employ1* 12 216 1.16 0.37 1.87 1.50 0.02
## ------------------------------------------------------------
## group: yes
## vars n mean sd skew kurtosis se
## id 1 216 220.78 121.28 0.01 -1.16 8.25
## age 2 216 24.97 6.47 1.34 0.99 0.44
## finance* 3 216 1.00 0.00 NaN NaN 0.00
## week 4 216 46.88 11.69 -2.24 3.70 0.80
## arrest 5 216 0.22 0.42 1.33 -0.24 0.03
## race* 6 216 1.10 0.30 2.61 4.86 0.02
## work.exp* 7 216 1.57 0.50 -0.30 -1.92 0.03
## married* 8 216 1.89 0.31 -2.46 4.06 0.02
## parole* 9 216 1.61 0.49 -0.45 -1.80 0.03
## prior 10 216 2.98 2.88 1.79 3.18 0.20
## educ 11 216 3.52 0.82 1.08 1.04 0.06
## employ1* 12 216 1.12 0.33 2.32 3.39 0.02
# Describe categorical variables
library(gmodels)
attach(arr) # or use arr$ before finance
CrossTable(finance, digits = 3)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 432
##
##
## | no | yes |
## |-----------|-----------|
## | 216 | 216 |
## | 0.500 | 0.500 |
## |-----------|-----------|
##
##
##
##
CrossTable(finance, race, digits = 3)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 432
##
##
## | race
## finance | black | other | Row Total |
## -------------|-----------|-----------|-----------|
## no | 185 | 31 | 216 |
## | 0.107 | 0.764 | |
## | 0.856 | 0.144 | 0.500 |
## | 0.488 | 0.585 | |
## | 0.428 | 0.072 | |
## -------------|-----------|-----------|-----------|
## yes | 194 | 22 | 216 |
## | 0.107 | 0.764 | |
## | 0.898 | 0.102 | 0.500 |
## | 0.512 | 0.415 | |
## | 0.449 | 0.051 | |
## -------------|-----------|-----------|-----------|
## Column Total | 379 | 53 | 432 |
## | 0.877 | 0.123 | |
## -------------|-----------|-----------|-----------|
##
##
CrossTable(finance, race, digits = 2, chisq = T, fisher = T) #if nu of participants very low use Fisher`s test
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 432
##
##
## | race
## finance | black | other | Row Total |
## -------------|-----------|-----------|-----------|
## no | 185 | 31 | 216 |
## | 0.11 | 0.76 | |
## | 0.86 | 0.14 | 0.50 |
## | 0.49 | 0.58 | |
## | 0.43 | 0.07 | |
## -------------|-----------|-----------|-----------|
## yes | 194 | 22 | 216 |
## | 0.11 | 0.76 | |
## | 0.90 | 0.10 | 0.50 |
## | 0.51 | 0.42 | |
## | 0.45 | 0.05 | |
## -------------|-----------|-----------|-----------|
## Column Total | 379 | 53 | 432 |
## | 0.88 | 0.12 | |
## -------------|-----------|-----------|-----------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 1.742022 d.f. = 1 p = 0.1868828
##
## Pearson's Chi-squared test with Yates' continuity correction
## ------------------------------------------------------------
## Chi^2 = 1.376413 d.f. = 1 p = 0.2407132
##
##
## Fisher's Exact Test for Count Data
## ------------------------------------------------------------
## Sample estimate odds ratio: 0.6773696
##
## Alternative hypothesis: true odds ratio is not equal to 1
## p = 0.2405139
## 95% confidence interval: 0.3594579 1.258022
##
## Alternative hypothesis: true odds ratio is less than 1
## p = 0.1202569
## 95% confidence interval: 0 1.147456
##
## Alternative hypothesis: true odds ratio is greater than 1
## p = 0.9290199
## 95% confidence interval: 0.3960554 Inf
##
##
##
# Using simulation dataset below
x = round(rnorm(1000, mean=10, sd=3), 2)
mean(x)
## [1] 9.90123
sd(x)
## [1] 3.088074
hist(x)
plot(density(x))
qqnorm(x)
qqline(x, col=2) #col equal to color. 2 is color red.
# 3.1 Check distribution by statistical test
library(stats19)
## Data provided under OGL v3.0. Cite the source and link to:
## www.nationalarchives.gov.uk/doc/open-government-licence/version/3/
ks.test(x, "pnorm", mean = 10, sd = 3)
## Warning in ks.test.default(x, "pnorm", mean = 10, sd = 3): ties should not be
## present for the Kolmogorov-Smirnov test
##
## Asymptotic one-sample Kolmogorov-Smirnov test
##
## data: x
## D = 0.03026, p-value = 0.3191
## alternative hypothesis: two-sided
shapiro.test(x)
##
## Shapiro-Wilk normality test
##
## data: x
## W = 0.99866, p-value = 0.6649
library(nortest) # Lilliefors (Kolmogorov-Smirnov) normality test
lillie.test(x)
##
## Lilliefors (Kolmogorov-Smirnov) normality test
##
## data: x
## D = 0.01369, p-value = 0.921
shapiro.test(arr$age)
##
## Shapiro-Wilk normality test
##
## data: arr$age
## W = 0.84992, p-value < 2.2e-16
# Important package is table1
library(table1) # create summary table of selected var as shown below
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
table1(~ age + finance + week + arrest + married, data = arr)
Overall (N=432) |
|
---|---|
age | |
Mean (SD) | 24.6 (6.11) |
Median [Min, Max] | 23.0 [17.0, 44.0] |
finance | |
no | 216 (50.0%) |
yes | 216 (50.0%) |
week | |
Mean (SD) | 45.9 (12.7) |
Median [Min, Max] | 52.0 [1.00, 52.0] |
arrest | |
Mean (SD) | 0.264 (0.441) |
Median [Min, Max] | 0 [0, 1.00] |
married | |
married | 53 (12.3%) |
not married | 379 (87.7%) |
table1(~ age + finance + week + arrest + married | race, data = arr)
black (N=379) |
other (N=53) |
Overall (N=432) |
|
---|---|---|---|
age | |||
Mean (SD) | 24.6 (6.06) | 24.6 (6.53) | 24.6 (6.11) |
Median [Min, Max] | 23.0 [17.0, 44.0] | 22.0 [17.0, 42.0] | 23.0 [17.0, 44.0] |
finance | |||
no | 185 (48.8%) | 31 (58.5%) | 216 (50.0%) |
yes | 194 (51.2%) | 22 (41.5%) | 216 (50.0%) |
week | |||
Mean (SD) | 45.6 (13.0) | 48.0 (9.73) | 45.9 (12.7) |
Median [Min, Max] | 52.0 [1.00, 52.0] | 52.0 [7.00, 52.0] | 52.0 [1.00, 52.0] |
arrest | |||
Mean (SD) | 0.269 (0.444) | 0.226 (0.423) | 0.264 (0.441) |
Median [Min, Max] | 0 [0, 1.00] | 0 [0, 1.00] | 0 [0, 1.00] |
married | |||
married | 44 (11.6%) | 9 (17.0%) | 53 (12.3%) |
not married | 335 (88.4%) | 44 (83.0%) | 379 (87.7%) |
# to check difference of continuos var by groups
wilcox.test(age ~ race)
##
## Wilcoxon rank sum test with continuity correction
##
## data: age by race
## W = 10402, p-value = 0.6735
## alternative hypothesis: true location shift is not equal to 0
t.test(age~race)
##
## Welch Two Sample t-test
##
## data: age by race
## t = -0.053158, df = 65.152, p-value = 0.9578
## alternative hypothesis: true difference in means between group black and group other is not equal to 0
## 95 percent confidence interval:
## -1.946953 1.845992
## sample estimates:
## mean in group black mean in group other
## 24.59103 24.64151