1 Importing data

arr=read.csv("/Users/gantsetsegganbaatar/Desktop/R class 2023/Arrest-dataset.csv")

dim(arr) # dim means dimension
## [1] 432  12

2 Describing data

library(psych)
describe (arr)
##           vars   n   mean     sd median trimmed    mad min max range  skew
## id           1 432 216.50 124.85  216.5  216.50 160.12   1 432   431  0.00
## age          2 432  24.60   6.11   23.0   23.58   4.45  17  44    27  1.38
## finance*     3 432   1.50   0.50    1.5    1.50   0.74   1   2     1  0.00
## week         4 432  45.85  12.66   52.0   49.15   0.00   1  52    51 -1.98
## arrest       5 432   0.26   0.44    0.0    0.21   0.00   0   1     1  1.07
## race*        6 432   1.12   0.33    1.0    1.03   0.00   1   2     1  2.29
## work.exp*    7 432   1.57   0.50    2.0    1.59   0.00   1   2     1 -0.29
## married*     8 432   1.88   0.33    2.0    1.97   0.00   1   2     1 -2.29
## parole*      9 432   1.62   0.49    2.0    1.65   0.00   1   2     1 -0.48
## prior       10 432   2.98   2.90    2.0    2.46   1.48   0  18    18  2.07
## educ        11 432   3.48   0.83    3.0    3.38   0.00   2   6     4  0.91
## employ1*    12 432   1.14   0.35    1.0    1.05   0.00   1   2     1  2.08
##           kurtosis   se
## id           -1.21 6.01
## age           1.32 0.29
## finance*     -2.00 0.02
## week          2.62 0.61
## arrest       -0.86 0.02
## race*         3.26 0.02
## work.exp*    -1.92 0.02
## married*      3.26 0.02
## parole*      -1.77 0.02
## prior         5.21 0.14
## educ          0.79 0.04
## employ1*      2.34 0.02
describe(arr, ranges = F) # display most important informations
##           vars   n   mean     sd  skew kurtosis   se
## id           1 432 216.50 124.85  0.00    -1.21 6.01
## age          2 432  24.60   6.11  1.38     1.32 0.29
## finance*     3 432   1.50   0.50  0.00    -2.00 0.02
## week         4 432  45.85  12.66 -1.98     2.62 0.61
## arrest       5 432   0.26   0.44  1.07    -0.86 0.02
## race*        6 432   1.12   0.33  2.29     3.26 0.02
## work.exp*    7 432   1.57   0.50 -0.29    -1.92 0.02
## married*     8 432   1.88   0.33 -2.29     3.26 0.02
## parole*      9 432   1.62   0.49 -0.48    -1.77 0.02
## prior       10 432   2.98   2.90  2.07     5.21 0.14
## educ        11 432   3.48   0.83  0.91     0.79 0.04
## employ1*    12 432   1.14   0.35  2.08     2.34 0.02
# Describe numerical variables by categorical variable
describeBy(arr, arr$finance, range = F)
## 
##  Descriptive statistics by group 
## group: no
##           vars   n   mean     sd  skew kurtosis   se
## id           1 216 212.22 128.46  0.00    -1.27 8.74
## age          2 216  24.22   5.73  1.39     1.54 0.39
## finance*     3 216   1.00   0.00   NaN      NaN 0.00
## week         4 216  44.83  13.52 -1.75     1.77 0.92
## arrest       5 216   0.31   0.46  0.84    -1.30 0.03
## race*        6 216   1.14   0.35  2.02     2.09 0.02
## work.exp*    7 216   1.57   0.50 -0.28    -1.93 0.03
## married*     8 216   1.87   0.34 -2.13     2.55 0.02
## parole*      9 216   1.62   0.49 -0.51    -1.75 0.03
## prior       10 216   2.99   2.92  2.31     7.05 0.20
## educ        11 216   3.44   0.84  0.76     0.48 0.06
## employ1*    12 216   1.16   0.37  1.87     1.50 0.02
## ------------------------------------------------------------ 
## group: yes
##           vars   n   mean     sd  skew kurtosis   se
## id           1 216 220.78 121.28  0.01    -1.16 8.25
## age          2 216  24.97   6.47  1.34     0.99 0.44
## finance*     3 216   1.00   0.00   NaN      NaN 0.00
## week         4 216  46.88  11.69 -2.24     3.70 0.80
## arrest       5 216   0.22   0.42  1.33    -0.24 0.03
## race*        6 216   1.10   0.30  2.61     4.86 0.02
## work.exp*    7 216   1.57   0.50 -0.30    -1.92 0.03
## married*     8 216   1.89   0.31 -2.46     4.06 0.02
## parole*      9 216   1.61   0.49 -0.45    -1.80 0.03
## prior       10 216   2.98   2.88  1.79     3.18 0.20
## educ        11 216   3.52   0.82  1.08     1.04 0.06
## employ1*    12 216   1.12   0.33  2.32     3.39 0.02
# Describe categorical variables
library(gmodels)
attach(arr) # or use arr$ before finance
CrossTable(finance, digits = 3)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  432 
## 
##  
##           |        no |       yes | 
##           |-----------|-----------|
##           |       216 |       216 | 
##           |     0.500 |     0.500 | 
##           |-----------|-----------|
## 
## 
## 
## 
CrossTable(finance, race, digits = 3)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  432 
## 
##  
##              | race 
##      finance |     black |     other | Row Total | 
## -------------|-----------|-----------|-----------|
##           no |       185 |        31 |       216 | 
##              |     0.107 |     0.764 |           | 
##              |     0.856 |     0.144 |     0.500 | 
##              |     0.488 |     0.585 |           | 
##              |     0.428 |     0.072 |           | 
## -------------|-----------|-----------|-----------|
##          yes |       194 |        22 |       216 | 
##              |     0.107 |     0.764 |           | 
##              |     0.898 |     0.102 |     0.500 | 
##              |     0.512 |     0.415 |           | 
##              |     0.449 |     0.051 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |       379 |        53 |       432 | 
##              |     0.877 |     0.123 |           | 
## -------------|-----------|-----------|-----------|
## 
## 
CrossTable(finance, race, digits = 2,  chisq = T, fisher = T) #if nu of participants very low use Fisher`s test
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  432 
## 
##  
##              | race 
##      finance |     black |     other | Row Total | 
## -------------|-----------|-----------|-----------|
##           no |       185 |        31 |       216 | 
##              |      0.11 |      0.76 |           | 
##              |      0.86 |      0.14 |      0.50 | 
##              |      0.49 |      0.58 |           | 
##              |      0.43 |      0.07 |           | 
## -------------|-----------|-----------|-----------|
##          yes |       194 |        22 |       216 | 
##              |      0.11 |      0.76 |           | 
##              |      0.90 |      0.10 |      0.50 | 
##              |      0.51 |      0.42 |           | 
##              |      0.45 |      0.05 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |       379 |        53 |       432 | 
##              |      0.88 |      0.12 |           | 
## -------------|-----------|-----------|-----------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  1.742022     d.f. =  1     p =  0.1868828 
## 
## Pearson's Chi-squared test with Yates' continuity correction 
## ------------------------------------------------------------
## Chi^2 =  1.376413     d.f. =  1     p =  0.2407132 
## 
##  
## Fisher's Exact Test for Count Data
## ------------------------------------------------------------
## Sample estimate odds ratio:  0.6773696 
## 
## Alternative hypothesis: true odds ratio is not equal to 1
## p =  0.2405139 
## 95% confidence interval:  0.3594579 1.258022 
## 
## Alternative hypothesis: true odds ratio is less than 1
## p =  0.1202569 
## 95% confidence interval:  0 1.147456 
## 
## Alternative hypothesis: true odds ratio is greater than 1
## p =  0.9290199 
## 95% confidence interval:  0.3960554 Inf 
## 
## 
## 

3.1 Check normal distribution. Based on visual inspection

# Using simulation dataset below

x = round(rnorm(1000, mean=10, sd=3), 2)
mean(x)
## [1] 9.90123
sd(x)
## [1] 3.088074
hist(x)

plot(density(x))

qqnorm(x)
qqline(x, col=2) #col equal to color. 2 is color red.

# 3.1 Check distribution by statistical test

library(stats19)
## Data provided under OGL v3.0. Cite the source and link to:
## www.nationalarchives.gov.uk/doc/open-government-licence/version/3/
ks.test(x, "pnorm", mean = 10, sd = 3)
## Warning in ks.test.default(x, "pnorm", mean = 10, sd = 3): ties should not be
## present for the Kolmogorov-Smirnov test
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  x
## D = 0.03026, p-value = 0.3191
## alternative hypothesis: two-sided
shapiro.test(x)
## 
##  Shapiro-Wilk normality test
## 
## data:  x
## W = 0.99866, p-value = 0.6649
library(nortest) # Lilliefors (Kolmogorov-Smirnov) normality test
lillie.test(x)
## 
##  Lilliefors (Kolmogorov-Smirnov) normality test
## 
## data:  x
## D = 0.01369, p-value = 0.921
shapiro.test(arr$age)
## 
##  Shapiro-Wilk normality test
## 
## data:  arr$age
## W = 0.84992, p-value < 2.2e-16

4 Creating summary table

# Important package is table1 

library(table1) # create summary table of selected var as shown below
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
table1(~ age + finance + week + arrest + married, data = arr)
Overall
(N=432)
age
Mean (SD) 24.6 (6.11)
Median [Min, Max] 23.0 [17.0, 44.0]
finance
no 216 (50.0%)
yes 216 (50.0%)
week
Mean (SD) 45.9 (12.7)
Median [Min, Max] 52.0 [1.00, 52.0]
arrest
Mean (SD) 0.264 (0.441)
Median [Min, Max] 0 [0, 1.00]
married
married 53 (12.3%)
not married 379 (87.7%)
table1(~ age + finance + week + arrest + married | race, data = arr)
black
(N=379)
other
(N=53)
Overall
(N=432)
age
Mean (SD) 24.6 (6.06) 24.6 (6.53) 24.6 (6.11)
Median [Min, Max] 23.0 [17.0, 44.0] 22.0 [17.0, 42.0] 23.0 [17.0, 44.0]
finance
no 185 (48.8%) 31 (58.5%) 216 (50.0%)
yes 194 (51.2%) 22 (41.5%) 216 (50.0%)
week
Mean (SD) 45.6 (13.0) 48.0 (9.73) 45.9 (12.7)
Median [Min, Max] 52.0 [1.00, 52.0] 52.0 [7.00, 52.0] 52.0 [1.00, 52.0]
arrest
Mean (SD) 0.269 (0.444) 0.226 (0.423) 0.264 (0.441)
Median [Min, Max] 0 [0, 1.00] 0 [0, 1.00] 0 [0, 1.00]
married
married 44 (11.6%) 9 (17.0%) 53 (12.3%)
not married 335 (88.4%) 44 (83.0%) 379 (87.7%)
# to check difference of continuos var by groups
wilcox.test(age ~ race) 
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  age by race
## W = 10402, p-value = 0.6735
## alternative hypothesis: true location shift is not equal to 0
t.test(age~race)
## 
##  Welch Two Sample t-test
## 
## data:  age by race
## t = -0.053158, df = 65.152, p-value = 0.9578
## alternative hypothesis: true difference in means between group black and group other is not equal to 0
## 95 percent confidence interval:
##  -1.946953  1.845992
## sample estimates:
## mean in group black mean in group other 
##            24.59103            24.64151