#th=file.choose()
arr = read.csv("C:/Users/thien/OneDrive/Desktop/R thuc hanh/R learning/SUMS - R class data/Arrest dataset.csv")
dim(arr)
## [1] 432 12
# f1 = fix(arr)
library(psych)
# Describe numerical variables
describe(arr)
## vars n mean sd median trimmed mad min max range skew
## id 1 432 216.50 124.85 216.5 216.50 160.12 1 432 431 0.00
## age 2 432 24.60 6.11 23.0 23.58 4.45 17 44 27 1.38
## finance* 3 432 1.50 0.50 1.5 1.50 0.74 1 2 1 0.00
## week 4 432 45.85 12.66 52.0 49.15 0.00 1 52 51 -1.98
## arrest 5 432 0.26 0.44 0.0 0.21 0.00 0 1 1 1.07
## race* 6 432 1.12 0.33 1.0 1.03 0.00 1 2 1 2.29
## work.exp* 7 432 1.57 0.50 2.0 1.59 0.00 1 2 1 -0.29
## married* 8 432 1.88 0.33 2.0 1.97 0.00 1 2 1 -2.29
## parole* 9 432 1.62 0.49 2.0 1.65 0.00 1 2 1 -0.48
## prior 10 432 2.98 2.90 2.0 2.46 1.48 0 18 18 2.07
## educ 11 432 3.48 0.83 3.0 3.38 0.00 2 6 4 0.91
## employ1* 12 432 1.14 0.35 1.0 1.05 0.00 1 2 1 2.08
## kurtosis se
## id -1.21 6.01
## age 1.32 0.29
## finance* -2.00 0.02
## week 2.62 0.61
## arrest -0.86 0.02
## race* 3.26 0.02
## work.exp* -1.92 0.02
## married* 3.26 0.02
## parole* -1.77 0.02
## prior 5.21 0.14
## educ 0.79 0.04
## employ1* 2.34 0.02
describe(arr, ranges = F)
## vars n mean sd skew kurtosis se
## id 1 432 216.50 124.85 0.00 -1.21 6.01
## age 2 432 24.60 6.11 1.38 1.32 0.29
## finance* 3 432 1.50 0.50 0.00 -2.00 0.02
## week 4 432 45.85 12.66 -1.98 2.62 0.61
## arrest 5 432 0.26 0.44 1.07 -0.86 0.02
## race* 6 432 1.12 0.33 2.29 3.26 0.02
## work.exp* 7 432 1.57 0.50 -0.29 -1.92 0.02
## married* 8 432 1.88 0.33 -2.29 3.26 0.02
## parole* 9 432 1.62 0.49 -0.48 -1.77 0.02
## prior 10 432 2.98 2.90 2.07 5.21 0.14
## educ 11 432 3.48 0.83 0.91 0.79 0.04
## employ1* 12 432 1.14 0.35 2.08 2.34 0.02
# Describe numerical variables by categorical variable
describeBy(arr, arr$finance, range = F)
##
## Descriptive statistics by group
## group: no
## vars n mean sd skew kurtosis se
## id 1 216 212.22 128.46 0.00 -1.27 8.74
## age 2 216 24.22 5.73 1.39 1.54 0.39
## finance* 3 216 1.00 0.00 NaN NaN 0.00
## week 4 216 44.83 13.52 -1.75 1.77 0.92
## arrest 5 216 0.31 0.46 0.84 -1.30 0.03
## race* 6 216 1.14 0.35 2.02 2.09 0.02
## work.exp* 7 216 1.57 0.50 -0.28 -1.93 0.03
## married* 8 216 1.87 0.34 -2.13 2.55 0.02
## parole* 9 216 1.62 0.49 -0.51 -1.75 0.03
## prior 10 216 2.99 2.92 2.31 7.05 0.20
## educ 11 216 3.44 0.84 0.76 0.48 0.06
## employ1* 12 216 1.16 0.37 1.87 1.50 0.02
## ------------------------------------------------------------
## group: yes
## vars n mean sd skew kurtosis se
## id 1 216 220.78 121.28 0.01 -1.16 8.25
## age 2 216 24.97 6.47 1.34 0.99 0.44
## finance* 3 216 1.00 0.00 NaN NaN 0.00
## week 4 216 46.88 11.69 -2.24 3.70 0.80
## arrest 5 216 0.22 0.42 1.33 -0.24 0.03
## race* 6 216 1.10 0.30 2.61 4.86 0.02
## work.exp* 7 216 1.57 0.50 -0.30 -1.92 0.03
## married* 8 216 1.89 0.31 -2.46 4.06 0.02
## parole* 9 216 1.61 0.49 -0.45 -1.80 0.03
## prior 10 216 2.98 2.88 1.79 3.18 0.20
## educ 11 216 3.52 0.82 1.08 1.04 0.06
## employ1* 12 216 1.12 0.33 2.32 3.39 0.02
# Describe categorical variables
library(gmodels)
attach(arr)
CrossTable(finance, digits = 3)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 432
##
##
## | no | yes |
## |-----------|-----------|
## | 216 | 216 |
## | 0.500 | 0.500 |
## |-----------|-----------|
##
##
##
##
CrossTable(finance, race, digits = 3)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 432
##
##
## | race
## finance | black | other | Row Total |
## -------------|-----------|-----------|-----------|
## no | 185 | 31 | 216 |
## | 0.107 | 0.764 | |
## | 0.856 | 0.144 | 0.500 |
## | 0.488 | 0.585 | |
## | 0.428 | 0.072 | |
## -------------|-----------|-----------|-----------|
## yes | 194 | 22 | 216 |
## | 0.107 | 0.764 | |
## | 0.898 | 0.102 | 0.500 |
## | 0.512 | 0.415 | |
## | 0.449 | 0.051 | |
## -------------|-----------|-----------|-----------|
## Column Total | 379 | 53 | 432 |
## | 0.877 | 0.123 | |
## -------------|-----------|-----------|-----------|
##
##
CrossTable(finance, race, digits = 2, chisq = T, fisher = T)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 432
##
##
## | race
## finance | black | other | Row Total |
## -------------|-----------|-----------|-----------|
## no | 185 | 31 | 216 |
## | 0.11 | 0.76 | |
## | 0.86 | 0.14 | 0.50 |
## | 0.49 | 0.58 | |
## | 0.43 | 0.07 | |
## -------------|-----------|-----------|-----------|
## yes | 194 | 22 | 216 |
## | 0.11 | 0.76 | |
## | 0.90 | 0.10 | 0.50 |
## | 0.51 | 0.42 | |
## | 0.45 | 0.05 | |
## -------------|-----------|-----------|-----------|
## Column Total | 379 | 53 | 432 |
## | 0.88 | 0.12 | |
## -------------|-----------|-----------|-----------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 1.742022 d.f. = 1 p = 0.1868828
##
## Pearson's Chi-squared test with Yates' continuity correction
## ------------------------------------------------------------
## Chi^2 = 1.376413 d.f. = 1 p = 0.2407132
##
##
## Fisher's Exact Test for Count Data
## ------------------------------------------------------------
## Sample estimate odds ratio: 0.6773696
##
## Alternative hypothesis: true odds ratio is not equal to 1
## p = 0.2405139
## 95% confidence interval: 0.3594579 1.258022
##
## Alternative hypothesis: true odds ratio is less than 1
## p = 0.1202569
## 95% confidence interval: 0 1.147456
##
## Alternative hypothesis: true odds ratio is greater than 1
## p = 0.9290199
## 95% confidence interval: 0.3960554 Inf
##
##
##
# Using simulation
x = round(rnorm(1000, mean=10, sd=3), 2)
mean(x)
## [1] 10.00196
sd(x)
## [1] 2.967631
##
hist(x)
plot(density(x))
qqnorm(x)
qqline(x, col=2)
##
library(stats)
ks.test(x, "pnorm", mean = 10, sd = 3)
## Warning in ks.test(x, "pnorm", mean = 10, sd = 3): ties should not be present
## for the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: x
## D = 0.020267, p-value = 0.806
## alternative hypothesis: two-sided
shapiro.test(x)
##
## Shapiro-Wilk normality test
##
## data: x
## W = 0.99859, p-value = 0.6115
fmh <- read.csv("C:/Users/thien/OneDrive/Desktop/R thuc hanh/R learning/SUMS - R class data/Framingham dataset.csv")
attach(fmh)
## The following objects are masked from arr:
##
## age, educ, id
str(fmh)
## 'data.frame': 11627 obs. of 39 variables:
## $ id : int 2448 2448 6238 6238 6238 9428 9428 10552 10552 11252 ...
## $ sex : int 1 1 2 2 2 1 1 2 2 2 ...
## $ tot.chol : int 195 209 250 260 237 245 283 225 232 285 ...
## $ age : int 39 52 46 52 58 48 54 61 67 46 ...
## $ sysbp : num 106 121 121 105 108 ...
## $ diasbp : num 70 66 81 69.5 66 80 89 95 109 84 ...
## $ smoker : int 0 0 0 0 0 1 1 1 1 1 ...
## $ cigs.day : int 0 0 0 0 0 20 30 30 20 23 ...
## $ bmi : num 27 NA 28.7 29.4 28.5 ...
## $ diabetes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ bpmed : int 0 0 0 0 0 0 0 0 0 0 ...
## $ heart.rate : int 80 69 95 80 80 75 75 65 60 85 ...
## $ glucose : int 77 92 76 86 71 70 87 103 89 85 ...
## $ educ : int 4 4 2 2 2 1 1 3 3 3 ...
## $ prev.chd : int 0 0 0 0 0 0 0 0 0 0 ...
## $ prev.ap : int 0 0 0 0 0 0 0 0 0 0 ...
## $ prev.mi : int 0 0 0 0 0 0 0 0 0 0 ...
## $ prev.stroke : int 0 0 0 0 0 0 0 0 0 0 ...
## $ prev.hyp : int 0 0 0 0 0 0 0 1 1 0 ...
## $ time : int 0 4628 0 2156 4344 0 2199 0 1977 0 ...
## $ period : int 1 3 1 2 3 1 2 1 2 1 ...
## $ hdlc : int NA 31 NA NA 54 NA NA NA NA NA ...
## $ ldlc : int NA 178 NA NA 141 NA NA NA NA NA ...
## $ death : int 0 0 0 0 0 0 0 1 1 0 ...
## $ angina : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hosp.mi : int 1 1 0 0 0 0 0 0 0 0 ...
## $ mi.fchd : int 1 1 0 0 0 0 0 0 0 0 ...
## $ any.chd : int 1 1 0 0 0 0 0 0 0 0 ...
## $ stroke : int 0 0 0 0 0 0 0 1 1 0 ...
## $ cvd : int 1 1 0 0 0 0 0 1 1 0 ...
## $ hypertension: int 0 0 0 0 0 0 0 1 1 1 ...
## $ time.ap : int 8766 8766 8766 8766 8766 8766 8766 2956 2956 8766 ...
## $ time.mi : int 6438 6438 8766 8766 8766 8766 8766 2956 2956 8766 ...
## $ time.mi.1 : int 6438 6438 8766 8766 8766 8766 8766 2956 2956 8766 ...
## $ time.chd : int 6438 6438 8766 8766 8766 8766 8766 2956 2956 8766 ...
## $ time.stroke : int 8766 8766 8766 8766 8766 8766 8766 2089 2089 8766 ...
## $ time.cvd : int 6438 6438 8766 8766 8766 8766 8766 2089 2089 8766 ...
## $ time.dth : int 8766 8766 8766 8766 8766 8766 8766 2956 2956 8766 ...
## $ time.hyp : int 8766 8766 8766 8766 8766 8766 8766 0 0 4285 ...
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
t.test(bmi~sex)
##
## Welch Two Sample t-test
##
## data: bmi by sex
## t = 7.7706, df = 11572, p-value = 8.471e-15
## alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
## 95 percent confidence interval:
## 0.4300158 0.7201504
## sample estimates:
## mean in group 1 mean in group 2
## 26.20382 25.62873
table1(~ bmi|sex, fmh)
## Warning in table1.formula(~bmi | sex, fmh): Terms to the right of '|' in formula
## 'x' define table columns and are expected to be factors with meaningful labels.
| 1 (N=5022) |
2 (N=6605) |
Overall (N=11627) |
|
|---|---|---|---|
| bmi | |||
| Mean (SD) | 26.2 (3.43) | 25.6 (4.53) | 25.9 (4.10) |
| Median [Min, Max] | 26.1 [14.4, 45.4] | 24.8 [14.5, 56.8] | 25.5 [14.4, 56.8] |
| Missing | 18 (0.4%) | 34 (0.5%) | 52 (0.4%) |