1. How to import the dataset

#th=file.choose()
arr = read.csv("C:/Users/thien/OneDrive/Desktop/R thuc hanh/R learning/SUMS - R class data/Arrest dataset.csv")
dim(arr)
## [1] 432  12
# f1 = fix(arr)

2. Describe variables

library(psych)
# Describe numerical variables
describe(arr)
##           vars   n   mean     sd median trimmed    mad min max range  skew
## id           1 432 216.50 124.85  216.5  216.50 160.12   1 432   431  0.00
## age          2 432  24.60   6.11   23.0   23.58   4.45  17  44    27  1.38
## finance*     3 432   1.50   0.50    1.5    1.50   0.74   1   2     1  0.00
## week         4 432  45.85  12.66   52.0   49.15   0.00   1  52    51 -1.98
## arrest       5 432   0.26   0.44    0.0    0.21   0.00   0   1     1  1.07
## race*        6 432   1.12   0.33    1.0    1.03   0.00   1   2     1  2.29
## work.exp*    7 432   1.57   0.50    2.0    1.59   0.00   1   2     1 -0.29
## married*     8 432   1.88   0.33    2.0    1.97   0.00   1   2     1 -2.29
## parole*      9 432   1.62   0.49    2.0    1.65   0.00   1   2     1 -0.48
## prior       10 432   2.98   2.90    2.0    2.46   1.48   0  18    18  2.07
## educ        11 432   3.48   0.83    3.0    3.38   0.00   2   6     4  0.91
## employ1*    12 432   1.14   0.35    1.0    1.05   0.00   1   2     1  2.08
##           kurtosis   se
## id           -1.21 6.01
## age           1.32 0.29
## finance*     -2.00 0.02
## week          2.62 0.61
## arrest       -0.86 0.02
## race*         3.26 0.02
## work.exp*    -1.92 0.02
## married*      3.26 0.02
## parole*      -1.77 0.02
## prior         5.21 0.14
## educ          0.79 0.04
## employ1*      2.34 0.02
describe(arr, ranges = F)
##           vars   n   mean     sd  skew kurtosis   se
## id           1 432 216.50 124.85  0.00    -1.21 6.01
## age          2 432  24.60   6.11  1.38     1.32 0.29
## finance*     3 432   1.50   0.50  0.00    -2.00 0.02
## week         4 432  45.85  12.66 -1.98     2.62 0.61
## arrest       5 432   0.26   0.44  1.07    -0.86 0.02
## race*        6 432   1.12   0.33  2.29     3.26 0.02
## work.exp*    7 432   1.57   0.50 -0.29    -1.92 0.02
## married*     8 432   1.88   0.33 -2.29     3.26 0.02
## parole*      9 432   1.62   0.49 -0.48    -1.77 0.02
## prior       10 432   2.98   2.90  2.07     5.21 0.14
## educ        11 432   3.48   0.83  0.91     0.79 0.04
## employ1*    12 432   1.14   0.35  2.08     2.34 0.02
# Describe numerical variables by categorical variable
describeBy(arr, arr$finance, range = F)
## 
##  Descriptive statistics by group 
## group: no
##           vars   n   mean     sd  skew kurtosis   se
## id           1 216 212.22 128.46  0.00    -1.27 8.74
## age          2 216  24.22   5.73  1.39     1.54 0.39
## finance*     3 216   1.00   0.00   NaN      NaN 0.00
## week         4 216  44.83  13.52 -1.75     1.77 0.92
## arrest       5 216   0.31   0.46  0.84    -1.30 0.03
## race*        6 216   1.14   0.35  2.02     2.09 0.02
## work.exp*    7 216   1.57   0.50 -0.28    -1.93 0.03
## married*     8 216   1.87   0.34 -2.13     2.55 0.02
## parole*      9 216   1.62   0.49 -0.51    -1.75 0.03
## prior       10 216   2.99   2.92  2.31     7.05 0.20
## educ        11 216   3.44   0.84  0.76     0.48 0.06
## employ1*    12 216   1.16   0.37  1.87     1.50 0.02
## ------------------------------------------------------------ 
## group: yes
##           vars   n   mean     sd  skew kurtosis   se
## id           1 216 220.78 121.28  0.01    -1.16 8.25
## age          2 216  24.97   6.47  1.34     0.99 0.44
## finance*     3 216   1.00   0.00   NaN      NaN 0.00
## week         4 216  46.88  11.69 -2.24     3.70 0.80
## arrest       5 216   0.22   0.42  1.33    -0.24 0.03
## race*        6 216   1.10   0.30  2.61     4.86 0.02
## work.exp*    7 216   1.57   0.50 -0.30    -1.92 0.03
## married*     8 216   1.89   0.31 -2.46     4.06 0.02
## parole*      9 216   1.61   0.49 -0.45    -1.80 0.03
## prior       10 216   2.98   2.88  1.79     3.18 0.20
## educ        11 216   3.52   0.82  1.08     1.04 0.06
## employ1*    12 216   1.12   0.33  2.32     3.39 0.02
# Describe categorical variables
library(gmodels)
attach(arr)
CrossTable(finance, digits = 3)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  432 
## 
##  
##           |        no |       yes | 
##           |-----------|-----------|
##           |       216 |       216 | 
##           |     0.500 |     0.500 | 
##           |-----------|-----------|
## 
## 
## 
## 
CrossTable(finance, race, digits = 3)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  432 
## 
##  
##              | race 
##      finance |     black |     other | Row Total | 
## -------------|-----------|-----------|-----------|
##           no |       185 |        31 |       216 | 
##              |     0.107 |     0.764 |           | 
##              |     0.856 |     0.144 |     0.500 | 
##              |     0.488 |     0.585 |           | 
##              |     0.428 |     0.072 |           | 
## -------------|-----------|-----------|-----------|
##          yes |       194 |        22 |       216 | 
##              |     0.107 |     0.764 |           | 
##              |     0.898 |     0.102 |     0.500 | 
##              |     0.512 |     0.415 |           | 
##              |     0.449 |     0.051 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |       379 |        53 |       432 | 
##              |     0.877 |     0.123 |           | 
## -------------|-----------|-----------|-----------|
## 
## 
CrossTable(finance, race, digits = 2,  chisq = T, fisher = T)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  432 
## 
##  
##              | race 
##      finance |     black |     other | Row Total | 
## -------------|-----------|-----------|-----------|
##           no |       185 |        31 |       216 | 
##              |      0.11 |      0.76 |           | 
##              |      0.86 |      0.14 |      0.50 | 
##              |      0.49 |      0.58 |           | 
##              |      0.43 |      0.07 |           | 
## -------------|-----------|-----------|-----------|
##          yes |       194 |        22 |       216 | 
##              |      0.11 |      0.76 |           | 
##              |      0.90 |      0.10 |      0.50 | 
##              |      0.51 |      0.42 |           | 
##              |      0.45 |      0.05 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |       379 |        53 |       432 | 
##              |      0.88 |      0.12 |           | 
## -------------|-----------|-----------|-----------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  1.742022     d.f. =  1     p =  0.1868828 
## 
## Pearson's Chi-squared test with Yates' continuity correction 
## ------------------------------------------------------------
## Chi^2 =  1.376413     d.f. =  1     p =  0.2407132 
## 
##  
## Fisher's Exact Test for Count Data
## ------------------------------------------------------------
## Sample estimate odds ratio:  0.6773696 
## 
## Alternative hypothesis: true odds ratio is not equal to 1
## p =  0.2405139 
## 95% confidence interval:  0.3594579 1.258022 
## 
## Alternative hypothesis: true odds ratio is less than 1
## p =  0.1202569 
## 95% confidence interval:  0 1.147456 
## 
## Alternative hypothesis: true odds ratio is greater than 1
## p =  0.9290199 
## 95% confidence interval:  0.3960554 Inf 
## 
## 
## 

3. Check normal distribution.

Based on visual inspection, statistical test, or regression.

# Using simulation

x = round(rnorm(1000, mean=10, sd=3), 2)
mean(x)
## [1] 10.00196
sd(x)
## [1] 2.967631
##
hist(x)

plot(density(x))

qqnorm(x)
qqline(x, col=2)

##
library(stats)
ks.test(x, "pnorm", mean = 10, sd = 3)
## Warning in ks.test(x, "pnorm", mean = 10, sd = 3): ties should not be present
## for the Kolmogorov-Smirnov test
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  x
## D = 0.020267, p-value = 0.806
## alternative hypothesis: two-sided
shapiro.test(x)
## 
##  Shapiro-Wilk normality test
## 
## data:  x
## W = 0.99859, p-value = 0.6115

4. Compare means (parametric test)

fmh <- read.csv("C:/Users/thien/OneDrive/Desktop/R thuc hanh/R learning/SUMS - R class data/Framingham dataset.csv")
attach(fmh)
## The following objects are masked from arr:
## 
##     age, educ, id
str(fmh)
## 'data.frame':    11627 obs. of  39 variables:
##  $ id          : int  2448 2448 6238 6238 6238 9428 9428 10552 10552 11252 ...
##  $ sex         : int  1 1 2 2 2 1 1 2 2 2 ...
##  $ tot.chol    : int  195 209 250 260 237 245 283 225 232 285 ...
##  $ age         : int  39 52 46 52 58 48 54 61 67 46 ...
##  $ sysbp       : num  106 121 121 105 108 ...
##  $ diasbp      : num  70 66 81 69.5 66 80 89 95 109 84 ...
##  $ smoker      : int  0 0 0 0 0 1 1 1 1 1 ...
##  $ cigs.day    : int  0 0 0 0 0 20 30 30 20 23 ...
##  $ bmi         : num  27 NA 28.7 29.4 28.5 ...
##  $ diabetes    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ bpmed       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ heart.rate  : int  80 69 95 80 80 75 75 65 60 85 ...
##  $ glucose     : int  77 92 76 86 71 70 87 103 89 85 ...
##  $ educ        : int  4 4 2 2 2 1 1 3 3 3 ...
##  $ prev.chd    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ prev.ap     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ prev.mi     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ prev.stroke : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ prev.hyp    : int  0 0 0 0 0 0 0 1 1 0 ...
##  $ time        : int  0 4628 0 2156 4344 0 2199 0 1977 0 ...
##  $ period      : int  1 3 1 2 3 1 2 1 2 1 ...
##  $ hdlc        : int  NA 31 NA NA 54 NA NA NA NA NA ...
##  $ ldlc        : int  NA 178 NA NA 141 NA NA NA NA NA ...
##  $ death       : int  0 0 0 0 0 0 0 1 1 0 ...
##  $ angina      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hosp.mi     : int  1 1 0 0 0 0 0 0 0 0 ...
##  $ mi.fchd     : int  1 1 0 0 0 0 0 0 0 0 ...
##  $ any.chd     : int  1 1 0 0 0 0 0 0 0 0 ...
##  $ stroke      : int  0 0 0 0 0 0 0 1 1 0 ...
##  $ cvd         : int  1 1 0 0 0 0 0 1 1 0 ...
##  $ hypertension: int  0 0 0 0 0 0 0 1 1 1 ...
##  $ time.ap     : int  8766 8766 8766 8766 8766 8766 8766 2956 2956 8766 ...
##  $ time.mi     : int  6438 6438 8766 8766 8766 8766 8766 2956 2956 8766 ...
##  $ time.mi.1   : int  6438 6438 8766 8766 8766 8766 8766 2956 2956 8766 ...
##  $ time.chd    : int  6438 6438 8766 8766 8766 8766 8766 2956 2956 8766 ...
##  $ time.stroke : int  8766 8766 8766 8766 8766 8766 8766 2089 2089 8766 ...
##  $ time.cvd    : int  6438 6438 8766 8766 8766 8766 8766 2089 2089 8766 ...
##  $ time.dth    : int  8766 8766 8766 8766 8766 8766 8766 2956 2956 8766 ...
##  $ time.hyp    : int  8766 8766 8766 8766 8766 8766 8766 0 0 4285 ...
library(table1)
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
t.test(bmi~sex)
## 
##  Welch Two Sample t-test
## 
## data:  bmi by sex
## t = 7.7706, df = 11572, p-value = 8.471e-15
## alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
## 95 percent confidence interval:
##  0.4300158 0.7201504
## sample estimates:
## mean in group 1 mean in group 2 
##        26.20382        25.62873
table1(~ bmi|sex, fmh)
## Warning in table1.formula(~bmi | sex, fmh): Terms to the right of '|' in formula
## 'x' define table columns and are expected to be factors with meaningful labels.
1
(N=5022)
2
(N=6605)
Overall
(N=11627)
bmi
Mean (SD) 26.2 (3.43) 25.6 (4.53) 25.9 (4.10)
Median [Min, Max] 26.1 [14.4, 45.4] 24.8 [14.5, 56.8] 25.5 [14.4, 56.8]
Missing 18 (0.4%) 34 (0.5%) 52 (0.4%)

5. Assignment

How to export data