1. How to import the dataset

#th=file.choose()
arr = read.csv("C:/Users/thien/OneDrive/Desktop/R thuc hanh/R learning/SUMS - R class data/Arrest dataset.csv")
dim(arr)

## [1] 432  12

# f1 = fix(arr)

2. Describe variables

library(psych)
# Describe numerical variables
describe(arr)

##           vars   n   mean     sd median trimmed    mad min max range  skew
## id           1 432 216.50 124.85  216.5  216.50 160.12   1 432   431  0.00
## age          2 432  24.60   6.11   23.0   23.58   4.45  17  44    27  1.38
## finance*     3 432   1.50   0.50    1.5    1.50   0.74   1   2     1  0.00
## week         4 432  45.85  12.66   52.0   49.15   0.00   1  52    51 -1.98
## arrest       5 432   0.26   0.44    0.0    0.21   0.00   0   1     1  1.07
## race*        6 432   1.12   0.33    1.0    1.03   0.00   1   2     1  2.29
## work.exp*    7 432   1.57   0.50    2.0    1.59   0.00   1   2     1 -0.29
## married*     8 432   1.88   0.33    2.0    1.97   0.00   1   2     1 -2.29
## parole*      9 432   1.62   0.49    2.0    1.65   0.00   1   2     1 -0.48
## prior       10 432   2.98   2.90    2.0    2.46   1.48   0  18    18  2.07
## educ        11 432   3.48   0.83    3.0    3.38   0.00   2   6     4  0.91
## employ1*    12 432   1.14   0.35    1.0    1.05   0.00   1   2     1  2.08
##           kurtosis   se
## id           -1.21 6.01
## age           1.32 0.29
## finance*     -2.00 0.02
## week          2.62 0.61
## arrest       -0.86 0.02
## race*         3.26 0.02
## work.exp*    -1.92 0.02
## married*      3.26 0.02
## parole*      -1.77 0.02
## prior         5.21 0.14
## educ          0.79 0.04
## employ1*      2.34 0.02

describe(arr, ranges = F)

##           vars   n   mean     sd  skew kurtosis   se
## id           1 432 216.50 124.85  0.00    -1.21 6.01
## age          2 432  24.60   6.11  1.38     1.32 0.29
## finance*     3 432   1.50   0.50  0.00    -2.00 0.02
## week         4 432  45.85  12.66 -1.98     2.62 0.61
## arrest       5 432   0.26   0.44  1.07    -0.86 0.02
## race*        6 432   1.12   0.33  2.29     3.26 0.02
## work.exp*    7 432   1.57   0.50 -0.29    -1.92 0.02
## married*     8 432   1.88   0.33 -2.29     3.26 0.02
## parole*      9 432   1.62   0.49 -0.48    -1.77 0.02
## prior       10 432   2.98   2.90  2.07     5.21 0.14
## educ        11 432   3.48   0.83  0.91     0.79 0.04
## employ1*    12 432   1.14   0.35  2.08     2.34 0.02

# Describe numerical variables by categorical variable
describeBy(arr, arr$finance, range = F)

## 
##  Descriptive statistics by group 
## group: no
##           vars   n   mean     sd  skew kurtosis   se
## id           1 216 212.22 128.46  0.00    -1.27 8.74
## age          2 216  24.22   5.73  1.39     1.54 0.39
## finance*     3 216   1.00   0.00   NaN      NaN 0.00
## week         4 216  44.83  13.52 -1.75     1.77 0.92
## arrest       5 216   0.31   0.46  0.84    -1.30 0.03
## race*        6 216   1.14   0.35  2.02     2.09 0.02
## work.exp*    7 216   1.57   0.50 -0.28    -1.93 0.03
## married*     8 216   1.87   0.34 -2.13     2.55 0.02
## parole*      9 216   1.62   0.49 -0.51    -1.75 0.03
## prior       10 216   2.99   2.92  2.31     7.05 0.20
## educ        11 216   3.44   0.84  0.76     0.48 0.06
## employ1*    12 216   1.16   0.37  1.87     1.50 0.02
## ------------------------------------------------------------ 
## group: yes
##           vars   n   mean     sd  skew kurtosis   se
## id           1 216 220.78 121.28  0.01    -1.16 8.25
## age          2 216  24.97   6.47  1.34     0.99 0.44
## finance*     3 216   1.00   0.00   NaN      NaN 0.00
## week         4 216  46.88  11.69 -2.24     3.70 0.80
## arrest       5 216   0.22   0.42  1.33    -0.24 0.03
## race*        6 216   1.10   0.30  2.61     4.86 0.02
## work.exp*    7 216   1.57   0.50 -0.30    -1.92 0.03
## married*     8 216   1.89   0.31 -2.46     4.06 0.02
## parole*      9 216   1.61   0.49 -0.45    -1.80 0.03
## prior       10 216   2.98   2.88  1.79     3.18 0.20
## educ        11 216   3.52   0.82  1.08     1.04 0.06
## employ1*    12 216   1.12   0.33  2.32     3.39 0.02

# Describe categorical variables
library(gmodels)
attach(arr)
CrossTable(finance, digits = 3)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  432 
## 
##  
##           |        no |       yes | 
##           |-----------|-----------|
##           |       216 |       216 | 
##           |     0.500 |     0.500 | 
##           |-----------|-----------|
## 
## 
## 
##

CrossTable(finance, race, digits = 3)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  432 
## 
##  
##              | race 
##      finance |     black |     other | Row Total | 
## -------------|-----------|-----------|-----------|
##           no |       185 |        31 |       216 | 
##              |     0.107 |     0.764 |           | 
##              |     0.856 |     0.144 |     0.500 | 
##              |     0.488 |     0.585 |           | 
##              |     0.428 |     0.072 |           | 
## -------------|-----------|-----------|-----------|
##          yes |       194 |        22 |       216 | 
##              |     0.107 |     0.764 |           | 
##              |     0.898 |     0.102 |     0.500 | 
##              |     0.512 |     0.415 |           | 
##              |     0.449 |     0.051 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |       379 |        53 |       432 | 
##              |     0.877 |     0.123 |           | 
## -------------|-----------|-----------|-----------|
## 
##

CrossTable(finance, race, digits = 2,  chisq = T, fisher = T)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  432 
## 
##  
##              | race 
##      finance |     black |     other | Row Total | 
## -------------|-----------|-----------|-----------|
##           no |       185 |        31 |       216 | 
##              |      0.11 |      0.76 |           | 
##              |      0.86 |      0.14 |      0.50 | 
##              |      0.49 |      0.58 |           | 
##              |      0.43 |      0.07 |           | 
## -------------|-----------|-----------|-----------|
##          yes |       194 |        22 |       216 | 
##              |      0.11 |      0.76 |           | 
##              |      0.90 |      0.10 |      0.50 | 
##              |      0.51 |      0.42 |           | 
##              |      0.45 |      0.05 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |       379 |        53 |       432 | 
##              |      0.88 |      0.12 |           | 
## -------------|-----------|-----------|-----------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  1.742022     d.f. =  1     p =  0.1868828 
## 
## Pearson's Chi-squared test with Yates' continuity correction 
## ------------------------------------------------------------
## Chi^2 =  1.376413     d.f. =  1     p =  0.2407132 
## 
##  
## Fisher's Exact Test for Count Data
## ------------------------------------------------------------
## Sample estimate odds ratio:  0.6773696 
## 
## Alternative hypothesis: true odds ratio is not equal to 1
## p =  0.2405139 
## 95% confidence interval:  0.3594579 1.258022 
## 
## Alternative hypothesis: true odds ratio is less than 1
## p =  0.1202569 
## 95% confidence interval:  0 1.147456 
## 
## Alternative hypothesis: true odds ratio is greater than 1
## p =  0.9290199 
## 95% confidence interval:  0.3960554 Inf 
## 
## 
##

3. Check normal distribution.

Based on visual inspection, statistical test, or regression.

# Using simulation

x = round(rnorm(1000, mean=10, sd=3), 2)
mean(x)

## [1] 10.00196

sd(x)

## [1] 2.967631

##
hist(x)

plot(density(x))

qqnorm(x)
qqline(x, col=2)

##
library(stats)
ks.test(x, "pnorm", mean = 10, sd = 3)

## Warning in ks.test(x, "pnorm", mean = 10, sd = 3): ties should not be present
## for the Kolmogorov-Smirnov test

## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  x
## D = 0.020267, p-value = 0.806
## alternative hypothesis: two-sided

shapiro.test(x)

## 
##  Shapiro-Wilk normality test
## 
## data:  x
## W = 0.99859, p-value = 0.6115

4. Compare means (parametric test)

fmh <- read.csv("C:/Users/thien/OneDrive/Desktop/R thuc hanh/R learning/SUMS - R class data/Framingham dataset.csv")
attach(fmh)

## The following objects are masked from arr:
## 
##     age, educ, id

str(fmh)

## 'data.frame':    11627 obs. of  39 variables:
##  $ id          : int  2448 2448 6238 6238 6238 9428 9428 10552 10552 11252 ...
##  $ sex         : int  1 1 2 2 2 1 1 2 2 2 ...
##  $ tot.chol    : int  195 209 250 260 237 245 283 225 232 285 ...
##  $ age         : int  39 52 46 52 58 48 54 61 67 46 ...
##  $ sysbp       : num  106 121 121 105 108 ...
##  $ diasbp      : num  70 66 81 69.5 66 80 89 95 109 84 ...
##  $ smoker      : int  0 0 0 0 0 1 1 1 1 1 ...
##  $ cigs.day    : int  0 0 0 0 0 20 30 30 20 23 ...
##  $ bmi         : num  27 NA 28.7 29.4 28.5 ...
##  $ diabetes    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ bpmed       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ heart.rate  : int  80 69 95 80 80 75 75 65 60 85 ...
##  $ glucose     : int  77 92 76 86 71 70 87 103 89 85 ...
##  $ educ        : int  4 4 2 2 2 1 1 3 3 3 ...
##  $ prev.chd    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ prev.ap     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ prev.mi     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ prev.stroke : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ prev.hyp    : int  0 0 0 0 0 0 0 1 1 0 ...
##  $ time        : int  0 4628 0 2156 4344 0 2199 0 1977 0 ...
##  $ period      : int  1 3 1 2 3 1 2 1 2 1 ...
##  $ hdlc        : int  NA 31 NA NA 54 NA NA NA NA NA ...
##  $ ldlc        : int  NA 178 NA NA 141 NA NA NA NA NA ...
##  $ death       : int  0 0 0 0 0 0 0 1 1 0 ...
##  $ angina      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hosp.mi     : int  1 1 0 0 0 0 0 0 0 0 ...
##  $ mi.fchd     : int  1 1 0 0 0 0 0 0 0 0 ...
##  $ any.chd     : int  1 1 0 0 0 0 0 0 0 0 ...
##  $ stroke      : int  0 0 0 0 0 0 0 1 1 0 ...
##  $ cvd         : int  1 1 0 0 0 0 0 1 1 0 ...
##  $ hypertension: int  0 0 0 0 0 0 0 1 1 1 ...
##  $ time.ap     : int  8766 8766 8766 8766 8766 8766 8766 2956 2956 8766 ...
##  $ time.mi     : int  6438 6438 8766 8766 8766 8766 8766 2956 2956 8766 ...
##  $ time.mi.1   : int  6438 6438 8766 8766 8766 8766 8766 2956 2956 8766 ...
##  $ time.chd    : int  6438 6438 8766 8766 8766 8766 8766 2956 2956 8766 ...
##  $ time.stroke : int  8766 8766 8766 8766 8766 8766 8766 2089 2089 8766 ...
##  $ time.cvd    : int  6438 6438 8766 8766 8766 8766 8766 2089 2089 8766 ...
##  $ time.dth    : int  8766 8766 8766 8766 8766 8766 8766 2956 2956 8766 ...
##  $ time.hyp    : int  8766 8766 8766 8766 8766 8766 8766 0 0 4285 ...

library(table1)

## 
## Attaching package: 'table1'

## The following objects are masked from 'package:base':
## 
##     units, units<-

t.test(bmi~sex)

## 
##  Welch Two Sample t-test
## 
## data:  bmi by sex
## t = 7.7706, df = 11572, p-value = 8.471e-15
## alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
## 95 percent confidence interval:
##  0.4300158 0.7201504
## sample estimates:
## mean in group 1 mean in group 2 
##        26.20382        25.62873

table1(~ bmi|sex, fmh)

## Warning in table1.formula(~bmi | sex, fmh): Terms to the right of '|' in formula
## 'x' define table columns and are expected to be factors with meaningful labels.

	1 (N=5022)	2 (N=6605)	Overall (N=11627)
bmi
Mean (SD)	26.2 (3.43)	25.6 (4.53)	25.9 (4.10)
Median [Min, Max]	26.1 [14.4, 45.4]	24.8 [14.5, 56.8]	25.5 [14.4, 56.8]
Missing	18 (0.4%)	34 (0.5%)	52 (0.4%)

SUMS 02 - Import, describe variable, normal distribution, parametric test

Thien

2023-02-08

1. How to import the dataset

2. Describe variables

3. Check normal distribution.

Based on visual inspection, statistical test, or regression.

4. Compare means (parametric test)

5. Assignment

How to export data