Descriptive Statistics - Learning R

#1. Summary Statistics

library(ACSWR)
data(yb)
summary(yb)

##  Preparation_1   Preparation_2  
##  Min.   : 7.00   Min.   : 5.00  
##  1st Qu.: 8.75   1st Qu.: 6.75  
##  Median :13.50   Median :10.50  
##  Mean   :15.00   Mean   :11.00  
##  3rd Qu.:18.50   3rd Qu.:14.75  
##  Max.   :31.00   Max.   :18.00

# Or for 1st and 3rd quantiles
quantile(yb$Preparation_1,0.25)

##  25% 
## 8.75

quantile(yb$Preparation_2, 0.75)

##   75% 
## 14.75

# The percentiles

quantile(yb$Preparation_1, seq(0, 1, 0.1))

##   0%  10%  20%  30%  40%  50%  60%  70%  80%  90% 100% 
##  7.0  7.7  8.4  9.1  9.8 13.5 17.2 17.9 19.2 23.3 31.0

# Others
median(yb$Preparation_1)

## [1] 13.5

IQR(yb$Preparation_1)

## [1] 9.75

skewcoeff(yb$Preparation_1)

## [1] 0.8548652

kurtcoeff(yb$Preparation_1)

## [1] 2.727591

lapply(yb,summary)

## $Preparation_1
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    7.00    8.75   13.50   15.00   18.50   31.00 
## 
## $Preparation_2
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5.00    6.75   10.50   11.00   14.75   18.00

fivenum(yb$Preparation_1)

## [1]  7.0  8.5 13.5 19.0 31.0

# calculate standard diviation and variance

sd(yb$Preparation_1)

## [1] 8.176622

sd(yb$Preparation_2)

## [1] 4.956958

var(yb$Preparation_1)

## [1] 66.85714

var(yb$Preparation_2)

## [1] 24.57143

# list the ranges of variables
range(yb$Preparation_1)

## [1]  7 31

range(yb$Preparation_2)

## [1]  5 18

#2. Correlation analysis

# Pearson correlation (parametric correlation test)(r), which measures a linear dependence between two variables (x and y).
# Kendall and Spearman, which are rank-based correlation coefficients

data1 <- mtcars
head(data1, 10)

##                    mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## Duster 360        14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## Merc 240D         24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## Merc 230          22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## Merc 280          19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4

# Scatter plot
library(ggpubr)

## Loading required package: ggplot2

ggscatter(data1, x = "mpg", y = "wt", 
          add = "reg.line", conf.int = TRUE, 
          cor.coef = TRUE, cor.method = "pearson",
          xlab = "Miles/(US) gallon", ylab = "Weight (1000 lbs)")

## `geom_smooth()` using formula = 'y ~ x'

# Shapiro-Wilk normality test for mpg, wt
shapiro.test(data1$mpg)

## 
##  Shapiro-Wilk normality test
## 
## data:  data1$mpg
## W = 0.94756, p-value = 0.1229

shapiro.test(data1$wt) # ? Null hypothesis

## 
##  Shapiro-Wilk normality test
## 
## data:  data1$wt
## W = 0.94326, p-value = 0.09265

# Q-Q plots (quantile-quantile plots)
par(mfrow=c(1,2))
ggqqplot(data1$mpg, ylab = "MPG") # --> Q-Q plot showing normality of variables

## Warning: The following aesthetics were dropped during statistical transformation: sample
## i This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## i Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation: sample
## i This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## i Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

ggqqplot(data1$mpg, ylab = "MPG")

## Warning: The following aesthetics were dropped during statistical transformation: sample
## i This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## i Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation: sample
## i This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## i Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

# cor(x, y, method = c("pearson", "kendall", "spearman"))
corr_coef <- cor.test(data1$wt, data1$mpg, 
                method = "pearson")

corr_coef <- cor.test(data1$wt, data1$mpg, 
                method = "kendall")

## Warning in cor.test.default(data1$wt, data1$mpg, method = "kendall"): Cannot
## compute exact p-value with ties

corr_coef

## 
##  Kendall's rank correlation tau
## 
## data:  data1$wt and data1$mpg
## z = -5.7981, p-value = 6.706e-09
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
##        tau 
## -0.7278321

# How to interpret the correlation coefficient 

# Correlation matrix
library(Hmisc)

## Loading required package: lattice

## Loading required package: survival

## Loading required package: Formula

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:base':
## 
##     format.pval, units

corr_matrix <- rcorr(as.matrix(data1))
corr_matrix

##        mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
## mpg   1.00 -0.85 -0.85 -0.78  0.68 -0.87  0.42  0.66  0.60  0.48 -0.55
## cyl  -0.85  1.00  0.90  0.83 -0.70  0.78 -0.59 -0.81 -0.52 -0.49  0.53
## disp -0.85  0.90  1.00  0.79 -0.71  0.89 -0.43 -0.71 -0.59 -0.56  0.39
## hp   -0.78  0.83  0.79  1.00 -0.45  0.66 -0.71 -0.72 -0.24 -0.13  0.75
## drat  0.68 -0.70 -0.71 -0.45  1.00 -0.71  0.09  0.44  0.71  0.70 -0.09
## wt   -0.87  0.78  0.89  0.66 -0.71  1.00 -0.17 -0.55 -0.69 -0.58  0.43
## qsec  0.42 -0.59 -0.43 -0.71  0.09 -0.17  1.00  0.74 -0.23 -0.21 -0.66
## vs    0.66 -0.81 -0.71 -0.72  0.44 -0.55  0.74  1.00  0.17  0.21 -0.57
## am    0.60 -0.52 -0.59 -0.24  0.71 -0.69 -0.23  0.17  1.00  0.79  0.06
## gear  0.48 -0.49 -0.56 -0.13  0.70 -0.58 -0.21  0.21  0.79  1.00  0.27
## carb -0.55  0.53  0.39  0.75 -0.09  0.43 -0.66 -0.57  0.06  0.27  1.00
## 
## n= 32 
## 
## 
## P
##      mpg    cyl    disp   hp     drat   wt     qsec   vs     am     gear  
## mpg         0.0000 0.0000 0.0000 0.0000 0.0000 0.0171 0.0000 0.0003 0.0054
## cyl  0.0000        0.0000 0.0000 0.0000 0.0000 0.0004 0.0000 0.0022 0.0042
## disp 0.0000 0.0000        0.0000 0.0000 0.0000 0.0131 0.0000 0.0004 0.0010
## hp   0.0000 0.0000 0.0000        0.0100 0.0000 0.0000 0.0000 0.1798 0.4930
## drat 0.0000 0.0000 0.0000 0.0100        0.0000 0.6196 0.0117 0.0000 0.0000
## wt   0.0000 0.0000 0.0000 0.0000 0.0000        0.3389 0.0010 0.0000 0.0005
## qsec 0.0171 0.0004 0.0131 0.0000 0.6196 0.3389        0.0000 0.2057 0.2425
## vs   0.0000 0.0000 0.0000 0.0000 0.0117 0.0010 0.0000        0.3570 0.2579
## am   0.0003 0.0022 0.0004 0.1798 0.0000 0.0000 0.2057 0.3570        0.0000
## gear 0.0054 0.0042 0.0010 0.4930 0.0000 0.0005 0.2425 0.2579 0.0000       
## carb 0.0011 0.0019 0.0253 0.0000 0.6212 0.0146 0.0000 0.0007 0.7545 0.1290
##      carb  
## mpg  0.0011
## cyl  0.0019
## disp 0.0253
## hp   0.0000
## drat 0.6212
## wt   0.0146
## qsec 0.0000
## vs   0.0007
## am   0.7545
## gear 0.1290
## carb

library(corrplot)

## corrplot 0.92 loaded

corrplot(cor(data.matrix(data1)))

library(PerformanceAnalytics)

## Loading required package: xts

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## 
## Attaching package: 'PerformanceAnalytics'

## The following object is masked from 'package:graphics':
## 
##     legend

data1 <- mtcars[, c(1,3,4,5,6,7)]
chart.Correlation(data1, histogram=TRUE, pch=19)

library(GGally)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

ggpairs(data1)

Descriptive Statistics - Learning R - 01

Thien

2023-02-02