#1. Summary Statistics
library(ACSWR)
data(yb)
summary(yb)
## Preparation_1 Preparation_2
## Min. : 7.00 Min. : 5.00
## 1st Qu.: 8.75 1st Qu.: 6.75
## Median :13.50 Median :10.50
## Mean :15.00 Mean :11.00
## 3rd Qu.:18.50 3rd Qu.:14.75
## Max. :31.00 Max. :18.00
# Or for 1st and 3rd quantiles
quantile(yb$Preparation_1,0.25)
## 25%
## 8.75
quantile(yb$Preparation_2, 0.75)
## 75%
## 14.75
# The percentiles
quantile(yb$Preparation_1, seq(0, 1, 0.1))
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
## 7.0 7.7 8.4 9.1 9.8 13.5 17.2 17.9 19.2 23.3 31.0
# Others
median(yb$Preparation_1)
## [1] 13.5
IQR(yb$Preparation_1)
## [1] 9.75
skewcoeff(yb$Preparation_1)
## [1] 0.8548652
kurtcoeff(yb$Preparation_1)
## [1] 2.727591
lapply(yb,summary)
## $Preparation_1
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7.00 8.75 13.50 15.00 18.50 31.00
##
## $Preparation_2
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.00 6.75 10.50 11.00 14.75 18.00
fivenum(yb$Preparation_1)
## [1] 7.0 8.5 13.5 19.0 31.0
# calculate standard diviation and variance
sd(yb$Preparation_1)
## [1] 8.176622
sd(yb$Preparation_2)
## [1] 4.956958
var(yb$Preparation_1)
## [1] 66.85714
var(yb$Preparation_2)
## [1] 24.57143
# list the ranges of variables
range(yb$Preparation_1)
## [1] 7 31
range(yb$Preparation_2)
## [1] 5 18
#2. Correlation analysis
# Pearson correlation (parametric correlation test)(r), which measures a linear dependence between two variables (x and y).
# Kendall and Spearman, which are rank-based correlation coefficients
data1 <- mtcars
head(data1, 10)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
# Scatter plot
library(ggpubr)
## Loading required package: ggplot2
ggscatter(data1, x = "mpg", y = "wt",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "Miles/(US) gallon", ylab = "Weight (1000 lbs)")
## `geom_smooth()` using formula = 'y ~ x'
# Shapiro-Wilk normality test for mpg, wt
shapiro.test(data1$mpg)
##
## Shapiro-Wilk normality test
##
## data: data1$mpg
## W = 0.94756, p-value = 0.1229
shapiro.test(data1$wt) # ? Null hypothesis
##
## Shapiro-Wilk normality test
##
## data: data1$wt
## W = 0.94326, p-value = 0.09265
# Q-Q plots (quantile-quantile plots)
par(mfrow=c(1,2))
ggqqplot(data1$mpg, ylab = "MPG") # --> Q-Q plot showing normality of variables
## Warning: The following aesthetics were dropped during statistical transformation: sample
## i This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## i Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: sample
## i This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## i Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
ggqqplot(data1$mpg, ylab = "MPG")
## Warning: The following aesthetics were dropped during statistical transformation: sample
## i This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## i Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: sample
## i This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## i Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
# cor(x, y, method = c("pearson", "kendall", "spearman"))
corr_coef <- cor.test(data1$wt, data1$mpg,
method = "pearson")
corr_coef <- cor.test(data1$wt, data1$mpg,
method = "kendall")
## Warning in cor.test.default(data1$wt, data1$mpg, method = "kendall"): Cannot
## compute exact p-value with ties
corr_coef
##
## Kendall's rank correlation tau
##
## data: data1$wt and data1$mpg
## z = -5.7981, p-value = 6.706e-09
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
## tau
## -0.7278321
# How to interpret the correlation coefficient
# Correlation matrix
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
corr_matrix <- rcorr(as.matrix(data1))
corr_matrix
## mpg cyl disp hp drat wt qsec vs am gear carb
## mpg 1.00 -0.85 -0.85 -0.78 0.68 -0.87 0.42 0.66 0.60 0.48 -0.55
## cyl -0.85 1.00 0.90 0.83 -0.70 0.78 -0.59 -0.81 -0.52 -0.49 0.53
## disp -0.85 0.90 1.00 0.79 -0.71 0.89 -0.43 -0.71 -0.59 -0.56 0.39
## hp -0.78 0.83 0.79 1.00 -0.45 0.66 -0.71 -0.72 -0.24 -0.13 0.75
## drat 0.68 -0.70 -0.71 -0.45 1.00 -0.71 0.09 0.44 0.71 0.70 -0.09
## wt -0.87 0.78 0.89 0.66 -0.71 1.00 -0.17 -0.55 -0.69 -0.58 0.43
## qsec 0.42 -0.59 -0.43 -0.71 0.09 -0.17 1.00 0.74 -0.23 -0.21 -0.66
## vs 0.66 -0.81 -0.71 -0.72 0.44 -0.55 0.74 1.00 0.17 0.21 -0.57
## am 0.60 -0.52 -0.59 -0.24 0.71 -0.69 -0.23 0.17 1.00 0.79 0.06
## gear 0.48 -0.49 -0.56 -0.13 0.70 -0.58 -0.21 0.21 0.79 1.00 0.27
## carb -0.55 0.53 0.39 0.75 -0.09 0.43 -0.66 -0.57 0.06 0.27 1.00
##
## n= 32
##
##
## P
## mpg cyl disp hp drat wt qsec vs am gear
## mpg 0.0000 0.0000 0.0000 0.0000 0.0000 0.0171 0.0000 0.0003 0.0054
## cyl 0.0000 0.0000 0.0000 0.0000 0.0000 0.0004 0.0000 0.0022 0.0042
## disp 0.0000 0.0000 0.0000 0.0000 0.0000 0.0131 0.0000 0.0004 0.0010
## hp 0.0000 0.0000 0.0000 0.0100 0.0000 0.0000 0.0000 0.1798 0.4930
## drat 0.0000 0.0000 0.0000 0.0100 0.0000 0.6196 0.0117 0.0000 0.0000
## wt 0.0000 0.0000 0.0000 0.0000 0.0000 0.3389 0.0010 0.0000 0.0005
## qsec 0.0171 0.0004 0.0131 0.0000 0.6196 0.3389 0.0000 0.2057 0.2425
## vs 0.0000 0.0000 0.0000 0.0000 0.0117 0.0010 0.0000 0.3570 0.2579
## am 0.0003 0.0022 0.0004 0.1798 0.0000 0.0000 0.2057 0.3570 0.0000
## gear 0.0054 0.0042 0.0010 0.4930 0.0000 0.0005 0.2425 0.2579 0.0000
## carb 0.0011 0.0019 0.0253 0.0000 0.6212 0.0146 0.0000 0.0007 0.7545 0.1290
## carb
## mpg 0.0011
## cyl 0.0019
## disp 0.0253
## hp 0.0000
## drat 0.6212
## wt 0.0146
## qsec 0.0000
## vs 0.0007
## am 0.7545
## gear 0.1290
## carb
library(corrplot)
## corrplot 0.92 loaded
corrplot(cor(data.matrix(data1)))
library(PerformanceAnalytics)
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
##
## legend
data1 <- mtcars[, c(1,3,4,5,6,7)]
chart.Correlation(data1, histogram=TRUE, pch=19)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggpairs(data1)