Install all R packages required for correlation test and visualization Check that all packages load successfully
library(ggpubr)
library(Hmisc)
library(corrplot)
library(ggplot2)
Correlation is estimating the association between two or more variables In the following example, we would like to know whether there is any relationships between weight and height in women.
df<-read.csv("weight.csv", header = TRUE)
summary(df)
## weight height
## Min. : 3.00 Min. :40.00
## 1st Qu.: 4.25 1st Qu.:50.00
## Median : 7.50 Median :64.00
## Mean : 7.30 Mean :59.36
## 3rd Qu.: 9.75 3rd Qu.:68.25
## Max. :12.00 Max. :73.00
###Shapiro test We apply Shapiro test to check the normality of our data Null hypothesis:Data are normally distributed Alternative hypothesis: Data are not normally distributed If the p-value of the Shapiro test is lower than 0.05, the null hypothesis is rejected data are not normal. If the p-value of the Shapiro test is higher than 0.05, there is not enough evidence to reject the null hypothesis, so the null hypothesis is accepted and data are normal.
test_p<- shapiro.test(df$weight)
print(test_p)
##
## Shapiro-Wilk normality test
##
## data: df$weight
## W = 0.91096, p-value = 0.1627
test_p1<- shapiro.test(df$height)
print(test_p1)
##
## Shapiro-Wilk normality test
##
## data: df$height
## W = 0.90987, p-value = 0.1569
ggscatter(df, x = "height", y = "weight",
add = "reg.line",
conf.int = TRUE,
add.params = list(color = "red",
fill = "lightgray")
)+stat_cor(method = "pearson")
### Correlation and Regression In the following example we will survey
variables contributed in people happiness using correlation and
regression test
mydata<-read.csv("happy1.csv", header = TRUE)
summary(mydata)
## Hap.score GDP le Fr
## Min. :2.910 Min. :0.0700 Min. :0.1600 Min. :0.0400
## 1st Qu.:4.570 1st Qu.:0.6525 1st Qu.:0.5150 1st Qu.:0.2375
## Median :5.615 Median :1.0150 Median :0.6050 Median :0.4200
## Mean :5.532 Mean :0.9780 Mean :0.5955 Mean :0.3850
## 3rd Qu.:6.545 3rd Qu.:1.3950 3rd Qu.:0.8000 3rd Qu.:0.5275
## Max. :7.530 Max. :1.7000 Max. :0.9500 Max. :0.6100
## Trust Generosity corruption
## Min. :0.0200 Min. :0.0500 Min. :0.0200
## 1st Qu.:0.0575 1st Qu.:0.1950 1st Qu.:0.0575
## Median :0.1250 Median :0.2850 Median :0.1250
## Mean :0.1770 Mean :0.2650 Mean :0.1770
## 3rd Qu.:0.2975 3rd Qu.:0.3425 3rd Qu.:0.2975
## Max. :0.4700 Max. :0.4800 Max. :0.4700
Shaipro test for cheching normality
sh1<-shapiro.test(mydata$Hap.score)
print(sh1)
##
## Shapiro-Wilk normality test
##
## data: mydata$Hap.score
## W = 0.96793, p-value = 0.7107
v<-cor(mydata, method = "pearson")
corrplot(v, method = "number", col = c("darkblue", "orange"))
With showing squares
corrplot(v, method = "square", col = c("darkblue", "orange"))
Diverging colors for plotting
knitr::include_graphics("diverging5.svg")
Sequential colors for
plotting
knitr::include_graphics("seq1.svg")
###Plotting-Sequential color
##Sequential color
corrplot(v, method = "circle", type = "lower", col= COL1("Reds"))
###Plotting-Diverging color
corrplot(v, method = "circle", type = "upper", col= COL2("RdBu"))
###Correlation with P-value in another way
library("Hmisc")
new_data <- mydata[ , c(1,2,3,4,5,6,7)]
summary(new_data)
## Hap.score GDP le Fr
## Min. :2.910 Min. :0.0700 Min. :0.1600 Min. :0.0400
## 1st Qu.:4.570 1st Qu.:0.6525 1st Qu.:0.5150 1st Qu.:0.2375
## Median :5.615 Median :1.0150 Median :0.6050 Median :0.4200
## Mean :5.532 Mean :0.9780 Mean :0.5955 Mean :0.3850
## 3rd Qu.:6.545 3rd Qu.:1.3950 3rd Qu.:0.8000 3rd Qu.:0.5275
## Max. :7.530 Max. :1.7000 Max. :0.9500 Max. :0.6100
## Trust Generosity corruption
## Min. :0.0200 Min. :0.0500 Min. :0.0200
## 1st Qu.:0.0575 1st Qu.:0.1950 1st Qu.:0.0575
## Median :0.1250 Median :0.2850 Median :0.1250
## Mean :0.1770 Mean :0.2650 Mean :0.1770
## 3rd Qu.:0.2975 3rd Qu.:0.3425 3rd Qu.:0.2975
## Max. :0.4700 Max. :0.4800 Max. :0.4700
###Correlation test
result <- rcorr(as.matrix(mydata), type = "pearson")
print(result)
## Hap.score GDP le Fr Trust Generosity corruption
## Hap.score 1.00 0.86 0.85 0.83 0.70 0.05 -0.74
## GDP 0.86 1.00 0.95 0.69 0.64 -0.04 -0.60
## le 0.85 0.95 1.00 0.71 0.56 -0.18 -0.72
## Fr 0.83 0.69 0.71 1.00 0.65 -0.06 -0.58
## Trust 0.70 0.64 0.56 0.65 1.00 0.31 -0.20
## Generosity 0.05 -0.04 -0.18 -0.06 0.31 1.00 0.07
## corruption -0.74 -0.60 -0.72 -0.58 -0.20 0.07 1.00
##
## n= 20
##
##
## P
## Hap.score GDP le Fr Trust Generosity corruption
## Hap.score 0.0000 0.0000 0.0000 0.0006 0.8376 0.0002
## GDP 0.0000 0.0000 0.0007 0.0025 0.8677 0.0053
## le 0.0000 0.0000 0.0005 0.0108 0.4461 0.0004
## Fr 0.0000 0.0007 0.0005 0.0020 0.8066 0.0071
## Trust 0.0006 0.0025 0.0108 0.0020 0.1890 0.4099
## Generosity 0.8376 0.8677 0.4461 0.8066 0.1890 0.7838
## corruption 0.0002 0.0053 0.0004 0.0071 0.4099 0.7838
###get correlation and P-value
rr<-result$r
print(rr)
## Hap.score GDP le Fr Trust Generosity
## Hap.score 1.00000000 0.85560449 0.8503420 0.8312359 0.6986812 0.04894782
## GDP 0.85560449 1.00000000 0.9540119 0.6910128 0.6373026 -0.03979261
## le 0.85034198 0.95401192 1.0000000 0.7074102 0.5568864 -0.18060365
## Fr 0.83123589 0.69101280 0.7074102 1.0000000 0.6491447 -0.05846570
## Trust 0.69868116 0.63730258 0.5568864 0.6491447 1.0000000 0.30635055
## Generosity 0.04894782 -0.03979261 -0.1806036 -0.0584657 0.3063505 1.00000000
## corruption -0.74219994 -0.59814899 -0.7175692 -0.5821310 -0.1950426 0.06551655
## corruption
## Hap.score -0.74219994
## GDP -0.59814899
## le -0.71756924
## Fr -0.58213102
## Trust -0.19504265
## Generosity 0.06551655
## corruption 1.00000000
pb<-result$P
print(pb)
## Hap.score GDP le Fr Trust
## Hap.score NA 1.516032e-06 2.050515e-06 5.616976e-06 0.0006105033
## GDP 1.516032e-06 NA 7.391843e-11 7.416727e-04 0.0025078536
## le 2.050515e-06 7.391843e-11 NA 4.856519e-04 0.0107566190
## Fr 5.616976e-06 7.416727e-04 4.856519e-04 NA 0.0019553515
## Trust 6.105033e-04 2.507854e-03 1.075662e-02 1.955351e-03 NA
## Generosity 8.376280e-01 8.677126e-01 4.460740e-01 8.065805e-01 0.1889543496
## corruption 1.790063e-04 5.339296e-03 3.683063e-04 7.082950e-03 0.4099102573
## Generosity corruption
## Hap.score 0.8376280 0.0001790063
## GDP 0.8677126 0.0053392965
## le 0.4460740 0.0003683063
## Fr 0.8065805 0.0070829504
## Trust 0.1889543 0.4099102573
## Generosity NA 0.7837561181
## corruption 0.7837561 NA
corrplot(result$r, type="lower", order="original",method = "number",
p.mat = result$p, sig.level = 0.05, insig = "blank", col = COL2('PiYG'))
corrplot(result$r, type="lower", order="hclust", method = "circle",
p.mat = result$p, sig.level = 0.05, insig = 'blank', col = COL2('PuOr'), tl.col = "black", tl.cex = 1)
### Regression test
fit<- lm(Hap.score~GDP + le + Fr + Trust + corruption, data = mydata)
summary(fit)
##
## Call:
## lm(formula = Hap.score ~ GDP + le + Fr + Trust + corruption,
## data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.64179 -0.22753 -0.08158 0.29377 0.72012
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.7308 0.6741 7.018 6.08e-06 ***
## GDP 1.5835 0.8176 1.937 0.07323 .
## le -1.9289 1.8369 -1.050 0.31146
## Fr 1.7240 0.9534 1.808 0.09208 .
## Trust 2.6944 1.0467 2.574 0.02206 *
## corruption -4.1808 1.1338 -3.687 0.00244 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.43 on 14 degrees of freedom
## Multiple R-squared: 0.9259, Adjusted R-squared: 0.8995
## F-statistic: 35 on 5 and 14 DF, p-value: 1.969e-07