Correlation and Regression

Install all R packages required for correlation test and visualization Check that all packages load successfully

library(ggpubr)
library(Hmisc)
library(corrplot)
library(ggplot2)

Correlation is estimating the association between two or more variables In the following example, we would like to know whether there is any relationships between weight and height in women.

Import csv file

df<-read.csv("weight.csv", header = TRUE)
summary(df)

##      weight          height     
##  Min.   : 3.00   Min.   :40.00  
##  1st Qu.: 4.25   1st Qu.:50.00  
##  Median : 7.50   Median :64.00  
##  Mean   : 7.30   Mean   :59.36  
##  3rd Qu.: 9.75   3rd Qu.:68.25  
##  Max.   :12.00   Max.   :73.00

###Shapiro test We apply Shapiro test to check the normality of our data Null hypothesis:Data are normally distributed Alternative hypothesis: Data are not normally distributed If the p-value of the Shapiro test is lower than 0.05, the null hypothesis is rejected data are not normal. If the p-value of the Shapiro test is higher than 0.05, there is not enough evidence to reject the null hypothesis, so the null hypothesis is accepted and data are normal.

test_p<- shapiro.test(df$weight)
print(test_p)

## 
##  Shapiro-Wilk normality test
## 
## data:  df$weight
## W = 0.91096, p-value = 0.1627

test_p1<- shapiro.test(df$height)
print(test_p1)

## 
##  Shapiro-Wilk normality test
## 
## data:  df$height
## W = 0.90987, p-value = 0.1569

Add regression line

ggscatter(df, x = "height", y = "weight",
          add = "reg.line",                                
          conf.int = TRUE,                               
          add.params = list(color = "red",
                            fill = "lightgray")
)+stat_cor(method = "pearson")

### Correlation and Regression In the following example we will survey variables contributed in people happiness using correlation and regression test

mydata<-read.csv("happy1.csv", header = TRUE)
summary(mydata)

##    Hap.score          GDP               le               Fr        
##  Min.   :2.910   Min.   :0.0700   Min.   :0.1600   Min.   :0.0400  
##  1st Qu.:4.570   1st Qu.:0.6525   1st Qu.:0.5150   1st Qu.:0.2375  
##  Median :5.615   Median :1.0150   Median :0.6050   Median :0.4200  
##  Mean   :5.532   Mean   :0.9780   Mean   :0.5955   Mean   :0.3850  
##  3rd Qu.:6.545   3rd Qu.:1.3950   3rd Qu.:0.8000   3rd Qu.:0.5275  
##  Max.   :7.530   Max.   :1.7000   Max.   :0.9500   Max.   :0.6100  
##      Trust          Generosity       corruption    
##  Min.   :0.0200   Min.   :0.0500   Min.   :0.0200  
##  1st Qu.:0.0575   1st Qu.:0.1950   1st Qu.:0.0575  
##  Median :0.1250   Median :0.2850   Median :0.1250  
##  Mean   :0.1770   Mean   :0.2650   Mean   :0.1770  
##  3rd Qu.:0.2975   3rd Qu.:0.3425   3rd Qu.:0.2975  
##  Max.   :0.4700   Max.   :0.4800   Max.   :0.4700

Shaipro test for cheching normality

sh1<-shapiro.test(mydata$Hap.score)
print(sh1)

## 
##  Shapiro-Wilk normality test
## 
## data:  mydata$Hap.score
## W = 0.96793, p-value = 0.7107

Simple correlation

v<-cor(mydata, method = "pearson")

Plotting

corrplot(v, method = "number", col = c("darkblue", "orange"))

With showing squares

corrplot(v, method = "square", col = c("darkblue", "orange"))

Diverging colors for plotting

knitr::include_graphics("diverging5.svg")

Sequential colors for plotting

knitr::include_graphics("seq1.svg")

###Plotting-Sequential color

##Sequential color
corrplot(v, method = "circle", type = "lower", col= COL1("Reds"))

###Plotting-Diverging color

corrplot(v, method = "circle", type = "upper", col= COL2("RdBu"))

###Correlation with P-value in another way

library("Hmisc")
 new_data <- mydata[ , c(1,2,3,4,5,6,7)]
 summary(new_data)

##    Hap.score          GDP               le               Fr        
##  Min.   :2.910   Min.   :0.0700   Min.   :0.1600   Min.   :0.0400  
##  1st Qu.:4.570   1st Qu.:0.6525   1st Qu.:0.5150   1st Qu.:0.2375  
##  Median :5.615   Median :1.0150   Median :0.6050   Median :0.4200  
##  Mean   :5.532   Mean   :0.9780   Mean   :0.5955   Mean   :0.3850  
##  3rd Qu.:6.545   3rd Qu.:1.3950   3rd Qu.:0.8000   3rd Qu.:0.5275  
##  Max.   :7.530   Max.   :1.7000   Max.   :0.9500   Max.   :0.6100  
##      Trust          Generosity       corruption    
##  Min.   :0.0200   Min.   :0.0500   Min.   :0.0200  
##  1st Qu.:0.0575   1st Qu.:0.1950   1st Qu.:0.0575  
##  Median :0.1250   Median :0.2850   Median :0.1250  
##  Mean   :0.1770   Mean   :0.2650   Mean   :0.1770  
##  3rd Qu.:0.2975   3rd Qu.:0.3425   3rd Qu.:0.2975  
##  Max.   :0.4700   Max.   :0.4800   Max.   :0.4700

###Correlation test

result <- rcorr(as.matrix(mydata), type = "pearson")
print(result)

##            Hap.score   GDP    le    Fr Trust Generosity corruption
## Hap.score       1.00  0.86  0.85  0.83  0.70       0.05      -0.74
## GDP             0.86  1.00  0.95  0.69  0.64      -0.04      -0.60
## le              0.85  0.95  1.00  0.71  0.56      -0.18      -0.72
## Fr              0.83  0.69  0.71  1.00  0.65      -0.06      -0.58
## Trust           0.70  0.64  0.56  0.65  1.00       0.31      -0.20
## Generosity      0.05 -0.04 -0.18 -0.06  0.31       1.00       0.07
## corruption     -0.74 -0.60 -0.72 -0.58 -0.20       0.07       1.00
## 
## n= 20 
## 
## 
## P
##            Hap.score GDP    le     Fr     Trust  Generosity corruption
## Hap.score            0.0000 0.0000 0.0000 0.0006 0.8376     0.0002    
## GDP        0.0000           0.0000 0.0007 0.0025 0.8677     0.0053    
## le         0.0000    0.0000        0.0005 0.0108 0.4461     0.0004    
## Fr         0.0000    0.0007 0.0005        0.0020 0.8066     0.0071    
## Trust      0.0006    0.0025 0.0108 0.0020        0.1890     0.4099    
## Generosity 0.8376    0.8677 0.4461 0.8066 0.1890            0.7838    
## corruption 0.0002    0.0053 0.0004 0.0071 0.4099 0.7838

###get correlation and P-value

rr<-result$r
print(rr)

##              Hap.score         GDP         le         Fr      Trust  Generosity
## Hap.score   1.00000000  0.85560449  0.8503420  0.8312359  0.6986812  0.04894782
## GDP         0.85560449  1.00000000  0.9540119  0.6910128  0.6373026 -0.03979261
## le          0.85034198  0.95401192  1.0000000  0.7074102  0.5568864 -0.18060365
## Fr          0.83123589  0.69101280  0.7074102  1.0000000  0.6491447 -0.05846570
## Trust       0.69868116  0.63730258  0.5568864  0.6491447  1.0000000  0.30635055
## Generosity  0.04894782 -0.03979261 -0.1806036 -0.0584657  0.3063505  1.00000000
## corruption -0.74219994 -0.59814899 -0.7175692 -0.5821310 -0.1950426  0.06551655
##             corruption
## Hap.score  -0.74219994
## GDP        -0.59814899
## le         -0.71756924
## Fr         -0.58213102
## Trust      -0.19504265
## Generosity  0.06551655
## corruption  1.00000000

pb<-result$P
print(pb)

##               Hap.score          GDP           le           Fr        Trust
## Hap.score            NA 1.516032e-06 2.050515e-06 5.616976e-06 0.0006105033
## GDP        1.516032e-06           NA 7.391843e-11 7.416727e-04 0.0025078536
## le         2.050515e-06 7.391843e-11           NA 4.856519e-04 0.0107566190
## Fr         5.616976e-06 7.416727e-04 4.856519e-04           NA 0.0019553515
## Trust      6.105033e-04 2.507854e-03 1.075662e-02 1.955351e-03           NA
## Generosity 8.376280e-01 8.677126e-01 4.460740e-01 8.065805e-01 0.1889543496
## corruption 1.790063e-04 5.339296e-03 3.683063e-04 7.082950e-03 0.4099102573
##            Generosity   corruption
## Hap.score   0.8376280 0.0001790063
## GDP         0.8677126 0.0053392965
## le          0.4460740 0.0003683063
## Fr          0.8065805 0.0070829504
## Trust       0.1889543 0.4099102573
## Generosity         NA 0.7837561181
## corruption  0.7837561           NA

Plotting the correlation along with P-value result, The insignificant correlation is blank

corrplot(result$r, type="lower", order="original",method = "number",
         p.mat = result$p, sig.level = 0.05, insig = "blank", col = COL2('PiYG'))

corrplot(result$r, type="lower", order="hclust", method = "circle",
         p.mat = result$p, sig.level = 0.05, insig = 'blank', col = COL2('PuOr'), tl.col = "black", tl.cex = 1)

### Regression test

fit<- lm(Hap.score~GDP + le + Fr + Trust + corruption, data = mydata)
summary(fit)

## 
## Call:
## lm(formula = Hap.score ~ GDP + le + Fr + Trust + corruption, 
##     data = mydata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.64179 -0.22753 -0.08158  0.29377  0.72012 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   4.7308     0.6741   7.018 6.08e-06 ***
## GDP           1.5835     0.8176   1.937  0.07323 .  
## le           -1.9289     1.8369  -1.050  0.31146    
## Fr            1.7240     0.9534   1.808  0.09208 .  
## Trust         2.6944     1.0467   2.574  0.02206 *  
## corruption   -4.1808     1.1338  -3.687  0.00244 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.43 on 14 degrees of freedom
## Multiple R-squared:  0.9259, Adjusted R-squared:  0.8995 
## F-statistic:    35 on 5 and 14 DF,  p-value: 1.969e-07