#Exercise 1 - Bivariate Correlation
EXAMPLE 1 In a current study, we want to know the relationship between height and weight. We randomly select some participants and measure their weight and height. Participant 1 2 3 4 5 Height 175 170 180 178 168 Weight 60 70 75 80 69
#Creating the dataset
Height <- c(175, 170, 180, 178, 168)
Weight <- c(60, 70, 75, 80, 69)
data <- data.frame(Height, Weight)
data
## Height Weight
## 1 175 60
## 2 170 70
## 3 180 75
## 4 178 80
## 5 168 69
library(moments)
plot(density(data$Height, main = "Denisty Plot"))
## Warning: In density.default(data$Height, main = "Denisty Plot") :
## extra argument 'main' will be disregarded
plot(density(data$Weight, main = "Density Plot"))
## Warning: In density.default(data$Weight, main = "Density Plot") :
## extra argument 'main' will be disregarded
qqnorm(data$Height)
qqnorm(data$Weight)
#agostino.test(data$Height) can't perform agostino test as sample size is very small
#agostino.test(data$Weight)
shapiro.test(data$Height)
##
## Shapiro-Wilk normality test
##
## data: data$Height
## W = 0.93645, p-value = 0.6409
shapiro.test(data$Weight)
##
## Shapiro-Wilk normality test
##
## data: data$Weight
## W = 0.97529, p-value = 0.908
anscombe.test(data$Height)
##
## Anscombe-Glynn kurtosis test
##
## data: data$Height
## kurt = 1.4246, z = -1.1746, p-value = 0.2402
## alternative hypothesis: kurtosis is not equal to 3
anscombe.test(data$Weight)
##
## Anscombe-Glynn kurtosis test
##
## data: data$Weight
## kurt = 2.12439, z = 0.29398, p-value = 0.7688
## alternative hypothesis: kurtosis is not equal to 3
library(car)
## Warning: package 'car' was built under R version 3.6.2
## Loading required package: carData
scatterplot(data$Height,data$Weight)
## Warning in smoother(.x, .y, col = col[1], log.x = logged("x"), log.y =
## logged("y"), : could not fit smooth
#Residual plot (lm - linear model)
height.lm <- lm(Height ~ Weight, data = data)
height.lm
##
## Call:
## lm(formula = Height ~ Weight, data = data)
##
## Coefficients:
## (Intercept) Weight
## 153.4811 0.2926
height.res <- resid(height.lm) #OR heigh.res <- residuals(height.lm)
height.res
## 1 2 3 4 5
## 3.960503 -3.965889 4.570916 1.107720 -5.673250
plot(data$Height, height.res, ylab = "Residual", xlab = "Weight", main = "Independence of Observation")
abline(0, 0)
#bartlett.test(data$Height, data$Weight)
#Checking variance among groups
#tapply(data$Height, data$Weight, var)
#After checking all the assumptions fulfill our criteria, now we will perform ANOVA with Blocking Design
#TO check whether there are any relationships among variables
cor(data$Height, data$Weight, use = "complete.obs", method = "pearson") #use "complete.obs" when no missing values
## [1] 0.426687
# Results suggest the Pearson's r = 0.426 (medium-strong correlation)
# Pearson's product-moment correlation - use Significance T Test
cor.test(data$Height, data$Weight, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: data$Height and data$Weight
## t = 0.81716, df = 3, p-value = 0.4737
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.7306240 0.9509622
## sample estimates:
## cor
## 0.426687
cor.test(data$Height, data$Weight, method = "pearson", conf.level = 0.99)
##
## Pearson's product-moment correlation
##
## data: data$Height and data$Weight
## t = 0.81716, df = 3, p-value = 0.4737
## alternative hypothesis: true correlation is not equal to 0
## 99 percent confidence interval:
## -0.8776734 0.9791785
## sample estimates:
## cor
## 0.426687
cor.test(data$Height, data$Weight, method = "pearson", conf.level = 0.95)
##
## Pearson's product-moment correlation
##
## data: data$Height and data$Weight
## t = 0.81716, df = 3, p-value = 0.4737
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.7306240 0.9509622
## sample estimates:
## cor
## 0.426687
# Results - p-value = 0.4737 suggest that there is no significant correlation
EXAMPLE 2 What if we have more than two variables? HeartRate <- c(60, 70, 75, 73, 71)
#Creating the dataset
Height <- c(175, 170, 180, 178, 168)
Weight <- c(60, 70, 75, 80, 69)
HeartRate <- c(60, 70, 75, 73, 71)
data2 <- data.frame(Height, Weight, HeartRate)
data2
## Height Weight HeartRate
## 1 175 60 60
## 2 170 70 70
## 3 180 75 75
## 4 178 80 73
## 5 168 69 71
library(moments)
plot(density(data2$HeartRate, main = "Denisty Plot"))
## Warning: In density.default(data2$HeartRate, main = "Denisty Plot") :
## extra argument 'main' will be disregarded
#TO check whether there are any relationships among variables
cor(data2)
## Height Weight HeartRate
## Height 1.0000000 0.4266870 0.2204325
## Weight 0.4266870 1.0000000 0.8932405
## HeartRate 0.2204325 0.8932405 1.0000000
# Results suggest the Pearson's r = 0.426 for Height and Weight (medium-strong correlation) and Pearson's r = 0.893 for HeartRate and Weight (very strong correlation)
#From the results, nw we can chose which variables have correlation and so we can plot them on scatterplot
#library(car)
scatterplot(data2$Height,data2$HeartRate) #medium relationsship
## Warning in smoother(.x, .y, col = col[1], log.x = logged("x"), log.y =
## logged("y"), : could not fit smooth
scatterplot(data2$HeartRate, data2$Weight) #strong relationsship
## Warning in smoother(.x, .y, col = col[1], log.x = logged("x"), log.y =
## logged("y"), : could not fit smooth
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 3.6.2
## Loading required package: lattice
## Loading required package: survival
## Warning: package 'survival' was built under R version 3.6.2
## Loading required package: Formula
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.6.2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
#test the whole matrix table once rather than doing multiple pairs of correlation testing
data2.matrix <- as.matrix(data2[, c("Height", "Weight", "HeartRate")])
rcorr(data2.matrix)
## Height Weight HeartRate
## Height 1.00 0.43 0.22
## Weight 0.43 1.00 0.89
## HeartRate 0.22 0.89 1.00
##
## n= 5
##
##
## P
## Height Weight HeartRate
## Height 0.4737 0.7216
## Weight 0.4737 0.0412
## HeartRate 0.7216 0.0412
# Results show that correlation is significant for the pair of weight and heartrate with Pearson's r = 0.89 and p-value = 0.0412
#R2 (R square) - Although conclusion can not be made in terms of causality and direction, we can use R square to measure the amount of variability in one variable that is shared by another one
cor(data2)^2
## Height Weight HeartRate
## Height 1.0000000 0.1820618 0.0485905
## Weight 0.1820618 1.0000000 0.7978786
## HeartRate 0.0485905 0.7978786 1.0000000
cor(data2)^2*100 #Percent
## Height Weight HeartRate
## Height 100.00000 18.20618 4.85905
## Weight 18.20618 100.00000 79.78786
## HeartRate 4.85905 79.78786 100.00000
# Results shows height and weight can explain about 18% variability and weight and heartrate can explain about 79% variability
#Exercise 2 - Non-Parametric - when data is not interval/ordinal or if our data fails normality test (e.g., lots of skew and potential outliers)
#Spearman’s correlation coefficient and Kendall's Tau (τ) - most polpular
# 1. Spearman’s correlation coefficient
cor(data2$Height, data2$Weight, method = "spearman")
## [1] 0.6
#rho: 0.6
cor.test(data2$Height, data2$Weight, method = "spearman")
##
## Spearman's rank correlation rho
##
## data: data2$Height and data2$Weight
## S = 8, p-value = 0.35
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.6
#p-value = 0.35 - not significant correlation
cor.test(data2$Height, data2$Weight, alternative = "less", method = "spearman")
##
## Spearman's rank correlation rho
##
## data: data2$Height and data2$Weight
## S = 8, p-value = 0.8833
## alternative hypothesis: true rho is less than 0
## sample estimates:
## rho
## 0.6
# 2. Kendall's Tau (τ): small dataset with a large number of tied ranks.
cor(data2$Height, data2$Weight, method = "kendall")
## [1] 0.4
#rho: 0.4
cor.test(data2$Height, data2$Weight, method = "kendall")
##
## Kendall's rank correlation tau
##
## data: data2$Height and data2$Weight
## T = 7, p-value = 0.4833
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
## tau
## 0.4
#p-value = 0.48 - not significant correlation
cor.test(data2$Height, data2$Weight, alternative = "less", method = "kendall")
##
## Kendall's rank correlation tau
##
## data: data2$Height and data2$Weight
## T = 7, p-value = 0.8833
## alternative hypothesis: true tau is less than 0
## sample estimates:
## tau
## 0.4
#Exercise 3 - Partial Correlation Partial correlation helps to exam the relationship between the two variables yet the effect of a third variable is constantly held. For example, can we conduct a partial correlation between weight and height, while controlling for the effect of heart rate
library(ggm)
## Warning: package 'ggm' was built under R version 3.6.3
##
## Attaching package: 'ggm'
## The following object is masked from 'package:Hmisc':
##
## rcorr
#Partial Correlation
pc <- pcor(c("Height", "Weight", "HeartRate"), var(data2)) #controlling for HeartRate. syntax pc <- pcor(c(“Variable 1", “Variable 2", “Variable Controlled"), var(Data))
pc # r = 0.52
## [1] 0.5240067
pc^2 # R2 = 0.27
## [1] 0.2745831
# Testing the significance of partial correlation
pcor.test(pc, 1, 10) #pcor.test(model we created, number of variable we try to control, sample size)
## $tval
## [1] 1.627766
##
## $df
## [1] 7
##
## $pvalue
## [1] 0.1475993
# p-value - 0.48 - not significant correlation
pc2 <- pcor(c("Weight", "HeartRate", "Height"), var(data2)) #controlling for Height
pc2 # r = 0.90
## [1] 0.9059479
pc2^2 # R2 = 0.82
## [1] 0.8207416
# Testing the significance of partial correlation
pcor.test(pc2, 1, 10) #pcor.test(model we created, number of variable we try to control, sample size)
## $tval
## [1] 5.661253
##
## $df
## [1] 7
##
## $pvalue
## [1] 0.0007654755
# p-value - 0.0007654755 - significant correlation
#Exercise 4 - Semi Partial Correlation Semi-partial correlation is useful when trying to explain the variance in one particular variable from a set of predictor variable. When we control for the effect of the third variable, which has effect on only one variable.
setwd("E:\\mikhilesh\\HU Sem VI ANLY 510 and 506\\ANLY 510 Kao Principals and Applications\\Lecture and other materials")
library(readxl)
## Warning: package 'readxl' was built under R version 3.6.3
data5 <- read_excel("Lecture 5 simplerelationships.xlsx")
names(data5)
## [1] "Income" "PI" "MS" "Age"
str(data5)
## Classes 'tbl_df', 'tbl' and 'data.frame': 99 obs. of 4 variables:
## $ Income: num 81119 100291 74035 19289 103307 ...
## $ PI : num 0 0 -0.01 0 -0.01 0.01 -0.02 -0.02 -0.01 0.01 ...
## $ MS : num 8 9 8 10 3 5 9 8 10 7 ...
## $ Age : num 34.1 35 33.5 31 35 ...
summary(data5)
## Income PI MS Age
## Min. : 12470 Min. :-0.030000 Min. : 0.000 Min. :30.62
## 1st Qu.: 50551 1st Qu.: 0.000000 1st Qu.: 3.000 1st Qu.:32.63
## Median : 95875 Median : 0.010000 Median : 5.000 Median :34.99
## Mean : 91077 Mean : 0.001616 Mean : 5.212 Mean :34.59
## 3rd Qu.:124876 3rd Qu.: 0.010000 3rd Qu.: 8.000 3rd Qu.:36.38
## Max. :162697 Max. : 0.020000 Max. :10.000 Max. :38.30
#Before we get started, we always want to take a look at the data.
plot(density(data5$Income))
agostino.test(data5$Income)
##
## D'Agostino skewness test
##
## data: data5$Income
## skew = -0.21484, z = -0.92092, p-value = 0.3571
## alternative hypothesis: data have a skewness
shapiro.test(data5$Income)
##
## Shapiro-Wilk normality test
##
## data: data5$Income
## W = 0.95278, p-value = 0.001357
scatterplot(data5$Income, data5$PI)
scatterplot(data5$Income, data5$Age)
scatterplot(data5$MS, data5$PI)
#Test the correlation
cor(data5$Income, data5$PI, method = "pearson")
## [1] 0.3990131
r = 0.40 #medium/strong correlation`
#Significance Test
cor.test(data5$Income, data5$PI, method = "pearson", conf.level = 0.95)
##
## Pearson's product-moment correlation
##
## data: data5$Income and data5$PI
## t = 4.2858, df = 97, p-value = 4.296e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2188392 0.5528750
## sample estimates:
## cor
## 0.3990131
cor.test(data5$Income, data5$PI, method = "pearson", conf.level = 0.99)
##
## Pearson's product-moment correlation
##
## data: data5$Income and data5$PI
## t = 4.2858, df = 97, p-value = 4.296e-05
## alternative hypothesis: true correlation is not equal to 0
## 99 percent confidence interval:
## 0.1582391 0.5949987
## sample estimates:
## cor
## 0.3990131
# p-value < 0.05 - significant correlation
#to see how much variance we can explain
cor(data5)^2
## Income PI MS Age
## Income 1.000000000 1.592114e-01 3.691716e-03 0.990749110
## PI 0.159211415 1.000000e+00 9.569379e-05 0.235570800
## MS 0.003691716 9.569379e-05 1.000000e+00 0.003238598
## Age 0.990749110 2.355708e-01 3.238598e-03 1.000000000
cor(data5)^2*100
## Income PI MS Age
## Income 100.0000000 1.592114e+01 3.691716e-01 99.0749110
## PI 15.9211415 1.000000e+02 9.569379e-03 23.5570800
## MS 0.3691716 9.569379e-03 1.000000e+02 0.3238598
## Age 99.0749110 2.355708e+01 3.238598e-01 100.0000000
# A significant positive relationship (r = .40, t(97) = 4.29, p < .001) between income and what they save – the more they earn the greater the percentage of money they appear to save (and vice versa –directionality isn’t known from this)
#We also want to control the potential third-party effect so to make sure we learn what we are aiming for
#Data MAtrix
#data5 <- data[, c("Income", "PI", "MS", "Age")]
cor(data5)
## Income PI MS Age
## Income 1.00000000 0.39901305 -0.06075949 0.99536381
## PI 0.39901305 1.00000000 0.00978232 0.48535636
## MS -0.06075949 0.00978232 1.00000000 -0.05690868
## Age 0.99536381 0.48535636 -0.05690868 1.00000000
#Partial Correlation - #here we study income and MS while controlling for Age factor
pc <- pcor(c("Income", "MS", "Age"), var(data5))
pc # r = -0.0428495 (negative relation)
## [1] -0.0428495
pc^2 # R2 = 0.00183608
## [1] 0.00183608
# Testing the significance of partial correlation
pcor.test(pc, 1, 99) #pcor.test(model we created, number of variable we try to control, sample size)
## $tval
## [1] -0.4202236
##
## $df
## [1] 96
##
## $pvalue
## [1] 0.6752612
# p-value - 0.6752612 - not significant correlation
There is a significant relationship between income and percent income saved, r = .40, p < .001. There is a significant relationship between income and age, r = .99, p < .001. The relationship between income and savings motivation while controlling age is not significant, r = -.043, p = .68.