#Exercise 1 - Bivariate Correlation

EXAMPLE 1 In a current study, we want to know the relationship between height and weight. We randomly select some participants and measure their weight and height. Participant 1 2 3 4 5 Height 175 170 180 178 168 Weight 60 70 75 80 69

#Creating the dataset
Height <- c(175, 170, 180, 178, 168)
Weight <- c(60, 70, 75, 80, 69)
data <- data.frame(Height, Weight)
data

##   Height Weight
## 1    175     60
## 2    170     70
## 3    180     75
## 4    178     80
## 5    168     69

Checking Assumptions

library(moments)
plot(density(data$Height, main = "Denisty Plot"))

## Warning: In density.default(data$Height, main = "Denisty Plot") :
##  extra argument 'main' will be disregarded

plot(density(data$Weight, main = "Density Plot"))

## Warning: In density.default(data$Weight, main = "Density Plot") :
##  extra argument 'main' will be disregarded

qqnorm(data$Height)

qqnorm(data$Weight)

#agostino.test(data$Height) can't perform agostino test as sample size is very small
#agostino.test(data$Weight)
shapiro.test(data$Height)

## 
##  Shapiro-Wilk normality test
## 
## data:  data$Height
## W = 0.93645, p-value = 0.6409

shapiro.test(data$Weight)

## 
##  Shapiro-Wilk normality test
## 
## data:  data$Weight
## W = 0.97529, p-value = 0.908

anscombe.test(data$Height)

## 
##  Anscombe-Glynn kurtosis test
## 
## data:  data$Height
## kurt = 1.4246, z = -1.1746, p-value = 0.2402
## alternative hypothesis: kurtosis is not equal to 3

anscombe.test(data$Weight)

## 
##  Anscombe-Glynn kurtosis test
## 
## data:  data$Weight
## kurt = 2.12439, z = 0.29398, p-value = 0.7688
## alternative hypothesis: kurtosis is not equal to 3

library(car)

## Warning: package 'car' was built under R version 3.6.2

## Loading required package: carData

scatterplot(data$Height,data$Weight)

## Warning in smoother(.x, .y, col = col[1], log.x = logged("x"), log.y =
## logged("y"), : could not fit smooth

#Residual plot (lm - linear model)
height.lm <- lm(Height ~ Weight, data = data)
height.lm

## 
## Call:
## lm(formula = Height ~ Weight, data = data)
## 
## Coefficients:
## (Intercept)       Weight  
##    153.4811       0.2926

height.res <- resid(height.lm) #OR heigh.res <- residuals(height.lm)
height.res

##         1         2         3         4         5 
##  3.960503 -3.965889  4.570916  1.107720 -5.673250

plot(data$Height, height.res, ylab = "Residual", xlab = "Weight", main = "Independence of Observation")
abline(0, 0)

#bartlett.test(data$Height, data$Weight)

#Checking variance among groups
#tapply(data$Height, data$Weight, var)

#After checking all the assumptions fulfill our criteria, now we will perform ANOVA with Blocking Design

#TO check whether there are any relationships among variables
cor(data$Height, data$Weight, use = "complete.obs", method = "pearson") #use "complete.obs" when no missing values

## [1] 0.426687

# Results suggest the Pearson's r = 0.426 (medium-strong correlation)

#   Pearson's product-moment correlation - use Significance T Test
cor.test(data$Height, data$Weight, method = "pearson")

## 
##  Pearson's product-moment correlation
## 
## data:  data$Height and data$Weight
## t = 0.81716, df = 3, p-value = 0.4737
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.7306240  0.9509622
## sample estimates:
##      cor 
## 0.426687

cor.test(data$Height, data$Weight, method = "pearson", conf.level = 0.99)

## 
##  Pearson's product-moment correlation
## 
## data:  data$Height and data$Weight
## t = 0.81716, df = 3, p-value = 0.4737
## alternative hypothesis: true correlation is not equal to 0
## 99 percent confidence interval:
##  -0.8776734  0.9791785
## sample estimates:
##      cor 
## 0.426687

cor.test(data$Height, data$Weight, method = "pearson", conf.level = 0.95)

## 
##  Pearson's product-moment correlation
## 
## data:  data$Height and data$Weight
## t = 0.81716, df = 3, p-value = 0.4737
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.7306240  0.9509622
## sample estimates:
##      cor 
## 0.426687

# Results - p-value = 0.4737 suggest that there is no significant correlation

EXAMPLE 2 What if we have more than two variables? HeartRate <- c(60, 70, 75, 73, 71)

#Creating the dataset
Height <- c(175, 170, 180, 178, 168)
Weight <- c(60, 70, 75, 80, 69)
HeartRate <- c(60, 70, 75, 73, 71)
data2 <- data.frame(Height, Weight, HeartRate)
data2

##   Height Weight HeartRate
## 1    175     60        60
## 2    170     70        70
## 3    180     75        75
## 4    178     80        73
## 5    168     69        71

Checking Assumptions

library(moments)
plot(density(data2$HeartRate, main = "Denisty Plot"))

## Warning: In density.default(data2$HeartRate, main = "Denisty Plot") :
##  extra argument 'main' will be disregarded

#TO check whether there are any relationships among variables
cor(data2)

##              Height    Weight HeartRate
## Height    1.0000000 0.4266870 0.2204325
## Weight    0.4266870 1.0000000 0.8932405
## HeartRate 0.2204325 0.8932405 1.0000000

# Results suggest the Pearson's r = 0.426 for Height and Weight (medium-strong correlation) and Pearson's r = 0.893 for HeartRate and Weight (very strong correlation)

#From the results, nw we can chose which variables have correlation and so we can plot them on scatterplot
#library(car)
scatterplot(data2$Height,data2$HeartRate) #medium relationsship

## Warning in smoother(.x, .y, col = col[1], log.x = logged("x"), log.y =
## logged("y"), : could not fit smooth

scatterplot(data2$HeartRate, data2$Weight) #strong relationsship

## Warning in smoother(.x, .y, col = col[1], log.x = logged("x"), log.y =
## logged("y"), : could not fit smooth

library(Hmisc)

## Warning: package 'Hmisc' was built under R version 3.6.2

## Loading required package: lattice

## Loading required package: survival

## Warning: package 'survival' was built under R version 3.6.2

## Loading required package: Formula

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 3.6.2

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:base':
## 
##     format.pval, units

#test the whole matrix table once rather than doing multiple pairs of correlation testing
data2.matrix <- as.matrix(data2[, c("Height", "Weight", "HeartRate")])
rcorr(data2.matrix)

##           Height Weight HeartRate
## Height      1.00   0.43      0.22
## Weight      0.43   1.00      0.89
## HeartRate   0.22   0.89      1.00
## 
## n= 5 
## 
## 
## P
##           Height Weight HeartRate
## Height           0.4737 0.7216   
## Weight    0.4737        0.0412   
## HeartRate 0.7216 0.0412

# Results show that correlation is significant for the pair of weight and heartrate with Pearson's r = 0.89 and p-value = 0.0412

#R2 (R square) - Although conclusion can not be made in terms of causality and direction, we can use R square to measure the amount of variability in one variable that is shared by another one
cor(data2)^2

##              Height    Weight HeartRate
## Height    1.0000000 0.1820618 0.0485905
## Weight    0.1820618 1.0000000 0.7978786
## HeartRate 0.0485905 0.7978786 1.0000000

cor(data2)^2*100 #Percent

##              Height    Weight HeartRate
## Height    100.00000  18.20618   4.85905
## Weight     18.20618 100.00000  79.78786
## HeartRate   4.85905  79.78786 100.00000

# Results shows height and weight can explain about 18% variability and weight and heartrate can explain about 79% variability

#Exercise 2 - Non-Parametric - when data is not interval/ordinal or if our data fails normality test (e.g., lots of skew and potential outliers)

#Spearman’s correlation coefficient and Kendall's Tau (τ) - most polpular

# 1. Spearman’s correlation coefficient
cor(data2$Height, data2$Weight, method = "spearman")

## [1] 0.6

#rho: 0.6 
cor.test(data2$Height, data2$Weight, method = "spearman")

## 
##  Spearman's rank correlation rho
## 
## data:  data2$Height and data2$Weight
## S = 8, p-value = 0.35
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho 
## 0.6

#p-value = 0.35 - not significant correlation
cor.test(data2$Height, data2$Weight, alternative = "less", method = "spearman")

## 
##  Spearman's rank correlation rho
## 
## data:  data2$Height and data2$Weight
## S = 8, p-value = 0.8833
## alternative hypothesis: true rho is less than 0
## sample estimates:
## rho 
## 0.6

# 2. Kendall's Tau (τ): small dataset with a large number of tied ranks.
cor(data2$Height, data2$Weight, method = "kendall")

## [1] 0.4

#rho: 0.4
cor.test(data2$Height, data2$Weight, method = "kendall")

## 
##  Kendall's rank correlation tau
## 
## data:  data2$Height and data2$Weight
## T = 7, p-value = 0.4833
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
## tau 
## 0.4

#p-value = 0.48 - not significant correlation
cor.test(data2$Height, data2$Weight, alternative = "less", method = "kendall")

## 
##  Kendall's rank correlation tau
## 
## data:  data2$Height and data2$Weight
## T = 7, p-value = 0.8833
## alternative hypothesis: true tau is less than 0
## sample estimates:
## tau 
## 0.4

#Exercise 3 - Partial Correlation Partial correlation helps to exam the relationship between the two variables yet the effect of a third variable is constantly held. For example, can we conduct a partial correlation between weight and height, while controlling for the effect of heart rate

library(ggm)

## Warning: package 'ggm' was built under R version 3.6.3

## 
## Attaching package: 'ggm'

## The following object is masked from 'package:Hmisc':
## 
##     rcorr

#Partial Correlation
pc <- pcor(c("Height", "Weight", "HeartRate"), var(data2)) #controlling for HeartRate. syntax pc <- pcor(c(“Variable 1", “Variable 2", “Variable Controlled"), var(Data))
pc # r = 0.52

## [1] 0.5240067

pc^2 # R2 = 0.27

## [1] 0.2745831

#  Testing the significance of partial correlation
pcor.test(pc, 1, 10) #pcor.test(model we created, number of variable we try to control, sample size)

## $tval
## [1] 1.627766
## 
## $df
## [1] 7
## 
## $pvalue
## [1] 0.1475993

# p-value - 0.48 - not significant correlation

pc2 <- pcor(c("Weight", "HeartRate", "Height"), var(data2)) #controlling for Height
pc2 # r = 0.90

## [1] 0.9059479

pc2^2 # R2 = 0.82

## [1] 0.8207416

#  Testing the significance of partial correlation
pcor.test(pc2, 1, 10) #pcor.test(model we created, number of variable we try to control, sample size)

## $tval
## [1] 5.661253
## 
## $df
## [1] 7
## 
## $pvalue
## [1] 0.0007654755

# p-value - 0.0007654755 - significant correlation

#Exercise 4 - Semi Partial Correlation Semi-partial correlation is useful when trying to explain the variance in one particular variable from a set of predictor variable. When we control for the effect of the third variable, which has effect on only one variable.

setwd("E:\\mikhilesh\\HU Sem VI ANLY 510 and 506\\ANLY 510 Kao Principals and Applications\\Lecture and other materials")
library(readxl)

## Warning: package 'readxl' was built under R version 3.6.3

data5 <- read_excel("Lecture 5 simplerelationships.xlsx")
names(data5)

## [1] "Income" "PI"     "MS"     "Age"

str(data5)

## Classes 'tbl_df', 'tbl' and 'data.frame':    99 obs. of  4 variables:
##  $ Income: num  81119 100291 74035 19289 103307 ...
##  $ PI    : num  0 0 -0.01 0 -0.01 0.01 -0.02 -0.02 -0.01 0.01 ...
##  $ MS    : num  8 9 8 10 3 5 9 8 10 7 ...
##  $ Age   : num  34.1 35 33.5 31 35 ...

summary(data5)

##      Income             PI                  MS              Age       
##  Min.   : 12470   Min.   :-0.030000   Min.   : 0.000   Min.   :30.62  
##  1st Qu.: 50551   1st Qu.: 0.000000   1st Qu.: 3.000   1st Qu.:32.63  
##  Median : 95875   Median : 0.010000   Median : 5.000   Median :34.99  
##  Mean   : 91077   Mean   : 0.001616   Mean   : 5.212   Mean   :34.59  
##  3rd Qu.:124876   3rd Qu.: 0.010000   3rd Qu.: 8.000   3rd Qu.:36.38  
##  Max.   :162697   Max.   : 0.020000   Max.   :10.000   Max.   :38.30

#Before we get started, we always want to take a look at the data.
plot(density(data5$Income))

agostino.test(data5$Income)

## 
##  D'Agostino skewness test
## 
## data:  data5$Income
## skew = -0.21484, z = -0.92092, p-value = 0.3571
## alternative hypothesis: data have a skewness

shapiro.test(data5$Income)

## 
##  Shapiro-Wilk normality test
## 
## data:  data5$Income
## W = 0.95278, p-value = 0.001357

scatterplot(data5$Income, data5$PI)

scatterplot(data5$Income, data5$Age)

scatterplot(data5$MS, data5$PI)

#Test the correlation
cor(data5$Income, data5$PI, method = "pearson")

## [1] 0.3990131

r = 0.40 #medium/strong correlation`
#Significance Test
cor.test(data5$Income, data5$PI, method = "pearson", conf.level = 0.95)

## 
##  Pearson's product-moment correlation
## 
## data:  data5$Income and data5$PI
## t = 4.2858, df = 97, p-value = 4.296e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2188392 0.5528750
## sample estimates:
##       cor 
## 0.3990131

cor.test(data5$Income, data5$PI, method = "pearson", conf.level = 0.99)

## 
##  Pearson's product-moment correlation
## 
## data:  data5$Income and data5$PI
## t = 4.2858, df = 97, p-value = 4.296e-05
## alternative hypothesis: true correlation is not equal to 0
## 99 percent confidence interval:
##  0.1582391 0.5949987
## sample estimates:
##       cor 
## 0.3990131

# p-value < 0.05 - significant correlation

#to see how much variance we can explain
cor(data5)^2

##             Income           PI           MS         Age
## Income 1.000000000 1.592114e-01 3.691716e-03 0.990749110
## PI     0.159211415 1.000000e+00 9.569379e-05 0.235570800
## MS     0.003691716 9.569379e-05 1.000000e+00 0.003238598
## Age    0.990749110 2.355708e-01 3.238598e-03 1.000000000

cor(data5)^2*100

##             Income           PI           MS         Age
## Income 100.0000000 1.592114e+01 3.691716e-01  99.0749110
## PI      15.9211415 1.000000e+02 9.569379e-03  23.5570800
## MS       0.3691716 9.569379e-03 1.000000e+02   0.3238598
## Age     99.0749110 2.355708e+01 3.238598e-01 100.0000000

# A significant positive relationship (r = .40, t(97) = 4.29, p < .001) between income and what they save – the more they earn the greater the percentage of money they appear to save (and vice versa –directionality isn’t known from this)

#We also want to control the potential third-party effect so to make sure we learn what we are aiming for

#Data MAtrix
#data5 <- data[, c("Income", "PI", "MS", "Age")]
cor(data5)

##             Income         PI          MS         Age
## Income  1.00000000 0.39901305 -0.06075949  0.99536381
## PI      0.39901305 1.00000000  0.00978232  0.48535636
## MS     -0.06075949 0.00978232  1.00000000 -0.05690868
## Age     0.99536381 0.48535636 -0.05690868  1.00000000

#Partial Correlation - #here we study income and MS while controlling for Age factor
pc <- pcor(c("Income", "MS", "Age"), var(data5)) 
pc # r = -0.0428495 (negative relation)

## [1] -0.0428495

pc^2 # R2 = 0.00183608

## [1] 0.00183608

#  Testing the significance of partial correlation
pcor.test(pc, 1, 99) #pcor.test(model we created, number of variable we try to control, sample size)

## $tval
## [1] -0.4202236
## 
## $df
## [1] 96
## 
## $pvalue
## [1] 0.6752612

# p-value - 0.6752612 - not significant correlation

Summary Write Up

There is a significant relationship between income and percent income saved, r = .40, p < .001. There is a significant relationship between income and age, r = .99, p < .001. The relationship between income and savings motivation while controlling age is not significant, r = -.043, p = .68.

Introduction of Correlation

Mikhilesh Dehane

6/3/2020

Checking Assumptions

Checking Assumptions

Summary Write Up