08_-_correlation__regression-_Zadra

require(tidyverse) #if you don't have this, use install.packages("tidyverse")

## Loading required package: tidyverse

## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr

## Conflicts with tidy packages ----------------------------------------------

## filter(): dplyr, stats
## lag():    dplyr, stats

theme_set(theme_gray(base_size = 18)) #Set our default font size for all the plots we produce with ggplot()

load("C:/Users/Jon/Documents/Classes/quant methods I (6500 5500)/data/08 - correlation & regression/08 - correlation & regression - Zadra Example (without tidyr).Rdata") #adjust the path to the location of your .Rdata file

# First look at our data ----------------------------------------------------------


dat <- data.frame("adversity" = x, "wellbeing" = y) #Put our vectors for adversity and wellbeing into a data frame
dat #take a look at the data frame.

##    adversity wellbeing
## 1  10.257202  116.9637
## 2   9.580445  116.1984
## 3   9.047772  116.1271
## 4  10.512393  115.3765
## 5   8.255732  117.4686
## 6   9.502488  116.0299
## 7  10.765586  114.7700
## 8  10.501393  115.2308
## 9  10.331911  116.6814
## 10  9.908924  115.2496
## 11  9.343318  116.4805
## 12  8.839152  116.1287
## 13 11.655932  115.1735
## 14  9.250789  115.0956
## 15 10.621921  115.8074
## 16  8.528195  116.1973
## 17 10.798195  115.6320
## 18 10.279463  116.6767
## 19 10.657874  114.4302
## 20 10.593726  117.0619
## 21  9.422073  117.4563
## 22 10.476063  115.5271
## 23  8.812366  116.3504
## 24 10.717688  115.6888
## 25 10.134697  116.3310

#Take a look at our univariate data
ggplot(dat, aes(x=adversity)) + geom_density() + geom_rug() #geom_rug() gives us the ticks along the axes

ggplot(dat, aes(x=wellbeing)) + geom_density() + geom_rug()

#Bivariate scatter plot
ggplot(dat, aes(x=adversity, y=wellbeing)) + geom_point()

# Calculate Z-scores --------------------------------------------------------

#In R, the scale() function converts a vector to z-scores.
scale(dat$adversity) #notice that it also attaches attributes for the original mean (9.951) and sd (0.847).  These won't get in your way for other operations on the vector, but they are available if we need them for something else.

##              [,1]
##  [1,]  0.36032861
##  [2,] -0.43817511
##  [3,] -1.06667316
##  [4,]  0.66142803
##  [5,] -2.00119838
##  [6,] -0.53015626
##  [7,]  0.96016882
##  [8,]  0.64844877
##  [9,]  0.44847768
## [10,] -0.05060306
## [11,] -0.71795993
## [12,] -1.31282373
## [13,]  2.01068509
## [14,] -0.82713400
## [15,]  0.79065873
## [16,] -1.67972083
## [17,]  0.99864486
## [18,]  0.38659369
## [19,]  0.83308038
## [20,]  0.75739245
## [21,] -0.62503657
## [22,]  0.61856160
## [23,] -1.34442782
## [24,]  0.90365445
## [25,]  0.21578567
## attr(,"scaled:center")
## [1] 9.951812
## attr(,"scaled:scale")
## [1] 0.847532

dat$adversity_z <- scale(dat$adversity)
dat$wellbeing_z <- scale(dat$wellbeing)

dat #take a look at the data frame now we've added the z-score columns

##    adversity wellbeing adversity_z wellbeing_z
## 1  10.257202  116.9637  0.36032861  1.19851864
## 2   9.580445  116.1984 -0.43817511  0.24141896
## 3   9.047772  116.1271 -1.06667316  0.15226753
## 4  10.512393  115.3765  0.66142803 -0.78647147
## 5   8.255732  117.4686 -2.00119838  1.82993398
## 6   9.502488  116.0299 -0.53015626  0.03073164
## 7  10.765586  114.7700  0.96016882 -1.54485479
## 8  10.501393  115.2308  0.64844877 -0.96868078
## 9  10.331911  116.6814  0.44847768  0.84546610
## 10  9.908924  115.2496 -0.05060306 -0.94511073
## 11  9.343318  116.4805 -0.71795993  0.59421739
## 12  8.839152  116.1287 -1.31282373  0.15423778
## 13 11.655932  115.1735  2.01068509 -1.04031241
## 14  9.250789  115.0956 -0.82713400 -1.13770964
## 15 10.621921  115.8074  0.79065873 -0.24757879
## 16  8.528195  116.1973 -1.67972083  0.24005915
## 17 10.798195  115.6320  0.99864486 -0.46683056
## 18 10.279463  116.6767  0.38659369  0.83967370
## 19 10.657874  114.4302  0.83308038 -1.96982495
## 20 10.593726  117.0619  0.75739245  1.32137942
## 21  9.422073  117.4563 -0.62503657  1.81461441
## 22 10.476063  115.5271  0.61856160 -0.59809086
## 23  8.812366  116.3504 -1.34442782  0.43151651
## 24 10.717688  115.6888  0.90365445 -0.39583928
## 25 10.134697  116.3310  0.21578567  0.40726904

ggplot(dat, aes(x=adversity_z, y=wellbeing_z)) + geom_point() #Plot a scatter of our z-scores

#If we didn't know anything about our x variable (adversity), our best guess for y (wellbeing) would be the mean.  Lets show that graphically and draw lines to indicate how far off we would be from our observed balues (y - yhat)

ggplot(dat, aes(x = adversity_z, y = wellbeing_z)) + geom_point() +
  geom_hline(yintercept = 0, lty = 2, color = "purple") +
  geom_point(aes(x = adversity_z, y = mean(wellbeing_z)), color="darkorange") +
  geom_segment(aes(x = adversity_z, xend = adversity_z, y = wellbeing_z, yend = mean(wellbeing_z)), color="orange", size = 1)

#We can calculate the average squared length of the line (or difference between y and yhat, aka unexplained variance) as follows:  
sum((dat$wellbeing_z - mean(dat$wellbeing_z))^2) / (nrow(dat)-1) #note that this is our formula for variance! (The n-1 in the denominator is an adjustment for variance in a sample as opposed to the population).

## [1] 1

#Notice that this is the same as using R to calculate the variance in our scores:
var(dat$wellbeing_z)

##      [,1]
## [1,]    1

#Our goal is to get predicted values for wellbeing that reduce the unexplained variance, ie shorten these lines.

# Manually get our correlation --------------------------------------------

#A correlation is basically saying, "on average, how much does y change when x changes by 1". The formula to get our correlation coefficient is to take the sum of squares of the differences between each pair of z values, and divide that by 2*(n - 1).  Lets do that step by step:

dat$zdif <- dat$adversity_z - dat$wellbeing_z #Here is a new column that is the difference between each pair of z values

dat$zdifsq <-  dat$zdif^2 #And now the squared values

sum(dat$zdif) #Remember that the sum of z values should be zero.  Notice here that it is a non-zero, but extremely low value.  This due to rounding errors (which can be overcome in R, but there is no need to in most cases).

## [1] -8.917866e-14

sum(dat$zdifsq) #And here is our sum of squares.  We will use this value to get our correlation coefficient below:

## [1] 70.46815

1 - sum(dat$zdifsq) / (2 * (nrow(dat) - 1)) #this is the formula for Pearson product moment correlation.

## [1] -0.4680864

cor(dat$adversity, dat$wellbeing) #R has a function that does all the above for us.  We'll use this from now on.

## [1] -0.4680864

# Predict our zyhat scores ------------------------------------------------

#Lets plot a line with a slope equal to our r value over our scatter plot
ggplot(dat, aes(x = adversity_z, y = wellbeing_z)) + 
  geom_point() + 
  geom_abline(slope= -.468, color="blue")

  #we can break ggplot geoms over multiple lines using "+" at the end of the line.  Make sure there isn't one at the end of the last line though

#What does that r value mean?  In z-units, it means that we can predict our Zyhat value will change by -0.468 units for each 1 unit change of Zx
    #Zyhat = rxy * zx
dat$wellbeing_zhat  <-  -.468 * dat$adversity_z #Create a column of our predicted zy values.

#Take a look at subject 17, who has almost a score of 1 for adversity_z
dat[17, ] #notice the wellbeing_zhat score: almost -0.468

##    adversity wellbeing adversity_z wellbeing_z     zdif   zdifsq
## 17   10.7982   115.632   0.9986449  -0.4668306 1.465475 2.147618
##    wellbeing_zhat
## 17     -0.4673658

#What if we add these zyhat scores to our previous plot?
ggplot(dat, aes(x = adversity_z, y = wellbeing_z)) + 
  geom_point() + 
  geom_abline(slope= -.468, color="blue") +
  geom_point(aes(x = adversity_z, y = wellbeing_zhat), color="purple", size = 2)

  #Our predicted zy points all fall on the correlation slope!

#How far off are our predicted y values from out observed y values? IE how much unexplained variance are we left with? (aka residuals or errors).  
dat$wellbeing_zhat_resid <- dat$wellbeing_z - dat$wellbeing_zhat #We just calculate the difference between our predicted zy value and our measured y value.

dat #Take another look at our data frame

##    adversity wellbeing adversity_z wellbeing_z       zdif      zdifsq
## 1  10.257202  116.9637  0.36032861  1.19851864 -0.8381900  0.70256253
## 2   9.580445  116.1984 -0.43817511  0.24141896 -0.6795941  0.46184810
## 3   9.047772  116.1271 -1.06667316  0.15226753 -1.2189407  1.48581641
## 4  10.512393  115.3765  0.66142803 -0.78647147  1.4478995  2.09641297
## 5   8.255732  117.4686 -2.00119838  1.82993398 -3.8311324 14.67757515
## 6   9.502488  116.0299 -0.53015626  0.03073164 -0.5608879  0.31459524
## 7  10.765586  114.7700  0.96016882 -1.54485479  2.5050236  6.27514326
## 8  10.501393  115.2308  0.64844877 -0.96868078  1.6171296  2.61510799
## 9  10.331911  116.6814  0.44847768  0.84546610 -0.3969884  0.15759980
## 10  9.908924  115.2496 -0.05060306 -0.94511073  0.8945077  0.80014396
## 11  9.343318  116.4805 -0.71795993  0.59421739 -1.3121773  1.72180931
## 12  8.839152  116.1287 -1.31282373  0.15423778 -1.4670615  2.15226947
## 13 11.655932  115.1735  2.01068509 -1.04031241  3.0509975  9.30858573
## 14  9.250789  115.0956 -0.82713400 -1.13770964  0.3105756  0.09645723
## 15 10.621921  115.8074  0.79065873 -0.24757879  1.0382375  1.07793714
## 16  8.528195  116.1973 -1.67972083  0.24005915 -1.9197800  3.68555515
## 17 10.798195  115.6320  0.99864486 -0.46683056  1.4654754  2.14761819
## 18 10.279463  116.6767  0.38659369  0.83967370 -0.4530800  0.20528150
## 19 10.657874  114.4302  0.83308038 -1.96982495  2.8029053  7.85627826
## 20 10.593726  117.0619  0.75739245  1.32137942 -0.5639870  0.31808130
## 21  9.422073  117.4563 -0.62503657  1.81461441 -2.4396510  5.95189691
## 22 10.476063  115.5271  0.61856160 -0.59809086  1.2166525  1.48024321
## 23  8.812366  116.3504 -1.34442782  0.43151651 -1.7759443  3.15397826
## 24 10.717688  115.6888  0.90365445 -0.39583928  1.2994937  1.68868394
## 25 10.134697  116.3310  0.21578567  0.40726904 -0.1914834  0.03666588
##    wellbeing_zhat wellbeing_zhat_resid
## 1     -0.16863379         1.3671524328
## 2      0.20506595         0.0363530051
## 3      0.49920304        -0.3469355079
## 4     -0.30954832        -0.4769231523
## 5      0.93656084         0.8933731336
## 6      0.24811313        -0.2173814861
## 7     -0.44935901        -1.0954957776
## 8     -0.30347403        -0.6652067520
## 9     -0.20988756         1.0553536538
## 10     0.02368223        -0.9687929605
## 11     0.33600525         0.2582121405
## 12     0.61440151        -0.4601637240
## 13    -0.94100062        -0.0993117908
## 14     0.38709871        -1.5248083541
## 15    -0.37002828         0.1224494948
## 16     0.78610935        -0.5460501981
## 17    -0.46736579         0.0005352385
## 18    -0.18092585         1.0205995541
## 19    -0.38988162        -1.5799433324
## 20    -0.35445967         1.6758390885
## 21     0.29251712         1.5220972936
## 22    -0.28948683        -0.3086040276
## 23     0.62919222        -0.1976757125
## 24    -0.42291028         0.0270710077
## 25    -0.10098770         0.5082567330

#Lets visualize the size of the residuals
last_plot() + geom_segment(aes(x = adversity_z + .025, xend = adversity_z + .025, y = wellbeing_z, yend = wellbeing_zhat), color="green", size = 1.2) #note here that I have shifted the x values by .025 so that they don't overlap our earlier lines

  #Note: When using last_plot(), it is referencing the last plot R created, not necessarily what is above this in your script editor.  IE if you run the script out of order or create another ggplot on your own, it will try to use that most recent one!

#Compare that the the size of the residuals (or variance) we would have had if we didn't know anything about adversity, and were making our guess based on the mean of wellbeing:
last_plot() + 
  geom_hline(yintercept = 0, lty = 2, color = "purple") + #Here is a line showing our mean of wellbeing
  geom_segment(aes(x = adversity_z, xend = adversity_z, y = wellbeing_z, yend = 0), color="orange", size = 1.2)

#In most cases, the green lines are shorter than the orange lines. Quantifying how much shorter overall is our R^2 value:

1 - sum(dat$wellbeing_zhat_resid^2) / sum(dat$wellbeing_z^2) #variance accounted for.  How much have we shortened the lines?

## [1] 0.2191049

#This is equal to our correlation coefficient squared:
cor(dat$adversity, dat$wellbeing)^2

## [1] 0.2191049

sum(dat$wellbeing_zhat_resid^2) / sum(dat$wellbeing_z^2) #Variance remaining (error / residuals). How much line is left?

## [1] 0.7808951

# Here is our full plot all in one place if you want to play --------

#Plot our predicted wellbeing_z values
ggplot(dat, aes(x = adversity_z, y = wellbeing_z)) + geom_point() +
  geom_point(aes(x = adversity_z, y = wellbeing_zhat), color="purple", size = 2) + #add our predicted wellbeing_z values
  geom_abline(slope= -.468, color="blue") + #add a line with a slope equal to r
  geom_segment(aes(x = adversity_z + .025, xend = adversity_z + .025, y = wellbeing_z, yend = wellbeing_zhat), color="green", size = 1) +
  geom_hline(yintercept = 0, lty = 2, color = "purple") +
  geom_segment(aes(x = adversity_z, xend = adversity_z, y = wellbeing_z, yend = 0), color="orange", size = 1) +

  geom_abline(slope = .468, color = "red", lty=3) + #an example of what if we had a bad prediction
  geom_segment(aes(x = adversity_z - .02, xend = adversity_z - .02, y = wellbeing_z, yend = wellbeing_zhat*(-1)), color="red")

08_-_correlation__regression-_Zadra_Example.R

Zadra

Mon Sep 26 15:03:27 2016