require(tidyverse) #if you don't have this, use install.packages("tidyverse")
## Loading required package: tidyverse
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
theme_set(theme_gray(base_size = 18)) #Set our default font size for all the plots we produce with ggplot()
load("C:/Users/Jon/Documents/Classes/quant methods I (6500 5500)/data/08 - correlation & regression/08 - correlation & regression - Zadra Example (without tidyr).Rdata") #adjust the path to the location of your .Rdata file
# First look at our data ----------------------------------------------------------
dat <- data.frame("adversity" = x, "wellbeing" = y) #Put our vectors for adversity and wellbeing into a data frame
dat #take a look at the data frame.
## adversity wellbeing
## 1 10.257202 116.9637
## 2 9.580445 116.1984
## 3 9.047772 116.1271
## 4 10.512393 115.3765
## 5 8.255732 117.4686
## 6 9.502488 116.0299
## 7 10.765586 114.7700
## 8 10.501393 115.2308
## 9 10.331911 116.6814
## 10 9.908924 115.2496
## 11 9.343318 116.4805
## 12 8.839152 116.1287
## 13 11.655932 115.1735
## 14 9.250789 115.0956
## 15 10.621921 115.8074
## 16 8.528195 116.1973
## 17 10.798195 115.6320
## 18 10.279463 116.6767
## 19 10.657874 114.4302
## 20 10.593726 117.0619
## 21 9.422073 117.4563
## 22 10.476063 115.5271
## 23 8.812366 116.3504
## 24 10.717688 115.6888
## 25 10.134697 116.3310
#Take a look at our univariate data
ggplot(dat, aes(x=adversity)) + geom_density() + geom_rug() #geom_rug() gives us the ticks along the axes

ggplot(dat, aes(x=wellbeing)) + geom_density() + geom_rug()

#Bivariate scatter plot
ggplot(dat, aes(x=adversity, y=wellbeing)) + geom_point()

# Calculate Z-scores --------------------------------------------------------
#In R, the scale() function converts a vector to z-scores.
scale(dat$adversity) #notice that it also attaches attributes for the original mean (9.951) and sd (0.847). These won't get in your way for other operations on the vector, but they are available if we need them for something else.
## [,1]
## [1,] 0.36032861
## [2,] -0.43817511
## [3,] -1.06667316
## [4,] 0.66142803
## [5,] -2.00119838
## [6,] -0.53015626
## [7,] 0.96016882
## [8,] 0.64844877
## [9,] 0.44847768
## [10,] -0.05060306
## [11,] -0.71795993
## [12,] -1.31282373
## [13,] 2.01068509
## [14,] -0.82713400
## [15,] 0.79065873
## [16,] -1.67972083
## [17,] 0.99864486
## [18,] 0.38659369
## [19,] 0.83308038
## [20,] 0.75739245
## [21,] -0.62503657
## [22,] 0.61856160
## [23,] -1.34442782
## [24,] 0.90365445
## [25,] 0.21578567
## attr(,"scaled:center")
## [1] 9.951812
## attr(,"scaled:scale")
## [1] 0.847532
dat$adversity_z <- scale(dat$adversity)
dat$wellbeing_z <- scale(dat$wellbeing)
dat #take a look at the data frame now we've added the z-score columns
## adversity wellbeing adversity_z wellbeing_z
## 1 10.257202 116.9637 0.36032861 1.19851864
## 2 9.580445 116.1984 -0.43817511 0.24141896
## 3 9.047772 116.1271 -1.06667316 0.15226753
## 4 10.512393 115.3765 0.66142803 -0.78647147
## 5 8.255732 117.4686 -2.00119838 1.82993398
## 6 9.502488 116.0299 -0.53015626 0.03073164
## 7 10.765586 114.7700 0.96016882 -1.54485479
## 8 10.501393 115.2308 0.64844877 -0.96868078
## 9 10.331911 116.6814 0.44847768 0.84546610
## 10 9.908924 115.2496 -0.05060306 -0.94511073
## 11 9.343318 116.4805 -0.71795993 0.59421739
## 12 8.839152 116.1287 -1.31282373 0.15423778
## 13 11.655932 115.1735 2.01068509 -1.04031241
## 14 9.250789 115.0956 -0.82713400 -1.13770964
## 15 10.621921 115.8074 0.79065873 -0.24757879
## 16 8.528195 116.1973 -1.67972083 0.24005915
## 17 10.798195 115.6320 0.99864486 -0.46683056
## 18 10.279463 116.6767 0.38659369 0.83967370
## 19 10.657874 114.4302 0.83308038 -1.96982495
## 20 10.593726 117.0619 0.75739245 1.32137942
## 21 9.422073 117.4563 -0.62503657 1.81461441
## 22 10.476063 115.5271 0.61856160 -0.59809086
## 23 8.812366 116.3504 -1.34442782 0.43151651
## 24 10.717688 115.6888 0.90365445 -0.39583928
## 25 10.134697 116.3310 0.21578567 0.40726904
ggplot(dat, aes(x=adversity_z, y=wellbeing_z)) + geom_point() #Plot a scatter of our z-scores

#If we didn't know anything about our x variable (adversity), our best guess for y (wellbeing) would be the mean. Lets show that graphically and draw lines to indicate how far off we would be from our observed balues (y - yhat)
ggplot(dat, aes(x = adversity_z, y = wellbeing_z)) + geom_point() +
geom_hline(yintercept = 0, lty = 2, color = "purple") +
geom_point(aes(x = adversity_z, y = mean(wellbeing_z)), color="darkorange") +
geom_segment(aes(x = adversity_z, xend = adversity_z, y = wellbeing_z, yend = mean(wellbeing_z)), color="orange", size = 1)

#We can calculate the average squared length of the line (or difference between y and yhat, aka unexplained variance) as follows:
sum((dat$wellbeing_z - mean(dat$wellbeing_z))^2) / (nrow(dat)-1) #note that this is our formula for variance! (The n-1 in the denominator is an adjustment for variance in a sample as opposed to the population).
## [1] 1
#Notice that this is the same as using R to calculate the variance in our scores:
var(dat$wellbeing_z)
## [,1]
## [1,] 1
#Our goal is to get predicted values for wellbeing that reduce the unexplained variance, ie shorten these lines.
# Manually get our correlation --------------------------------------------
#A correlation is basically saying, "on average, how much does y change when x changes by 1". The formula to get our correlation coefficient is to take the sum of squares of the differences between each pair of z values, and divide that by 2*(n - 1). Lets do that step by step:
dat$zdif <- dat$adversity_z - dat$wellbeing_z #Here is a new column that is the difference between each pair of z values
dat$zdifsq <- dat$zdif^2 #And now the squared values
sum(dat$zdif) #Remember that the sum of z values should be zero. Notice here that it is a non-zero, but extremely low value. This due to rounding errors (which can be overcome in R, but there is no need to in most cases).
## [1] -8.917866e-14
sum(dat$zdifsq) #And here is our sum of squares. We will use this value to get our correlation coefficient below:
## [1] 70.46815
1 - sum(dat$zdifsq) / (2 * (nrow(dat) - 1)) #this is the formula for Pearson product moment correlation.
## [1] -0.4680864
cor(dat$adversity, dat$wellbeing) #R has a function that does all the above for us. We'll use this from now on.
## [1] -0.4680864
# Predict our zyhat scores ------------------------------------------------
#Lets plot a line with a slope equal to our r value over our scatter plot
ggplot(dat, aes(x = adversity_z, y = wellbeing_z)) +
geom_point() +
geom_abline(slope= -.468, color="blue")

#we can break ggplot geoms over multiple lines using "+" at the end of the line. Make sure there isn't one at the end of the last line though
#What does that r value mean? In z-units, it means that we can predict our Zyhat value will change by -0.468 units for each 1 unit change of Zx
#Zyhat = rxy * zx
dat$wellbeing_zhat <- -.468 * dat$adversity_z #Create a column of our predicted zy values.
#Take a look at subject 17, who has almost a score of 1 for adversity_z
dat[17, ] #notice the wellbeing_zhat score: almost -0.468
## adversity wellbeing adversity_z wellbeing_z zdif zdifsq
## 17 10.7982 115.632 0.9986449 -0.4668306 1.465475 2.147618
## wellbeing_zhat
## 17 -0.4673658
#What if we add these zyhat scores to our previous plot?
ggplot(dat, aes(x = adversity_z, y = wellbeing_z)) +
geom_point() +
geom_abline(slope= -.468, color="blue") +
geom_point(aes(x = adversity_z, y = wellbeing_zhat), color="purple", size = 2)

#Our predicted zy points all fall on the correlation slope!
#How far off are our predicted y values from out observed y values? IE how much unexplained variance are we left with? (aka residuals or errors).
dat$wellbeing_zhat_resid <- dat$wellbeing_z - dat$wellbeing_zhat #We just calculate the difference between our predicted zy value and our measured y value.
dat #Take another look at our data frame
## adversity wellbeing adversity_z wellbeing_z zdif zdifsq
## 1 10.257202 116.9637 0.36032861 1.19851864 -0.8381900 0.70256253
## 2 9.580445 116.1984 -0.43817511 0.24141896 -0.6795941 0.46184810
## 3 9.047772 116.1271 -1.06667316 0.15226753 -1.2189407 1.48581641
## 4 10.512393 115.3765 0.66142803 -0.78647147 1.4478995 2.09641297
## 5 8.255732 117.4686 -2.00119838 1.82993398 -3.8311324 14.67757515
## 6 9.502488 116.0299 -0.53015626 0.03073164 -0.5608879 0.31459524
## 7 10.765586 114.7700 0.96016882 -1.54485479 2.5050236 6.27514326
## 8 10.501393 115.2308 0.64844877 -0.96868078 1.6171296 2.61510799
## 9 10.331911 116.6814 0.44847768 0.84546610 -0.3969884 0.15759980
## 10 9.908924 115.2496 -0.05060306 -0.94511073 0.8945077 0.80014396
## 11 9.343318 116.4805 -0.71795993 0.59421739 -1.3121773 1.72180931
## 12 8.839152 116.1287 -1.31282373 0.15423778 -1.4670615 2.15226947
## 13 11.655932 115.1735 2.01068509 -1.04031241 3.0509975 9.30858573
## 14 9.250789 115.0956 -0.82713400 -1.13770964 0.3105756 0.09645723
## 15 10.621921 115.8074 0.79065873 -0.24757879 1.0382375 1.07793714
## 16 8.528195 116.1973 -1.67972083 0.24005915 -1.9197800 3.68555515
## 17 10.798195 115.6320 0.99864486 -0.46683056 1.4654754 2.14761819
## 18 10.279463 116.6767 0.38659369 0.83967370 -0.4530800 0.20528150
## 19 10.657874 114.4302 0.83308038 -1.96982495 2.8029053 7.85627826
## 20 10.593726 117.0619 0.75739245 1.32137942 -0.5639870 0.31808130
## 21 9.422073 117.4563 -0.62503657 1.81461441 -2.4396510 5.95189691
## 22 10.476063 115.5271 0.61856160 -0.59809086 1.2166525 1.48024321
## 23 8.812366 116.3504 -1.34442782 0.43151651 -1.7759443 3.15397826
## 24 10.717688 115.6888 0.90365445 -0.39583928 1.2994937 1.68868394
## 25 10.134697 116.3310 0.21578567 0.40726904 -0.1914834 0.03666588
## wellbeing_zhat wellbeing_zhat_resid
## 1 -0.16863379 1.3671524328
## 2 0.20506595 0.0363530051
## 3 0.49920304 -0.3469355079
## 4 -0.30954832 -0.4769231523
## 5 0.93656084 0.8933731336
## 6 0.24811313 -0.2173814861
## 7 -0.44935901 -1.0954957776
## 8 -0.30347403 -0.6652067520
## 9 -0.20988756 1.0553536538
## 10 0.02368223 -0.9687929605
## 11 0.33600525 0.2582121405
## 12 0.61440151 -0.4601637240
## 13 -0.94100062 -0.0993117908
## 14 0.38709871 -1.5248083541
## 15 -0.37002828 0.1224494948
## 16 0.78610935 -0.5460501981
## 17 -0.46736579 0.0005352385
## 18 -0.18092585 1.0205995541
## 19 -0.38988162 -1.5799433324
## 20 -0.35445967 1.6758390885
## 21 0.29251712 1.5220972936
## 22 -0.28948683 -0.3086040276
## 23 0.62919222 -0.1976757125
## 24 -0.42291028 0.0270710077
## 25 -0.10098770 0.5082567330
#Lets visualize the size of the residuals
last_plot() + geom_segment(aes(x = adversity_z + .025, xend = adversity_z + .025, y = wellbeing_z, yend = wellbeing_zhat), color="green", size = 1.2) #note here that I have shifted the x values by .025 so that they don't overlap our earlier lines

#Note: When using last_plot(), it is referencing the last plot R created, not necessarily what is above this in your script editor. IE if you run the script out of order or create another ggplot on your own, it will try to use that most recent one!
#Compare that the the size of the residuals (or variance) we would have had if we didn't know anything about adversity, and were making our guess based on the mean of wellbeing:
last_plot() +
geom_hline(yintercept = 0, lty = 2, color = "purple") + #Here is a line showing our mean of wellbeing
geom_segment(aes(x = adversity_z, xend = adversity_z, y = wellbeing_z, yend = 0), color="orange", size = 1.2)

#In most cases, the green lines are shorter than the orange lines. Quantifying how much shorter overall is our R^2 value:
1 - sum(dat$wellbeing_zhat_resid^2) / sum(dat$wellbeing_z^2) #variance accounted for. How much have we shortened the lines?
## [1] 0.2191049
#This is equal to our correlation coefficient squared:
cor(dat$adversity, dat$wellbeing)^2
## [1] 0.2191049
sum(dat$wellbeing_zhat_resid^2) / sum(dat$wellbeing_z^2) #Variance remaining (error / residuals). How much line is left?
## [1] 0.7808951
# Here is our full plot all in one place if you want to play --------
#Plot our predicted wellbeing_z values
ggplot(dat, aes(x = adversity_z, y = wellbeing_z)) + geom_point() +
geom_point(aes(x = adversity_z, y = wellbeing_zhat), color="purple", size = 2) + #add our predicted wellbeing_z values
geom_abline(slope= -.468, color="blue") + #add a line with a slope equal to r
geom_segment(aes(x = adversity_z + .025, xend = adversity_z + .025, y = wellbeing_z, yend = wellbeing_zhat), color="green", size = 1) +
geom_hline(yintercept = 0, lty = 2, color = "purple") +
geom_segment(aes(x = adversity_z, xend = adversity_z, y = wellbeing_z, yend = 0), color="orange", size = 1) +
geom_abline(slope = .468, color = "red", lty=3) + #an example of what if we had a bad prediction
geom_segment(aes(x = adversity_z - .02, xend = adversity_z - .02, y = wellbeing_z, yend = wellbeing_zhat*(-1)), color="red")
