#the course url has a very detailed explanation of
#correlation analysis
#https://onlinecourses.science.psu.edu/stat501/node/284
#import data from url into R Studio using read.table function
iqSize<-read.table("https://onlinecourses.science.psu.edu/stat501/sites/onlinecourses.science.psu.edu.stat501/files/data/iqsize.txt", header = TRUE)
#check dataframe
#this is an American dataset, so the participant's weight is in pounds not kilos!
head(iqSize, 3)
## PIQ Brain Height Weight
## 1 124 81.69 64.5 118
## 2 150 103.84 73.3 143
## 3 128 96.54 68.8 172
#inspect the structure of your dataset
str(iqSize)
## 'data.frame': 38 obs. of 4 variables:
## $ PIQ : int 124 150 128 134 110 131 98 84 147 124 ...
## $ Brain : num 81.7 103.8 96.5 95.2 92.9 ...
## $ Height: num 64.5 73.3 68.8 65 69 64.5 66 66.3 68.8 64.5 ...
## $ Weight: int 118 143 172 147 146 138 175 134 172 118 ...
#use the summary function to get a run down on your dataset
#it provides a summary of all the data in your dataset
summary(iqSize)
## PIQ Brain Height Weight
## Min. : 72.00 Min. : 79.06 Min. :62.00 Min. :106.0
## 1st Qu.: 89.25 1st Qu.: 85.48 1st Qu.:66.00 1st Qu.:135.2
## Median :115.00 Median : 90.54 Median :68.00 Median :146.5
## Mean :111.34 Mean : 90.68 Mean :68.42 Mean :151.1
## 3rd Qu.:128.00 3rd Qu.: 94.95 3rd Qu.:70.38 3rd Qu.:172.0
## Max. :150.00 Max. :107.95 Max. :77.00 Max. :192.0
#use the plot function to plot all your variables in a scatterplot
#This plot allows us to visualize the relationship among all variables in one image.
#We can see that height and weight suggests a positive correlation.
#4th column, 3rd row from the top.
plot(iqSize)
#let's calculate correlation
corr<-cor(iqSize)
#inspect matrix
corr
## PIQ Brain Height Weight
## PIQ 1.000000000 0.3778155 -0.09315559 0.002512154
## Brain 0.377815463 1.0000000 0.58836684 0.513486971
## Height -0.093155590 0.5883668 1.00000000 0.699614004
## Weight 0.002512154 0.5134870 0.69961400 1.000000000
#install ggcorrplot if needed
#if(!require(devtools)) install.packages("devtools")
#devtools::install_github("kassambara/ggcorrplot")
#load visualization libraries
library(ggplot2)

library(ggcorrplot)
#plot the correlation matrix visual
ggcorrplot(corr)

#add correlation coefficients & reorder matrix using hierarchical clustering
ggcorrplot(corr, hc.order = TRUE, type = "lower",
lab = TRUE)

#you can also plot the matrix with circles
ggcorrplot(corr, lab = TRUE, type = "lower", method="circle")

#You can see a moderate to strong correlation between height and weight.
#Now you have your correlation matrix with the corresponding correlation coefficients for easy visualization
#if you want to continue the example on the Stats 501 page found here: (https://onlinecourses.science.psu.edu/stat501/node/284) and get your regression equation, residuals, and R-squared
#use the fit function to run your regression analysis in R similar to the example
#shown using Minitab Statistical Software
fit <- lm(PIQ~ Brain + Height + Weight, data=iqSize)
summary(fit)
##
## Call:
## lm(formula = PIQ ~ Brain + Height + Weight, data = iqSize)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32.74 -12.09 -3.84 14.17 51.69
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.114e+02 6.297e+01 1.768 0.085979 .
## Brain 2.060e+00 5.634e-01 3.657 0.000856 ***
## Height -2.732e+00 1.229e+00 -2.222 0.033034 *
## Weight 5.599e-04 1.971e-01 0.003 0.997750
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.79 on 34 degrees of freedom
## Multiple R-squared: 0.2949, Adjusted R-squared: 0.2327
## F-statistic: 4.741 on 3 and 34 DF, p-value: 0.007215
vcov(fit)
## (Intercept) Brain Height Weight
## (Intercept) 3965.358990 -8.11822511 -58.9794977 5.40551851
## Brain -8.118225 0.31747232 -0.2588663 -0.01957544
## Height -58.979498 -0.25886634 1.5114948 -0.13879702
## Weight 5.405519 -0.01957544 -0.1387970 0.03883506