Correlation and Simple Linear Regression Analysis

Load necessary packages:

library(pander)
library(ggpubr)

## Loading required package: ggplot2

library(outliers)

$~$

CORRELATION EXAMPLE: Create the data vectors and the data frame manually in RStudio:

experience <- c(10, 12, 8, 15, 6, 11, 14, 16, 15, 12)
salaries <- c(98, 95, 97, 110, 88, 102, 120, 128, 105, 104)
cor.data <- data.frame(experience, salaries)

$~$

Check for Outliers:

plot1 <- boxplot(cor.data$experience, outcol = "red", cex=1.5)

plot1

## $stats
##      [,1]
## [1,]    6
## [2,]   10
## [3,]   12
## [4,]   15
## [5,]   16
## 
## $n
## [1] 10
## 
## $conf
##           [,1]
## [1,]  9.501801
## [2,] 14.498199
## 
## $out
## numeric(0)
## 
## $group
## numeric(0)
## 
## $names
## [1] ""

plot2 <- boxplot(cor.data$salaries, outcol="red", cex=1.5)

plot2

## $stats
##      [,1]
## [1,]   88
## [2,]   97
## [3,]  103
## [4,]  110
## [5,]  128
## 
## $n
## [1] 10
## 
## $conf
##           [,1]
## [1,]  96.50468
## [2,] 109.49532
## 
## $out
## numeric(0)
## 
## $group
## numeric(0)
## 
## $names
## [1] ""

$~$

Test for normality of data:

shapiro.test(cor.data$experience)

## 
##  Shapiro-Wilk normality test
## 
## data:  cor.data$experience
## W = 0.94701, p-value = 0.6333

shapiro.test(cor.data$salaries)

## 
##  Shapiro-Wilk normality test
## 
## data:  cor.data$salaries
## W = 0.94407, p-value = 0.5991

$~$

Perform Correlation Analysis:

pearson.cor <- cor.test(cor.data$experience, cor.data$salaries, method = "pearson")
pander(pearson.cor)

Pearson’s product-moment correlation: `cor.data$experience` and `cor.data$salaries`
Test statistic	df	P value	Alternative hypothesis	cor
4.086	8	0.003504 * *	two.sided	0.8222

$~$

SIMPLE LINEAR REGRESSION ANALYSIS EXAMPLE: Create the data vectors manually in RStudio

noise <- c(28, 33, 21, 35, 29, 26, 22, 30, 34, 27, 31, 34, 29)
hypertension <- c(73, 68, 69, 88, 80, 74, 74, 69, 89, 68, 76, 87, 73)
reg.data <- data.frame(noise, hypertension)
reg.data

##    noise hypertension
## 1     28           73
## 2     33           68
## 3     21           69
## 4     35           88
## 5     29           80
## 6     26           74
## 7     22           74
## 8     30           69
## 9     34           89
## 10    27           68
## 11    31           76
## 12    34           87
## 13    29           73

$~$

Check for outliers:

plot1 <- boxplot(reg.data$noise, outcol="red", cex=1.5)

plot1

## $stats
##      [,1]
## [1,]   21
## [2,]   27
## [3,]   29
## [4,]   33
## [5,]   35
## 
## $n
## [1] 13
## 
## $conf
##          [,1]
## [1,] 26.37072
## [2,] 31.62928
## 
## $out
## numeric(0)
## 
## $group
## numeric(0)
## 
## $names
## [1] ""

plot2 <- boxplot(reg.data$hypertension, outcol="red", cex=1.5)

plot2

## $stats
##      [,1]
## [1,]   68
## [2,]   69
## [3,]   74
## [4,]   80
## [5,]   89
## 
## $n
## [1] 13
## 
## $conf
##          [,1]
## [1,] 69.17966
## [2,] 78.82034
## 
## $out
## numeric(0)
## 
## $group
## numeric(0)
## 
## $names
## [1] ""

$~$

Create a scatterplot of the data:

plot(noise, hypertension, main="Scatterplot of Data", xlab="Noise Level", ylab="Hypertension")

ggscatter(reg.data, x="noise", y="hypertension", xlab="Noise Level", ylab="Hypertension", add="reg.line", method="pearson")

## Warning: Ignoring unknown parameters: method

## `geom_smooth()` using formula 'y ~ x'

$~$

Perform the Simple Linear Regression analysis:

reg.model <-lm(reg.data$hypertension~reg.data$noise, data=reg.data)
summary(reg.model)

## 
## Call:
## lm(formula = reg.data$hypertension ~ reg.data$noise, data = reg.data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -12.082  -2.837   1.347   5.592   7.857 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)   
## (Intercept)     45.0612    12.1720   3.702  0.00349 **
## reg.data$noise   1.0612     0.4132   2.569  0.02612 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.316 on 11 degrees of freedom
## Multiple R-squared:  0.3749, Adjusted R-squared:  0.3181 
## F-statistic: 6.597 on 1 and 11 DF,  p-value: 0.02612