if echo = false, then no code will show in output

Packages

Loading required packages:

library(ggplot2)
library(dplyr)
#install.packages("zoo")
library(ggstatsplot)
library(gridExtra)
library(car)
library(lmtest)

Setting global theme

theme_set(theme_bw())

Data

data <- data.frame(
  x = c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100),
  y = c(15, 30, 45, 60, 75, 85, 95, 105, 110, 120),
  z = c(30, NA, 40, 45, 30, 50, 30, 40, 60, 70),
  m = -c(30, NA, 40, 45, 30, 50, 30, 40, 60, 70),
  cat = rep(c("A", "B"), each = 5)
)


ggplot(data, aes( x= x, y= y))+
  geom_point()

Correlation function

cor(data$x, data$y)       # method = "pearson" is default
## [1] 0.9903356
cor(data$x, data$z, use = "complete.obs")    # for missing value (use = "complete.obs")
## [1] 0.6767234
cor(data$x, data$m, use = "complete.obs")   # z and m are opposite (-) negative
## [1] -0.6767234
cor(data %>% select(where(is.numeric)), use = "complete.obs") %>%
  round(3)
##        x      y      z      m
## x  1.000  0.989  0.677 -0.677
## y  0.989  1.000  0.610 -0.610
## z  0.677  0.610  1.000 -1.000
## m -0.677 -0.610 -1.000  1.000
# in data there is a categorical vector. we have to transfer is numeric value because in pearson correlation all value need to numeric

data %>%
  select(where(is.numeric)) %>%
  cor( use = "complete.obs") %>%       # same as before
  round(3)
##        x      y      z      m
## x  1.000  0.989  0.677 -0.677
## y  0.989  1.000  0.610 -0.610
## z  0.677  0.610  1.000 -1.000
## m -0.677 -0.610 -1.000  1.000
data %>%
  select(where(is.numeric)) %>%
  cor( use = "complete.obs", method = "spearman") %>%       # spearman method
  round(3)
##        x      y      z      m
## x  1.000  1.000  0.604 -0.604
## y  1.000  1.000  0.604 -0.604
## z  0.604  0.604  1.000 -1.000
## m -0.604 -0.604 -1.000  1.000
data %>%
  select(where(is.numeric)) %>%
  cor( use = "complete.obs", method = "kendall") %>%       # kendall method
  round(3)
##       x     y     z     m
## x  1.00  1.00  0.53 -0.53
## y  1.00  1.00  0.53 -0.53
## z  0.53  0.53  1.00 -1.00
## m -0.53 -0.53 -1.00  1.00

ggstatplot

ggcorrmat(
  data = data,
  colors = c("blue", "white", "green"),
  title = "Correlogram for simulated data", 
  matrix.type = "lower", 
  type = "parametric", pch = ""
)

Scatterplot

#cor(data, method = "pearson")
#plot(data$x, data$y, main = "Scatterplot X vs Y",
     #xlab = "X variable", ylab = "Y variable", pch =19, color = "blue")

Example of correlation

set.seed(42)

X <- seq(-10, 10, length.out = 30)
X
##  [1] -10.0000000  -9.3103448  -8.6206897  -7.9310345  -7.2413793  -6.5517241
##  [7]  -5.8620690  -5.1724138  -4.4827586  -3.7931034  -3.1034483  -2.4137931
## [13]  -1.7241379  -1.0344828  -0.3448276   0.3448276   1.0344828   1.7241379
## [19]   2.4137931   3.1034483   3.7931034   4.4827586   5.1724138   5.8620690
## [25]   6.5517241   7.2413793   7.9310345   8.6206897   9.3103448  10.0000000
# Create different relationship
Linear <- 2 * X + rnorm(30, mean = 0, sd = 1)
Sine <- sin(X) + rnorm(30, mean = 0, sd = .2)
Exponential <- exp(X/ 10) + rnorm(30, mean = 0, sd = .2)
Quadratic <- X^2 + rnorm(30, mean = 0, sd = 5)
Logarithmic <- log(abs(X) + 1) + rnorm(30, mean = 0, sd = .2)

df <- data.frame( X, Sine, Exponential, Quadratic, Logarithmic, Linear)
head(df)
##            X        Sine Exponential Quadratic Logarithmic    Linear
## 1 -10.000000  0.63511114   0.2944325 106.96058    2.099170 -18.62904
## 2  -9.310345  0.02678392   0.4311919  84.30165    2.039061 -19.18539
## 3  -8.620690 -0.51317773   0.5386522  77.56803    2.288856 -16.87825
## 4  -7.931034 -1.11881817   0.7323859  69.85686    1.990204 -15.22921
## 5  -7.241379 -0.71716343   0.3392839  46.88363    2.108803 -14.07849
## 6  -6.551724 -0.60872466   0.7798610  38.62113    1.936124 -13.20957
# Scatter plot for sine
p1 <- ggplot(df, aes(x= X, y= Sine)) +
  geom_point(color = "blue") +
  geom_smooth(method = "loess", color = "red") + 
  ggtitle("X vs Sine (Non-Linear)")

# Scatter plot for Exponential
p2 <- ggplot(df, aes(x= X, y= Exponential)) +
  geom_point(color = "green") +
  geom_smooth(method = "loess", color = "red") + 
  ggtitle("X vs Exponential (Non-Linear)")

# Scatter plot for Quadratic
p3 <- ggplot(df, aes(x= X, y= Quadratic)) +
  geom_point(color = "purple") +
  geom_smooth(method = "loess", color = "red") + 
  ggtitle("X vs Quadratic (Non-Linear)")


# Scatter plot for Logarithmic
p4 <- ggplot(df, aes(x= X, y= Logarithmic)) +
  geom_point(color = "yellow") +
  geom_smooth(method = "loess", color = "red") + 
  ggtitle("X vs Logarithmic (Non-Linear)")


# Scatter plot for Linear
p5 <- ggplot(df, aes(x= X, y= Linear)) +
  geom_point(color = "black") +
  geom_smooth(method = "loess", color = "red") + 
  ggtitle("X vs Linear (Non-Linear)")

# Arrange the plot in a grid
grid.arrange(p1, p2, p3, p4, p5, ncol= 2)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

Regression

Data

set.seed(42)

n <- 50
x1 <- runif(n, 1, 100)
x2 <- rnorm(n, 0, 5)
y <- 5 + 2*x1 -3*x2 + rnorm(n, 0, 10)

df <- round(data.frame(y, x1, x2), 2)

head(df)
##        y    x1    x2
## 1 200.40 91.57 -2.15
## 2 204.08 93.77 -1.29
## 3  94.74 29.33 -8.82
## 4 155.67 83.21  2.30
## 5 132.67 64.53 -3.20
## 6 118.08 52.39  2.28

Model

model <- lm(y ~ x1 +x2, data = df)
summary(model)
## 
## Call:
## lm(formula = y ~ x1 + x2, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.2545  -6.1818  -0.1287   4.4273  20.2253 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 12.69609    2.83610   4.477 4.81e-05 ***
## x1           1.86519    0.04218  44.217  < 2e-16 ***
## x2          -3.05519    0.26124 -11.695 1.62e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.876 on 47 degrees of freedom
## Multiple R-squared:  0.9782, Adjusted R-squared:  0.9772 
## F-statistic:  1053 on 2 and 47 DF,  p-value: < 2.2e-16

Residuals vs fitted plot

plot(model$fitted.values, resid(model),
     main = "Residuals vs Fitted",
     xlab = "Fitted value",
     ylabs = "Residuals",
     pch = 19)
## Warning in plot.window(...): "ylabs" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "ylabs" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "ylabs" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "ylabs" is not a
## graphical parameter
## Warning in box(...): "ylabs" is not a graphical parameter
## Warning in title(...): "ylabs" is not a graphical parameter
abline(h = 0, col = "red")

cbind(df, predicted = predict(model, df), resid(model))
##         y    x1     x2  predicted resid(model)
## 1  200.40 91.57  -2.15 190.060609   10.3393913
## 2  204.08 93.77  -1.29 191.536572   12.5434277
## 3   94.74 29.33  -8.82  94.349028    0.3909719
## 4  155.67 83.21   2.30 160.871984   -5.2019836
## 5  132.67 64.53  -3.20 142.833701  -10.1637007
## 6  118.08 52.39   2.28 103.447794   14.6322060
## 7  144.85 73.92   3.52 139.816994    5.0330058
## 8   19.02 14.33   5.18  23.598439   -4.5784392
## 9  145.01 66.04  -3.04 145.161314   -0.1513138
## 10 127.09 70.80   2.52 137.052778   -9.9627783
## 11 129.51 46.32  -8.59 125.335988    4.1740118
## 12 158.98 72.19  -3.92 159.320828   -0.3408277
## 13 203.00 93.53  -4.25 200.132291    2.8677094
## 14 103.12 26.29 -12.07  98.608207    4.5117928
## 15 106.21 46.77   0.18  99.381302    6.8286981
## 16 203.95 94.06   1.03 184.989436   18.9605640
## 17 201.34 97.84  -1.81 200.716613    0.6233869
## 18  25.39 12.63   3.79  24.674324    0.7156761
## 19 125.86 48.02  -3.63 113.353072   12.5069279
## 20 127.36 56.47  -6.84 138.921128  -11.5611280
## 21 170.90 90.50   2.16 174.896978   -3.9969780
## 22  35.32 14.73  -4.06  52.574480  -17.2544805
## 23 166.55 98.90   7.22 175.105346   -8.5553458
## 24 201.71 94.72  -2.16 195.966523    5.7434769
## 25  20.02  9.16   3.28  19.760246    0.2597536
## 26 115.99 51.91   1.61 104.599479   11.3905215
## 27 106.47 39.63  -3.92  98.590096    7.8799041
## 28 152.67 90.67   7.88 157.738369   -5.0683694
## 29 104.34 45.25   3.21  87.288978   17.0510220
## 30 164.51 83.76   0.45 167.549944   -3.0399437
## 31 149.95 74.02   1.38 146.541622    3.4083779
## 32 153.18 81.29   3.40 153.930100   -0.7501003
## 33  81.27 39.42   0.45  84.847221   -3.5772211
## 34 189.44 68.83 -14.97 186.813633    2.6263667
## 35   4.70  1.39   1.42  10.950340   -6.2503404
## 36 177.17 83.46  -1.84 173.986772    3.1832276
## 37   6.75  1.73   0.93  13.081550   -6.3315500
## 38  34.53 21.56   2.91  44.019078   -9.4890784
## 39 160.47 90.75   7.00 160.576153   -0.1061529
## 40 122.43 61.57  -3.64 138.657009  -16.2270091
## 41  58.79 38.58   6.51  64.766001   -5.9760011
## 42  83.12 44.14   1.68  89.893054   -6.7730542
## 43  25.85  4.71   5.19   5.624716   20.2252835
## 44 172.33 97.38   4.60 180.274850   -7.9448502
## 45  83.05 43.74   3.60  83.281010   -0.2310099
## 46 197.31 95.80  -5.22 207.329817  -10.0198172
## 47 169.42 88.89  -0.45 179.868063  -10.4480630
## 48 125.61 64.36   3.12 123.207811    2.4021886
## 49 203.59 97.13  -4.77 208.435690   -4.8456899
## 50 137.65 62.26  -2.71 137.102666    0.5473342

VIF

vif(model)
##       x1       x2 
## 1.000154 1.000154

Breusch-pagan test

lmtest::bptest(model)
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 1.1264, df = 2, p-value = 0.5694

QQ plot

qqnorm(resid(model))
qqline(resid(model), col = "red")