data <- data.frame(
x = c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100),
y = c(15, 30, 45, 60, 75, 85, 95, 105, 110, 120),
z = c(30, NA, 40, 45, 30, 50, 30, 40, 60, 70),
m = -c(30, NA, 40, 45, 30, 50, 30, 40, 60, 70),
cat = rep(c("A", "B"), each = 5)
)
ggplot(data, aes( x= x, y= y))+
geom_point()## [1] 0.9903356
## [1] 0.6767234
## [1] -0.6767234
## x y z m
## x 1.000 0.989 0.677 -0.677
## y 0.989 1.000 0.610 -0.610
## z 0.677 0.610 1.000 -1.000
## m -0.677 -0.610 -1.000 1.000
# in data there is a categorical vector. we have to transfer is numeric value because in pearson correlation all value need to numeric
data %>%
select(where(is.numeric)) %>%
cor( use = "complete.obs") %>% # same as before
round(3)## x y z m
## x 1.000 0.989 0.677 -0.677
## y 0.989 1.000 0.610 -0.610
## z 0.677 0.610 1.000 -1.000
## m -0.677 -0.610 -1.000 1.000
data %>%
select(where(is.numeric)) %>%
cor( use = "complete.obs", method = "spearman") %>% # spearman method
round(3)## x y z m
## x 1.000 1.000 0.604 -0.604
## y 1.000 1.000 0.604 -0.604
## z 0.604 0.604 1.000 -1.000
## m -0.604 -0.604 -1.000 1.000
data %>%
select(where(is.numeric)) %>%
cor( use = "complete.obs", method = "kendall") %>% # kendall method
round(3)## x y z m
## x 1.00 1.00 0.53 -0.53
## y 1.00 1.00 0.53 -0.53
## z 0.53 0.53 1.00 -1.00
## m -0.53 -0.53 -1.00 1.00
ggcorrmat(
data = data,
colors = c("blue", "white", "green"),
title = "Correlogram for simulated data",
matrix.type = "lower",
type = "parametric", pch = ""
)## [1] -10.0000000 -9.3103448 -8.6206897 -7.9310345 -7.2413793 -6.5517241
## [7] -5.8620690 -5.1724138 -4.4827586 -3.7931034 -3.1034483 -2.4137931
## [13] -1.7241379 -1.0344828 -0.3448276 0.3448276 1.0344828 1.7241379
## [19] 2.4137931 3.1034483 3.7931034 4.4827586 5.1724138 5.8620690
## [25] 6.5517241 7.2413793 7.9310345 8.6206897 9.3103448 10.0000000
# Create different relationship
Linear <- 2 * X + rnorm(30, mean = 0, sd = 1)
Sine <- sin(X) + rnorm(30, mean = 0, sd = .2)
Exponential <- exp(X/ 10) + rnorm(30, mean = 0, sd = .2)
Quadratic <- X^2 + rnorm(30, mean = 0, sd = 5)
Logarithmic <- log(abs(X) + 1) + rnorm(30, mean = 0, sd = .2)
df <- data.frame( X, Sine, Exponential, Quadratic, Logarithmic, Linear)
head(df)## X Sine Exponential Quadratic Logarithmic Linear
## 1 -10.000000 0.63511114 0.2944325 106.96058 2.099170 -18.62904
## 2 -9.310345 0.02678392 0.4311919 84.30165 2.039061 -19.18539
## 3 -8.620690 -0.51317773 0.5386522 77.56803 2.288856 -16.87825
## 4 -7.931034 -1.11881817 0.7323859 69.85686 1.990204 -15.22921
## 5 -7.241379 -0.71716343 0.3392839 46.88363 2.108803 -14.07849
## 6 -6.551724 -0.60872466 0.7798610 38.62113 1.936124 -13.20957
# Scatter plot for sine
p1 <- ggplot(df, aes(x= X, y= Sine)) +
geom_point(color = "blue") +
geom_smooth(method = "loess", color = "red") +
ggtitle("X vs Sine (Non-Linear)")
# Scatter plot for Exponential
p2 <- ggplot(df, aes(x= X, y= Exponential)) +
geom_point(color = "green") +
geom_smooth(method = "loess", color = "red") +
ggtitle("X vs Exponential (Non-Linear)")
# Scatter plot for Quadratic
p3 <- ggplot(df, aes(x= X, y= Quadratic)) +
geom_point(color = "purple") +
geom_smooth(method = "loess", color = "red") +
ggtitle("X vs Quadratic (Non-Linear)")
# Scatter plot for Logarithmic
p4 <- ggplot(df, aes(x= X, y= Logarithmic)) +
geom_point(color = "yellow") +
geom_smooth(method = "loess", color = "red") +
ggtitle("X vs Logarithmic (Non-Linear)")
# Scatter plot for Linear
p5 <- ggplot(df, aes(x= X, y= Linear)) +
geom_point(color = "black") +
geom_smooth(method = "loess", color = "red") +
ggtitle("X vs Linear (Non-Linear)")
# Arrange the plot in a grid
grid.arrange(p1, p2, p3, p4, p5, ncol= 2)## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
set.seed(42)
n <- 50
x1 <- runif(n, 1, 100)
x2 <- rnorm(n, 0, 5)
y <- 5 + 2*x1 -3*x2 + rnorm(n, 0, 10)
df <- round(data.frame(y, x1, x2), 2)
head(df)## y x1 x2
## 1 200.40 91.57 -2.15
## 2 204.08 93.77 -1.29
## 3 94.74 29.33 -8.82
## 4 155.67 83.21 2.30
## 5 132.67 64.53 -3.20
## 6 118.08 52.39 2.28
##
## Call:
## lm(formula = y ~ x1 + x2, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.2545 -6.1818 -0.1287 4.4273 20.2253
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.69609 2.83610 4.477 4.81e-05 ***
## x1 1.86519 0.04218 44.217 < 2e-16 ***
## x2 -3.05519 0.26124 -11.695 1.62e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.876 on 47 degrees of freedom
## Multiple R-squared: 0.9782, Adjusted R-squared: 0.9772
## F-statistic: 1053 on 2 and 47 DF, p-value: < 2.2e-16
plot(model$fitted.values, resid(model),
main = "Residuals vs Fitted",
xlab = "Fitted value",
ylabs = "Residuals",
pch = 19)## Warning in plot.window(...): "ylabs" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "ylabs" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "ylabs" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "ylabs" is not a
## graphical parameter
## Warning in box(...): "ylabs" is not a graphical parameter
## Warning in title(...): "ylabs" is not a graphical parameter
## y x1 x2 predicted resid(model)
## 1 200.40 91.57 -2.15 190.060609 10.3393913
## 2 204.08 93.77 -1.29 191.536572 12.5434277
## 3 94.74 29.33 -8.82 94.349028 0.3909719
## 4 155.67 83.21 2.30 160.871984 -5.2019836
## 5 132.67 64.53 -3.20 142.833701 -10.1637007
## 6 118.08 52.39 2.28 103.447794 14.6322060
## 7 144.85 73.92 3.52 139.816994 5.0330058
## 8 19.02 14.33 5.18 23.598439 -4.5784392
## 9 145.01 66.04 -3.04 145.161314 -0.1513138
## 10 127.09 70.80 2.52 137.052778 -9.9627783
## 11 129.51 46.32 -8.59 125.335988 4.1740118
## 12 158.98 72.19 -3.92 159.320828 -0.3408277
## 13 203.00 93.53 -4.25 200.132291 2.8677094
## 14 103.12 26.29 -12.07 98.608207 4.5117928
## 15 106.21 46.77 0.18 99.381302 6.8286981
## 16 203.95 94.06 1.03 184.989436 18.9605640
## 17 201.34 97.84 -1.81 200.716613 0.6233869
## 18 25.39 12.63 3.79 24.674324 0.7156761
## 19 125.86 48.02 -3.63 113.353072 12.5069279
## 20 127.36 56.47 -6.84 138.921128 -11.5611280
## 21 170.90 90.50 2.16 174.896978 -3.9969780
## 22 35.32 14.73 -4.06 52.574480 -17.2544805
## 23 166.55 98.90 7.22 175.105346 -8.5553458
## 24 201.71 94.72 -2.16 195.966523 5.7434769
## 25 20.02 9.16 3.28 19.760246 0.2597536
## 26 115.99 51.91 1.61 104.599479 11.3905215
## 27 106.47 39.63 -3.92 98.590096 7.8799041
## 28 152.67 90.67 7.88 157.738369 -5.0683694
## 29 104.34 45.25 3.21 87.288978 17.0510220
## 30 164.51 83.76 0.45 167.549944 -3.0399437
## 31 149.95 74.02 1.38 146.541622 3.4083779
## 32 153.18 81.29 3.40 153.930100 -0.7501003
## 33 81.27 39.42 0.45 84.847221 -3.5772211
## 34 189.44 68.83 -14.97 186.813633 2.6263667
## 35 4.70 1.39 1.42 10.950340 -6.2503404
## 36 177.17 83.46 -1.84 173.986772 3.1832276
## 37 6.75 1.73 0.93 13.081550 -6.3315500
## 38 34.53 21.56 2.91 44.019078 -9.4890784
## 39 160.47 90.75 7.00 160.576153 -0.1061529
## 40 122.43 61.57 -3.64 138.657009 -16.2270091
## 41 58.79 38.58 6.51 64.766001 -5.9760011
## 42 83.12 44.14 1.68 89.893054 -6.7730542
## 43 25.85 4.71 5.19 5.624716 20.2252835
## 44 172.33 97.38 4.60 180.274850 -7.9448502
## 45 83.05 43.74 3.60 83.281010 -0.2310099
## 46 197.31 95.80 -5.22 207.329817 -10.0198172
## 47 169.42 88.89 -0.45 179.868063 -10.4480630
## 48 125.61 64.36 3.12 123.207811 2.4021886
## 49 203.59 97.13 -4.77 208.435690 -4.8456899
## 50 137.65 62.26 -2.71 137.102666 0.5473342
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 1.1264, df = 2, p-value = 0.5694