Linear Regression
iris = read.csv("../datasets/iris.csv")
iris
NA
library(ggplot2)
ggplot(iris, aes(x = petal.length, y = petal.width)) +
geom_point() +
geom_smooth(method = "lm")+
theme_minimal()

ggplot(iris, aes(x = petal.length, y = petal.width, color = variety)) +
geom_point() +
geom_smooth(method = "lm")+
theme_minimal()

lm_output = lm( petal.width~petal.length, data = iris)
summary(lm_output)
Call:
lm(formula = petal.width ~ petal.length, data = iris)
Residuals:
Min 1Q Median 3Q Max
-0.56515 -0.12358 -0.01898 0.13288 0.64272
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.363076 0.039762 -9.131 4.7e-16 ***
petal.length 0.415755 0.009582 43.387 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2065 on 148 degrees of freedom
Multiple R-squared: 0.9271, Adjusted R-squared: 0.9266
F-statistic: 1882 on 1 and 148 DF, p-value: < 2.2e-16
ggplot(iris, aes(x = sepal.length, y = sepal.width)) +
geom_point() +
geom_smooth(method = "lm")+
theme_minimal()

lm_output = lm( sepal.width~sepal.length, data = iris)
summary(lm_output)
Call:
lm(formula = sepal.width ~ sepal.length, data = iris)
Residuals:
Min 1Q Median 3Q Max
-1.1095 -0.2454 -0.0167 0.2763 1.3338
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.41895 0.25356 13.48 <2e-16 ***
sepal.length -0.06188 0.04297 -1.44 0.152
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.4343 on 148 degrees of freedom
Multiple R-squared: 0.01382, Adjusted R-squared: 0.007159
F-statistic: 2.074 on 1 and 148 DF, p-value: 0.1519
ggplot(iris, aes(x = petal.length, y = sepal.width)) +
geom_point() +
geom_smooth(method = "lm", formula = y~poly(x, 1))+
theme_minimal()

lm_output = lm( sepal.width~petal.length, data = iris)
summary(lm_output)
Call:
lm(formula = sepal.width ~ petal.length, data = iris)
Residuals:
Min 1Q Median 3Q Max
-1.08463 -0.21537 0.02116 0.21587 1.10380
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.45487 0.07610 45.402 < 2e-16 ***
petal.length -0.10579 0.01834 -5.768 4.51e-08 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.3952 on 148 degrees of freedom
Multiple R-squared: 0.1836, Adjusted R-squared: 0.178
F-statistic: 33.28 on 1 and 148 DF, p-value: 4.513e-08
ggplot(iris, aes(x = petal.length, y = sepal.width)) +
geom_point() +
geom_smooth(method = "lm", formula = y~poly(x, 2))+
theme_minimal()

lm_output = lm( sepal.width~I(petal.length)+I(petal.length^2), data = iris)
summary(lm_output)
Call:
lm(formula = sepal.width ~ I(petal.length) + I(petal.length^2),
data = iris)
Residuals:
Min 1Q Median 3Q Max
-1.16966 -0.26002 0.02279 0.20965 1.02444
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.24920 0.13294 31.963 < 2e-16 ***
I(petal.length) -0.71154 0.08929 -7.969 4.06e-13 ***
I(petal.length^2) 0.08608 0.01248 6.896 1.47e-10 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.3447 on 147 degrees of freedom
Multiple R-squared: 0.3831, Adjusted R-squared: 0.3747
F-statistic: 45.65 on 2 and 147 DF, p-value: 3.8e-16
lm_output = lm( sepal.width~I(petal.length)+I(petal.length^5), data = iris)
summary(lm_output)
Call:
lm(formula = sepal.width ~ I(petal.length) + I(petal.length^5),
data = iris)
Residuals:
Min 1Q Median 3Q Max
-1.11661 -0.24358 0.03151 0.20469 1.03151
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.731e+00 8.328e-02 44.805 < 2e-16 ***
I(petal.length) -2.425e-01 2.855e-02 -8.493 2.04e-14 ***
I(petal.length^5) 9.624e-05 1.638e-05 5.877 2.69e-08 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.3568 on 147 degrees of freedom
Multiple R-squared: 0.3389, Adjusted R-squared: 0.3299
F-statistic: 37.68 on 2 and 147 DF, p-value: 6.172e-14
ggplot(iris, aes(x = petal.length, y = sepal.width)) +
geom_point() +
geom_smooth(method = "lm", formula = y~poly(x, 5))+
theme_minimal()

ggplot(iris, aes(x = petal.length, y = sepal.length)) +
geom_point() +
geom_smooth(method = "lm", formula = y~poly(x, 2))+
theme_minimal()

lm_output = lm( sepal.length~I(petal.length)+I(petal.length^2), data = iris)
summary(lm_output)
Call:
lm(formula = sepal.length ~ I(petal.length) + I(petal.length^2),
data = iris)
Residuals:
Min 1Q Median 3Q Max
-1.0684 -0.2348 0.0121 0.2049 0.9146
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.05833 0.14036 36.038 < 2e-16 ***
I(petal.length) -0.16435 0.09427 -1.743 0.0834 .
I(petal.length^2) 0.08146 0.01318 6.181 5.96e-09 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.3639 on 147 degrees of freedom
Multiple R-squared: 0.8095, Adjusted R-squared: 0.8069
F-statistic: 312.3 on 2 and 147 DF, p-value: < 2.2e-16
ggplot(iris, aes(x = petal.length, y = sepal.length)) +
geom_point() +
geom_smooth(method = "lm", formula = y~poly(x, 1))+
theme_minimal()

lm_output = lm( sepal.length~I(petal.length), data = iris)
summary(lm_output)
Call:
lm(formula = sepal.length ~ I(petal.length), data = iris)
Residuals:
Min 1Q Median 3Q Max
-1.24675 -0.29657 -0.01515 0.27676 1.00269
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.30660 0.07839 54.94 <2e-16 ***
I(petal.length) 0.40892 0.01889 21.65 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.4071 on 148 degrees of freedom
Multiple R-squared: 0.76, Adjusted R-squared: 0.7583
F-statistic: 468.6 on 1 and 148 DF, p-value: < 2.2e-16
Multivariate polynomial regression
lm_output = lm( sepal.length~I(petal.length)+I(petal.width), data = iris)
summary(lm_output)
Call:
lm(formula = sepal.length ~ I(petal.length) + I(petal.width),
data = iris)
Residuals:
Min 1Q Median 3Q Max
-1.18534 -0.29838 -0.02763 0.28925 1.02320
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.19058 0.09705 43.181 < 2e-16 ***
I(petal.length) 0.54178 0.06928 7.820 9.41e-13 ***
I(petal.width) -0.31955 0.16045 -1.992 0.0483 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.4031 on 147 degrees of freedom
Multiple R-squared: 0.7663, Adjusted R-squared: 0.7631
F-statistic: 241 on 2 and 147 DF, p-value: < 2.2e-16
lm_output = lm( sepal.length~I(petal.width^2)+I(petal.length^3), data = iris)
summary(lm_output)
Call:
lm(formula = sepal.length ~ I(petal.width^2) + I(petal.length^3),
data = iris)
Residuals:
Min 1Q Median 3Q Max
-0.95925 -0.25830 -0.01588 0.20670 0.97874
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.0371767 0.0455562 110.571 <2e-16 ***
I(petal.width^2) -0.0356134 0.0370512 -0.961 0.338
I(petal.length^3) 0.0101508 0.0008602 11.800 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.3692 on 147 degrees of freedom
Multiple R-squared: 0.8039, Adjusted R-squared: 0.8012
F-statistic: 301.3 on 2 and 147 DF, p-value: < 2.2e-16
Cluster
iris
iris[ , -5]
cluster_result = kmeans(iris[ , -5], centers = 3)
cluster_result
K-means clustering with 3 clusters of sizes 50, 38, 62
Cluster means:
sepal.length sepal.width petal.length petal.width
1 5.006000 3.428000 1.462000 0.246000
2 6.850000 3.073684 5.742105 2.071053
3 5.901613 2.748387 4.393548 1.433871
Clustering vector:
[1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 2 3 3 3
[57] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 2 2 2 2 3 2 2 2 2 2
[113] 2 3 3 2 2 2 2 3 2 3 2 3 2 2 3 3 2 2 2 2 2 3 2 2 2 2 3 2 2 2 3 2 2 2 3 2 2 3
Within cluster sum of squares by cluster:
[1] 15.15100 23.87947 39.82097
(between_SS / total_SS = 88.4 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss" "betweenss" "size"
[8] "iter" "ifault"
library(cluster)
clusplot(iris[, -5], cluster_result$cluster)

iris
library(e1071) #SVM - ML model
library(caret) # data processing
iris$variety = as.factor(iris$variety)
training_idx = createDataPartition(iris$variety, p=0.8, list = FALSE)
train_data = iris[training_idx, ]
test_data = iris[-training_idx, ]
# train_data -> ML (SVM) -> test_data
svm_model = svm(variety~sepal.length+sepal.width+petal.length+petal.width, data=train_data,kernel="linear")
test_data[1, -5]
predict(svm_model, newdata = test_data[1, -5])
8
Setosa
Levels: Setosa Versicolor Virginica
test_data[19, ]
predict(svm_model, newdata = test_data[19, -5])
98
Versicolor
Levels: Setosa Versicolor Virginica
predictions = predict(svm_model, newdata = test_data[ , -5])
predictions
8 11 16 20 24 26 28 35 42 45
Setosa Setosa Setosa Setosa Setosa Setosa Setosa Setosa Setosa Setosa
56 65 68 70 77 79 84 90 98 99
Versicolor Versicolor Versicolor Versicolor Versicolor Versicolor Virginica Versicolor Versicolor Versicolor
112 114 121 125 126 142 143 145 146 147
Virginica Virginica Virginica Virginica Virginica Virginica Virginica Virginica Virginica Virginica
Levels: Setosa Versicolor Virginica
conf_mat = confusionMatrix(predictions, test_data$variety)
conf_mat
Confusion Matrix and Statistics
Reference
Prediction Setosa Versicolor Virginica
Setosa 10 0 0
Versicolor 0 9 0
Virginica 0 1 10
Overall Statistics
Accuracy : 0.9667
95% CI : (0.8278, 0.9992)
No Information Rate : 0.3333
P-Value [Acc > NIR] : 2.963e-13
Kappa : 0.95
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: Setosa Class: Versicolor Class: Virginica
Sensitivity 1.0000 0.9000 1.0000
Specificity 1.0000 1.0000 0.9500
Pos Pred Value 1.0000 1.0000 0.9091
Neg Pred Value 1.0000 0.9524 1.0000
Prevalence 0.3333 0.3333 0.3333
Detection Rate 0.3333 0.3000 0.3333
Detection Prevalence 0.3333 0.3000 0.3667
Balanced Accuracy 1.0000 0.9500 0.9750
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKCiMjIExpbmVhciBSZWdyZXNzaW9uCgpgYGB7cn0KCmlyaXMgPSByZWFkLmNzdigiLi4vZGF0YXNldHMvaXJpcy5jc3YiKQppcmlzCmBgYAoKCmBgYHtyfQpsaWJyYXJ5KGdncGxvdDIpCgpnZ3Bsb3QoaXJpcywgYWVzKHggPSBwZXRhbC5sZW5ndGgsIHkgPSBwZXRhbC53aWR0aCkpICsKICBnZW9tX3BvaW50KCkgKwogIGdlb21fc21vb3RoKG1ldGhvZCA9ICJsbSIpKwogIHRoZW1lX21pbmltYWwoKQpgYGAKCgpgYGB7cn0KZ2dwbG90KGlyaXMsIGFlcyh4ID0gcGV0YWwubGVuZ3RoLCB5ID0gcGV0YWwud2lkdGgsIGNvbG9yID0gdmFyaWV0eSkpICsKICBnZW9tX3BvaW50KCkgKwogIGdlb21fc21vb3RoKG1ldGhvZCA9ICJsbSIpKwogIHRoZW1lX21pbmltYWwoKQpgYGAKCgpgYGB7cn0KbG1fb3V0cHV0ID0gbG0oIHBldGFsLndpZHRofnBldGFsLmxlbmd0aCwgZGF0YSA9IGlyaXMpCnN1bW1hcnkobG1fb3V0cHV0KQpgYGAKCgpgYGB7cn0KZ2dwbG90KGlyaXMsIGFlcyh4ID0gc2VwYWwubGVuZ3RoLCB5ID0gc2VwYWwud2lkdGgpKSArCiAgZ2VvbV9wb2ludCgpICsKICBnZW9tX3Ntb290aChtZXRob2QgPSAibG0iKSsKICB0aGVtZV9taW5pbWFsKCkKYGBgCgoKCmBgYHtyfQpsbV9vdXRwdXQgPSBsbSggc2VwYWwud2lkdGh+c2VwYWwubGVuZ3RoLCBkYXRhID0gaXJpcykKc3VtbWFyeShsbV9vdXRwdXQpCmBgYAoKYGBge3J9CmdncGxvdChpcmlzLCBhZXMoeCA9IHBldGFsLmxlbmd0aCwgeSA9IHNlcGFsLndpZHRoKSkgKwogIGdlb21fcG9pbnQoKSArCiAgZ2VvbV9zbW9vdGgobWV0aG9kID0gImxtIiwgZm9ybXVsYSA9IHl+cG9seSh4LCAxKSkrCiAgdGhlbWVfbWluaW1hbCgpCmBgYAoKCmBgYHtyfQpsbV9vdXRwdXQgPSBsbSggc2VwYWwud2lkdGh+cGV0YWwubGVuZ3RoLCBkYXRhID0gaXJpcykKc3VtbWFyeShsbV9vdXRwdXQpCmBgYAoKCmBgYHtyfQpnZ3Bsb3QoaXJpcywgYWVzKHggPSBwZXRhbC5sZW5ndGgsIHkgPSBzZXBhbC53aWR0aCkpICsKICBnZW9tX3BvaW50KCkgKwogIGdlb21fc21vb3RoKG1ldGhvZCA9ICJsbSIsIGZvcm11bGEgPSB5fnBvbHkoeCwgMikpKwogIHRoZW1lX21pbmltYWwoKQpgYGAKCgpgYGB7cn0KbG1fb3V0cHV0ID0gbG0oIHNlcGFsLndpZHRofkkocGV0YWwubGVuZ3RoKStJKHBldGFsLmxlbmd0aF4yKSwgZGF0YSA9IGlyaXMpCnN1bW1hcnkobG1fb3V0cHV0KQpgYGAKCgpgYGB7cn0KbG1fb3V0cHV0ID0gbG0oIHNlcGFsLndpZHRofkkocGV0YWwubGVuZ3RoKStJKHBldGFsLmxlbmd0aF41KSwgZGF0YSA9IGlyaXMpCnN1bW1hcnkobG1fb3V0cHV0KQpgYGAKCmBgYHtyfQpnZ3Bsb3QoaXJpcywgYWVzKHggPSBwZXRhbC5sZW5ndGgsIHkgPSBzZXBhbC53aWR0aCkpICsKICBnZW9tX3BvaW50KCkgKwogIGdlb21fc21vb3RoKG1ldGhvZCA9ICJsbSIsIGZvcm11bGEgPSB5fnBvbHkoeCwgNSkpKwogIHRoZW1lX21pbmltYWwoKQpgYGAKCmBgYHtyfQpnZ3Bsb3QoaXJpcywgYWVzKHggPSBwZXRhbC5sZW5ndGgsIHkgPSBzZXBhbC5sZW5ndGgpKSArCiAgZ2VvbV9wb2ludCgpICsKICBnZW9tX3Ntb290aChtZXRob2QgPSAibG0iLCBmb3JtdWxhID0geX5wb2x5KHgsIDIpKSsKICB0aGVtZV9taW5pbWFsKCkKYGBgCgoKYGBge3J9CmxtX291dHB1dCA9IGxtKCBzZXBhbC5sZW5ndGh+SShwZXRhbC5sZW5ndGgpK0kocGV0YWwubGVuZ3RoXjIpLCBkYXRhID0gaXJpcykKc3VtbWFyeShsbV9vdXRwdXQpCmBgYAoKYGBge3J9CmdncGxvdChpcmlzLCBhZXMoeCA9IHBldGFsLmxlbmd0aCwgeSA9IHNlcGFsLmxlbmd0aCkpICsKICBnZW9tX3BvaW50KCkgKwogIGdlb21fc21vb3RoKG1ldGhvZCA9ICJsbSIsIGZvcm11bGEgPSB5fnBvbHkoeCwgMSkpKwogIHRoZW1lX21pbmltYWwoKQpgYGAKCmBgYHtyfQpsbV9vdXRwdXQgPSBsbSggc2VwYWwubGVuZ3RofkkocGV0YWwubGVuZ3RoKSwgZGF0YSA9IGlyaXMpCnN1bW1hcnkobG1fb3V0cHV0KQpgYGAKCiMjIyBNdWx0aXZhcmlhdGUgcG9seW5vbWlhbCByZWdyZXNzaW9uCgpgYGB7cn0KbG1fb3V0cHV0ID0gbG0oIHNlcGFsLmxlbmd0aH5JKHBldGFsLmxlbmd0aCkrSShwZXRhbC53aWR0aCksIGRhdGEgPSBpcmlzKQpzdW1tYXJ5KGxtX291dHB1dCkKYGBgCgoKYGBge3J9CmxtX291dHB1dCA9IGxtKCBzZXBhbC5sZW5ndGh+SShwZXRhbC53aWR0aF4yKStJKHBldGFsLmxlbmd0aF4zKSwgZGF0YSA9IGlyaXMpCnN1bW1hcnkobG1fb3V0cHV0KQpgYGAKCgoKIyMgQ2x1c3RlciAKCmBgYHtyfQppcmlzCmBgYAoKCmBgYHtyfQppcmlzWyAsIC01XQpgYGAKCmBgYHtyfQpjbHVzdGVyX3Jlc3VsdCA9IGttZWFucyhpcmlzWyAsIC01XSwgY2VudGVycyA9IDMpCmNsdXN0ZXJfcmVzdWx0CmBgYAoKCmBgYHtyfQpsaWJyYXJ5KGNsdXN0ZXIpCmNsdXNwbG90KGlyaXNbLCAtNV0sIGNsdXN0ZXJfcmVzdWx0JGNsdXN0ZXIpCmBgYAoKCgpgYGB7cn0KaXJpcwpgYGAKCmBgYHtyfQpsaWJyYXJ5KGUxMDcxKSAjU1ZNIC0gTUwgbW9kZWwKbGlicmFyeShjYXJldCkgIyBkYXRhIHByb2Nlc3NpbmcKCmlyaXMkdmFyaWV0eSA9IGFzLmZhY3RvcihpcmlzJHZhcmlldHkpICMgY2hhciB0byBjYXRlZ29yeQoKdHJhaW5pbmdfaWR4ID0gY3JlYXRlRGF0YVBhcnRpdGlvbihpcmlzJHZhcmlldHksIHA9MC44LCBsaXN0ID0gRkFMU0UpCnRyYWluX2RhdGEgPSBpcmlzW3RyYWluaW5nX2lkeCwgXQp0ZXN0X2RhdGEgPSBpcmlzWy10cmFpbmluZ19pZHgsIF0KCiMgdHJhaW5fZGF0YSAtPiBNTCAoU1ZNKSAtPiB0ZXN0X2RhdGEKCgoKc3ZtX21vZGVsID0gc3ZtKHZhcmlldHl+c2VwYWwubGVuZ3RoK3NlcGFsLndpZHRoK3BldGFsLmxlbmd0aCtwZXRhbC53aWR0aCwgZGF0YT10cmFpbl9kYXRhLGtlcm5lbD0ibGluZWFyIikKCgpgYGAKCgoKCmBgYHtyfQp0ZXN0X2RhdGFbMSwgLTVdCnByZWRpY3Qoc3ZtX21vZGVsLCBuZXdkYXRhID0gdGVzdF9kYXRhWzEsIC01XSkKYGBgCgoKCmBgYHtyfQp0ZXN0X2RhdGFbMTksIF0KYGBgCgoKYGBge3J9CnByZWRpY3Qoc3ZtX21vZGVsLCBuZXdkYXRhID0gdGVzdF9kYXRhWzE5LCAtNV0pCmBgYAoKCmBgYHtyfQpwcmVkaWN0aW9ucyA9IHByZWRpY3Qoc3ZtX21vZGVsLCBuZXdkYXRhID0gdGVzdF9kYXRhWyAsIC01XSkKcHJlZGljdGlvbnMKYGBgCgpgYGB7cn0KY29uZl9tYXQgPSBjb25mdXNpb25NYXRyaXgocHJlZGljdGlvbnMsIHRlc3RfZGF0YSR2YXJpZXR5KQpjb25mX21hdApgYGAKCg==