iris = read.csv("iris.csv")
iris
Histogram plot
library(ggplot2)
ggplot(iris, aes(x = sepal.length)) +
geom_histogram(binwidth = 0.1, fill = "lightblue", color = "black", alpha = 0.7) +
labs(title = "Histogram of Sepal Length", x = "Sepal Length (cm)", y = "Frequency") +
#xlim(0, 8)+
theme_minimal()

NA
ggplot(iris, aes(x = sepal.length)) +
geom_histogram(binwidth = 0.3, fill = "lightblue", color = "black", alpha = 0.7) +
labs(title = "Histogram of Sepal Length", x = "Sepal Length (cm)", y = "Frequency") +
#xlim(0, 8)+
theme_minimal()

NA
?labs
Correlation Matrix
#cor_mat = cor(iris[ , c(1, 2, 3, 4)])
cor_mat = cor(iris[ , 1:4])
cor_mat
sepal.length sepal.width petal.length petal.width
sepal.length 1.0000000 -0.1175698 0.8717538 0.8179411
sepal.width -0.1175698 1.0000000 -0.4284401 -0.3661259
petal.length 0.8717538 -0.4284401 1.0000000 0.9628654
petal.width 0.8179411 -0.3661259 0.9628654 1.0000000
Heat map
library(ggcorrplot)
ggcorrplot(cor_mat)

ggcorrplot(cor_mat, type = 'lower')

ggcorrplot(cor_mat, type = 'upper')

ggcorrplot(cor_mat,
type = 'lower',
colors = c("green", 'white', "pink"))


library(GGally)
ggpairs(iris, aes(colour = variety))

Interactive plot
library(plotly)
plot_ly(data = iris, x = ~sepal.length, y = ~sepal.width, z= ~petal.length, color = ~variety, type = "scatter3d")
plot_ly(data = iris, y= ~sepal.length, type="violin", x = ~variety)
plot_ly(data = iris, x = ~sepal.length, y= ~variety, type = 'box')
NA
plot_ly(data = iris, y = ~sepal.length, x= ~variety, type = 'box')
PCA
library(stats)
iris_pca = prcomp(iris[, 1:4], center = TRUE, scale = TRUE)
summary(iris_pca)
Importance of components:
PC1 PC2 PC3 PC4
Standard deviation 1.7084 0.9560 0.38309 0.14393
Proportion of Variance 0.7296 0.2285 0.03669 0.00518
Cumulative Proportion 0.7296 0.9581 0.99482 1.00000
iris_pca
Standard deviations (1, .., p=4):
[1] 1.7083611 0.9560494 0.3830886 0.1439265
Rotation (n x k) = (4 x 4):
PC1 PC2 PC3 PC4
sepal.length 0.5210659 -0.37741762 0.7195664 0.2612863
sepal.width -0.2693474 -0.92329566 -0.2443818 -0.1235096
petal.length 0.5804131 -0.02449161 -0.1421264 -0.8014492
petal.width 0.5648565 -0.06694199 -0.6342727 0.5235971
pc_score = as.data.frame(iris_pca$x[ , c(1, 2)])
pc_score
pc_data = cbind(pc_score, variety = iris$variety)
pc_data
library(ggplot2)
ggplot(pc_data, aes(PC1, PC2, color = variety)) +
geom_point() +
theme_minimal()

gg = ggplot(pc_data, aes(PC1, PC2, color = variety)) +
geom_point() +
theme_minimal()
gg

ggplotly(gg)
NA
NA
library(factoextra)
fviz_eig(iris_pca)

fviz_pca_var(iris_pca, col.var = "contrib", gradient.cols = c("red", 'black', 'green'))

fviz_pca_ind(iris_pca)

fviz_pca_ind(iris_pca, col.ind = iris$variety)

fviz_pca_ind(iris_pca, col.ind = iris$variety, addEllipses = TRUE)

fviz_pca_ind(iris_pca, col.ind = iris$variety, addEllipses = TRUE, geom.ind = "point")

Regression
ggplot(iris, aes(x = petal.length, y= sepal.length, color = variety)) +
geom_point() +
geom_smooth(method = "lm", se = TRUE, color = 'purple', level=0.95)

lm_model = lm(petal.length ~ sepal.length, data=iris)
summary(lm_model)
Call:
lm(formula = petal.length ~ sepal.length, data = iris)
Residuals:
Min 1Q Median 3Q Max
-2.47747 -0.59072 -0.00668 0.60484 2.49512
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -7.10144 0.50666 -14.02 <2e-16 ***
sepal.length 1.85843 0.08586 21.65 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.8678 on 148 degrees of freedom
Multiple R-squared: 0.76, Adjusted R-squared: 0.7583
F-statistic: 468.6 on 1 and 148 DF, p-value: < 2.2e-16
Polynomial regression
ggplot(iris, aes(x = petal.length, y= sepal.length, color = variety)) +
geom_point() +
geom_smooth(method = "lm", formula= y~poly(x,2), se = TRUE, color = 'purple', level=0.95)

K-means cluster
iris[, 1:4]
library(stats)
k_means_cluste = kmeans(iris[, 1:4], centers = 3)
k_means_cluste
K-means clustering with 3 clusters of sizes 62, 38, 50
Cluster means:
sepal.length sepal.width petal.length petal.width
1 5.901613 2.748387 4.393548 1.433871
2 6.850000 3.073684 5.742105 2.071053
3 5.006000 3.428000 1.462000 0.246000
Clustering vector:
[1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 2 1 1 1
[57] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2 2 1 2 2 2 2 2
[113] 2 1 1 2 2 2 2 1 2 1 2 1 2 2 1 1 2 2 2 2 2 1 2 2 2 2 1 2 2 2 1 2 2 2 1 2 2 1
Within cluster sum of squares by cluster:
[1] 39.82097 23.87947 15.15100
(between_SS / total_SS = 88.4 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss" "betweenss" "size"
[8] "iter" "ifault"
library(cluster)
clusplot(iris, k_means_cluste$cluster)

iris
Classification: SVM
library(e1071) #svm
library(caret) #evaluation
library(lattice)
train_idx = createDataPartition(iris$variety, p=0.8, list = FALSE)
train_data = iris[train_idx, ]
test_data = iris[-train_idx, ]
svm_model = svm(as.factor(variety) ~ sepal.length+sepal.width+petal.length+petal.width, data = train_data, kernel = "linear")
pred = predict(svm_model, test_data)
pred
2 4 9 12 20 21 27 35 38 46
Setosa Setosa Setosa Setosa Setosa Setosa Setosa Setosa Setosa Setosa
64 68 70 71 74 85 87 94 99 100
Versicolor Versicolor Versicolor Virginica Versicolor Versicolor Versicolor Versicolor Versicolor Versicolor
112 115 126 127 130 132 136 138 139 150
Virginica Virginica Virginica Virginica Virginica Virginica Virginica Virginica Virginica Virginica
Levels: Setosa Versicolor Virginica
test_data
conf_mat = confusionMatrix(pred, as.factor(test_data$variety))
conf_mat
Confusion Matrix and Statistics
Reference
Prediction Setosa Versicolor Virginica
Setosa 10 0 0
Versicolor 0 9 0
Virginica 0 1 10
Overall Statistics
Accuracy : 0.9667
95% CI : (0.8278, 0.9992)
No Information Rate : 0.3333
P-Value [Acc > NIR] : 2.963e-13
Kappa : 0.95
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: Setosa Class: Versicolor Class: Virginica
Sensitivity 1.0000 0.9000 1.0000
Specificity 1.0000 1.0000 0.9500
Pos Pred Value 1.0000 1.0000 0.9091
Neg Pred Value 1.0000 0.9524 1.0000
Prevalence 0.3333 0.3333 0.3333
Detection Rate 0.3333 0.3000 0.3333
Detection Prevalence 0.3333 0.3000 0.3667
Balanced Accuracy 1.0000 0.9500 0.9750
test_data[1,]
test_data[1,-5]
predict(svm_model, test_data[1,-5])
2
Setosa
Levels: Setosa Versicolor Virginica
sample = iris[88, -5]
sample
predict(svm_model, sample)
88
Versicolor
Levels: Setosa Versicolor Virginica
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKYGBge3J9CmlyaXMgPSByZWFkLmNzdigiaXJpcy5jc3YiKQppcmlzCmBgYAoKCiMjIEhpc3RvZ3JhbSBwbG90CmBgYHtyfQpsaWJyYXJ5KGdncGxvdDIpCgpnZ3Bsb3QoaXJpcywgYWVzKHggPSBzZXBhbC5sZW5ndGgpKSArIAogIGdlb21faGlzdG9ncmFtKGJpbndpZHRoID0gMC4xLCBmaWxsID0gImxpZ2h0Ymx1ZSIsIGNvbG9yID0gImJsYWNrIiwgYWxwaGEgPSAwLjcpICsKICBsYWJzKHRpdGxlID0gIkhpc3RvZ3JhbSBvZiBTZXBhbCBMZW5ndGgiLCB4ID0gIlNlcGFsIExlbmd0aCAoY20pIiwgeSA9ICJGcmVxdWVuY3kiKSArCiAgI3hsaW0oMCwgOCkrCiAgdGhlbWVfbWluaW1hbCgpCiAgCmBgYAoKYGBge3J9CmdncGxvdChpcmlzLCBhZXMoeCA9IHNlcGFsLmxlbmd0aCkpICsgCiAgZ2VvbV9oaXN0b2dyYW0oYmlud2lkdGggPSAwLjMsIGZpbGwgPSAibGlnaHRibHVlIiwgY29sb3IgPSAiYmxhY2siLCBhbHBoYSA9IDAuNykgKwogIGxhYnModGl0bGUgPSAiSGlzdG9ncmFtIG9mIFNlcGFsIExlbmd0aCIsIHggPSAiU2VwYWwgTGVuZ3RoIChjbSkiLCB5ID0gIkZyZXF1ZW5jeSIpICsKICAjeGxpbSgwLCA4KSsKICB0aGVtZV9taW5pbWFsKCkKICAKYGBgCgpgYGB7cn0KP2xhYnMKYGBgCgoKCiMjIENvcnJlbGF0aW9uIE1hdHJpeApgYGB7cn0KCiNjb3JfbWF0ID0gY29yKGlyaXNbICwgYygxLCAyLCAzLCA0KV0pCmNvcl9tYXQgPSBjb3IoaXJpc1sgLCAxOjRdKQpjb3JfbWF0CmBgYAoKIyMgSGVhdCBtYXAKYGBge3J9CmxpYnJhcnkoZ2djb3JycGxvdCkKCmdnY29ycnBsb3QoY29yX21hdCkKCmBgYAoKYGBge3J9CmdnY29ycnBsb3QoY29yX21hdCwgdHlwZSA9ICdsb3dlcicpCmBgYAoKYGBge3J9CmdnY29ycnBsb3QoY29yX21hdCwgdHlwZSA9ICd1cHBlcicpCmBgYAoKYGBge3J9CmdnY29ycnBsb3QoY29yX21hdCwgCiAgICAgICAgICAgdHlwZSA9ICdsb3dlcicsCiAgICAgICAgICAgY29sb3JzID0gYygiZ3JlZW4iLCAnd2hpdGUnLCAicGluayIpKQpgYGAKCgoKYGBge3IsIGVjaG89RkFMU0V9CmdnY29ycnBsb3QoY29yX21hdCwgCiAgICAgICAgICAgdHlwZSA9ICdsb3dlcicsCiAgICAgICAgICAgY29sb3JzID0gYygiZ3JlZW4iLCAnd2hpdGUnLCAicGluayIpLAogICAgICAgICAgIGxhYiA9IFRSVUUpCmBgYAoKCgpgYGB7cn0KbGlicmFyeShHR2FsbHkpCmdncGFpcnMoaXJpcywgYWVzKGNvbG91ciA9IHZhcmlldHkpKQpgYGAKCiMjIEludGVyYWN0aXZlIHBsb3QKCmBgYHtyLCBtZXNzYWdlPUZBTFNFfQpsaWJyYXJ5KHBsb3RseSkKCnBsb3RfbHkoZGF0YSA9IGlyaXMsIHggPSB+c2VwYWwubGVuZ3RoLCB5ID0gfnNlcGFsLndpZHRoLCB6PSB+cGV0YWwubGVuZ3RoLCBjb2xvciA9IH52YXJpZXR5LCB0eXBlID0gInNjYXR0ZXIzZCIpCmBgYAoKCmBgYHtyfQpwbG90X2x5KGRhdGEgPSBpcmlzLCB5PSB+c2VwYWwubGVuZ3RoLCB0eXBlPSJ2aW9saW4iLCB4ID0gfnZhcmlldHkpCmBgYAoKCmBgYHtyfQpwbG90X2x5KGRhdGEgPSBpcmlzLCB4ID0gfnNlcGFsLmxlbmd0aCwgeT0gfnZhcmlldHksIHR5cGUgPSAnYm94JykKCmBgYAoKYGBge3J9CnBsb3RfbHkoZGF0YSA9IGlyaXMsIHkgPSB+c2VwYWwubGVuZ3RoLCB4PSB+dmFyaWV0eSwgdHlwZSA9ICdib3gnKQpgYGAKCgoKCiMjIFBDQQpgYGB7cn0KbGlicmFyeShzdGF0cykKCmlyaXNfcGNhID0gcHJjb21wKGlyaXNbLCAxOjRdLCBjZW50ZXIgPSBUUlVFLCBzY2FsZSA9IFRSVUUpCnN1bW1hcnkoaXJpc19wY2EpCmBgYAoKYGBge3J9CmlyaXNfcGNhCmBgYAoKYGBge3J9CnBjX3Njb3JlID0gYXMuZGF0YS5mcmFtZShpcmlzX3BjYSR4WyAsIGMoMSwgMildKQpwY19zY29yZQpgYGAKCgpgYGB7cn0KcGNfZGF0YSA9IGNiaW5kKHBjX3Njb3JlLCB2YXJpZXR5ID0gaXJpcyR2YXJpZXR5KQpwY19kYXRhCmBgYAoKYGBge3J9CmxpYnJhcnkoZ2dwbG90MikKZ2dwbG90KHBjX2RhdGEsIGFlcyhQQzEsIFBDMiwgY29sb3IgPSB2YXJpZXR5KSkgKyAKICBnZW9tX3BvaW50KCkgKyAKICB0aGVtZV9taW5pbWFsKCkKYGBgCgoKYGBge3J9CmdnID0gZ2dwbG90KHBjX2RhdGEsIGFlcyhQQzEsIFBDMiwgY29sb3IgPSB2YXJpZXR5KSkgKyAKICBnZW9tX3BvaW50KCkgKyAKICB0aGVtZV9taW5pbWFsKCkKZ2cKCmdncGxvdGx5KGdnKQoKCmBgYAoKCmBgYHtyfQpsaWJyYXJ5KGZhY3RvZXh0cmEpCmZ2aXpfZWlnKGlyaXNfcGNhKQpgYGAKCgoKYGBge3J9CmZ2aXpfcGNhX3ZhcihpcmlzX3BjYSwgY29sLnZhciA9ICJjb250cmliIiwgZ3JhZGllbnQuY29scyA9IGMoInJlZCIsICdibGFjaycsICdncmVlbicpKQpgYGAKCmBgYHtyfQpmdml6X3BjYV9pbmQoaXJpc19wY2EpCmBgYAoKYGBge3J9CmZ2aXpfcGNhX2luZChpcmlzX3BjYSwgY29sLmluZCA9IGlyaXMkdmFyaWV0eSkKYGBgCgpgYGB7cn0KZnZpel9wY2FfaW5kKGlyaXNfcGNhLCBjb2wuaW5kID0gaXJpcyR2YXJpZXR5LCBhZGRFbGxpcHNlcyA9IFRSVUUpCmBgYAoKYGBge3J9CmZ2aXpfcGNhX2luZChpcmlzX3BjYSwgY29sLmluZCA9IGlyaXMkdmFyaWV0eSwgYWRkRWxsaXBzZXMgPSBUUlVFLCBnZW9tLmluZCA9ICJwb2ludCIpCmBgYAoKIyMgUmVncmVzc2lvbgoKYGBge3J9CgpnZ3Bsb3QoaXJpcywgYWVzKHggPSBwZXRhbC5sZW5ndGgsIHk9IHNlcGFsLmxlbmd0aCwgY29sb3IgPSB2YXJpZXR5KSkgKwogIGdlb21fcG9pbnQoKSArIAogIGdlb21fc21vb3RoKG1ldGhvZCA9ICJsbSIsIHNlID0gVFJVRSwgY29sb3IgPSAncHVycGxlJywgbGV2ZWw9MC45NSkKCmBgYAoKYGBge3J9CmxtX21vZGVsID0gbG0ocGV0YWwubGVuZ3RoIH4gc2VwYWwubGVuZ3RoLCBkYXRhPWlyaXMpCnN1bW1hcnkobG1fbW9kZWwpCmBgYAoKIyMjIFBvbHlub21pYWwgcmVncmVzc2lvbgoKYGBge3J9CmdncGxvdChpcmlzLCBhZXMoeCA9IHBldGFsLmxlbmd0aCwgeT0gc2VwYWwubGVuZ3RoLCBjb2xvciA9IHZhcmlldHkpKSArCiAgZ2VvbV9wb2ludCgpICsgCiAgZ2VvbV9zbW9vdGgobWV0aG9kID0gImxtIiwgZm9ybXVsYT0geX5wb2x5KHgsMiksIHNlID0gVFJVRSwgY29sb3IgPSAncHVycGxlJywgbGV2ZWw9MC45NSkKYGBgCgojIyBLLW1lYW5zIGNsdXN0ZXIKCmBgYHtyfQppcmlzWywgMTo0XQpgYGAKCmBgYHtyfQpsaWJyYXJ5KHN0YXRzKQoKa19tZWFuc19jbHVzdGUgPSBrbWVhbnMoaXJpc1ssIDE6NF0sIGNlbnRlcnMgPSAzKQprX21lYW5zX2NsdXN0ZQpgYGAKCmBgYHtyfQpsaWJyYXJ5KGNsdXN0ZXIpCmNsdXNwbG90KGlyaXMsIGtfbWVhbnNfY2x1c3RlJGNsdXN0ZXIpCmBgYAoKYGBge3J9CmlyaXMKYGBgCgojIyBDbGFzc2lmaWNhdGlvbjogU1ZNCmBgYHtyfQpsaWJyYXJ5KGUxMDcxKSAjc3ZtCmxpYnJhcnkoY2FyZXQpICNldmFsdWF0aW9uCmxpYnJhcnkobGF0dGljZSkKCnRyYWluX2lkeCA9IGNyZWF0ZURhdGFQYXJ0aXRpb24oaXJpcyR2YXJpZXR5LCBwPTAuOCwgbGlzdCA9IEZBTFNFKQp0cmFpbl9kYXRhID0gaXJpc1t0cmFpbl9pZHgsIF0KdGVzdF9kYXRhID0gaXJpc1stdHJhaW5faWR4LCBdCgoKc3ZtX21vZGVsID0gc3ZtKGFzLmZhY3Rvcih2YXJpZXR5KSB+IHNlcGFsLmxlbmd0aCtzZXBhbC53aWR0aCtwZXRhbC5sZW5ndGgrcGV0YWwud2lkdGgsIGRhdGEgPSB0cmFpbl9kYXRhLCBrZXJuZWwgPSAibGluZWFyIikKCnByZWQgPSBwcmVkaWN0KHN2bV9tb2RlbCwgdGVzdF9kYXRhKQpwcmVkCmBgYApgYGB7cn0KdGVzdF9kYXRhCmBgYAoKYGBge3J9CmNvbmZfbWF0ID0gY29uZnVzaW9uTWF0cml4KHByZWQsIGFzLmZhY3Rvcih0ZXN0X2RhdGEkdmFyaWV0eSkpCmNvbmZfbWF0CmBgYApgYGB7cn0KdGVzdF9kYXRhWzEsXQp0ZXN0X2RhdGFbMSwtNV0KcHJlZGljdChzdm1fbW9kZWwsIHRlc3RfZGF0YVsxLC01XSkKYGBgCgoKYGBge3J9CnNhbXBsZSA9IGlyaXNbODgsIC01XQpzYW1wbGUKCnByZWRpY3Qoc3ZtX21vZGVsLCBzYW1wbGUpCmBgYAoKCgoKCg==