iris = read.csv("iris.csv")
iris

Histogram plot

library(ggplot2)

ggplot(iris, aes(x = sepal.length)) + 
  geom_histogram(binwidth = 0.1, fill = "lightblue", color = "black", alpha = 0.7) +
  labs(title = "Histogram of Sepal Length", x = "Sepal Length (cm)", y = "Frequency") +
  #xlim(0, 8)+
  theme_minimal()

NA
ggplot(iris, aes(x = sepal.length)) + 
  geom_histogram(binwidth = 0.3, fill = "lightblue", color = "black", alpha = 0.7) +
  labs(title = "Histogram of Sepal Length", x = "Sepal Length (cm)", y = "Frequency") +
  #xlim(0, 8)+
  theme_minimal()

NA
?labs

Correlation Matrix


#cor_mat = cor(iris[ , c(1, 2, 3, 4)])
cor_mat = cor(iris[ , 1:4])
cor_mat
             sepal.length sepal.width petal.length petal.width
sepal.length    1.0000000  -0.1175698    0.8717538   0.8179411
sepal.width    -0.1175698   1.0000000   -0.4284401  -0.3661259
petal.length    0.8717538  -0.4284401    1.0000000   0.9628654
petal.width     0.8179411  -0.3661259    0.9628654   1.0000000

Heat map

library(ggcorrplot)

ggcorrplot(cor_mat)

ggcorrplot(cor_mat, type = 'lower')

ggcorrplot(cor_mat, type = 'upper')

ggcorrplot(cor_mat, 
           type = 'lower',
           colors = c("green", 'white', "pink"))

library(GGally)
ggpairs(iris, aes(colour = variety))

Interactive plot

library(plotly)

plot_ly(data = iris, x = ~sepal.length, y = ~sepal.width, z= ~petal.length, color = ~variety, type = "scatter3d")
plot_ly(data = iris, y= ~sepal.length, type="violin", x = ~variety)
plot_ly(data = iris, x = ~sepal.length, y= ~variety, type = 'box')
NA
plot_ly(data = iris, y = ~sepal.length, x= ~variety, type = 'box')

PCA

library(stats)

iris_pca = prcomp(iris[, 1:4], center = TRUE, scale = TRUE)
summary(iris_pca)
Importance of components:
                          PC1    PC2     PC3     PC4
Standard deviation     1.7084 0.9560 0.38309 0.14393
Proportion of Variance 0.7296 0.2285 0.03669 0.00518
Cumulative Proportion  0.7296 0.9581 0.99482 1.00000
iris_pca
Standard deviations (1, .., p=4):
[1] 1.7083611 0.9560494 0.3830886 0.1439265

Rotation (n x k) = (4 x 4):
                    PC1         PC2        PC3        PC4
sepal.length  0.5210659 -0.37741762  0.7195664  0.2612863
sepal.width  -0.2693474 -0.92329566 -0.2443818 -0.1235096
petal.length  0.5804131 -0.02449161 -0.1421264 -0.8014492
petal.width   0.5648565 -0.06694199 -0.6342727  0.5235971
pc_score = as.data.frame(iris_pca$x[ , c(1, 2)])
pc_score
pc_data = cbind(pc_score, variety = iris$variety)
pc_data
library(ggplot2)
ggplot(pc_data, aes(PC1, PC2, color = variety)) + 
  geom_point() + 
  theme_minimal()

gg = ggplot(pc_data, aes(PC1, PC2, color = variety)) + 
  geom_point() + 
  theme_minimal()
gg


ggplotly(gg)
NA
NA
library(factoextra)
fviz_eig(iris_pca)

fviz_pca_var(iris_pca, col.var = "contrib", gradient.cols = c("red", 'black', 'green'))

fviz_pca_ind(iris_pca)

fviz_pca_ind(iris_pca, col.ind = iris$variety)

fviz_pca_ind(iris_pca, col.ind = iris$variety, addEllipses = TRUE)

fviz_pca_ind(iris_pca, col.ind = iris$variety, addEllipses = TRUE, geom.ind = "point")

Regression


ggplot(iris, aes(x = petal.length, y= sepal.length, color = variety)) +
  geom_point() + 
  geom_smooth(method = "lm", se = TRUE, color = 'purple', level=0.95)

lm_model = lm(petal.length ~ sepal.length, data=iris)
summary(lm_model)

Call:
lm(formula = petal.length ~ sepal.length, data = iris)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.47747 -0.59072 -0.00668  0.60484  2.49512 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  -7.10144    0.50666  -14.02   <2e-16 ***
sepal.length  1.85843    0.08586   21.65   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.8678 on 148 degrees of freedom
Multiple R-squared:   0.76, Adjusted R-squared:  0.7583 
F-statistic: 468.6 on 1 and 148 DF,  p-value: < 2.2e-16

Polynomial regression

ggplot(iris, aes(x = petal.length, y= sepal.length, color = variety)) +
  geom_point() + 
  geom_smooth(method = "lm", formula= y~poly(x,2), se = TRUE, color = 'purple', level=0.95)

K-means cluster

iris[, 1:4]
library(stats)

k_means_cluste = kmeans(iris[, 1:4], centers = 3)
k_means_cluste
K-means clustering with 3 clusters of sizes 62, 38, 50

Cluster means:
  sepal.length sepal.width petal.length petal.width
1     5.901613    2.748387     4.393548    1.433871
2     6.850000    3.073684     5.742105    2.071053
3     5.006000    3.428000     1.462000    0.246000

Clustering vector:
  [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 2 1 1 1
 [57] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2 2 1 2 2 2 2 2
[113] 2 1 1 2 2 2 2 1 2 1 2 1 2 2 1 1 2 2 2 2 2 1 2 2 2 2 1 2 2 2 1 2 2 2 1 2 2 1

Within cluster sum of squares by cluster:
[1] 39.82097 23.87947 15.15100
 (between_SS / total_SS =  88.4 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss" "betweenss"    "size"        
[8] "iter"         "ifault"      
library(cluster)
clusplot(iris, k_means_cluste$cluster)

iris

Classification: SVM

library(e1071) #svm
library(caret) #evaluation
library(lattice)

train_idx = createDataPartition(iris$variety, p=0.8, list = FALSE)
train_data = iris[train_idx, ]
test_data = iris[-train_idx, ]


svm_model = svm(as.factor(variety) ~ sepal.length+sepal.width+petal.length+petal.width, data = train_data, kernel = "linear")

pred = predict(svm_model, test_data)
pred
         2          4          9         12         20         21         27         35         38         46 
    Setosa     Setosa     Setosa     Setosa     Setosa     Setosa     Setosa     Setosa     Setosa     Setosa 
        64         68         70         71         74         85         87         94         99        100 
Versicolor Versicolor Versicolor  Virginica Versicolor Versicolor Versicolor Versicolor Versicolor Versicolor 
       112        115        126        127        130        132        136        138        139        150 
 Virginica  Virginica  Virginica  Virginica  Virginica  Virginica  Virginica  Virginica  Virginica  Virginica 
Levels: Setosa Versicolor Virginica
test_data
conf_mat = confusionMatrix(pred, as.factor(test_data$variety))
conf_mat
Confusion Matrix and Statistics

            Reference
Prediction   Setosa Versicolor Virginica
  Setosa         10          0         0
  Versicolor      0          9         0
  Virginica       0          1        10

Overall Statistics
                                          
               Accuracy : 0.9667          
                 95% CI : (0.8278, 0.9992)
    No Information Rate : 0.3333          
    P-Value [Acc > NIR] : 2.963e-13       
                                          
                  Kappa : 0.95            
                                          
 Mcnemar's Test P-Value : NA              

Statistics by Class:

                     Class: Setosa Class: Versicolor Class: Virginica
Sensitivity                 1.0000            0.9000           1.0000
Specificity                 1.0000            1.0000           0.9500
Pos Pred Value              1.0000            1.0000           0.9091
Neg Pred Value              1.0000            0.9524           1.0000
Prevalence                  0.3333            0.3333           0.3333
Detection Rate              0.3333            0.3000           0.3333
Detection Prevalence        0.3333            0.3000           0.3667
Balanced Accuracy           1.0000            0.9500           0.9750
test_data[1,]
test_data[1,-5]
predict(svm_model, test_data[1,-5])
     2 
Setosa 
Levels: Setosa Versicolor Virginica
sample = iris[88, -5]
sample

predict(svm_model, sample)
        88 
Versicolor 
Levels: Setosa Versicolor Virginica
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKYGBge3J9CmlyaXMgPSByZWFkLmNzdigiaXJpcy5jc3YiKQppcmlzCmBgYAoKCiMjIEhpc3RvZ3JhbSBwbG90CmBgYHtyfQpsaWJyYXJ5KGdncGxvdDIpCgpnZ3Bsb3QoaXJpcywgYWVzKHggPSBzZXBhbC5sZW5ndGgpKSArIAogIGdlb21faGlzdG9ncmFtKGJpbndpZHRoID0gMC4xLCBmaWxsID0gImxpZ2h0Ymx1ZSIsIGNvbG9yID0gImJsYWNrIiwgYWxwaGEgPSAwLjcpICsKICBsYWJzKHRpdGxlID0gIkhpc3RvZ3JhbSBvZiBTZXBhbCBMZW5ndGgiLCB4ID0gIlNlcGFsIExlbmd0aCAoY20pIiwgeSA9ICJGcmVxdWVuY3kiKSArCiAgI3hsaW0oMCwgOCkrCiAgdGhlbWVfbWluaW1hbCgpCiAgCmBgYAoKYGBge3J9CmdncGxvdChpcmlzLCBhZXMoeCA9IHNlcGFsLmxlbmd0aCkpICsgCiAgZ2VvbV9oaXN0b2dyYW0oYmlud2lkdGggPSAwLjMsIGZpbGwgPSAibGlnaHRibHVlIiwgY29sb3IgPSAiYmxhY2siLCBhbHBoYSA9IDAuNykgKwogIGxhYnModGl0bGUgPSAiSGlzdG9ncmFtIG9mIFNlcGFsIExlbmd0aCIsIHggPSAiU2VwYWwgTGVuZ3RoIChjbSkiLCB5ID0gIkZyZXF1ZW5jeSIpICsKICAjeGxpbSgwLCA4KSsKICB0aGVtZV9taW5pbWFsKCkKICAKYGBgCgpgYGB7cn0KP2xhYnMKYGBgCgoKCiMjIENvcnJlbGF0aW9uIE1hdHJpeApgYGB7cn0KCiNjb3JfbWF0ID0gY29yKGlyaXNbICwgYygxLCAyLCAzLCA0KV0pCmNvcl9tYXQgPSBjb3IoaXJpc1sgLCAxOjRdKQpjb3JfbWF0CmBgYAoKIyMgSGVhdCBtYXAKYGBge3J9CmxpYnJhcnkoZ2djb3JycGxvdCkKCmdnY29ycnBsb3QoY29yX21hdCkKCmBgYAoKYGBge3J9CmdnY29ycnBsb3QoY29yX21hdCwgdHlwZSA9ICdsb3dlcicpCmBgYAoKYGBge3J9CmdnY29ycnBsb3QoY29yX21hdCwgdHlwZSA9ICd1cHBlcicpCmBgYAoKYGBge3J9CmdnY29ycnBsb3QoY29yX21hdCwgCiAgICAgICAgICAgdHlwZSA9ICdsb3dlcicsCiAgICAgICAgICAgY29sb3JzID0gYygiZ3JlZW4iLCAnd2hpdGUnLCAicGluayIpKQpgYGAKCgoKYGBge3IsIGVjaG89RkFMU0V9CmdnY29ycnBsb3QoY29yX21hdCwgCiAgICAgICAgICAgdHlwZSA9ICdsb3dlcicsCiAgICAgICAgICAgY29sb3JzID0gYygiZ3JlZW4iLCAnd2hpdGUnLCAicGluayIpLAogICAgICAgICAgIGxhYiA9IFRSVUUpCmBgYAoKCgpgYGB7cn0KbGlicmFyeShHR2FsbHkpCmdncGFpcnMoaXJpcywgYWVzKGNvbG91ciA9IHZhcmlldHkpKQpgYGAKCiMjIEludGVyYWN0aXZlIHBsb3QKCmBgYHtyLCBtZXNzYWdlPUZBTFNFfQpsaWJyYXJ5KHBsb3RseSkKCnBsb3RfbHkoZGF0YSA9IGlyaXMsIHggPSB+c2VwYWwubGVuZ3RoLCB5ID0gfnNlcGFsLndpZHRoLCB6PSB+cGV0YWwubGVuZ3RoLCBjb2xvciA9IH52YXJpZXR5LCB0eXBlID0gInNjYXR0ZXIzZCIpCmBgYAoKCmBgYHtyfQpwbG90X2x5KGRhdGEgPSBpcmlzLCB5PSB+c2VwYWwubGVuZ3RoLCB0eXBlPSJ2aW9saW4iLCB4ID0gfnZhcmlldHkpCmBgYAoKCmBgYHtyfQpwbG90X2x5KGRhdGEgPSBpcmlzLCB4ID0gfnNlcGFsLmxlbmd0aCwgeT0gfnZhcmlldHksIHR5cGUgPSAnYm94JykKCmBgYAoKYGBge3J9CnBsb3RfbHkoZGF0YSA9IGlyaXMsIHkgPSB+c2VwYWwubGVuZ3RoLCB4PSB+dmFyaWV0eSwgdHlwZSA9ICdib3gnKQpgYGAKCgoKCiMjIFBDQQpgYGB7cn0KbGlicmFyeShzdGF0cykKCmlyaXNfcGNhID0gcHJjb21wKGlyaXNbLCAxOjRdLCBjZW50ZXIgPSBUUlVFLCBzY2FsZSA9IFRSVUUpCnN1bW1hcnkoaXJpc19wY2EpCmBgYAoKYGBge3J9CmlyaXNfcGNhCmBgYAoKYGBge3J9CnBjX3Njb3JlID0gYXMuZGF0YS5mcmFtZShpcmlzX3BjYSR4WyAsIGMoMSwgMildKQpwY19zY29yZQpgYGAKCgpgYGB7cn0KcGNfZGF0YSA9IGNiaW5kKHBjX3Njb3JlLCB2YXJpZXR5ID0gaXJpcyR2YXJpZXR5KQpwY19kYXRhCmBgYAoKYGBge3J9CmxpYnJhcnkoZ2dwbG90MikKZ2dwbG90KHBjX2RhdGEsIGFlcyhQQzEsIFBDMiwgY29sb3IgPSB2YXJpZXR5KSkgKyAKICBnZW9tX3BvaW50KCkgKyAKICB0aGVtZV9taW5pbWFsKCkKYGBgCgoKYGBge3J9CmdnID0gZ2dwbG90KHBjX2RhdGEsIGFlcyhQQzEsIFBDMiwgY29sb3IgPSB2YXJpZXR5KSkgKyAKICBnZW9tX3BvaW50KCkgKyAKICB0aGVtZV9taW5pbWFsKCkKZ2cKCmdncGxvdGx5KGdnKQoKCmBgYAoKCmBgYHtyfQpsaWJyYXJ5KGZhY3RvZXh0cmEpCmZ2aXpfZWlnKGlyaXNfcGNhKQpgYGAKCgoKYGBge3J9CmZ2aXpfcGNhX3ZhcihpcmlzX3BjYSwgY29sLnZhciA9ICJjb250cmliIiwgZ3JhZGllbnQuY29scyA9IGMoInJlZCIsICdibGFjaycsICdncmVlbicpKQpgYGAKCmBgYHtyfQpmdml6X3BjYV9pbmQoaXJpc19wY2EpCmBgYAoKYGBge3J9CmZ2aXpfcGNhX2luZChpcmlzX3BjYSwgY29sLmluZCA9IGlyaXMkdmFyaWV0eSkKYGBgCgpgYGB7cn0KZnZpel9wY2FfaW5kKGlyaXNfcGNhLCBjb2wuaW5kID0gaXJpcyR2YXJpZXR5LCBhZGRFbGxpcHNlcyA9IFRSVUUpCmBgYAoKYGBge3J9CmZ2aXpfcGNhX2luZChpcmlzX3BjYSwgY29sLmluZCA9IGlyaXMkdmFyaWV0eSwgYWRkRWxsaXBzZXMgPSBUUlVFLCBnZW9tLmluZCA9ICJwb2ludCIpCmBgYAoKIyMgUmVncmVzc2lvbgoKYGBge3J9CgpnZ3Bsb3QoaXJpcywgYWVzKHggPSBwZXRhbC5sZW5ndGgsIHk9IHNlcGFsLmxlbmd0aCwgY29sb3IgPSB2YXJpZXR5KSkgKwogIGdlb21fcG9pbnQoKSArIAogIGdlb21fc21vb3RoKG1ldGhvZCA9ICJsbSIsIHNlID0gVFJVRSwgY29sb3IgPSAncHVycGxlJywgbGV2ZWw9MC45NSkKCmBgYAoKYGBge3J9CmxtX21vZGVsID0gbG0ocGV0YWwubGVuZ3RoIH4gc2VwYWwubGVuZ3RoLCBkYXRhPWlyaXMpCnN1bW1hcnkobG1fbW9kZWwpCmBgYAoKIyMjIFBvbHlub21pYWwgcmVncmVzc2lvbgoKYGBge3J9CmdncGxvdChpcmlzLCBhZXMoeCA9IHBldGFsLmxlbmd0aCwgeT0gc2VwYWwubGVuZ3RoLCBjb2xvciA9IHZhcmlldHkpKSArCiAgZ2VvbV9wb2ludCgpICsgCiAgZ2VvbV9zbW9vdGgobWV0aG9kID0gImxtIiwgZm9ybXVsYT0geX5wb2x5KHgsMiksIHNlID0gVFJVRSwgY29sb3IgPSAncHVycGxlJywgbGV2ZWw9MC45NSkKYGBgCgojIyBLLW1lYW5zIGNsdXN0ZXIKCmBgYHtyfQppcmlzWywgMTo0XQpgYGAKCmBgYHtyfQpsaWJyYXJ5KHN0YXRzKQoKa19tZWFuc19jbHVzdGUgPSBrbWVhbnMoaXJpc1ssIDE6NF0sIGNlbnRlcnMgPSAzKQprX21lYW5zX2NsdXN0ZQpgYGAKCmBgYHtyfQpsaWJyYXJ5KGNsdXN0ZXIpCmNsdXNwbG90KGlyaXMsIGtfbWVhbnNfY2x1c3RlJGNsdXN0ZXIpCmBgYAoKYGBge3J9CmlyaXMKYGBgCgojIyBDbGFzc2lmaWNhdGlvbjogU1ZNCmBgYHtyfQpsaWJyYXJ5KGUxMDcxKSAjc3ZtCmxpYnJhcnkoY2FyZXQpICNldmFsdWF0aW9uCmxpYnJhcnkobGF0dGljZSkKCnRyYWluX2lkeCA9IGNyZWF0ZURhdGFQYXJ0aXRpb24oaXJpcyR2YXJpZXR5LCBwPTAuOCwgbGlzdCA9IEZBTFNFKQp0cmFpbl9kYXRhID0gaXJpc1t0cmFpbl9pZHgsIF0KdGVzdF9kYXRhID0gaXJpc1stdHJhaW5faWR4LCBdCgoKc3ZtX21vZGVsID0gc3ZtKGFzLmZhY3Rvcih2YXJpZXR5KSB+IHNlcGFsLmxlbmd0aCtzZXBhbC53aWR0aCtwZXRhbC5sZW5ndGgrcGV0YWwud2lkdGgsIGRhdGEgPSB0cmFpbl9kYXRhLCBrZXJuZWwgPSAibGluZWFyIikKCnByZWQgPSBwcmVkaWN0KHN2bV9tb2RlbCwgdGVzdF9kYXRhKQpwcmVkCmBgYApgYGB7cn0KdGVzdF9kYXRhCmBgYAoKYGBge3J9CmNvbmZfbWF0ID0gY29uZnVzaW9uTWF0cml4KHByZWQsIGFzLmZhY3Rvcih0ZXN0X2RhdGEkdmFyaWV0eSkpCmNvbmZfbWF0CmBgYApgYGB7cn0KdGVzdF9kYXRhWzEsXQp0ZXN0X2RhdGFbMSwtNV0KcHJlZGljdChzdm1fbW9kZWwsIHRlc3RfZGF0YVsxLC01XSkKYGBgCgoKYGBge3J9CnNhbXBsZSA9IGlyaXNbODgsIC01XQpzYW1wbGUKCnByZWRpY3Qoc3ZtX21vZGVsLCBzYW1wbGUpCmBgYAoKCgoKCg==