iris = read.csv("iris.csv")
iris
ggplot(data = iris, aes(x= variety, y = sepal.length, fill = variety)) +
geom_boxplot() +
labs(title = "This plot is created using ggplot",
x = "Class",
y = "Sepal Length",
caption = "Source: Iris dataset")

ggplot(data = iris, aes(x= variety, y = sepal.length, fill = variety)) +
geom_violin() +
labs(title = "This plot is created using ggplot",
x = "Class",
y = "Sepal Length",
caption = "Source: Iris dataset")

summary(iris)
sepal.length sepal.width petal.length petal.width variety
Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100 Length:150
1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300 Class :character
Median :5.800 Median :3.000 Median :4.350 Median :1.300 Mode :character
Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
cor(iris$sepal.length, iris$sepal.width)
[1] -0.1175698
cor(iris$sepal.length, iris$sepal.length)
[1] 1
cor_matrix = cor(iris[ ,1:4])
cor_matrix
sepal.length sepal.width petal.length petal.width
sepal.length 1.0000000 -0.1175698 0.8717538 0.8179411
sepal.width -0.1175698 1.0000000 -0.4284401 -0.3661259
petal.length 0.8717538 -0.4284401 1.0000000 0.9628654
petal.width 0.8179411 -0.3661259 0.9628654 1.0000000
library(ggcorrplot)
ggcorrplot(cor_matrix)

ggcorrplot(cor_matrix, type = "lower")

ggcorrplot(cor_matrix, type = "upper")

ggcorrplot(cor_matrix,
type = "lower",
colors = c("purple", "white", "red"))

ggcorrplot(cor_matrix,
type = "lower",
colors = c("purple", "white", "red"),
lab = TRUE)

library(GGally)
ggpairs(iris, aes(colour = variety))

Interactive plots
library(plotly)
fig = iris %>%
plot_ly(y = ~sepal.length, type = 'violin')
fig
library(plotly)
plot_ly(iris, y = ~sepal.length, type = 'violin')
NA
NA
NA
plot_ly(iris, y = ~sepal.length, type = 'box')
plot_ly(iris, x = ~sepal.length, type = 'histogram')
plot_ly(iris, x = ~variety, y = ~ sepal.length, type = 'box')
Principal Component Analysis (PCA)
# dimension = axis = features
library(stats)
iris_pca = prcomp(iris[ , -5], scale = TRUE, center = TRUE)
iris_pca
Standard deviations (1, .., p=4):
[1] 1.7083611 0.9560494 0.3830886 0.1439265
Rotation (n x k) = (4 x 4):
PC1 PC2 PC3 PC4
sepal.length 0.5210659 -0.37741762 0.7195664 0.2612863
sepal.width -0.2693474 -0.92329566 -0.2443818 -0.1235096
petal.length 0.5804131 -0.02449161 -0.1421264 -0.8014492
petal.width 0.5648565 -0.06694199 -0.6342727 0.5235971
summary(iris_pca)
Importance of components:
PC1 PC2 PC3 PC4
Standard deviation 1.7084 0.9560 0.38309 0.14393
Proportion of Variance 0.7296 0.2285 0.03669 0.00518
Cumulative Proportion 0.7296 0.9581 0.99482 1.00000
pc_12 = data.frame(iris_pca$x[ , 1:2])
head(pc_12)
pc_12_class = cbind(pc_12, variety = iris$variety)
pc_12_class
ggplot(pc_12_class, aes(PC1, PC2, color=variety)) +
geom_point() +
theme_minimal()

library(factoextra)
fviz_eig(iris_pca, addlabels = TRUE)

fviz_pca_var(iris_pca,
col.var = "contrib")

library("corrplot")
var = get_pca_var(iris_pca)
corrplot(var$cos2)

fviz_pca_ind(iris_pca,
geom.ind = "point",
col.ind = iris$variety,
addEllipses = TRUE)

Regression
library(datasets)
data(iris)
lm_model = lm(Sepal.Length ~ Petal.Length, data = iris)
summary(lm_model)
Call:
lm(formula = Sepal.Length ~ Petal.Length, data = iris)
Residuals:
Min 1Q Median 3Q Max
-1.24675 -0.29657 -0.01515 0.27676 1.00269
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.30660 0.07839 54.94 <2e-16 ***
Petal.Length 0.40892 0.01889 21.65 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.4071 on 148 degrees of freedom
Multiple R-squared: 0.76, Adjusted R-squared: 0.7583
F-statistic: 468.6 on 1 and 148 DF, p-value: < 2.2e-16
x = iris$Petal.Length
y = iris$Sepal.Length
plot(x, y)

pred = predict(lm_model)
ix = sort(x, index.return = T)$ix # sort and return index not the acrtual values
plot(x, y)
lines(x[ix], pred[ix])

NA
NA
ggplot(iris, aes(x = Petal.Length, y=Sepal.Length)) +
geom_point() +
geom_smooth(method = "lm", level = 0.95)

ggplot(iris, aes(x = Petal.Length, y=Sepal.Length, color = Species)) +
geom_point() +
geom_smooth(method = "lm", level = 0.95)

ggplot(iris, aes(x = Petal.Length, y=Sepal.Length, color = Species)) +
geom_point() +
geom_smooth(method = "lm", formula = y~poly(x, 2), level = 0.95)

Clustering
kmeans_result = kmeans(iris[ , 1:4], centers = 3 )
kmeans_result
K-means clustering with 3 clusters of sizes 50, 38, 62
Cluster means:
Sepal.Length Sepal.Width Petal.Length Petal.Width
1 5.006000 3.428000 1.462000 0.246000
2 6.850000 3.073684 5.742105 2.071053
3 5.901613 2.748387 4.393548 1.433871
Clustering vector:
[1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 2 3 3 3
[57] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 2 2 2 2 3 2 2 2 2 2
[113] 2 3 3 2 2 2 2 3 2 3 2 3 2 2 3 3 2 2 2 2 2 3 2 2 2 2 3 2 2 2 3 2 2 2 3 2 2 3
Within cluster sum of squares by cluster:
[1] 15.15100 23.87947 39.82097
(between_SS / total_SS = 88.4 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss" "betweenss" "size"
[8] "iter" "ifault"
library(cluster)
clusplot(iris, kmeans_result$cluster)

Classification
library(lattice)
library(e1071)
library(caret)
train_ix = createDataPartition(iris$Species, p = 0.8, list = FALSE)
train_data = iris[train_ix, ]
test_data = iris[-train_ix, ]
train_data
test_data
svm_model = svm(Species ~ Sepal.Length+Sepal.Width+Petal.Length, data = train_data, kernel = "linear")
test_data[24, ]
predict(svm_model, newdata = test_data[24, -5])
120
virginica
Levels: setosa versicolor virginica
predictions = predict(svm_model, newdata = test_data)
conf_max = confusionMatrix(predictions, test_data$Species)
conf_max
Confusion Matrix and Statistics
Reference
Prediction setosa versicolor virginica
setosa 10 0 0
versicolor 0 10 1
virginica 0 0 9
Overall Statistics
Accuracy : 0.9667
95% CI : (0.8278, 0.9992)
No Information Rate : 0.3333
P-Value [Acc > NIR] : 2.963e-13
Kappa : 0.95
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: setosa Class: versicolor Class: virginica
Sensitivity 1.0000 1.0000 0.9000
Specificity 1.0000 0.9500 1.0000
Pos Pred Value 1.0000 0.9091 1.0000
Neg Pred Value 1.0000 1.0000 0.9524
Prevalence 0.3333 0.3333 0.3333
Detection Rate 0.3333 0.3333 0.3000
Detection Prevalence 0.3333 0.3667 0.3000
Balanced Accuracy 1.0000 0.9750 0.9500
cm = as.data.frame(conf_max$table)
ggplot(cm, aes(Prediction, Reference, fill = Freq)) +
geom_tile() +
geom_text(aes(label = Freq)) +
scale_fill_gradient(low="white", high="skyblue")

LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKCgpgYGB7cn0KaXJpcyA9IHJlYWQuY3N2KCJpcmlzLmNzdiIpCmlyaXMKYGBgCgoKYGBge3J9CgpnZ3Bsb3QoZGF0YSA9IGlyaXMsIGFlcyh4PSB2YXJpZXR5LCB5ID0gc2VwYWwubGVuZ3RoLCBmaWxsID0gdmFyaWV0eSkpICsgCiAgZ2VvbV9ib3hwbG90KCkgKwogIGxhYnModGl0bGUgPSAiVGhpcyBwbG90IGlzIGNyZWF0ZWQgdXNpbmcgZ2dwbG90IiwKICAgICAgIHggPSAiQ2xhc3MiLAogICAgICAgeSA9ICJTZXBhbCBMZW5ndGgiLAogICAgICAgY2FwdGlvbiA9ICJTb3VyY2U6IElyaXMgZGF0YXNldCIpCmBgYAoKCmBgYHtyfQpnZ3Bsb3QoZGF0YSA9IGlyaXMsIGFlcyh4PSB2YXJpZXR5LCB5ID0gc2VwYWwubGVuZ3RoLCBmaWxsID0gdmFyaWV0eSkpICsgCiAgZ2VvbV92aW9saW4oKSArCiAgbGFicyh0aXRsZSA9ICJUaGlzIHBsb3QgaXMgY3JlYXRlZCB1c2luZyBnZ3Bsb3QiLAogICAgICAgeCA9ICJDbGFzcyIsCiAgICAgICB5ID0gIlNlcGFsIExlbmd0aCIsCiAgICAgICBjYXB0aW9uID0gIlNvdXJjZTogSXJpcyBkYXRhc2V0IikKYGBgCgpgYGB7cn0Kc3VtbWFyeShpcmlzKQpgYGAKCgoKYGBge3J9CmNvcihpcmlzJHNlcGFsLmxlbmd0aCwgaXJpcyRzZXBhbC53aWR0aCkKY29yKGlyaXMkc2VwYWwubGVuZ3RoLCBpcmlzJHNlcGFsLmxlbmd0aCkKYGBgCgoKCgpgYGB7cn0KY29yX21hdHJpeCA9IGNvcihpcmlzWyAsMTo0XSkKY29yX21hdHJpeApgYGAKCmBgYHtyfQpsaWJyYXJ5KGdnY29ycnBsb3QpCmdnY29ycnBsb3QoY29yX21hdHJpeCkKYGBgCgpgYGB7cn0KZ2djb3JycGxvdChjb3JfbWF0cml4LCB0eXBlID0gImxvd2VyIikKYGBgCgoKYGBge3J9CmdnY29ycnBsb3QoY29yX21hdHJpeCwgdHlwZSA9ICJ1cHBlciIpCmBgYAoKCmBgYHtyfQpnZ2NvcnJwbG90KGNvcl9tYXRyaXgsIAogICAgICAgICAgIHR5cGUgPSAibG93ZXIiLAogICAgICAgICAgIGNvbG9ycyA9IGMoInB1cnBsZSIsICJ3aGl0ZSIsICJyZWQiKSkKYGBgCgpgYGB7cn0KZ2djb3JycGxvdChjb3JfbWF0cml4LCAKICAgICAgICAgICB0eXBlID0gImxvd2VyIiwKICAgICAgICAgICBjb2xvcnMgPSBjKCJwdXJwbGUiLCAid2hpdGUiLCAicmVkIiksCiAgICAgICAgICAgbGFiID0gVFJVRSkKYGBgCgoKYGBge3J9CmxpYnJhcnkoR0dhbGx5KQpnZ3BhaXJzKGlyaXMsIGFlcyhjb2xvdXIgPSB2YXJpZXR5KSkKYGBgCgojIEludGVyYWN0aXZlIHBsb3RzCmBgYHtyfQpsaWJyYXJ5KHBsb3RseSkKCmZpZyA9IGlyaXMgJT4lCiAgcGxvdF9seSh5ID0gfnNlcGFsLmxlbmd0aCwgdHlwZSA9ICd2aW9saW4nKQogIApmaWcKYGBgCgpgYGB7cn0KCmxpYnJhcnkocGxvdGx5KQpwbG90X2x5KGlyaXMsIHkgPSB+c2VwYWwubGVuZ3RoLCB0eXBlID0gJ3Zpb2xpbicpCmBgYAoKCmBgYHtyfQpwbG90X2x5KGlyaXMsIHkgPSB+c2VwYWwubGVuZ3RoLCB0eXBlID0gJ2JveCcpCmBgYAoKYGBge3J9CnBsb3RfbHkoaXJpcywgeCA9IH5zZXBhbC5sZW5ndGgsIHR5cGUgPSAnaGlzdG9ncmFtJykKYGBgCgoKYGBge3J9CnBsb3RfbHkoaXJpcywgeCA9IH52YXJpZXR5LCB5ID0gfiBzZXBhbC5sZW5ndGgsIHR5cGUgPSAnYm94JykKYGBgCgoKIyBQcmluY2lwYWwgQ29tcG9uZW50IEFuYWx5c2lzIChQQ0EpCgpgYGB7cn0KIyBkaW1lbnNpb24gPSBheGlzID0gZmVhdHVyZXMKbGlicmFyeShzdGF0cykKCmlyaXNfcGNhID0gcHJjb21wKGlyaXNbICwgLTVdLCBzY2FsZSA9IFRSVUUsIGNlbnRlciA9IFRSVUUpCmlyaXNfcGNhCmBgYAoKYGBge3J9CnN1bW1hcnkoaXJpc19wY2EpCmBgYAoKYGBge3J9CnBjXzEyID0gZGF0YS5mcmFtZShpcmlzX3BjYSR4WyAsIDE6Ml0pCmhlYWQocGNfMTIpCgpwY18xMl9jbGFzcyA9IGNiaW5kKHBjXzEyLCB2YXJpZXR5ID0gaXJpcyR2YXJpZXR5KQpwY18xMl9jbGFzcwpgYGAKCmBgYHtyfQpnZ3Bsb3QocGNfMTJfY2xhc3MsIGFlcyhQQzEsIFBDMiwgY29sb3I9dmFyaWV0eSkpICsgCiAgZ2VvbV9wb2ludCgpICsgCiAgdGhlbWVfbWluaW1hbCgpCmBgYAoKYGBge3J9CmxpYnJhcnkoZmFjdG9leHRyYSkKZnZpel9laWcoaXJpc19wY2EsIGFkZGxhYmVscyA9IFRSVUUpCmBgYAoKCmBgYHtyfQpmdml6X3BjYV92YXIoaXJpc19wY2EsCiAgICAgICAgICAgICBjb2wudmFyID0gImNvbnRyaWIiKQpgYGAKCmBgYHtyfQpsaWJyYXJ5KCJjb3JycGxvdCIpCnZhciA9IGdldF9wY2FfdmFyKGlyaXNfcGNhKQpjb3JycGxvdCh2YXIkY29zMikKYGBgCgoKCmBgYHtyfQpmdml6X3BjYV9pbmQoaXJpc19wY2EsCiAgICAgICAgICAgICBnZW9tLmluZCA9ICJwb2ludCIsCiAgICAgICAgICAgICBjb2wuaW5kID0gaXJpcyR2YXJpZXR5LAogICAgICAgICAgICAgYWRkRWxsaXBzZXMgPSBUUlVFKQpgYGAKCiMgUmVncmVzc2lvbgoKYGBge3J9CgpsaWJyYXJ5KGRhdGFzZXRzKQpkYXRhKGlyaXMpCgpsbV9tb2RlbCA9IGxtKFNlcGFsLkxlbmd0aCB+IFBldGFsLkxlbmd0aCwgZGF0YSA9IGlyaXMpCnN1bW1hcnkobG1fbW9kZWwpCmBgYAoKYGBge3J9CiMgcCA8IDAuMDUKCnggPSBpcmlzJFBldGFsLkxlbmd0aAp5ID0gaXJpcyRTZXBhbC5MZW5ndGgKCnBsb3QoeCwgeSkKYGBgCgoKYGBge3J9CnByZWQgPSBwcmVkaWN0KGxtX21vZGVsKQppeCA9IHNvcnQoeCwgaW5kZXgucmV0dXJuID0gVCkkaXggIyBzb3J0IGFuZCByZXR1cm4gaW5kZXggbm90IHRoZSBhY3J0dWFsIHZhbHVlcwoKcGxvdCh4LCB5KQpsaW5lcyh4W2l4XSwgcHJlZFtpeF0pCgoKYGBgCgoKYGBge3J9CmdncGxvdChpcmlzLCBhZXMoeCA9IFBldGFsLkxlbmd0aCwgeT1TZXBhbC5MZW5ndGgpKSArIAogIGdlb21fcG9pbnQoKSArCiAgZ2VvbV9zbW9vdGgobWV0aG9kID0gImxtIiwgbGV2ZWwgPSAwLjk1KQpgYGAKCmBgYHtyfQpnZ3Bsb3QoaXJpcywgYWVzKHggPSBQZXRhbC5MZW5ndGgsIHk9U2VwYWwuTGVuZ3RoLCBjb2xvciA9IFNwZWNpZXMpKSArIAogIGdlb21fcG9pbnQoKSArCiAgZ2VvbV9zbW9vdGgobWV0aG9kID0gImxtIiwgbGV2ZWwgPSAwLjk1KQpgYGAKCmBgYHtyfQpnZ3Bsb3QoaXJpcywgYWVzKHggPSBQZXRhbC5MZW5ndGgsIHk9U2VwYWwuTGVuZ3RoLCBjb2xvciA9IFNwZWNpZXMpKSArIAogIGdlb21fcG9pbnQoKSArCiAgZ2VvbV9zbW9vdGgobWV0aG9kID0gImxtIiwgZm9ybXVsYSA9IHl+cG9seSh4LCAyKSwgbGV2ZWwgPSAwLjk1KQpgYGAKCiMgQ2x1c3RlcmluZwpgYGB7cn0Ka21lYW5zX3Jlc3VsdCA9IGttZWFucyhpcmlzWyAsIDE6NF0sIGNlbnRlcnMgPSAzICkKa21lYW5zX3Jlc3VsdApgYGAKCgpgYGB7cn0KbGlicmFyeShjbHVzdGVyKQpjbHVzcGxvdChpcmlzLCBrbWVhbnNfcmVzdWx0JGNsdXN0ZXIpCmBgYAoKCiMgQ2xhc3NpZmljYXRpb24KYGBge3J9CmxpYnJhcnkobGF0dGljZSkKbGlicmFyeShlMTA3MSkKbGlicmFyeShjYXJldCkKCnRyYWluX2l4ID0gY3JlYXRlRGF0YVBhcnRpdGlvbihpcmlzJFNwZWNpZXMsIHAgPSAwLjgsIGxpc3QgPSBGQUxTRSkKdHJhaW5fZGF0YSA9IGlyaXNbdHJhaW5faXgsIF0KdGVzdF9kYXRhID0gaXJpc1stdHJhaW5faXgsIF0KCnRyYWluX2RhdGEKdGVzdF9kYXRhCgoKc3ZtX21vZGVsID0gc3ZtKFNwZWNpZXMgfiBTZXBhbC5MZW5ndGgrU2VwYWwuV2lkdGgrUGV0YWwuTGVuZ3RoLCBkYXRhID0gdHJhaW5fZGF0YSwga2VybmVsID0gImxpbmVhciIpCgpgYGAKCmBgYHtyfQp0ZXN0X2RhdGFbMjQsIF0KYGBgCgoKYGBge3J9CgpwcmVkaWN0KHN2bV9tb2RlbCwgbmV3ZGF0YSA9IHRlc3RfZGF0YVsyNCwgLTVdKQpgYGAKCgpgYGB7cn0KcHJlZGljdGlvbnMgPSBwcmVkaWN0KHN2bV9tb2RlbCwgbmV3ZGF0YSA9IHRlc3RfZGF0YSkKY29uZl9tYXggPSBjb25mdXNpb25NYXRyaXgocHJlZGljdGlvbnMsIHRlc3RfZGF0YSRTcGVjaWVzKQpjb25mX21heApgYGAKCmBgYHtyfQoKY20gPSBhcy5kYXRhLmZyYW1lKGNvbmZfbWF4JHRhYmxlKQoKZ2dwbG90KGNtLCBhZXMoUHJlZGljdGlvbiwgUmVmZXJlbmNlLCBmaWxsID0gRnJlcSkpICsgCiAgZ2VvbV90aWxlKCkgKwogIGdlb21fdGV4dChhZXMobGFiZWwgPSBGcmVxKSkgKyAKICBzY2FsZV9maWxsX2dyYWRpZW50KGxvdz0id2hpdGUiLCBoaWdoPSJza3libHVlIikKYGBgCgo=