ggplot
a) Scatter Plot
library(ggplot2)
diabetes$Outcome = as.character(diabetes$Outcome)
ggplot(diabetes, aes(x = BMI, y = Glucose, color = Outcome)) + geom_point(size = 1.5) +
labs(title = 'BMI Vs Glucose (Using ggplot)',
x = 'BMI (Kg/m^2)',
y = 'Glucose (mmol/L)',
caption = 'Source: Iskulghar') +
scale_color_manual(values = c("0" = "maroon", "1" = "royalblue"))+
theme_minimal() +
theme(
legend.position = "top",
text = element_text(colour = 'darkslategray', size = 13),
)

The association between a suspected diabetes patient’s BMI and
glucose level is shown in a scatter plot. Based on the two attributes,
each dot represents the conclusion as either no diabetes (0) or diabetes
(1).
b) Box Plot
diabetes$Outcome = as.character(diabetes$Outcome)
ggplot(diabetes, aes(x = Outcome, y = Pregnancies, fill = Outcome)) +
geom_boxplot() +
labs(title = "Bloxplot using ggplot",
x = "Pregnancies",
y = "value",
caption = "Source: Iskulghar") +
theme(
legend.position = "top",
text = element_text(colour = 'black', size = 14))

ggplot(diabetes, aes(x = Outcome, y = Glucose, fill = Outcome)) +
geom_boxplot() +
labs(title = "Bloxplot using ggplot",
x = "Glucose",
y = "value",
caption = "Source: Iskulghar") +
theme(
legend.position = "top",
text = element_text(colour = 'black', size = 14))

ggplot(diabetes, aes(x = Outcome, y = BloodPressure, fill = Outcome)) +
geom_boxplot() +
labs(title = "Bloxplot using ggplot",
x = "BloodPressure",
y = "value",
caption = "Source: Iskulghar") +
theme(
legend.position = "top",
text = element_text(colour = 'black', size = 14))

ggplot(diabetes, aes(x = Outcome, y = SkinThickness, fill = Outcome)) +
geom_boxplot() +
labs(title = "Bloxplot using ggplot",
x = "SkinThickness",
y = "value",
caption = "Source: Iskulghar") +
theme(
legend.position = "top",
text = element_text(colour = 'black', size = 14))

ggplot(diabetes, aes(x = Outcome, y = Insulin, fill = Outcome)) +
geom_boxplot() +
labs(title = "Bloxplot using ggplot",
x = "Insulin",
y = "value",
caption = "Source: Iskulghar") +
theme(
legend.position = "top",
text = element_text(colour = 'black', size = 14))

ggplot(diabetes, aes(x = Outcome, y = BMI, fill = Outcome)) +
geom_boxplot() +
labs(title = "Bloxplot using ggplot",
x = "BMI",
y = "value",
caption = "Source: Iskulghar") +
theme(
legend.position = "top",
text = element_text(colour = 'black', size = 14))

ggplot(diabetes, aes(x = Outcome, y = DiabetesPedigreeFunction, fill = Outcome)) +
geom_boxplot() +
labs(title = "Bloxplot using ggplot",
x = "DiabetesPedigreeFunction",
y = "value",
caption = "Source: Iskulghar") +
theme(
legend.position = "top",
text = element_text(colour = 'black', size = 14))

ggplot(diabetes, aes(x = Outcome, y = Age, fill = Outcome)) +
geom_boxplot() +
labs(title = "Bloxplot using ggplot",
x = "Age",
y = "value",
caption = "Source: Iskulghar") +
theme(
legend.position = "top",
text = element_text(colour = 'black', size = 14))

The interquartile range, which includes a line representing the
median age, is used to illustrate the distribution of the all columns of
“diabetes” dataset. It also shows the maximum and minimum values within
that range. Any point that falls outside of the range are considered as
outliers.
Interactive violin plot
library(plotly)
plot_ly(data= diabetes, x = ~Outcome, y = ~Pregnancies, type = 'violin')
library(plotly)
plot_ly(data= diabetes, x = ~Outcome, y = ~Glucose, type = 'violin')
library(plotly)
plot_ly(data= diabetes, x = ~Outcome, y = ~BloodPressure, type = 'violin')
library(plotly)
plot_ly(data= diabetes, x = ~Outcome, y = ~Insulin, type = 'violin')
library(plotly)
plot_ly(data= diabetes, x = ~Outcome, y = ~BMI, type = 'violin')
library(plotly)
plot_ly(data= diabetes, x = ~Outcome, y = ~DiabetesPedigreeFunction, type = 'violin')
library(plotly)
plot_ly(data= diabetes, x = ~Outcome, y = ~BMI, type = 'violin')
library(plotly)
plot_ly(data = diabetes, x = ~Outcome, y = ~Pregnancies, type = 'box')
library(plotly)
plot_ly(data = diabetes, x = ~Outcome, y = ~Glucose, type = 'box')
library(plotly)
plot_ly(data = diabetes, x = ~Outcome, y = ~BloodPressure, type = 'box')
library(plotly)
plot_ly(data = diabetes, x = ~Outcome, y = ~Insulin, type = 'box')
library(plotly)
plot_ly(data = diabetes, x = ~Outcome, y = ~BMI, type = 'box')
library(plotly)
plot_ly(data = diabetes, x = ~Outcome, y = ~DiabetesPedigreeFunction, type = 'box')
library(plotly)
plot_ly(data = diabetes, x = ~Outcome, y = ~Age, type = 'box')
Correlation matrix
cor_matrix = cor(diabetes[ ,1:8])
print(cor_matrix)
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction
Pregnancies 1.00000000 0.12945867 0.14128198 -0.08167177 -0.07353461 0.01768309 -0.03352267
Glucose 0.12945867 1.00000000 0.15258959 0.05732789 0.33135711 0.22107107 0.13733730
BloodPressure 0.14128198 0.15258959 1.00000000 0.20737054 0.08893338 0.28180529 0.04126495
SkinThickness -0.08167177 0.05732789 0.20737054 1.00000000 0.43678257 0.39257320 0.18392757
Insulin -0.07353461 0.33135711 0.08893338 0.43678257 1.00000000 0.19785906 0.18507093
BMI 0.01768309 0.22107107 0.28180529 0.39257320 0.19785906 1.00000000 0.14064695
DiabetesPedigreeFunction -0.03352267 0.13733730 0.04126495 0.18392757 0.18507093 0.14064695 1.00000000
Age 0.54434123 0.26351432 0.23952795 -0.11397026 -0.04216295 0.03624187 0.03356131
Age
Pregnancies 0.54434123
Glucose 0.26351432
BloodPressure 0.23952795
SkinThickness -0.11397026
Insulin -0.04216295
BMI 0.03624187
DiabetesPedigreeFunction 0.03356131
Age 1.00000000
Correlation matrix plot
library(ggcorrplot)
ggcorrplot(cor_matrix,
type = "lower",
colors = c("blue", "white", "maroon"),
lab = TRUE)

Pair plot
library(GGally)
ggpairs(diabetes, aes(colour = Outcome))

PCA
library(stats)
diabetes_pca = prcomp(diabetes[ , -9], scale = TRUE, center = TRUE)
diabetes_pca
Standard deviations (1, .., p=8):
[1] 1.4471973 1.3157546 1.0147068 0.9356971 0.8731234 0.8262133 0.6479322 0.6359733
Rotation (n x k) = (8 x 8):
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
Pregnancies -0.1284321 0.5937858 -0.01308692 0.08069115 -0.4756057 0.193598168 -0.58879003 -0.117840984
Glucose -0.3930826 0.1740291 0.46792282 -0.40432871 0.4663280 0.094161756 -0.06015291 -0.450355256
BloodPressure -0.3600026 0.1838921 -0.53549442 0.05598649 0.3279531 -0.634115895 -0.19211793 0.011295538
SkinThickness -0.4398243 -0.3319653 -0.23767380 0.03797608 -0.4878621 0.009589438 0.28221253 -0.566283799
Insulin -0.4350262 -0.2507811 0.33670893 -0.34994376 -0.3469348 -0.270650609 -0.13200992 0.548621381
BMI -0.4519413 -0.1009598 -0.36186463 0.05364595 0.2532038 0.685372179 -0.03536644 0.341517637
DiabetesPedigreeFunction -0.2706114 -0.1220690 0.43318905 0.83368010 0.1198105 -0.085784088 -0.08609107 0.008258731
Age -0.1980271 0.6205885 0.07524755 0.07120060 -0.1092900 -0.033357170 0.71208542 0.211661979
pca_12 = data.frame(diabetes_pca$x[ , 1:2])
head(pca_12)
pca_12_out = cbind(pca_12, Outcome = diabetes$Outcome)
pca_12_out
Bar plot of PCAs
library(factoextra)
fviz_eig(diabetes_pca, addlabels = TRUE)

Contribution plot (Circular plot)
fviz_pca_var(diabetes_pca, col.var = "contrib")

Contribution plot as heatmap
library("corrplot")
var = get_pca_var(diabetes_pca)
corrplot(var$cos2)

Cluster plot
fviz_pca_ind(diabetes_pca,
geom.ind = "point",
col.ind = diabetes$Outcome,
addEllipses = TRUE)

SVM Model
library(lattice)
library(e1071)
library(caret)
train_ix = createDataPartition(diabetes$Outcome, p = 0.8, list = FALSE)
train_data = diabetes[train_ix, ]
test_data = diabetes[-train_ix, ]
train_data
test_data
diabetes$Outcome = as.factor(diabetes$Outcome)
svm_model = svm(Outcome ~ Pregnancies+Glucose+BloodPressure+SkinThickness+Insulin+BMI+DiabetesPedigreeFunction
+Age, data = train_data, kernel = "linear")
test_data[2, ]
predict(svm_model, newdata = test_data[2, -9])
5
1
Levels: 0 1
Confusion matrix
predictions = predict(svm_model, newdata = test_data)
confusion_mat = confusionMatrix(predictions, test_data$Outcome)
confusion_mat
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 87 21
1 13 32
Accuracy : 0.7778
95% CI : (0.7036, 0.8409)
No Information Rate : 0.6536
P-Value [Acc > NIR] : 0.000586
Kappa : 0.4912
Mcnemar's Test P-Value : 0.229949
Sensitivity : 0.8700
Specificity : 0.6038
Pos Pred Value : 0.8056
Neg Pred Value : 0.7111
Prevalence : 0.6536
Detection Rate : 0.5686
Detection Prevalence : 0.7059
Balanced Accuracy : 0.7369
'Positive' Class : 0
com = as.data.frame(confusion_mat$table)
ggplot(com, aes(Prediction, Reference, fill = Freq)) +
geom_tile() +
geom_text(aes(label = Freq)) +
scale_fill_gradient(low="navy", high="aquamarine")

US Admission dataset
us_ad = read.csv('US Admission.csv')
us_ad
Removal of Serial.No column
us_ad_new = us_ad[ ,-1]
us_ad_new
Pair plot
library(GGally)
ggpairs(us_ad_new, cardinality_threshold = NULL )

Linear regression
x = us_ad_new$GRE.Score
y = us_ad_new$Chance.of.Admit
plot(x~y, xlab = "Chance of Admission", ylab = "GRE Score", main = "Linear Regression of GRE Score and Chance of Admission", pch = 20, col = 'blue')

x = us_ad_new$TOEFL.Score
y = us_ad_new$Chance.of.Admit
plot(x~y, xlab = "Chance of Admission", ylab = "TOEFL Score", main = "Linear Regression of TOEFL Score and Chance of Admission", pch = 20, col = 'red')

x = us_ad_new$University.Rating
y = us_ad_new$Chance.of.Admit
plot(x~y, xlab = "Chance of Admission", ylab = "University Rating", main = "Linear Regression of University Rating and Chance of Admission", pch = 20, col = 'purple')

x = us_ad_new$SOP
y = us_ad_new$Chance.of.Admit
plot(x~y, xlab = "Chance of Admission", ylab = "SOP", main = "Linear Regression of SOP and Chance of Admission", pch = 20, col = 'maroon')

x = us_ad_new$LOR
y = us_ad_new$Chance.of.Admit
plot(x~y, xlab = "Chance of Admission", ylab = "LOR", main = "Linear Regression of LOR and Chance of Admission", pch = 20, col = 'navy')

x = us_ad_new$CGPA
y = us_ad_new$Chance.of.Admit
plot(x~y, xlab = "Chance of Admission", ylab = "CGPA", main = "Linear Regression of CGPA and Chance of Admission", pch = 20, col = 'brown')

x = us_ad_new$Research
y = us_ad_new$Chance.of.Admit
plot(x~y, xlab = "Chance of Admission", ylab = "Research", main = "Linear Regression of Research and Chance of Admission", pch = 20, col = 'black')

Polynomial regression
library(ggplot2)
ggplot(us_ad_new, aes(x = GRE.Score, y = TOEFL.Score), color = Chance.of.Admit ) +
geom_point() +
geom_smooth(method = "lm", formula = y~poly(x, 2), level = 0.95) +
theme_minimal()

ggplot(us_ad_new, aes(x = GRE.Score, y =University.Rating), color = Chance.of.Admit ) +
geom_point() +
geom_smooth(method = "lm", formula = y~poly(x, 2), level = 0.95) +
theme_minimal()

ggplot(us_ad_new, aes(x = GRE.Score, y = SOP), color = Chance.of.Admit ) +
geom_point() +
geom_smooth(method = "lm", formula = y~poly(x, 2), level = 0.95) +
theme_minimal()

ggplot(us_ad_new, aes(x = GRE.Score, y = LOR), color = Chance.of.Admit ) +
geom_point() +
geom_smooth(method = "lm", formula = y~poly(x, 2), level = 0.95) +
theme_minimal()

ggplot(us_ad_new, aes(x = GRE.Score, y = CGPA), color = Chance.of.Admit ) +
geom_point() +
geom_smooth(method = "lm", formula = y~poly(x, 2), level = 0.95) +
theme_minimal()

ggplot(us_ad_new, aes(x = GRE.Score, y = Research), color = Chance.of.Admit ) +
geom_point() +
geom_smooth(method = "lm", formula = y~poly(x, 2), level = 0.95) +
theme_minimal()

Regression model
library(datasets)
data(us_ad_new)
Warning: data set ‘us_ad_new’ not found
lm_model = lm(Chance.of.Admit ~ GRE.Score+TOEFL.Score+University.Rating+SOP+LOR+CGPA+Research, data = us_ad_new)
summary(lm_model)
Call:
lm(formula = Chance.of.Admit ~ GRE.Score + TOEFL.Score + University.Rating +
SOP + LOR + CGPA + Research, data = us_ad_new)
Residuals:
Min 1Q Median 3Q Max
-0.26259 -0.02103 0.01005 0.03628 0.15928
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.2594325 0.1247307 -10.097 < 2e-16 ***
GRE.Score 0.0017374 0.0005979 2.906 0.00387 **
TOEFL.Score 0.0029196 0.0010895 2.680 0.00768 **
University.Rating 0.0057167 0.0047704 1.198 0.23150
SOP -0.0033052 0.0055616 -0.594 0.55267
LOR 0.0223531 0.0055415 4.034 6.6e-05 ***
CGPA 0.1189395 0.0122194 9.734 < 2e-16 ***
Research 0.0245251 0.0079598 3.081 0.00221 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.06378 on 392 degrees of freedom
Multiple R-squared: 0.8035, Adjusted R-squared: 0.8
F-statistic: 228.9 on 7 and 392 DF, p-value: < 2.2e-16
