Susmita Barua

Student, University of Chittagong

The ‘diabetes’ is a dataset that determines the outcome of the suspected patients based on several ‘features’ either as ‘No diabetes = 0’ or as ‘Diabetes = 1’.

nume_column = sapply(diabetes, is.numeric)
nume_column_name = names(nume_column)
nume_column_name
[1] "Pregnancies"              "Glucose"                  "BloodPressure"            "SkinThickness"           
[5] "Insulin"                  "BMI"                      "DiabetesPedigreeFunction" "Age"                     
[9] "Outcome"                 

There are total 9 numerical columns and 0 categorical column in the ‘diabetes’ dataset.The target variable of the dataset is the ‘Outcome’ column.

diabetes = read.csv('diabetes.csv')
diabetes

Basic Plot

a) Scatter plot

result_color = as.numeric(factor(diabetes$Outcome)) 
plot(diabetes$BMI, diabetes$Glucose, 
     col = result_color,
     pch = 20,
     xlab = substitute(paste(bold("BMI (kg/m^2)"))),
     ylab = substitute(paste(bold("Glucose (mmol/L)"))),
     main = 'BMI Vs Glucose',
     col.main = 'black',
     cex.main = 1.5
    ) 

The scatter plot represents the relationship between BMI and Glucose level of suspected diabetes patients. Each dot demonstrates the outcome as no diabetes (0) and diabetes(1) based on the two features.

b) Histogram Plot

hist(diabetes$Insulin,
     main = "Histogram plot of Insulin",
     col = "pink",
     xlab = "Insulin Level (IU/mL)")

The histogram is a graphical representation of the distribution of ‘diabetes’ dataset. Here, the x-axis represents insulin level range and the y-axis shows the frequency of individuals falling within each range.

c) Box Plot

boxplot(diabetes$Age,
        main = 'Box plot',
        col = 'aquamarine',
        xlab = substitute(paste(bold('Age'))),
        ylab = substitute(paste(bold('Years')))
)

The boxplot displays key statistics such as median, quartiles and potential outliers. This boxplot represents the age distribution of ‘diabetes’ dataset showing their maximum and minimum value within a certain range, the interquartile range with a line inside indicating the median age. The points beyond the maximum range are considered as outliers.

ggplot

a) Scatter Plot

library(ggplot2)

diabetes$Outcome = as.character(diabetes$Outcome)
ggplot(diabetes, aes(x = BMI, y = Glucose, color = Outcome)) + geom_point(size = 1.5) + 
  labs(title = 'BMI Vs Glucose (Using ggplot)',
       x = 'BMI (Kg/m^2)',
       y = 'Glucose (mmol/L)',
       caption = 'Source: Iskulghar') +
  scale_color_manual(values = c("0" = "maroon", "1" = "royalblue"))+
  theme_minimal() +
  theme(
    legend.position = "top",
    text = element_text(colour = 'darkslategray', size = 13), 
         )

The association between a suspected diabetes patient’s BMI and glucose level is shown in a scatter plot. Based on the two attributes, each dot represents the conclusion as either no diabetes (0) or diabetes (1).

b) Box Plot

diabetes$Outcome = as.character(diabetes$Outcome)
ggplot(diabetes, aes(x = Outcome, y = Pregnancies, fill = Outcome)) +
geom_boxplot() +

  labs(title = "Bloxplot using ggplot",
       x = "Pregnancies",
       y = "value",
       caption = "Source: Iskulghar") +
  
  theme(
    legend.position = "top", 
    text = element_text(colour = 'black', size = 14))


ggplot(diabetes, aes(x = Outcome, y = Glucose, fill = Outcome)) +
geom_boxplot() +

  labs(title = "Bloxplot using ggplot",
       x = "Glucose",
       y = "value",
       caption = "Source: Iskulghar") +
  theme(
    legend.position = "top", 
    text = element_text(colour = 'black', size = 14))


ggplot(diabetes, aes(x = Outcome, y = BloodPressure, fill = Outcome)) +
geom_boxplot() +

  labs(title = "Bloxplot using ggplot",
       x = "BloodPressure",
       y = "value",
       caption = "Source: Iskulghar") +
  theme(
    legend.position = "top", 
    text = element_text(colour = 'black', size = 14))


ggplot(diabetes, aes(x = Outcome, y = SkinThickness, fill = Outcome)) +
geom_boxplot() +

  labs(title = "Bloxplot using ggplot",
       x = "SkinThickness",
       y = "value",
       caption = "Source: Iskulghar") +
  theme(
    legend.position = "top", 
    text = element_text(colour = 'black', size = 14))


ggplot(diabetes, aes(x = Outcome, y = Insulin, fill = Outcome)) +
geom_boxplot() +

  labs(title = "Bloxplot using ggplot",
       x = "Insulin",
       y = "value",
       caption = "Source: Iskulghar") +
  theme(
    legend.position = "top", 
    text = element_text(colour = 'black', size = 14))


ggplot(diabetes, aes(x = Outcome, y = BMI, fill = Outcome)) +
geom_boxplot() +

  labs(title = "Bloxplot using ggplot",
       x = "BMI",
       y = "value",
       caption = "Source: Iskulghar") +
  theme(
    legend.position = "top", 
    text = element_text(colour = 'black', size = 14))


ggplot(diabetes, aes(x = Outcome, y = DiabetesPedigreeFunction, fill = Outcome)) +
geom_boxplot() +

  labs(title = "Bloxplot using ggplot",
       x = "DiabetesPedigreeFunction",
       y = "value",
       caption = "Source: Iskulghar") +
  theme(
    legend.position = "top", 
    text = element_text(colour = 'black', size = 14))


ggplot(diabetes, aes(x = Outcome, y = Age, fill = Outcome)) +
geom_boxplot() +

  labs(title = "Bloxplot using ggplot",
       x = "Age",
       y = "value",
       caption = "Source: Iskulghar") +
  theme(
    legend.position = "top", 
    text = element_text(colour = 'black', size = 14))

The interquartile range, which includes a line representing the median age, is used to illustrate the distribution of the all columns of “diabetes” dataset. It also shows the maximum and minimum values within that range. Any point that falls outside of the range are considered as outliers.

Interactive violin plot

library(plotly)
plot_ly(data= diabetes, x = ~Outcome, y = ~Pregnancies, type = 'violin') 
library(plotly)
plot_ly(data= diabetes, x = ~Outcome, y = ~Glucose, type = 'violin') 
library(plotly)
plot_ly(data= diabetes, x = ~Outcome, y = ~BloodPressure, type = 'violin') 
library(plotly)
plot_ly(data= diabetes, x = ~Outcome, y = ~Insulin, type = 'violin') 
library(plotly)
plot_ly(data= diabetes, x = ~Outcome, y = ~BMI, type = 'violin') 
library(plotly)
plot_ly(data= diabetes, x = ~Outcome, y = ~DiabetesPedigreeFunction, type = 'violin') 
library(plotly)
plot_ly(data= diabetes, x = ~Outcome, y = ~BMI, type = 'violin')
library(plotly)
plot_ly(data = diabetes, x = ~Outcome, y = ~Pregnancies, type = 'box')
library(plotly)
plot_ly(data = diabetes, x = ~Outcome, y = ~Glucose, type = 'box')
library(plotly)
plot_ly(data = diabetes, x = ~Outcome, y = ~BloodPressure, type = 'box')
library(plotly)
plot_ly(data = diabetes, x = ~Outcome, y = ~Insulin, type = 'box')
library(plotly)
plot_ly(data = diabetes, x = ~Outcome, y = ~BMI, type = 'box')
library(plotly)
plot_ly(data = diabetes, x = ~Outcome, y = ~DiabetesPedigreeFunction, type = 'box')
library(plotly)
plot_ly(data = diabetes, x = ~Outcome, y = ~Age, type = 'box')

Correlation matrix

cor_matrix = cor(diabetes[ ,1:8])
print(cor_matrix)
                         Pregnancies    Glucose BloodPressure SkinThickness     Insulin        BMI DiabetesPedigreeFunction
Pregnancies               1.00000000 0.12945867    0.14128198   -0.08167177 -0.07353461 0.01768309              -0.03352267
Glucose                   0.12945867 1.00000000    0.15258959    0.05732789  0.33135711 0.22107107               0.13733730
BloodPressure             0.14128198 0.15258959    1.00000000    0.20737054  0.08893338 0.28180529               0.04126495
SkinThickness            -0.08167177 0.05732789    0.20737054    1.00000000  0.43678257 0.39257320               0.18392757
Insulin                  -0.07353461 0.33135711    0.08893338    0.43678257  1.00000000 0.19785906               0.18507093
BMI                       0.01768309 0.22107107    0.28180529    0.39257320  0.19785906 1.00000000               0.14064695
DiabetesPedigreeFunction -0.03352267 0.13733730    0.04126495    0.18392757  0.18507093 0.14064695               1.00000000
Age                       0.54434123 0.26351432    0.23952795   -0.11397026 -0.04216295 0.03624187               0.03356131
                                 Age
Pregnancies               0.54434123
Glucose                   0.26351432
BloodPressure             0.23952795
SkinThickness            -0.11397026
Insulin                  -0.04216295
BMI                       0.03624187
DiabetesPedigreeFunction  0.03356131
Age                       1.00000000

Correlation matrix plot

library(ggcorrplot)
ggcorrplot(cor_matrix, 
               type = "lower",
               colors = c("blue", "white", "maroon"),
               lab = TRUE)

Pair plot

library(GGally)
ggpairs(diabetes, aes(colour = Outcome))

PCA

library(stats)

diabetes_pca = prcomp(diabetes[ , -9], scale = TRUE, center = TRUE)
diabetes_pca
Standard deviations (1, .., p=8):
[1] 1.4471973 1.3157546 1.0147068 0.9356971 0.8731234 0.8262133 0.6479322 0.6359733

Rotation (n x k) = (8 x 8):
                                PC1        PC2         PC3         PC4        PC5          PC6         PC7          PC8
Pregnancies              -0.1284321  0.5937858 -0.01308692  0.08069115 -0.4756057  0.193598168 -0.58879003 -0.117840984
Glucose                  -0.3930826  0.1740291  0.46792282 -0.40432871  0.4663280  0.094161756 -0.06015291 -0.450355256
BloodPressure            -0.3600026  0.1838921 -0.53549442  0.05598649  0.3279531 -0.634115895 -0.19211793  0.011295538
SkinThickness            -0.4398243 -0.3319653 -0.23767380  0.03797608 -0.4878621  0.009589438  0.28221253 -0.566283799
Insulin                  -0.4350262 -0.2507811  0.33670893 -0.34994376 -0.3469348 -0.270650609 -0.13200992  0.548621381
BMI                      -0.4519413 -0.1009598 -0.36186463  0.05364595  0.2532038  0.685372179 -0.03536644  0.341517637
DiabetesPedigreeFunction -0.2706114 -0.1220690  0.43318905  0.83368010  0.1198105 -0.085784088 -0.08609107  0.008258731
Age                      -0.1980271  0.6205885  0.07524755  0.07120060 -0.1092900 -0.033357170  0.71208542  0.211661979
pca_12 = data.frame(diabetes_pca$x[ , 1:2])
head(pca_12)
pca_12_out = cbind(pca_12, Outcome = diabetes$Outcome)
pca_12_out

Bar plot of PCAs

library(factoextra)
fviz_eig(diabetes_pca, addlabels = TRUE)

Contribution plot (Circular plot)

fviz_pca_var(diabetes_pca, col.var = "contrib")

Contribution plot as heatmap

library("corrplot")
var = get_pca_var(diabetes_pca)
corrplot(var$cos2)

Cluster plot

fviz_pca_ind(diabetes_pca,
             geom.ind = "point",
             col.ind = diabetes$Outcome,
             addEllipses = TRUE)

SVM Model

library(lattice)
library(e1071)
library(caret)

train_ix = createDataPartition(diabetes$Outcome, p = 0.8, list = FALSE)
train_data = diabetes[train_ix, ]
test_data = diabetes[-train_ix, ]

train_data
test_data

diabetes$Outcome = as.factor(diabetes$Outcome)
svm_model = svm(Outcome ~ Pregnancies+Glucose+BloodPressure+SkinThickness+Insulin+BMI+DiabetesPedigreeFunction
+Age, data = train_data, kernel = "linear")
test_data[2, ]
predict(svm_model, newdata = test_data[2, -9])
5 
1 
Levels: 0 1

Confusion matrix

predictions = predict(svm_model, newdata = test_data)
confusion_mat = confusionMatrix(predictions, test_data$Outcome)
confusion_mat
Confusion Matrix and Statistics

          Reference
Prediction  0  1
         0 87 21
         1 13 32
                                          
               Accuracy : 0.7778          
                 95% CI : (0.7036, 0.8409)
    No Information Rate : 0.6536          
    P-Value [Acc > NIR] : 0.000586        
                                          
                  Kappa : 0.4912          
                                          
 Mcnemar's Test P-Value : 0.229949        
                                          
            Sensitivity : 0.8700          
            Specificity : 0.6038          
         Pos Pred Value : 0.8056          
         Neg Pred Value : 0.7111          
             Prevalence : 0.6536          
         Detection Rate : 0.5686          
   Detection Prevalence : 0.7059          
      Balanced Accuracy : 0.7369          
                                          
       'Positive' Class : 0               
                                          
com = as.data.frame(confusion_mat$table)

ggplot(com, aes(Prediction, Reference, fill = Freq)) + 
  geom_tile() +
  geom_text(aes(label = Freq)) + 
  scale_fill_gradient(low="navy", high="aquamarine")

US Admission dataset

us_ad = read.csv('US Admission.csv')
us_ad

Removal of Serial.No column

us_ad_new = us_ad[ ,-1]
us_ad_new

Pair plot

library(GGally)
ggpairs(us_ad_new, cardinality_threshold = NULL ) 

Linear regression

x = us_ad_new$GRE.Score
y = us_ad_new$Chance.of.Admit

plot(x~y, xlab = "Chance of Admission", ylab = "GRE Score", main = "Linear Regression of GRE Score and Chance of Admission", pch = 20, col = 'blue')


x = us_ad_new$TOEFL.Score
y = us_ad_new$Chance.of.Admit

plot(x~y, xlab = "Chance of Admission", ylab = "TOEFL Score", main = "Linear Regression of TOEFL Score and Chance of Admission", pch = 20, col = 'red')


x = us_ad_new$University.Rating
y = us_ad_new$Chance.of.Admit

plot(x~y, xlab = "Chance of Admission", ylab = "University Rating", main = "Linear Regression of University Rating and Chance of Admission", pch = 20, col = 'purple')


x = us_ad_new$SOP
y = us_ad_new$Chance.of.Admit

plot(x~y, xlab = "Chance of Admission", ylab = "SOP", main = "Linear Regression of SOP and Chance of Admission", pch = 20, col = 'maroon')


x = us_ad_new$LOR
y = us_ad_new$Chance.of.Admit

plot(x~y, xlab = "Chance of Admission", ylab = "LOR", main = "Linear Regression of LOR and Chance of Admission", pch = 20, col = 'navy')


x = us_ad_new$CGPA
y = us_ad_new$Chance.of.Admit

plot(x~y, xlab = "Chance of Admission", ylab = "CGPA", main = "Linear Regression of CGPA and Chance of Admission", pch = 20, col = 'brown')


x = us_ad_new$Research
y = us_ad_new$Chance.of.Admit

plot(x~y, xlab = "Chance of Admission", ylab = "Research", main = "Linear Regression of Research and Chance of Admission", pch = 20, col = 'black')

Polynomial regression

library(ggplot2)
ggplot(us_ad_new, aes(x = GRE.Score, y = TOEFL.Score), color = Chance.of.Admit ) + 
  geom_point() +
  geom_smooth(method = "lm", formula = y~poly(x, 2), level = 0.95) +
  theme_minimal()


ggplot(us_ad_new, aes(x = GRE.Score, y =University.Rating), color = Chance.of.Admit ) + 
  geom_point() +
  geom_smooth(method = "lm", formula = y~poly(x, 2), level = 0.95) +
  theme_minimal()


ggplot(us_ad_new, aes(x = GRE.Score, y = SOP), color = Chance.of.Admit ) + 
  geom_point() +
  geom_smooth(method = "lm", formula = y~poly(x, 2), level = 0.95) +
  theme_minimal()


ggplot(us_ad_new, aes(x = GRE.Score, y = LOR), color = Chance.of.Admit ) + 
  geom_point() +
  geom_smooth(method = "lm", formula = y~poly(x, 2), level = 0.95) +
  theme_minimal()


ggplot(us_ad_new, aes(x = GRE.Score, y = CGPA), color = Chance.of.Admit ) + 
  geom_point() +
  geom_smooth(method = "lm", formula = y~poly(x, 2), level = 0.95) +
  theme_minimal()


ggplot(us_ad_new, aes(x = GRE.Score, y = Research), color = Chance.of.Admit ) + 
  geom_point() +
  geom_smooth(method = "lm", formula = y~poly(x, 2), level = 0.95) +
  theme_minimal()

Regression model

library(datasets)
data(us_ad_new)
Warning: data set ‘us_ad_new’ not found
lm_model = lm(Chance.of.Admit ~ GRE.Score+TOEFL.Score+University.Rating+SOP+LOR+CGPA+Research, data = us_ad_new)
summary(lm_model)

Call:
lm(formula = Chance.of.Admit ~ GRE.Score + TOEFL.Score + University.Rating + 
    SOP + LOR + CGPA + Research, data = us_ad_new)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.26259 -0.02103  0.01005  0.03628  0.15928 

Coefficients:
                    Estimate Std. Error t value Pr(>|t|)    
(Intercept)       -1.2594325  0.1247307 -10.097  < 2e-16 ***
GRE.Score          0.0017374  0.0005979   2.906  0.00387 ** 
TOEFL.Score        0.0029196  0.0010895   2.680  0.00768 ** 
University.Rating  0.0057167  0.0047704   1.198  0.23150    
SOP               -0.0033052  0.0055616  -0.594  0.55267    
LOR                0.0223531  0.0055415   4.034  6.6e-05 ***
CGPA               0.1189395  0.0122194   9.734  < 2e-16 ***
Research           0.0245251  0.0079598   3.081  0.00221 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.06378 on 392 degrees of freedom
Multiple R-squared:  0.8035,    Adjusted R-squared:    0.8 
F-statistic: 228.9 on 7 and 392 DF,  p-value: < 2.2e-16
