Export a Data Set Related to Diabates Risk Factors on Various Group

#Part 1

#1.Dataset

diabetes=read.csv("diabetes.csv")
diabetes

Data Exploration

str(diabetes)

## 'data.frame':    768 obs. of  9 variables:
##  $ Pregnancies             : int  6 1 8 1 0 5 3 10 2 8 ...
##  $ Glucose                 : int  148 85 183 89 137 116 78 115 197 125 ...
##  $ BloodPressure           : int  72 66 64 66 40 74 50 0 70 96 ...
##  $ SkinThickness           : int  35 29 0 23 35 0 32 0 45 0 ...
##  $ Insulin                 : int  0 0 0 94 168 0 88 0 543 0 ...
##  $ BMI                     : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ DiabetesPedigreeFunction: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ Age                     : int  50 31 32 21 33 30 26 29 53 54 ...
##  $ Outcome                 : int  1 0 1 0 1 0 1 0 1 1 ...

sapply(diabetes , class)

##              Pregnancies                  Glucose            BloodPressure 
##                "integer"                "integer"                "integer" 
##            SkinThickness                  Insulin                      BMI 
##                "integer"                "integer"                "numeric" 
## DiabetesPedigreeFunction                      Age                  Outcome 
##                "numeric"                "integer"                "integer"

#Target variable

class(diabetes$Outcome)

## [1] "integer"

#Convert target column as character

diabetes$Outcome =as.character(diabetes$Outcome)
diabetes

Basic Plot

a.Scatter plot

library(ggplot2)
ggplot(diabetes,aes (x= Age,y = Glucose , color = Outcome))+geom_point()+
labs(title = "visualization of age and Glucose ")+theme(legend.position = "top")

library(ggplot2)
ggplot(diabetes,aes (x= Age,y = Insulin , color = Outcome))+geom_point()+
labs(title = "visualization of age and insulin ")+theme(legend.position = "top")

library(ggplot2)
ggplot(diabetes,aes (x= Age,y = BloodPressure , color = Outcome))+geom_point()+
labs(title = "visualization of age and Blood pressure ")+theme(legend.position = "top")

#Boxplot

ggplot(diabetes,aes (y =Pregnancies,x= Age, color = Outcome,fill =Outcome))+geom_boxplot()+labs(title = "visualization of age and Pregnancies pressure ")+theme(legend.position = "top")

ggplot(diabetes,aes (y = SkinThickness,x= Age, color = Outcome,fill =Outcome))+geom_boxplot()+labs(title = "visualization of age and SkinThickness ")+theme(legend.position = "top")

ggplot(diabetes,aes (y = BMI,x= Age, color = Outcome,fill =Outcome))+geom_boxplot()+labs(title = "visualization of age and BMI ")+theme(legend.position = "top")

ggplot(diabetes,aes (y = Glucose,x= Age, color = Outcome,fill =Outcome))+geom_boxplot()+labs(title = "visualization of age and Glucose")+theme(legend.position = "top")

ggplot(diabetes,aes (y = BloodPressure,x= Age, color = Outcome,fill =Outcome))+geom_boxplot()+labs(title = "visualization of age and BloodPressure ")+theme(legend.position = "top")

ggplot(diabetes,aes (y = Insulin,x= Age, color = Outcome,fill =Outcome))+geom_boxplot()+labs(title = "visualization of age and Insulin")+theme(legend.position = "top")

#Violin Plot

ggplot(diabetes,aes (y = Pregnancies, x= Outcome, fill = Outcome))+geom_violin()+labs(title = "visualization of Pregnecies in dibateses outcome")+theme(legend.position = "top")

ggplot(diabetes,aes (y = BloodPressure , x= Outcome, fill = Outcome))+geom_violin()+labs(title = "visualization of Blood pressure in dibateses outcome")+theme(legend.position = "top")

ggplot(diabetes,aes (y = BMI, x= Outcome , fill = Outcome))+geom_violin()+labs(title = "visualization of BMI in dibateses outcome")+theme(legend.position = "top")

ggplot(diabetes,aes (y = SkinThickness, x= Outcome, fill = Outcome))+geom_violin()+labs(title = "visualization of Skin thickness in dibateses outcome")+theme(legend.position = "top")

ggplot(diabetes,aes(y = Age, x= Outcome, fill = Outcome))+geom_violin()+labs(title = "visualization of Age in dibateses outcome")+theme(legend.position = "top")

ggplot(diabetes,aes(y = Insulin, x= Outcome, fill = Outcome))+geom_violin()+labs(title = "visualization of Insulin in dibateses outcome")+theme(legend.position = "top")

ggplot(diabetes,aes(y = DiabetesPedigreeFunction, x= Outcome, fill = Outcome))+geom_violin()+labs(title = "visualization of Diabetes Pedigree Function in dibateses outcome")+theme(legend.position = "top")

Part 2

#Interactive plot a.Interactive violin plot of 2-3 features

Ages by Outcome #violin plot

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

plot_ly(data =diabetes, y= ~Age, x = ~Outcome, type = "violin")

Glucose cosumption rate by Outcome

library(plotly)
plot_ly(data =diabetes, y= ~Glucose, x = ~Outcome, type = "violin")

Insulin secretion by outcome

library(plotly)
plot_ly(data =diabetes, y= ~Insulin, x = ~Outcome, type = "violin")

b.Interactive boxplot Pregnancies by outcome

library(plotly)
plot_ly(data =diabetes, y= ~Pregnancies, x = ~Outcome, type = "box")

BMI rate by outcome

library(plotly)
plot_ly(data =diabetes, y= ~BMI, x = ~Outcome, type = "box")

Blood pressure in diabetic patient by outcome

library(plotly)
plot_ly(data =diabetes, y= ~BloodPressure, x = ~Outcome, type = "box")

c.Calculate Correlation matrix

cor_mat = cor(diabetes[ , 1:8])
cor_mat

##                          Pregnancies    Glucose BloodPressure SkinThickness
## Pregnancies               1.00000000 0.12945867    0.14128198   -0.08167177
## Glucose                   0.12945867 1.00000000    0.15258959    0.05732789
## BloodPressure             0.14128198 0.15258959    1.00000000    0.20737054
## SkinThickness            -0.08167177 0.05732789    0.20737054    1.00000000
## Insulin                  -0.07353461 0.33135711    0.08893338    0.43678257
## BMI                       0.01768309 0.22107107    0.28180529    0.39257320
## DiabetesPedigreeFunction -0.03352267 0.13733730    0.04126495    0.18392757
## Age                       0.54434123 0.26351432    0.23952795   -0.11397026
##                              Insulin        BMI DiabetesPedigreeFunction
## Pregnancies              -0.07353461 0.01768309              -0.03352267
## Glucose                   0.33135711 0.22107107               0.13733730
## BloodPressure             0.08893338 0.28180529               0.04126495
## SkinThickness             0.43678257 0.39257320               0.18392757
## Insulin                   1.00000000 0.19785906               0.18507093
## BMI                       0.19785906 1.00000000               0.14064695
## DiabetesPedigreeFunction  0.18507093 0.14064695               1.00000000
## Age                      -0.04216295 0.03624187               0.03356131
##                                  Age
## Pregnancies               0.54434123
## Glucose                   0.26351432
## BloodPressure             0.23952795
## SkinThickness            -0.11397026
## Insulin                  -0.04216295
## BMI                       0.03624187
## DiabetesPedigreeFunction  0.03356131
## Age                       1.00000000

The correlation matrix showed that the number of pregnancies increased with age, with the highest link being found between age and the number of pregnancies. Conversely, weak-negative correlations were discovered between a number of factors, suggesting less pronounced or trustworthy links between those variables, and age does not appear to be significantly connected with diabetes pedigree function.

d.Plot Correlation matrix (lower triangle) with values

e.Pair plot of all feature

library(GGally)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

ggpairs(diabetes, aes(colour = Outcome))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

f.Principal component analysis(PCA)

library(stats)
Diabetes_pca = prcomp(diabetes[ ,1:8], scale = TRUE, center = TRUE)
Diabetes_pca

## Standard deviations (1, .., p=8):
## [1] 1.4471973 1.3157546 1.0147068 0.9356971 0.8731234 0.8262133 0.6479322
## [8] 0.6359733
## 
## Rotation (n x k) = (8 x 8):
##                                 PC1        PC2         PC3         PC4
## Pregnancies              -0.1284321  0.5937858 -0.01308692  0.08069115
## Glucose                  -0.3930826  0.1740291  0.46792282 -0.40432871
## BloodPressure            -0.3600026  0.1838921 -0.53549442  0.05598649
## SkinThickness            -0.4398243 -0.3319653 -0.23767380  0.03797608
## Insulin                  -0.4350262 -0.2507811  0.33670893 -0.34994376
## BMI                      -0.4519413 -0.1009598 -0.36186463  0.05364595
## DiabetesPedigreeFunction -0.2706114 -0.1220690  0.43318905  0.83368010
## Age                      -0.1980271  0.6205885  0.07524755  0.07120060
##                                 PC5          PC6         PC7          PC8
## Pregnancies              -0.4756057  0.193598168 -0.58879003 -0.117840984
## Glucose                   0.4663280  0.094161756 -0.06015291 -0.450355256
## BloodPressure             0.3279531 -0.634115895 -0.19211793  0.011295538
## SkinThickness            -0.4878621  0.009589438  0.28221253 -0.566283799
## Insulin                  -0.3469348 -0.270650609 -0.13200992  0.548621381
## BMI                       0.2532038  0.685372179 -0.03536644  0.341517637
## DiabetesPedigreeFunction  0.1198105 -0.085784088 -0.08609107  0.008258731
## Age                      -0.1092900 -0.033357170  0.71208542  0.211661979

summary(Diabetes_pca)

## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5     PC6     PC7
## Standard deviation     1.4472 1.3158 1.0147 0.9357 0.87312 0.82621 0.64793
## Proportion of Variance 0.2618 0.2164 0.1287 0.1094 0.09529 0.08533 0.05248
## Cumulative Proportion  0.2618 0.4782 0.6069 0.7163 0.81164 0.89697 0.94944
##                            PC8
## Standard deviation     0.63597
## Proportion of Variance 0.05056
## Cumulative Proportion  1.00000

Evaluating which components have the most information and how much of the variability in the original data is captured are necessary steps in interpreting the proportion of variance. About 48% of the variance can be explained by components 1 and 2, indicating that they are able to extract some information from the original variables. Components 2, 3, and so forth, in order of precedence, explain diminishing quantities of variation.

Bar plot of PCAs

library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

fviz_eig(Diabetes_pca)

The diabetes dataset indicates that the first three components account for the majority of the variance in the data since it displays a dramatic fall in explained variance up until the third component, after which it declines more gently.

ii.Contribution plot of PCAs (Circular plot)

fviz_pca_var(Diabetes_pca, col.var = "contrib", gradient.cols = c("blue", "green", "red"))

Cluster plot after PCA

fviz_pca_ind(Diabetes_pca,geom.ind = "point", col.ind = diabetes$Outcome, addEllipses = TRUE)

Support Vector Machine(SVM)

library(e1071)
library(caret)

## Loading required package: lattice

library(lattice)
train_idx =createDataPartition(diabetes$Outcome,p= 0.80 ,list = FALSE)
train_data =diabetes[train_idx, ]
test_data = diabetes[-train_idx, ]
test_data

svm_model = svm(as.factor(Outcome) ~ Pregnancies +Glucose+BloodPressure
+SkinThickness+Insulin+BMI+DiabetesPedigreeFunction+Age, data = train_data, kernel = "linear" )
pred = predict(svm_model,test_data)
pred

##   3   5   8  15  25  30  33  39  46  50  52  60  63  64  69  76  77  79  81  94 
##   1   1   1   1   1   0   0   0   1   0   0   0   0   0   0   0   0   1   0   0 
##  95 101 105 115 120 122 127 132 137 138 144 146 151 154 164 165 172 178 213 216 
##   0   1   0   1   0   0   0   1   0   0   0   0   0   1   0   0   1   1   1   1 
## 218 223 227 235 236 250 251 254 257 260 277 281 288 295 298 308 309 311 313 316 
##   0   1   0   0   1   0   0   0   0   1   0   1   0   0   0   0   0   0   0   0 
## 317 323 333 337 347 349 350 351 352 359 370 371 373 377 380 386 391 397 402 404 
##   0   0   1   1   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0 
## 409 410 412 416 426 430 436 441 452 453 455 460 465 466 470 481 487 489 497 501 
##   1   1   0   1   1   0   1   1   0   0   0   1   0   0   1   1   0   0   0   0 
## 502 507 510 512 523 525 529 532 539 549 551 552 559 560 573 578 579 584 596 602 
##   0   1   0   0   0   0   0   0   0   1   0   0   1   0   0   0   0   0   1   0 
## 604 605 609 612 613 623 634 636 642 648 651 653 658 660 662 666 667 671 672 675 
##   1   1   0   1   1   1   0   0   0   1   0   0   0   0   1   0   1   1   0   0 
## 676 693 695 696 697 701 704 705 722 728 730 752 762 
##   1   0   0   0   1   0   1   0   0   0   0   0   1 
## Levels: 0 1

conf_mat = confusionMatrix(pred,as.factor(test_data$Outcome))
conf_mat

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 88 17
##          1 12 36
##                                           
##                Accuracy : 0.8105          
##                  95% CI : (0.7393, 0.8692)
##     No Information Rate : 0.6536          
##     P-Value [Acc > NIR] : 1.46e-05        
##                                           
##                   Kappa : 0.5719          
##                                           
##  Mcnemar's Test P-Value : 0.4576          
##                                           
##             Sensitivity : 0.8800          
##             Specificity : 0.6792          
##          Pos Pred Value : 0.8381          
##          Neg Pred Value : 0.7500          
##              Prevalence : 0.6536          
##          Detection Rate : 0.5752          
##    Detection Prevalence : 0.6863          
##       Balanced Accuracy : 0.7796          
##                                           
##        'Positive' Class : 0               
##

cm = as.data.frame(conf_mat$table)

ggplot(cm, aes(Prediction, Reference, fill = Freq)) + 
  geom_tile() +
  geom_text(aes(label = Freq)) + 
  scale_fill_gradient(low="white", high="skyblue")

US Admission data analysis About the US Admission dataset

Admission=read.csv("US Admission.csv")
Admission

str(Admission)

## 'data.frame':    400 obs. of  9 variables:
##  $ Serial.No.       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ GRE.Score        : int  337 324 316 322 314 330 321 308 302 323 ...
##  $ TOEFL.Score      : int  118 107 104 110 103 115 109 101 102 108 ...
##  $ University.Rating: int  4 4 3 3 2 5 3 2 1 3 ...
##  $ SOP              : num  4.5 4 3 3.5 2 4.5 3 3 2 3.5 ...
##  $ LOR              : num  4.5 4.5 3.5 2.5 3 3 4 4 1.5 3 ...
##  $ CGPA             : num  9.65 8.87 8 8.67 8.21 9.34 8.2 7.9 8 8.6 ...
##  $ Research         : int  1 1 1 1 0 1 1 0 0 0 ...
##  $ Chance.of.Admit  : num  0.92 0.76 0.72 0.8 0.65 0.9 0.75 0.68 0.5 0.45 ...

Data Analysis

Remove the “Serial No” column from the dataset

Admission[ ,-1]

Admission$Research =as.character(Admission$Research)
Admission

b.Pair plot of all features

library(GGally)
ggpairs(Admission[ ,-1], aes(colour = Research))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Linear regression Regression plot for chance of admission and GRE Score

ggplot(Admission,aes(x = Chance.of.Admit, y= GRE.Score, color = Research)) + 
geom_point() +
geom_smooth(method = "lm", se = TRUE, color = "purple", level = 0.95)

## `geom_smooth()` using formula = 'y ~ x'

Regression plot for chance of admission and TOEFL Score

ggplot(Admission,aes(x = Chance.of.Admit, y= TOEFL.Score, color = Research)) + 
geom_point() +
geom_smooth(method = "lm", se = TRUE, color = "black", level = 0.95)

## `geom_smooth()` using formula = 'y ~ x'

Regression plot for chance of admission and University.Rating

ggplot(Admission,aes(x = Chance.of.Admit, y= University.Rating, color = Research)) + 
geom_point() +
geom_smooth(method = "lm", se = TRUE, color = "red", level = 0.95)

## `geom_smooth()` using formula = 'y ~ x'

Regression plot for chance of admission and SOP

ggplot(Admission,aes(x = Chance.of.Admit, y= SOP, color = Research)) + 
geom_point() +
geom_smooth(method = "lm", se = TRUE, color = "green", level = 0.95)

## `geom_smooth()` using formula = 'y ~ x'

Regression plot for chance of admission and LOR

ggplot(Admission,aes(x = Chance.of.Admit, y= LOR, color = Research)) + 
geom_point() +
geom_smooth(method = "lm", se = TRUE, color = "yellow", level = 0.95)

## `geom_smooth()` using formula = 'y ~ x'

Regression plot for chance of admission and LOR

ggplot(Admission,aes(x = Chance.of.Admit, y= CGPA, color = Research)) + 
geom_point() +
geom_smooth(method = "lm", se = TRUE, color = "blue", level = 0.95)

## `geom_smooth()` using formula = 'y ~ x'

Regression plot for chance of admission and LOR

ggplot(Admission,aes(x = Chance.of.Admit, y= Research, color = Research)) + 
geom_point() +
geom_smooth(method = "lm", se = TRUE, color = "blue", level = 0.95)

## `geom_smooth()` using formula = 'y ~ x'

In the plot above, the dependent variable (Research Experience) is binary, meaning it can only have two values. Because a linear regression model presupposes a continuous response variable, it might not be suitable for this set of data. By calculating the likelihood of each value, a logistic regression model, on the other hand, can handle binary outcomes. A logistic function is used in logistic regression, a kind of generalised linear model, to connect the predictor and response variables.

A noteworthy trend emerged from the study using linear regression plots: every component looked at in connection to the likelihood of being admitted to university showed a positive association. This indicates that the possibility of being admitted to the university tends to increase as the values of these elements do.

d.Polynomial regression polynomial regression plot for chance of admission and GRE Score

ggplot(Admission,aes(x = Chance.of.Admit, y= GRE.Score, color = Research)) + 
geom_point() +
geom_smooth(method = "lm",formula= y~poly(x,2), se = TRUE, color = "purple", level = 0.95)

polynomial regression plot for chance of admission and TOEFL Score

ggplot(Admission,aes(x = Chance.of.Admit, y= TOEFL.Score, color = Research)) + 
geom_point() +
geom_smooth(method = "lm",formula= y~poly(x,2), se = TRUE, color = "red", level = 0.95)

polynomial regression plot for chance of admission and University.Rating

ggplot(Admission,aes(x = Chance.of.Admit, y= University.Rating, color = Research)) + 
geom_point() +
geom_smooth(method = "lm",formula= y~poly(x,2), se = TRUE, color = "blue", level = 0.95)

polynomial regression plot for chance of admission and SOP

ggplot(Admission,aes(x = Chance.of.Admit, y= SOP, color = Research)) + 
geom_point() +
geom_smooth(method = "lm",formula= y~poly(x,2), se = TRUE, color = "red", level = 0.95)

polynomial regression plot for chance of admission and LOR

ggplot(Admission,aes(x = Chance.of.Admit, y= LOR, color = Research)) + 
geom_point() +
geom_smooth(method = "lm",formula= y~poly(x,2), se = TRUE, color = "orange", level = 0.95)

polynomial regression plot for chance of admission and CGPA

ggplot(Admission,aes(x = Chance.of.Admit, y= CGPA, color = Research)) + 
geom_point() +
geom_smooth(method = "lm",formula= y~poly(x,2), se = TRUE, color = "black", level = 0.95)

polynomial regression plot for chance of admission and Research

ggplot(Admission,aes(x = Chance.of.Admit, y= Research , color = Research)) + 
geom_point() +
geom_smooth(method = "lm",formula= y~poly(x,2), se = TRUE, color = "purple", level = 0.95)

Strong reference letters, a high cumulative grade point average (CGPA), research experience, and scores on the GRE and TOEFL are all associated with better odds of being admitted to US colleges, according to the regression analysis. These results emphasise how important it is to get admission through academic achievement, language competency, excellent recommendations, steady grades, and research involvement.

Conclusion

The study looked at a sample of people’s diabetes outcomes and the relationships between different factors. The findings showed that whereas diabetes status had no effect on age or pregnancy, there was a significant correlation between these variables. Obesity may be a risk factor for diabetes, as evidenced by the positive correlation between BMI and triceps skin fold thickness. The 2-hour point revealed a skew in insulin levels, with some people having significantly higher readings than others. One severe outlier that departed from the typical range in the oral glucose tolerance test results called for additional research.The study underlined the need for additional research on the outlier and the possible mechanisms underlying the observed relationships, while also offering insightful information about the variables influencing diabetes outcomes.

Data Science Project

Basic Plot

Part 2