Export a Data Set Related to Diabates Risk Factors on Various Group
#Part 1
#1.Dataset
diabetes=read.csv("diabetes.csv")
diabetes
Data Exploration
str(diabetes)
## 'data.frame': 768 obs. of 9 variables:
## $ Pregnancies : int 6 1 8 1 0 5 3 10 2 8 ...
## $ Glucose : int 148 85 183 89 137 116 78 115 197 125 ...
## $ BloodPressure : int 72 66 64 66 40 74 50 0 70 96 ...
## $ SkinThickness : int 35 29 0 23 35 0 32 0 45 0 ...
## $ Insulin : int 0 0 0 94 168 0 88 0 543 0 ...
## $ BMI : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ DiabetesPedigreeFunction: num 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : int 50 31 32 21 33 30 26 29 53 54 ...
## $ Outcome : int 1 0 1 0 1 0 1 0 1 1 ...
sapply(diabetes , class)
## Pregnancies Glucose BloodPressure
## "integer" "integer" "integer"
## SkinThickness Insulin BMI
## "integer" "integer" "numeric"
## DiabetesPedigreeFunction Age Outcome
## "numeric" "integer" "integer"
#Target variable
class(diabetes$Outcome)
## [1] "integer"
#Convert target column as character
diabetes$Outcome =as.character(diabetes$Outcome)
diabetes
a.Scatter plot
library(ggplot2)
ggplot(diabetes,aes (x= Age,y = Glucose , color = Outcome))+geom_point()+
labs(title = "visualization of age and Glucose ")+theme(legend.position = "top")
library(ggplot2)
ggplot(diabetes,aes (x= Age,y = Insulin , color = Outcome))+geom_point()+
labs(title = "visualization of age and insulin ")+theme(legend.position = "top")
library(ggplot2)
ggplot(diabetes,aes (x= Age,y = BloodPressure , color = Outcome))+geom_point()+
labs(title = "visualization of age and Blood pressure ")+theme(legend.position = "top")
#Boxplot
ggplot(diabetes,aes (y =Pregnancies,x= Age, color = Outcome,fill =Outcome))+geom_boxplot()+labs(title = "visualization of age and Pregnancies pressure ")+theme(legend.position = "top")
ggplot(diabetes,aes (y = SkinThickness,x= Age, color = Outcome,fill =Outcome))+geom_boxplot()+labs(title = "visualization of age and SkinThickness ")+theme(legend.position = "top")
ggplot(diabetes,aes (y = BMI,x= Age, color = Outcome,fill =Outcome))+geom_boxplot()+labs(title = "visualization of age and BMI ")+theme(legend.position = "top")
ggplot(diabetes,aes (y = Glucose,x= Age, color = Outcome,fill =Outcome))+geom_boxplot()+labs(title = "visualization of age and Glucose")+theme(legend.position = "top")
ggplot(diabetes,aes (y = BloodPressure,x= Age, color = Outcome,fill =Outcome))+geom_boxplot()+labs(title = "visualization of age and BloodPressure ")+theme(legend.position = "top")
ggplot(diabetes,aes (y = Insulin,x= Age, color = Outcome,fill =Outcome))+geom_boxplot()+labs(title = "visualization of age and Insulin")+theme(legend.position = "top")
#Violin Plot
ggplot(diabetes,aes (y = Pregnancies, x= Outcome, fill = Outcome))+geom_violin()+labs(title = "visualization of Pregnecies in dibateses outcome")+theme(legend.position = "top")
ggplot(diabetes,aes (y = BloodPressure , x= Outcome, fill = Outcome))+geom_violin()+labs(title = "visualization of Blood pressure in dibateses outcome")+theme(legend.position = "top")
ggplot(diabetes,aes (y = BMI, x= Outcome , fill = Outcome))+geom_violin()+labs(title = "visualization of BMI in dibateses outcome")+theme(legend.position = "top")
ggplot(diabetes,aes (y = SkinThickness, x= Outcome, fill = Outcome))+geom_violin()+labs(title = "visualization of Skin thickness in dibateses outcome")+theme(legend.position = "top")
ggplot(diabetes,aes(y = Age, x= Outcome, fill = Outcome))+geom_violin()+labs(title = "visualization of Age in dibateses outcome")+theme(legend.position = "top")
ggplot(diabetes,aes(y = Insulin, x= Outcome, fill = Outcome))+geom_violin()+labs(title = "visualization of Insulin in dibateses outcome")+theme(legend.position = "top")
ggplot(diabetes,aes(y = DiabetesPedigreeFunction, x= Outcome, fill = Outcome))+geom_violin()+labs(title = "visualization of Diabetes Pedigree Function in dibateses outcome")+theme(legend.position = "top")
#Interactive plot a.Interactive violin plot of 2-3 features
Ages by Outcome #violin plot
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
plot_ly(data =diabetes, y= ~Age, x = ~Outcome, type = "violin")
Glucose cosumption rate by Outcome
library(plotly)
plot_ly(data =diabetes, y= ~Glucose, x = ~Outcome, type = "violin")
Insulin secretion by outcome
library(plotly)
plot_ly(data =diabetes, y= ~Insulin, x = ~Outcome, type = "violin")
b.Interactive boxplot Pregnancies by outcome
library(plotly)
plot_ly(data =diabetes, y= ~Pregnancies, x = ~Outcome, type = "box")
BMI rate by outcome
library(plotly)
plot_ly(data =diabetes, y= ~BMI, x = ~Outcome, type = "box")
Blood pressure in diabetic patient by outcome
library(plotly)
plot_ly(data =diabetes, y= ~BloodPressure, x = ~Outcome, type = "box")
c.Calculate Correlation matrix
cor_mat = cor(diabetes[ , 1:8])
cor_mat
## Pregnancies Glucose BloodPressure SkinThickness
## Pregnancies 1.00000000 0.12945867 0.14128198 -0.08167177
## Glucose 0.12945867 1.00000000 0.15258959 0.05732789
## BloodPressure 0.14128198 0.15258959 1.00000000 0.20737054
## SkinThickness -0.08167177 0.05732789 0.20737054 1.00000000
## Insulin -0.07353461 0.33135711 0.08893338 0.43678257
## BMI 0.01768309 0.22107107 0.28180529 0.39257320
## DiabetesPedigreeFunction -0.03352267 0.13733730 0.04126495 0.18392757
## Age 0.54434123 0.26351432 0.23952795 -0.11397026
## Insulin BMI DiabetesPedigreeFunction
## Pregnancies -0.07353461 0.01768309 -0.03352267
## Glucose 0.33135711 0.22107107 0.13733730
## BloodPressure 0.08893338 0.28180529 0.04126495
## SkinThickness 0.43678257 0.39257320 0.18392757
## Insulin 1.00000000 0.19785906 0.18507093
## BMI 0.19785906 1.00000000 0.14064695
## DiabetesPedigreeFunction 0.18507093 0.14064695 1.00000000
## Age -0.04216295 0.03624187 0.03356131
## Age
## Pregnancies 0.54434123
## Glucose 0.26351432
## BloodPressure 0.23952795
## SkinThickness -0.11397026
## Insulin -0.04216295
## BMI 0.03624187
## DiabetesPedigreeFunction 0.03356131
## Age 1.00000000
The correlation matrix showed that the number of pregnancies increased with age, with the highest link being found between age and the number of pregnancies. Conversely, weak-negative correlations were discovered between a number of factors, suggesting less pronounced or trustworthy links between those variables, and age does not appear to be significantly connected with diabetes pedigree function.
d.Plot Correlation matrix (lower triangle) with values
e.Pair plot of all feature
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggpairs(diabetes, aes(colour = Outcome))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
f.Principal component analysis(PCA)
library(stats)
Diabetes_pca = prcomp(diabetes[ ,1:8], scale = TRUE, center = TRUE)
Diabetes_pca
## Standard deviations (1, .., p=8):
## [1] 1.4471973 1.3157546 1.0147068 0.9356971 0.8731234 0.8262133 0.6479322
## [8] 0.6359733
##
## Rotation (n x k) = (8 x 8):
## PC1 PC2 PC3 PC4
## Pregnancies -0.1284321 0.5937858 -0.01308692 0.08069115
## Glucose -0.3930826 0.1740291 0.46792282 -0.40432871
## BloodPressure -0.3600026 0.1838921 -0.53549442 0.05598649
## SkinThickness -0.4398243 -0.3319653 -0.23767380 0.03797608
## Insulin -0.4350262 -0.2507811 0.33670893 -0.34994376
## BMI -0.4519413 -0.1009598 -0.36186463 0.05364595
## DiabetesPedigreeFunction -0.2706114 -0.1220690 0.43318905 0.83368010
## Age -0.1980271 0.6205885 0.07524755 0.07120060
## PC5 PC6 PC7 PC8
## Pregnancies -0.4756057 0.193598168 -0.58879003 -0.117840984
## Glucose 0.4663280 0.094161756 -0.06015291 -0.450355256
## BloodPressure 0.3279531 -0.634115895 -0.19211793 0.011295538
## SkinThickness -0.4878621 0.009589438 0.28221253 -0.566283799
## Insulin -0.3469348 -0.270650609 -0.13200992 0.548621381
## BMI 0.2532038 0.685372179 -0.03536644 0.341517637
## DiabetesPedigreeFunction 0.1198105 -0.085784088 -0.08609107 0.008258731
## Age -0.1092900 -0.033357170 0.71208542 0.211661979
summary(Diabetes_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.4472 1.3158 1.0147 0.9357 0.87312 0.82621 0.64793
## Proportion of Variance 0.2618 0.2164 0.1287 0.1094 0.09529 0.08533 0.05248
## Cumulative Proportion 0.2618 0.4782 0.6069 0.7163 0.81164 0.89697 0.94944
## PC8
## Standard deviation 0.63597
## Proportion of Variance 0.05056
## Cumulative Proportion 1.00000
Evaluating which components have the most information and how much of the variability in the original data is captured are necessary steps in interpreting the proportion of variance. About 48% of the variance can be explained by components 1 and 2, indicating that they are able to extract some information from the original variables. Components 2, 3, and so forth, in order of precedence, explain diminishing quantities of variation.
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_eig(Diabetes_pca)
The diabetes dataset indicates that the first three components account
for the majority of the variance in the data since it displays a
dramatic fall in explained variance up until the third component, after
which it declines more gently.
ii.Contribution plot of PCAs (Circular plot)
fviz_pca_var(Diabetes_pca, col.var = "contrib", gradient.cols = c("blue", "green", "red"))
fviz_pca_ind(Diabetes_pca,geom.ind = "point", col.ind = diabetes$Outcome, addEllipses = TRUE)
library(e1071)
library(caret)
## Loading required package: lattice
library(lattice)
train_idx =createDataPartition(diabetes$Outcome,p= 0.80 ,list = FALSE)
train_data =diabetes[train_idx, ]
test_data = diabetes[-train_idx, ]
test_data
svm_model = svm(as.factor(Outcome) ~ Pregnancies +Glucose+BloodPressure
+SkinThickness+Insulin+BMI+DiabetesPedigreeFunction+Age, data = train_data, kernel = "linear" )
pred = predict(svm_model,test_data)
pred
## 3 5 8 15 25 30 33 39 46 50 52 60 63 64 69 76 77 79 81 94
## 1 1 1 1 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0
## 95 101 105 115 120 122 127 132 137 138 144 146 151 154 164 165 172 178 213 216
## 0 1 0 1 0 0 0 1 0 0 0 0 0 1 0 0 1 1 1 1
## 218 223 227 235 236 250 251 254 257 260 277 281 288 295 298 308 309 311 313 316
## 0 1 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0
## 317 323 333 337 347 349 350 351 352 359 370 371 373 377 380 386 391 397 402 404
## 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## 409 410 412 416 426 430 436 441 452 453 455 460 465 466 470 481 487 489 497 501
## 1 1 0 1 1 0 1 1 0 0 0 1 0 0 1 1 0 0 0 0
## 502 507 510 512 523 525 529 532 539 549 551 552 559 560 573 578 579 584 596 602
## 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0
## 604 605 609 612 613 623 634 636 642 648 651 653 658 660 662 666 667 671 672 675
## 1 1 0 1 1 1 0 0 0 1 0 0 0 0 1 0 1 1 0 0
## 676 693 695 696 697 701 704 705 722 728 730 752 762
## 1 0 0 0 1 0 1 0 0 0 0 0 1
## Levels: 0 1
conf_mat = confusionMatrix(pred,as.factor(test_data$Outcome))
conf_mat
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 88 17
## 1 12 36
##
## Accuracy : 0.8105
## 95% CI : (0.7393, 0.8692)
## No Information Rate : 0.6536
## P-Value [Acc > NIR] : 1.46e-05
##
## Kappa : 0.5719
##
## Mcnemar's Test P-Value : 0.4576
##
## Sensitivity : 0.8800
## Specificity : 0.6792
## Pos Pred Value : 0.8381
## Neg Pred Value : 0.7500
## Prevalence : 0.6536
## Detection Rate : 0.5752
## Detection Prevalence : 0.6863
## Balanced Accuracy : 0.7796
##
## 'Positive' Class : 0
##
cm = as.data.frame(conf_mat$table)
ggplot(cm, aes(Prediction, Reference, fill = Freq)) +
geom_tile() +
geom_text(aes(label = Freq)) +
scale_fill_gradient(low="white", high="skyblue")
US Admission data analysis About the US Admission dataset
Admission=read.csv("US Admission.csv")
Admission
str(Admission)
## 'data.frame': 400 obs. of 9 variables:
## $ Serial.No. : int 1 2 3 4 5 6 7 8 9 10 ...
## $ GRE.Score : int 337 324 316 322 314 330 321 308 302 323 ...
## $ TOEFL.Score : int 118 107 104 110 103 115 109 101 102 108 ...
## $ University.Rating: int 4 4 3 3 2 5 3 2 1 3 ...
## $ SOP : num 4.5 4 3 3.5 2 4.5 3 3 2 3.5 ...
## $ LOR : num 4.5 4.5 3.5 2.5 3 3 4 4 1.5 3 ...
## $ CGPA : num 9.65 8.87 8 8.67 8.21 9.34 8.2 7.9 8 8.6 ...
## $ Research : int 1 1 1 1 0 1 1 0 0 0 ...
## $ Chance.of.Admit : num 0.92 0.76 0.72 0.8 0.65 0.9 0.75 0.68 0.5 0.45 ...
Data Analysis
Admission[ ,-1]
Admission$Research =as.character(Admission$Research)
Admission
b.Pair plot of all features
library(GGally)
ggpairs(Admission[ ,-1], aes(colour = Research))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(Admission,aes(x = Chance.of.Admit, y= GRE.Score, color = Research)) +
geom_point() +
geom_smooth(method = "lm", se = TRUE, color = "purple", level = 0.95)
## `geom_smooth()` using formula = 'y ~ x'
Regression plot for chance of admission and TOEFL Score
ggplot(Admission,aes(x = Chance.of.Admit, y= TOEFL.Score, color = Research)) +
geom_point() +
geom_smooth(method = "lm", se = TRUE, color = "black", level = 0.95)
## `geom_smooth()` using formula = 'y ~ x'
Regression plot for chance of admission and University.Rating
ggplot(Admission,aes(x = Chance.of.Admit, y= University.Rating, color = Research)) +
geom_point() +
geom_smooth(method = "lm", se = TRUE, color = "red", level = 0.95)
## `geom_smooth()` using formula = 'y ~ x'
Regression plot for chance of admission and SOP
ggplot(Admission,aes(x = Chance.of.Admit, y= SOP, color = Research)) +
geom_point() +
geom_smooth(method = "lm", se = TRUE, color = "green", level = 0.95)
## `geom_smooth()` using formula = 'y ~ x'
Regression plot for chance of admission and LOR
ggplot(Admission,aes(x = Chance.of.Admit, y= LOR, color = Research)) +
geom_point() +
geom_smooth(method = "lm", se = TRUE, color = "yellow", level = 0.95)
## `geom_smooth()` using formula = 'y ~ x'
Regression plot for chance of admission and LOR
ggplot(Admission,aes(x = Chance.of.Admit, y= CGPA, color = Research)) +
geom_point() +
geom_smooth(method = "lm", se = TRUE, color = "blue", level = 0.95)
## `geom_smooth()` using formula = 'y ~ x'
Regression plot for chance of admission and LOR
ggplot(Admission,aes(x = Chance.of.Admit, y= Research, color = Research)) +
geom_point() +
geom_smooth(method = "lm", se = TRUE, color = "blue", level = 0.95)
## `geom_smooth()` using formula = 'y ~ x'
In the plot above, the dependent variable (Research Experience) is
binary, meaning it can only have two values. Because a linear regression
model presupposes a continuous response variable, it might not be
suitable for this set of data. By calculating the likelihood of each
value, a logistic regression model, on the other hand, can handle binary
outcomes. A logistic function is used in logistic regression, a kind of
generalised linear model, to connect the predictor and response
variables.
A noteworthy trend emerged from the study using linear regression plots: every component looked at in connection to the likelihood of being admitted to university showed a positive association. This indicates that the possibility of being admitted to the university tends to increase as the values of these elements do.
d.Polynomial regression polynomial regression plot for chance of admission and GRE Score
ggplot(Admission,aes(x = Chance.of.Admit, y= GRE.Score, color = Research)) +
geom_point() +
geom_smooth(method = "lm",formula= y~poly(x,2), se = TRUE, color = "purple", level = 0.95)
polynomial regression plot for chance of admission and TOEFL Score
ggplot(Admission,aes(x = Chance.of.Admit, y= TOEFL.Score, color = Research)) +
geom_point() +
geom_smooth(method = "lm",formula= y~poly(x,2), se = TRUE, color = "red", level = 0.95)
polynomial regression plot for chance of admission and University.Rating
ggplot(Admission,aes(x = Chance.of.Admit, y= University.Rating, color = Research)) +
geom_point() +
geom_smooth(method = "lm",formula= y~poly(x,2), se = TRUE, color = "blue", level = 0.95)
polynomial regression plot for chance of admission and SOP
ggplot(Admission,aes(x = Chance.of.Admit, y= SOP, color = Research)) +
geom_point() +
geom_smooth(method = "lm",formula= y~poly(x,2), se = TRUE, color = "red", level = 0.95)
polynomial regression plot for chance of admission and LOR
ggplot(Admission,aes(x = Chance.of.Admit, y= LOR, color = Research)) +
geom_point() +
geom_smooth(method = "lm",formula= y~poly(x,2), se = TRUE, color = "orange", level = 0.95)
polynomial regression plot for chance of admission and CGPA
ggplot(Admission,aes(x = Chance.of.Admit, y= CGPA, color = Research)) +
geom_point() +
geom_smooth(method = "lm",formula= y~poly(x,2), se = TRUE, color = "black", level = 0.95)
polynomial regression plot for chance of admission and Research
ggplot(Admission,aes(x = Chance.of.Admit, y= Research , color = Research)) +
geom_point() +
geom_smooth(method = "lm",formula= y~poly(x,2), se = TRUE, color = "purple", level = 0.95)
Strong reference letters, a high cumulative grade point average (CGPA), research experience, and scores on the GRE and TOEFL are all associated with better odds of being admitted to US colleges, according to the regression analysis. These results emphasise how important it is to get admission through academic achievement, language competency, excellent recommendations, steady grades, and research involvement.
Conclusion
The study looked at a sample of people’s diabetes outcomes and the relationships between different factors. The findings showed that whereas diabetes status had no effect on age or pregnancy, there was a significant correlation between these variables. Obesity may be a risk factor for diabetes, as evidenced by the positive correlation between BMI and triceps skin fold thickness. The 2-hour point revealed a skew in insulin levels, with some people having significantly higher readings than others. One severe outlier that departed from the typical range in the oral glucose tolerance test results called for additional research.The study underlined the need for additional research on the outlier and the possible mechanisms underlying the observed relationships, while also offering insightful information about the variables influencing diabetes outcomes.