library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
library(readr)
library(gtsummary)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ purrr     1.0.2
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(papaja)
## Loading required package: tinylabels
library(report)
daiabet <- read_csv("C:/Users/USER/Desktop/data_science_portfolio/data_science_portfolio/daiabet.csv")
## Rows: 75 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Sex, Complications of diabetes
## dbl (7): age, hight, weight, blood sugar, blood pressure, type of diabetes, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
diabetic<-daiabet

diabetic<-rename(diabetic,Height=hight)

explore the data

diabetic<-diabetic%>%distinct()
names(diabetic)<-str_to_title(str_to_lower(str_replace_all(names(diabetic),"\\s","_")))
#age min 15 years max=77 years 
diabetic<-diabetic%>%mutate(Age_group=factor(ifelse(Age<43,"Young",ifelse(Age<58,"Middle_aged","Old"))),Bmi=Weight/(Height/100)^2,Status=factor(ifelse(Bmi<18.5,"Underweight",ifelse(Bmi<24.9,"Normalweight",ifelse(Bmi<29.9,"Overweight","Obese")))))
class(diabetic$Sex<-as.factor(diabetic$Sex))#change to categorical variable
## [1] "factor"
class(diabetic$Family_history<-as.factor(diabetic$Family_history))
## [1] "factor"
class(diabetic$Complications_of_diabetes<-as.ordered(diabetic$Complications_of_diabetes))
## [1] "ordered" "factor"
class(diabetic$Status<-as.factor(diabetic$Status))
## [1] "factor"
class(diabetic$Age_group<-as.ordered(diabetic$Age_group))
## [1] "ordered" "factor"
diabetic$Type_of_diabetes<-as.factor(diabetic$Type_of_diabetes)
summary(diabetic)
##      Sex          Age            Height          Weight        Blood_sugar   
##  female:42   Min.   :15.00   Min.   :147.0   Min.   : 50.00   Min.   :128.0  
##  male  :33   1st Qu.:47.50   1st Qu.:158.0   1st Qu.: 67.00   1st Qu.:163.0  
##              Median :55.00   Median :160.0   Median : 75.00   Median :178.0  
##              Mean   :54.17   Mean   :162.9   Mean   : 74.35   Mean   :191.9  
##              3rd Qu.:63.00   3rd Qu.:168.0   3rd Qu.: 80.00   3rd Qu.:216.5  
##              Max.   :77.00   Max.   :192.0   Max.   :110.00   Max.   :315.0  
##                                                                              
##  Blood_pressure  Type_of_diabetes   Complications_of_diabetes Family_history
##  Min.   : 90.6   1:21             eye            :15          1:43          
##  1st Qu.:110.7   2:54             Nervous        : 9          2:32          
##  Median :120.8                    no problem     : 9                        
##  Mean   :122.7                    eye and Nervous: 5                        
##  3rd Qu.:130.8                    kidney         : 5                        
##  Max.   :160.8                    digestive      : 4                        
##                                   (Other)        :28                        
##        Age_group       Bmi                 Status  
##  Middle_aged:34   Min.   :17.30   Normalweight:16  
##  Old        :30   1st Qu.:25.11   Obese       :25  
##  Young      :11   Median :28.72   Overweight  :33  
##                   Mean   :28.17   Underweight : 1  
##                   3rd Qu.:31.45                    
##                   Max.   :39.06                    
## 
str(diabetic)
## tibble [75 × 12] (S3: tbl_df/tbl/data.frame)
##  $ Sex                      : Factor w/ 2 levels "female","male": 2 2 2 1 2 1 1 1 2 1 ...
##  $ Age                      : num [1:75] 64 64 55 54 41 43 71 54 59 49 ...
##  $ Height                   : num [1:75] 160 175 168 155 173 158 155 167 170 160 ...
##  $ Weight                   : num [1:75] 90 65 77 84 66 79 73 90 50 54 ...
##  $ Blood_sugar              : num [1:75] 196 128 211 154 192 178 250 170 200 171 ...
##  $ Blood_pressure           : num [1:75] 90.6 110.6 100.6 110.5 90.6 ...
##  $ Type_of_diabetes         : Factor w/ 2 levels "1","2": 2 2 1 2 2 2 1 2 1 1 ...
##  $ Complications_of_diabetes: Ord.factor w/ 24 levels "cardiac and eye"<..: 8 17 8 8 17 22 8 14 14 17 ...
##  $ Family_history           : Factor w/ 2 levels "1","2": 1 1 1 1 2 1 1 1 1 2 ...
##  $ Age_group                : Ord.factor w/ 3 levels "Middle_aged"<..: 2 2 1 1 3 1 2 1 2 1 ...
##  $ Bmi                      : num [1:75] 35.2 21.2 27.3 35 22.1 ...
##  $ Status                   : Factor w/ 4 levels "Normalweight",..: 2 1 3 2 1 2 2 2 4 1 ...
dim(diabetic)
## [1] 75 12
diabetic%>%select(Sex,Height)%>%group_by(Sex)%>%filter(Sex=="female")%>%summarise(h=mean(Height))

change column names to standard

names(diabetic)<-str_to_title(str_to_lower(str_replace_all(names(diabetic),"\\s","_")))
#sql command
diabetic%>%ggplot(aes(Bmi,Blood_pressure))+geom_jitter(aes(color=Family_history,shape=Sex))+geom_smooth(method=lm,se=0)+facet_wrap(~Age_group)
## `geom_smooth()` using formula = 'y ~ x'

diabetic%>%ggplot(aes(Bmi,Blood_pressure))+geom_jitter(aes(color=Family_history,shape=Sex))+geom_smooth(method=lm,se=0)+facet_wrap(~Status)
## `geom_smooth()` using formula = 'y ~ x'

diabetic%>%ggplot(aes(Bmi,Blood_pressure))+geom_jitter(aes(color=Family_history,shape=Sex))+geom_smooth(method=lm,se=0)+facet_grid(Sex~Status)
## `geom_smooth()` using formula = 'y ~ x'

#sql

ran<-sqldf('select*,max(Bmi) from diabetic group by Age_group')
ran
rank<-sqldf('select*,max(Bmi) over() from diabetic group by Age_group')
rank
f<-sqldf('select*from diabetic where Age=64 and Sex="female" or Blood_pressure>90.0 and Sex="female" ')
f
#top seven most complicated cases of  diabetes for overall population
top_five_most_complicated_cases<-sqldf('select Complications_of_diabetes,sum(Blood_pressure) as pressure  from diabetic group by Complications_of_diabetes order by pressure desc limit 5 ')
top_five_most_complicated_cases
top_five_most_complicated_cases<-sqldf('select Complications_of_diabetes,sum(Blood_pressure) over(partition by Sex) as pressure  from diabetic group by Complications_of_diabetes order by pressure ')
top_five_most_complicated_cases
#top five least complicated cases of  diabetes for overall population
top_five_least_complicated_cases<-sqldf('select Complications_of_diabetes,sum(Blood_pressure) as pressure  from diabetic group by Complications_of_diabetes order by pressure asc limit 5 ')
top_five_least_complicated_cases
#complicated cases of  diabetes by gender
top_five_most_complicated_cases_by_gender<-sqldf('select Sex,Complications_of_diabetes,sum(Blood_pressure) as pressure  from diabetic  group by Sex order by pressure desc') 
top_five_most_complicated_cases_by_gender
#complicated cases of  diabetes by Age_group
top_five_most_complicated_cases_by_Age_group<-sqldf('select Age_group,Complications_of_diabetes,sum(Blood_pressure) as pressure  from diabetic  group by Age_group order by pressure desc') 
top_five_most_complicated_cases_by_Age_group
#complicated cases of  diabetes by Type_of_diabetes
top_five_most_complicated_cases_by_Type_of_diabetes<-sqldf('select Type_of_diabetes,Complications_of_diabetes,sum(Blood_pressure) as pressure  from diabetic  group by Type_of_diabetes order by pressure desc') 
top_five_most_complicated_cases_by_Type_of_diabetes
#complicated cases of  diabetes by Family_history
top_five_most_complicated_cases_by_Family_history<-sqldf('select Family_history ,Complications_of_diabetes,sum(Blood_pressure) as pressure  from diabetic  group by Family_history order by pressure desc ') 
top_five_most_complicated_cases_by_Family_history
#complicated cases of  diabetes by Weight Status
top_five_most_complicated_cases_by_Status<-sqldf('select Status,Complications_of_diabetes,sum(Blood_pressure) as pressure  from diabetic  group by Status order by pressure desc ') 
top_five_most_complicated_cases_by_Status
#blood pressure by sex
high_pressure<-sqldf('select Sex ,sum(Blood_pressure) as pressure  from diabetic group by Sex order by pressure desc')
high_pressure
#blood pressure by age group
high_pressure<-sqldf('select Age_group ,sum(Blood_pressure) as pressure  from diabetic group by  Age_group order by pressure desc ')
high_pressure
pressurey<-sqldf('select Sex,Age_group,sum(Blood_pressure) as pressure  from diabetic where Age_group="Young" group by Sex order by pressure desc') 
pressurey
pressurem<-sqldf('select Sex,Age_group,sum(Blood_pressure) as pressure  from diabetic where Age_group="Middle_aged" group by Sex order by pressure desc') 
pressurem
pressureo<-sqldf('select Sex,Age_group,sum(Blood_pressure) as pressure  from diabetic where Age_group="Old" group by Sex order by pressure desc') 
pressurem
plot(high_pressure$Age_group,high_pressure$Blood_pressure)

diabetic%>%ggplot(aes(Sex,Blood_pressure))+geom_boxplot()+facet_wrap(~Age_group)

structure and summary of the data

convert variables to categorical

diabetic$Sex<-as.factor(diabetic$Sex)
diabetic$Family_history<-as.factor(diabetic$Family_history)
diabetic$Type_of_diabetes<-as.factor(diabetic$Type_of_diabetes)
diabetic$Complications_of_diabetes<-as.factor(diabetic$Complications_of_diabetes)
str(diabetic)
## tibble [75 × 12] (S3: tbl_df/tbl/data.frame)
##  $ Sex                      : Factor w/ 2 levels "female","male": 2 2 2 1 2 1 1 1 2 1 ...
##  $ Age                      : num [1:75] 64 64 55 54 41 43 71 54 59 49 ...
##  $ Height                   : num [1:75] 160 175 168 155 173 158 155 167 170 160 ...
##  $ Weight                   : num [1:75] 90 65 77 84 66 79 73 90 50 54 ...
##  $ Blood_sugar              : num [1:75] 196 128 211 154 192 178 250 170 200 171 ...
##  $ Blood_pressure           : num [1:75] 90.6 110.6 100.6 110.5 90.6 ...
##  $ Type_of_diabetes         : Factor w/ 2 levels "1","2": 2 2 1 2 2 2 1 2 1 1 ...
##  $ Complications_of_diabetes: Ord.factor w/ 24 levels "cardiac and eye"<..: 8 17 8 8 17 22 8 14 14 17 ...
##  $ Family_history           : Factor w/ 2 levels "1","2": 1 1 1 1 2 1 1 1 1 2 ...
##  $ Age_group                : Ord.factor w/ 3 levels "Middle_aged"<..: 2 2 1 1 3 1 2 1 2 1 ...
##  $ Bmi                      : num [1:75] 35.2 21.2 27.3 35 22.1 ...
##  $ Status                   : Factor w/ 4 levels "Normalweight",..: 2 1 3 2 1 2 2 2 4 1 ...

check outliers

box<-diabetic%>%ggplot(aes(Blood_sugar,Family_history,fill=Type_of_diabetes))+geom_boxplot()+stat_summary(fun=mean)+facet_wrap(~Sex)+coord_flip()
box
## Warning: Removed 4 rows containing missing values (`geom_segment()`).
## Removed 4 rows containing missing values (`geom_segment()`).

box1<-diabetic%>%ggplot(aes(Blood_pressure,Family_history,fill=Type_of_diabetes))+geom_boxplot()+stat_summary(fun=mean)+facet_wrap(~Sex)+coord_flip()
box1
## Warning: Removed 4 rows containing missing values (`geom_segment()`).
## Removed 4 rows containing missing values (`geom_segment()`).

boxw<-diabetic%>%ggplot(aes(Weight,Family_history,fill=Type_of_diabetes))+geom_boxplot()+stat_summary(fun=mean)+facet_wrap(~Sex)+coord_flip()
boxw
## Warning: Removed 4 rows containing missing values (`geom_segment()`).
## Removed 4 rows containing missing values (`geom_segment()`).

boxH<-diabetic%>%ggplot(aes(Height,Family_history,fill=Type_of_diabetes))+geom_boxplot()+stat_summary(fun=mean)+facet_wrap(~Sex)+coord_flip()
boxH
## Warning: Removed 4 rows containing missing values (`geom_segment()`).
## Removed 4 rows containing missing values (`geom_segment()`).

boxA<-diabetic%>%ggplot(aes(Age,Family_history,fill=Type_of_diabetes))+geom_boxplot()+stat_summary(fun=mean)+facet_wrap(~Sex)+coord_flip()
boxA
## Warning: Removed 4 rows containing missing values (`geom_segment()`).
## Removed 4 rows containing missing values (`geom_segment()`).

summary

D<-diabetic%>%tbl_cross(row=Sex,col = Family_history,percent = "cell")%>%add_p(source_note=TRUE)
D
Family_history Total
1 2
Sex


    female 23 (31%) 19 (25%) 42 (56%)
    male 20 (27%) 13 (17%) 33 (44%)
Total 43 (57%) 32 (43%) 75 (100%)
Pearson’s Chi-squared test, p=0.6
DG<-diabetic%>%tbl_cross(row=Sex,col = Type_of_diabetes,percent = "cell")%>%add_p(source_note=TRUE)
DG
Type_of_diabetes Total
1 2
Sex


    female 10 (13%) 32 (43%) 42 (56%)
    male 11 (15%) 22 (29%) 33 (44%)
Total 21 (28%) 54 (72%) 75 (100%)
Pearson’s Chi-squared test, p=0.4
D<-diabetic%>%tbl_cross(row=Type_of_diabetes,col = Family_history,percent = "cell")%>%add_p(source_note=TRUE)
D
Family_history Total
1 2
Type_of_diabetes


    1 8 (11%) 13 (17%) 21 (28%)
    2 35 (47%) 19 (25%) 54 (72%)
Total 43 (57%) 32 (43%) 75 (100%)
Pearson’s Chi-squared test, p=0.036
table1<-diabetic%>%tbl_summary(by=Type_of_diabetes)%>%add_overall()%>%bold_labels()%>%add_p(test=all_continuous()~"t.test",pvalue_fun=~style_pvalue(.,digits = 2))
table1
Characteristic Overall, N = 751 1, N = 211 2, N = 541 p-value2
Sex


0.36
    female 42 (56%) 10 (48%) 32 (59%)
    male 33 (44%) 11 (52%) 22 (41%)
Age 55 (48, 63) 51 (45, 62) 55 (49, 64) 0.15
Height 160 (158, 168) 162 (159, 170) 160 (158, 167) 0.37
Weight 75 (67, 80) 69 (59, 80) 77 (69, 83) 0.017
Blood_sugar 178 (163, 217) 177 (168, 218) 178 (163, 213) 0.71
Blood_pressure 121 (111, 131) 121 (101, 130) 122 (111, 134) 0.13
Complications_of_diabetes


0.55
    cardiac and eye 1 (1.3%) 0 (0%) 1 (1.9%)
    cardiac and eye and skin 1 (1.3%) 0 (0%) 1 (1.9%)
    digestive 4 (5.3%) 1 (4.8%) 3 (5.6%)
    digestive and eye 2 (2.7%) 0 (0%) 2 (3.7%)
    digestive and eye and Nervous 1 (1.3%) 0 (0%) 1 (1.9%)
    digestive and kidney 1 (1.3%) 1 (4.8%) 0 (0%)
    digestive and rheumatism 1 (1.3%) 0 (0%) 1 (1.9%)
    eye 15 (20%) 5 (24%) 10 (19%)
    eye and digestive 3 (4.0%) 0 (0%) 3 (5.6%)
    eye and kidney 2 (2.7%) 1 (4.8%) 1 (1.9%)
    eye and Nervous 5 (6.7%) 3 (14%) 2 (3.7%)
    eye and skin 3 (4.0%) 0 (0%) 3 (5.6%)
    eye kidney 1 (1.3%) 0 (0%) 1 (1.9%)
    kidney 5 (6.7%) 1 (4.8%) 4 (7.4%)
    kidney and eye 1 (1.3%) 1 (4.8%) 0 (0%)
    kidney and Nervous 1 (1.3%) 1 (4.8%) 0 (0%)
    Nervous 9 (12%) 2 (9.5%) 7 (13%)
    Nervous and eye 4 (5.3%) 0 (0%) 4 (7.4%)
    Nervous and kidney and eye 1 (1.3%) 0 (0%) 1 (1.9%)
    Nervous and kidney and skin 1 (1.3%) 0 (0%) 1 (1.9%)
    Nervous and sexual problems 1 (1.3%) 1 (4.8%) 0 (0%)
    Nervous Gastrointestinal and skin 2 (2.7%) 0 (0%) 2 (3.7%)
    no problem 9 (12%) 4 (19%) 5 (9.3%)
    skin and eye 1 (1.3%) 0 (0%) 1 (1.9%)
Family_history


0.036
    1 43 (57%) 8 (38%) 35 (65%)
    2 32 (43%) 13 (62%) 19 (35%)
Age_group


0.37
    Middle_aged 34 (45%) 8 (38%) 26 (48%)
    Old 30 (40%) 8 (38%) 22 (41%)
    Young 11 (15%) 5 (24%) 6 (11%)
Bmi 28.7 (25.1, 31.4) 25.0 (23.0, 28.7) 29.2 (26.7, 31.9) 0.003
Status


0.031
    Normalweight 16 (21%) 8 (38%) 8 (15%)
    Obese 25 (33%) 4 (19%) 21 (39%)
    Overweight 33 (44%) 8 (38%) 25 (46%)
    Underweight 1 (1.3%) 1 (4.8%) 0 (0%)
1 n (%); Median (IQR)
2 Pearson’s Chi-squared test; Welch Two Sample t-test; Fisher’s exact test

correlation

correlation<-cor.test(diabetic$Blood_sugar,diabetic$Blood_pressure)
report(correlation)
## Effect sizes were labelled following Funder's (2019) recommendations.
## 
## The Pearson's product-moment correlation between diabetic$Blood_sugar and
## diabetic$Blood_pressure is positive, statistically significant, and medium (r =
## 0.24, 95% CI [0.01, 0.44], t(73) = 2.08, p = 0.041)
h<-diabetic%>%ggplot(aes(Blood_pressure))+geom_histogram(bins=18,fill="blue",alpha=0.5)
hd<-diabetic%>%ggplot(aes(Blood_sugar))+geom_histogram(bins=18,fill="blue",alpha=0.5)
par(mfrow=c(1,2))
h

hd

par(mfrow=c(1,1))

Plotting linear regressions

ff<-diabetic%>%ggplot(aes(Blood_sugar,Blood_pressure))+geom_point()+geom_smooth(method="lm",se=0)+facet_wrap(~Family_history)+theme_minimal()
fy<-diabetic%>%ggplot(aes(Blood_sugar,Height))+geom_point()+geom_smooth(method="lm",se=0)+facet_wrap(~Family_history)+theme_minimal()
fyr<-diabetic%>%ggplot(aes(Blood_pressure,Height))+geom_point()+geom_smooth(method="lm",se=0)+facet_wrap(~Family_history)+theme_minimal()
fyr<-diabetic%>%ggplot(aes(Blood_pressure,Weight))+geom_point()+geom_smooth(method="lm",se=0)+facet_wrap(~Family_history)+theme_minimal()
fyr<-diabetic%>%ggplot(aes(Blood_pressure,Age))+geom_point()+geom_smooth(method="lm",se=0)+facet_wrap(~Family_history)+theme_minimal()
f<-diabetic%>%ggplot(aes(Weight,Height))+geom_point()+geom_smooth(method="lm",se=0)+facet_wrap(~Family_history)+theme_minimal()
fn<-diabetic%>%ggplot(aes(Weight,Blood_sugar))+geom_point()+geom_smooth(method="lm",se=0)+facet_wrap(~Family_history)+theme_minimal()
fb<-diabetic%>%ggplot(aes(Weight,Blood_pressure))+geom_point()+geom_smooth(method="lm",se=0)+facet_wrap(~Family_history)+theme_minimal()
fx<-diabetic%>%ggplot(aes(Weight,Age))+geom_point()+geom_smooth(method="lm",se=0)+facet_wrap(~Family_history)+theme_minimal()
fu<-diabetic%>%ggplot(aes(Weight,Height))+geom_point()+geom_smooth(method="lm",se=0)+facet_wrap(~Family_history)+theme_minimal()
par(mfcol=c(2,2))
ff
## `geom_smooth()` using formula = 'y ~ x'

fy
## `geom_smooth()` using formula = 'y ~ x'

fb
## `geom_smooth()` using formula = 'y ~ x'

fx
## `geom_smooth()` using formula = 'y ~ x'

diabetic<-diabetic%>%mutate(x=1:75)
plot(diabetic$x,diabetic$Weight,type = 'l',col='blue')
  lines(diabetic$x,diabetic$Age,col='red',ylim=c(min(diabetic$x), max(diabetic$x)))

par(mfrow=c(1,2))
plot(diabetic$x,diabetic$Weight,type = 'l',col='blue')
plot(diabetic$x,diabetic$Age,type = 'l',col='red')

par(mfrow=c(2,1),mar=c(2,4,4,2))
plot(diabetic$x,diabetic$Hight,type = 'l',col='blue')
## Warning: Unknown or uninitialised column: `Hight`.
plot(diabetic$x,diabetic$Bood_pressure,type = 'l',col='red',ylim=c(min(diabetic$x), max(diabetic$x)))
## Warning: Unknown or uninitialised column: `Bood_pressure`.

par(mfrow=(c(2,2)))
fs<-diabetic%>%ggplot(aes(x,Weight))+geom_line(color="blue")+facet_wrap(~Sex)
f<-diabetic%>%ggplot(aes(x,Age))+geom_line(color="purple")+facet_wrap(~Sex)
fd<-diabetic%>%ggplot(aes(x,Blood_pressure))+geom_line(color="purple")+facet_wrap(~Sex)
fs

f

fd

par(cex=0.7, mai=c(0.1,0.1,0.2,0.1))
# define area for the histogram
par(fig=c(0.1,0.7,0.3,0.9))
hist(diabetic$Age)
#define area for the boxplot
par(fig=c(0.8,1,0,1),new=TRUE)
boxplot(diabetic$Age)
#define area for stripchart
par(fig=c(0.1,0.67,0.1,0.25),new=TRUE)
stripchart(diabetic$Age,method = "jitter")

par(mfrow=c(2,2))
hist(diabetic$Age)
boxplot(diabetic$Age,horizontal = T)
hist(diabetic$Weight)
boxplot(diabetic$Weight,horizontal = T)

hist(diabetic$Blood_sugar)
boxplot(diabetic$Blood_sugar,horizontal = T)
hist(diabetic$Height)
boxplot(diabetic$Height,horizontal = T)

c<-diabetic%>%group_by(Family_history)%>%summarise_at(.vars=vars(Height,Weight,Blood_sugar,Blood_pressure,Age),.funs=mean)
c
names(c$Blood_sugar)=c("FamilyA","familyB")
pct<-round(c$Blood_sugar/sum(c$Blood_sugar)*100)
lbls<-paste(c("FamilyA","familyB"),"",pct,"%",sep ="")
pie(c$Blood_sugar,labels =lbls,color=c$Blood_sugar,main="% Avarage Blood_pressure by Family Type")
## Warning in text.default(1.1 * P$x, 1.1 * P$y, labels[i], xpd = TRUE, adj =
## ifelse(P$x < : "color" is not a graphical parameter

## Warning in text.default(1.1 * P$x, 1.1 * P$y, labels[i], xpd = TRUE, adj =
## ifelse(P$x < : "color" is not a graphical parameter
## Warning in title(main = main, ...): "color" is not a graphical parameter

names(c$Weight)=c("FamilyA","FamilyB")
pct<-round(c$Weight/sum(c$Weight)*100)
lbls<-paste(c("FamilyA","FamilyB"),"",pct,"%",sep ="")
pie(c$Weight,labels =lbls,color=c$Weight,main="% Avarage Weight by Family Type")
## Warning in text.default(1.1 * P$x, 1.1 * P$y, labels[i], xpd = TRUE, adj =
## ifelse(P$x < : "color" is not a graphical parameter

## Warning in text.default(1.1 * P$x, 1.1 * P$y, labels[i], xpd = TRUE, adj =
## ifelse(P$x < : "color" is not a graphical parameter
## Warning in title(main = main, ...): "color" is not a graphical parameter

layout(matrix(c(1,2,3,4),nrow = 2,ncol = 2,byrow = TRUE))
plot(diabetic$x,diabetic$Weight,type = 'l',col='red',ylim=c(min(diabetic$x), max(diabetic$x)))
hist(diabetic$Weight,main = "Weight Histogram", xlab = "Weight") 
boxplot(diabetic$Weight,main = " Weight Boxplot") 
pie(c$Weight,labels =lbls,color=c$Weight,main="% Avarage Weight by Family Type")
## Warning in text.default(1.1 * P$x, 1.1 * P$y, labels[i], xpd = TRUE, adj =
## ifelse(P$x < : "color" is not a graphical parameter
## Warning in text.default(1.1 * P$x, 1.1 * P$y, labels[i], xpd = TRUE, adj =
## ifelse(P$x < : "color" is not a graphical parameter
## Warning in title(main = main, ...): "color" is not a graphical parameter

layout(matrix(c(1,2,3,4),nrow = 2,ncol = 2,byrow = TRUE))
plot(diabetic$x,diabetic$Bood_pressure,type = 'l',col='red',ylim=c(min(diabetic$x), max(diabetic$x)))
## Warning: Unknown or uninitialised column: `Bood_pressure`.
hist(diabetic$Blood_pressure, main = "Blood_pressure Histogram", xlab = "Blood_pressure") 
boxplot(diabetic$Blood_pressure, main = "Blood_pressure Boxplot") 
pie(c$Blood_sugar,labels =lbls,color=c$Blood_sugar,main="% Avarage  Blood_pressure by Family Type")
## Warning in text.default(1.1 * P$x, 1.1 * P$y, labels[i], xpd = TRUE, adj =
## ifelse(P$x < : "color" is not a graphical parameter
## Warning in text.default(1.1 * P$x, 1.1 * P$y, labels[i], xpd = TRUE, adj =
## ifelse(P$x < : "color" is not a graphical parameter
## Warning in title(main = main, ...): "color" is not a graphical parameter

linear regression

shapiro.test(diabetic$Blood_pressure)
## 
##  Shapiro-Wilk normality test
## 
## data:  diabetic$Blood_pressure
## W = 0.96409, p-value = 0.03188
shapiro.test(diabetic$Blood_sugar)
## 
##  Shapiro-Wilk normality test
## 
## data:  diabetic$Blood_sugar
## W = 0.93131, p-value = 0.0005375
shapiro.test(diabetic$Weight)
## 
##  Shapiro-Wilk normality test
## 
## data:  diabetic$Weight
## W = 0.98423, p-value = 0.4768
shapiro.test(diabetic$Height)
## 
##  Shapiro-Wilk normality test
## 
## data:  diabetic$Height
## W = 0.8918, p-value = 1.013e-05
shapiro.test(diabetic$Age)
## 
##  Shapiro-Wilk normality test
## 
## data:  diabetic$Age
## W = 0.96771, p-value = 0.0524
lm<-lm(Blood_pressure~Blood_sugar,data=diabetic)%>%tbl_regression()
lm
Characteristic Beta 95% CI1 p-value
Blood_sugar 0.08 0.00, 0.16 0.041
1 CI = Confidence Interval

Machine Learning (Logistic Regression Alogarithm)

data partitioning

set.seed(123)
ind<-sample(2,nrow(diabetic),replace = T,prob = c(0.8,0.2))
train<-diabetic[ind==1,]
test<-diabetic[ind==2,]

model

mymodel<-glm(Type_of_diabetes~Sex+Age_group+Height+Weight+Family_history+Blood_sugar+Blood_pressure,data = train,family = "binomial")

predict

p1<-predict(mymodel,train,type='response')

confusion matix

pred1=ifelse(p1>0.5,1,0)
tab1<-table(predicted=pred1,Actual=train$Type_of_diabetes)
tab1
##          Actual
## predicted  1  2
##         0  8  3
##         1 10 39

misclassification error

(1-sum(diag(tab1))/sum(tab1))*100
## [1] 21.66667

accuracy

(sum(diag(tab1))/sum(tab1))*100
## [1] 78.33333

goodness of fit

with(mymodel,pchisq(null.deviance-deviance,df.null-df.residual,lower.tail = F))
## [1] 0.1521733

for test data

p2<-predict(mymodel,test,type='response')
pred1=ifelse(p2>0.5,1,0)
tab1<-table(predicted=pred1,Actual=test$Type_of_diabetes)
#misclassification error
(1-sum(diag(tab1))/sum(tab1))*100
## [1] 26.66667
#accuracy
(sum(diag(tab1))/sum(tab1))*100
## [1] 73.33333
#goodness of fit
with(mymodel,pchisq(null.deviance-deviance,df.null-df.residual,lower.tail = F))
## [1] 0.1521733

ordered logistc regression

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## The following object is masked from 'package:gtsummary':
## 
##     select
model<-polr(Status~Sex+Age_group+Height+Weight+Family_history+Blood_sugar+Blood_pressure,data=train,Hess=T)

#calculate P values
ctable<-coef(summary(model))#coeficient
p<-pnorm(abs(ctable[,"t value"]),lower.tail = FALSE)*2
(ctable<-cbind(ctable,"p value"=p))
##                                Value  Std. Error    t value    p value
## Sexmale                  0.141413761 0.614740465  0.2300382 0.81806212
## Age_group.L             -0.152324998 0.609685391 -0.2498420 0.80270956
## Age_group.Q             -0.286915315 0.467664541 -0.6135067 0.53954137
## Height                  -0.062616070 0.039549779 -1.5832217 0.11337093
## Weight                   0.014741743 0.027057352  0.5448332 0.58586826
## Family_history2         -1.010645593 0.532862794 -1.8966338 0.05787628
## Blood_sugar              0.001226567 0.005655695  0.2168729 0.82830738
## Blood_pressure          -0.019717899 0.016665065 -1.1831876 0.23673481
## Normalweight|Obese     -13.200026717 6.979762761 -1.8911856 0.05859957
## Obese|Overweight       -11.424596488 6.927765272 -1.6491027 0.09912659
## Overweight|Underweight  -7.266107313 6.928210851 -1.0487711 0.29428348
t2<-tbl_regression(model,exponentiate=TRUE)%>%add_global_p()
t2
Characteristic OR1 95% CI1 p-value
Sex

0.8
    female
    male 1.15 0.35, 3.93
Age_group

0.8
    Age_group.L 0.86 0.25, 2.87
    Age_group.Q 0.75 0.30, 1.88
Height 0.94 0.87, 1.01 0.11
Weight 1.01 0.96, 1.07 0.6
Family_history

0.055
    1
    2 0.36 0.12, 1.02
Blood_sugar 1.00 0.99, 1.01 0.8
Blood_pressure 0.98 0.95, 1.01 0.2
1 OR = Odds Ratio, CI = Confidence Interval
#predictiions
pred<-predict(model,train)
print(pred,digits = 3)
##  [1] Overweight   Overweight   Overweight   Overweight   Overweight  
##  [6] Overweight   Obese        Overweight   Obese        Obese       
## [11] Obese        Overweight   Obese        Obese        Overweight  
## [16] Overweight   Obese        Overweight   Overweight   Overweight  
## [21] Overweight   Obese        Obese        Overweight   Obese       
## [26] Normalweight Overweight   Obese        Normalweight Overweight  
## [31] Overweight   Overweight   Obese        Normalweight Overweight  
## [36] Overweight   Overweight   Overweight   Overweight   Overweight  
## [41] Overweight   Overweight   Obese        Obese        Obese       
## [46] Overweight   Overweight   Obese        Obese        Overweight  
## [51] Overweight   Overweight   Overweight   Overweight   Overweight  
## [56] Overweight   Normalweight Obese        Overweight   Overweight  
## Levels: Normalweight Obese Overweight Underweight
#confusion matrix
(tab<-table(pred,train$Status))
##               
## pred           Normalweight Obese Overweight Underweight
##   Normalweight            4     0          0           0
##   Obese                   4     5          9           0
##   Overweight              3    15         19           1
##   Underweight             0     0          0           0
#misclassification
1-sum(diag(tab))/sum(tab)
## [1] 0.5333333
#confusion matrix for test data
pred1<-predict(model,test)
#missclasification error
tab<-table(pred1,test$Status)
1-sum(diag(tab))/sum(tab)
## [1] 0.7333333
##  other logistic regression interpretations
library(gtsummary)

t1<-tbl_regression(mymodel,exponentiate = TRUE)%>%add_global_p()
t1
Characteristic OR1 95% CI1 p-value
Sex

0.6
    female
    male 0.70 0.15, 3.15
Age_group

0.9
    Age_group.L 0.97 0.25, 4.24
    Age_group.Q 1.29 0.42, 4.10
Height 0.97 0.89, 1.06 0.5
Weight 1.06 1.00, 1.14 0.045
Family_history

0.036
    1
    2 0.26 0.07, 0.92
Blood_sugar 1.00 0.99, 1.01 >0.9
Blood_pressure 1.03 0.98, 1.07 0.3
1 OR = Odds Ratio, CI = Confidence Interval
#The odds ratio for age was 'r inline_text(regr, variable = Age)'
"{estimate} ({conf.level*100}% CI
{conf.low},{conf.high}; {p.value})"
## [1] "{estimate} ({conf.level*100}% CI\n{conf.low},{conf.high}; {p.value})"