library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
library(readr)
library(gtsummary)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ purrr 1.0.2
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(papaja)
## Loading required package: tinylabels
library(report)
daiabet <- read_csv("C:/Users/USER/Desktop/data_science_portfolio/data_science_portfolio/daiabet.csv")
## Rows: 75 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Sex, Complications of diabetes
## dbl (7): age, hight, weight, blood sugar, blood pressure, type of diabetes, ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
diabetic<-daiabet
diabetic<-rename(diabetic,Height=hight)
explore the data
diabetic<-diabetic%>%distinct()
names(diabetic)<-str_to_title(str_to_lower(str_replace_all(names(diabetic),"\\s","_")))
#age min 15 years max=77 years
diabetic<-diabetic%>%mutate(Age_group=factor(ifelse(Age<43,"Young",ifelse(Age<58,"Middle_aged","Old"))),Bmi=Weight/(Height/100)^2,Status=factor(ifelse(Bmi<18.5,"Underweight",ifelse(Bmi<24.9,"Normalweight",ifelse(Bmi<29.9,"Overweight","Obese")))))
class(diabetic$Sex<-as.factor(diabetic$Sex))#change to categorical variable
## [1] "factor"
class(diabetic$Family_history<-as.factor(diabetic$Family_history))
## [1] "factor"
class(diabetic$Complications_of_diabetes<-as.ordered(diabetic$Complications_of_diabetes))
## [1] "ordered" "factor"
class(diabetic$Status<-as.factor(diabetic$Status))
## [1] "factor"
class(diabetic$Age_group<-as.ordered(diabetic$Age_group))
## [1] "ordered" "factor"
diabetic$Type_of_diabetes<-as.factor(diabetic$Type_of_diabetes)
summary(diabetic)
## Sex Age Height Weight Blood_sugar
## female:42 Min. :15.00 Min. :147.0 Min. : 50.00 Min. :128.0
## male :33 1st Qu.:47.50 1st Qu.:158.0 1st Qu.: 67.00 1st Qu.:163.0
## Median :55.00 Median :160.0 Median : 75.00 Median :178.0
## Mean :54.17 Mean :162.9 Mean : 74.35 Mean :191.9
## 3rd Qu.:63.00 3rd Qu.:168.0 3rd Qu.: 80.00 3rd Qu.:216.5
## Max. :77.00 Max. :192.0 Max. :110.00 Max. :315.0
##
## Blood_pressure Type_of_diabetes Complications_of_diabetes Family_history
## Min. : 90.6 1:21 eye :15 1:43
## 1st Qu.:110.7 2:54 Nervous : 9 2:32
## Median :120.8 no problem : 9
## Mean :122.7 eye and Nervous: 5
## 3rd Qu.:130.8 kidney : 5
## Max. :160.8 digestive : 4
## (Other) :28
## Age_group Bmi Status
## Middle_aged:34 Min. :17.30 Normalweight:16
## Old :30 1st Qu.:25.11 Obese :25
## Young :11 Median :28.72 Overweight :33
## Mean :28.17 Underweight : 1
## 3rd Qu.:31.45
## Max. :39.06
##
str(diabetic)
## tibble [75 × 12] (S3: tbl_df/tbl/data.frame)
## $ Sex : Factor w/ 2 levels "female","male": 2 2 2 1 2 1 1 1 2 1 ...
## $ Age : num [1:75] 64 64 55 54 41 43 71 54 59 49 ...
## $ Height : num [1:75] 160 175 168 155 173 158 155 167 170 160 ...
## $ Weight : num [1:75] 90 65 77 84 66 79 73 90 50 54 ...
## $ Blood_sugar : num [1:75] 196 128 211 154 192 178 250 170 200 171 ...
## $ Blood_pressure : num [1:75] 90.6 110.6 100.6 110.5 90.6 ...
## $ Type_of_diabetes : Factor w/ 2 levels "1","2": 2 2 1 2 2 2 1 2 1 1 ...
## $ Complications_of_diabetes: Ord.factor w/ 24 levels "cardiac and eye"<..: 8 17 8 8 17 22 8 14 14 17 ...
## $ Family_history : Factor w/ 2 levels "1","2": 1 1 1 1 2 1 1 1 1 2 ...
## $ Age_group : Ord.factor w/ 3 levels "Middle_aged"<..: 2 2 1 1 3 1 2 1 2 1 ...
## $ Bmi : num [1:75] 35.2 21.2 27.3 35 22.1 ...
## $ Status : Factor w/ 4 levels "Normalweight",..: 2 1 3 2 1 2 2 2 4 1 ...
dim(diabetic)
## [1] 75 12
diabetic%>%select(Sex,Height)%>%group_by(Sex)%>%filter(Sex=="female")%>%summarise(h=mean(Height))
change column names to standard
names(diabetic)<-str_to_title(str_to_lower(str_replace_all(names(diabetic),"\\s","_")))
#sql command
diabetic%>%ggplot(aes(Bmi,Blood_pressure))+geom_jitter(aes(color=Family_history,shape=Sex))+geom_smooth(method=lm,se=0)+facet_wrap(~Age_group)
## `geom_smooth()` using formula = 'y ~ x'
diabetic%>%ggplot(aes(Bmi,Blood_pressure))+geom_jitter(aes(color=Family_history,shape=Sex))+geom_smooth(method=lm,se=0)+facet_wrap(~Status)
## `geom_smooth()` using formula = 'y ~ x'
diabetic%>%ggplot(aes(Bmi,Blood_pressure))+geom_jitter(aes(color=Family_history,shape=Sex))+geom_smooth(method=lm,se=0)+facet_grid(Sex~Status)
## `geom_smooth()` using formula = 'y ~ x'
#sql
ran<-sqldf('select*,max(Bmi) from diabetic group by Age_group')
ran
rank<-sqldf('select*,max(Bmi) over() from diabetic group by Age_group')
rank
f<-sqldf('select*from diabetic where Age=64 and Sex="female" or Blood_pressure>90.0 and Sex="female" ')
f
#top seven most complicated cases of diabetes for overall population
top_five_most_complicated_cases<-sqldf('select Complications_of_diabetes,sum(Blood_pressure) as pressure from diabetic group by Complications_of_diabetes order by pressure desc limit 5 ')
top_five_most_complicated_cases
top_five_most_complicated_cases<-sqldf('select Complications_of_diabetes,sum(Blood_pressure) over(partition by Sex) as pressure from diabetic group by Complications_of_diabetes order by pressure ')
top_five_most_complicated_cases
#top five least complicated cases of diabetes for overall population
top_five_least_complicated_cases<-sqldf('select Complications_of_diabetes,sum(Blood_pressure) as pressure from diabetic group by Complications_of_diabetes order by pressure asc limit 5 ')
top_five_least_complicated_cases
#complicated cases of diabetes by gender
top_five_most_complicated_cases_by_gender<-sqldf('select Sex,Complications_of_diabetes,sum(Blood_pressure) as pressure from diabetic group by Sex order by pressure desc')
top_five_most_complicated_cases_by_gender
#complicated cases of diabetes by Age_group
top_five_most_complicated_cases_by_Age_group<-sqldf('select Age_group,Complications_of_diabetes,sum(Blood_pressure) as pressure from diabetic group by Age_group order by pressure desc')
top_five_most_complicated_cases_by_Age_group
#complicated cases of diabetes by Type_of_diabetes
top_five_most_complicated_cases_by_Type_of_diabetes<-sqldf('select Type_of_diabetes,Complications_of_diabetes,sum(Blood_pressure) as pressure from diabetic group by Type_of_diabetes order by pressure desc')
top_five_most_complicated_cases_by_Type_of_diabetes
#complicated cases of diabetes by Family_history
top_five_most_complicated_cases_by_Family_history<-sqldf('select Family_history ,Complications_of_diabetes,sum(Blood_pressure) as pressure from diabetic group by Family_history order by pressure desc ')
top_five_most_complicated_cases_by_Family_history
#complicated cases of diabetes by Weight Status
top_five_most_complicated_cases_by_Status<-sqldf('select Status,Complications_of_diabetes,sum(Blood_pressure) as pressure from diabetic group by Status order by pressure desc ')
top_five_most_complicated_cases_by_Status
#blood pressure by sex
high_pressure<-sqldf('select Sex ,sum(Blood_pressure) as pressure from diabetic group by Sex order by pressure desc')
high_pressure
#blood pressure by age group
high_pressure<-sqldf('select Age_group ,sum(Blood_pressure) as pressure from diabetic group by Age_group order by pressure desc ')
high_pressure
pressurey<-sqldf('select Sex,Age_group,sum(Blood_pressure) as pressure from diabetic where Age_group="Young" group by Sex order by pressure desc')
pressurey
pressurem<-sqldf('select Sex,Age_group,sum(Blood_pressure) as pressure from diabetic where Age_group="Middle_aged" group by Sex order by pressure desc')
pressurem
pressureo<-sqldf('select Sex,Age_group,sum(Blood_pressure) as pressure from diabetic where Age_group="Old" group by Sex order by pressure desc')
pressurem
plot(high_pressure$Age_group,high_pressure$Blood_pressure)
diabetic%>%ggplot(aes(Sex,Blood_pressure))+geom_boxplot()+facet_wrap(~Age_group)
structure and summary of the data
convert variables to categorical
diabetic$Sex<-as.factor(diabetic$Sex)
diabetic$Family_history<-as.factor(diabetic$Family_history)
diabetic$Type_of_diabetes<-as.factor(diabetic$Type_of_diabetes)
diabetic$Complications_of_diabetes<-as.factor(diabetic$Complications_of_diabetes)
str(diabetic)
## tibble [75 × 12] (S3: tbl_df/tbl/data.frame)
## $ Sex : Factor w/ 2 levels "female","male": 2 2 2 1 2 1 1 1 2 1 ...
## $ Age : num [1:75] 64 64 55 54 41 43 71 54 59 49 ...
## $ Height : num [1:75] 160 175 168 155 173 158 155 167 170 160 ...
## $ Weight : num [1:75] 90 65 77 84 66 79 73 90 50 54 ...
## $ Blood_sugar : num [1:75] 196 128 211 154 192 178 250 170 200 171 ...
## $ Blood_pressure : num [1:75] 90.6 110.6 100.6 110.5 90.6 ...
## $ Type_of_diabetes : Factor w/ 2 levels "1","2": 2 2 1 2 2 2 1 2 1 1 ...
## $ Complications_of_diabetes: Ord.factor w/ 24 levels "cardiac and eye"<..: 8 17 8 8 17 22 8 14 14 17 ...
## $ Family_history : Factor w/ 2 levels "1","2": 1 1 1 1 2 1 1 1 1 2 ...
## $ Age_group : Ord.factor w/ 3 levels "Middle_aged"<..: 2 2 1 1 3 1 2 1 2 1 ...
## $ Bmi : num [1:75] 35.2 21.2 27.3 35 22.1 ...
## $ Status : Factor w/ 4 levels "Normalweight",..: 2 1 3 2 1 2 2 2 4 1 ...
check outliers
box<-diabetic%>%ggplot(aes(Blood_sugar,Family_history,fill=Type_of_diabetes))+geom_boxplot()+stat_summary(fun=mean)+facet_wrap(~Sex)+coord_flip()
box
## Warning: Removed 4 rows containing missing values (`geom_segment()`).
## Removed 4 rows containing missing values (`geom_segment()`).
box1<-diabetic%>%ggplot(aes(Blood_pressure,Family_history,fill=Type_of_diabetes))+geom_boxplot()+stat_summary(fun=mean)+facet_wrap(~Sex)+coord_flip()
box1
## Warning: Removed 4 rows containing missing values (`geom_segment()`).
## Removed 4 rows containing missing values (`geom_segment()`).
boxw<-diabetic%>%ggplot(aes(Weight,Family_history,fill=Type_of_diabetes))+geom_boxplot()+stat_summary(fun=mean)+facet_wrap(~Sex)+coord_flip()
boxw
## Warning: Removed 4 rows containing missing values (`geom_segment()`).
## Removed 4 rows containing missing values (`geom_segment()`).
boxH<-diabetic%>%ggplot(aes(Height,Family_history,fill=Type_of_diabetes))+geom_boxplot()+stat_summary(fun=mean)+facet_wrap(~Sex)+coord_flip()
boxH
## Warning: Removed 4 rows containing missing values (`geom_segment()`).
## Removed 4 rows containing missing values (`geom_segment()`).
boxA<-diabetic%>%ggplot(aes(Age,Family_history,fill=Type_of_diabetes))+geom_boxplot()+stat_summary(fun=mean)+facet_wrap(~Sex)+coord_flip()
boxA
## Warning: Removed 4 rows containing missing values (`geom_segment()`).
## Removed 4 rows containing missing values (`geom_segment()`).
summary
D<-diabetic%>%tbl_cross(row=Sex,col = Family_history,percent = "cell")%>%add_p(source_note=TRUE)
D
| Family_history | Total | ||
|---|---|---|---|
| 1 | 2 | ||
| Sex | |||
| female | 23 (31%) | 19 (25%) | 42 (56%) |
| male | 20 (27%) | 13 (17%) | 33 (44%) |
| Total | 43 (57%) | 32 (43%) | 75 (100%) |
| Pearson’s Chi-squared test, p=0.6 | |||
DG<-diabetic%>%tbl_cross(row=Sex,col = Type_of_diabetes,percent = "cell")%>%add_p(source_note=TRUE)
DG
| Type_of_diabetes | Total | ||
|---|---|---|---|
| 1 | 2 | ||
| Sex | |||
| female | 10 (13%) | 32 (43%) | 42 (56%) |
| male | 11 (15%) | 22 (29%) | 33 (44%) |
| Total | 21 (28%) | 54 (72%) | 75 (100%) |
| Pearson’s Chi-squared test, p=0.4 | |||
D<-diabetic%>%tbl_cross(row=Type_of_diabetes,col = Family_history,percent = "cell")%>%add_p(source_note=TRUE)
D
| Family_history | Total | ||
|---|---|---|---|
| 1 | 2 | ||
| Type_of_diabetes | |||
| 1 | 8 (11%) | 13 (17%) | 21 (28%) |
| 2 | 35 (47%) | 19 (25%) | 54 (72%) |
| Total | 43 (57%) | 32 (43%) | 75 (100%) |
| Pearson’s Chi-squared test, p=0.036 | |||
table1<-diabetic%>%tbl_summary(by=Type_of_diabetes)%>%add_overall()%>%bold_labels()%>%add_p(test=all_continuous()~"t.test",pvalue_fun=~style_pvalue(.,digits = 2))
table1
| Characteristic | Overall, N = 751 | 1, N = 211 | 2, N = 541 | p-value2 |
|---|---|---|---|---|
| Sex | 0.36 | |||
| female | 42 (56%) | 10 (48%) | 32 (59%) | |
| male | 33 (44%) | 11 (52%) | 22 (41%) | |
| Age | 55 (48, 63) | 51 (45, 62) | 55 (49, 64) | 0.15 |
| Height | 160 (158, 168) | 162 (159, 170) | 160 (158, 167) | 0.37 |
| Weight | 75 (67, 80) | 69 (59, 80) | 77 (69, 83) | 0.017 |
| Blood_sugar | 178 (163, 217) | 177 (168, 218) | 178 (163, 213) | 0.71 |
| Blood_pressure | 121 (111, 131) | 121 (101, 130) | 122 (111, 134) | 0.13 |
| Complications_of_diabetes | 0.55 | |||
| cardiac and eye | 1 (1.3%) | 0 (0%) | 1 (1.9%) | |
| cardiac and eye and skin | 1 (1.3%) | 0 (0%) | 1 (1.9%) | |
| digestive | 4 (5.3%) | 1 (4.8%) | 3 (5.6%) | |
| digestive and eye | 2 (2.7%) | 0 (0%) | 2 (3.7%) | |
| digestive and eye and Nervous | 1 (1.3%) | 0 (0%) | 1 (1.9%) | |
| digestive and kidney | 1 (1.3%) | 1 (4.8%) | 0 (0%) | |
| digestive and rheumatism | 1 (1.3%) | 0 (0%) | 1 (1.9%) | |
| eye | 15 (20%) | 5 (24%) | 10 (19%) | |
| eye and digestive | 3 (4.0%) | 0 (0%) | 3 (5.6%) | |
| eye and kidney | 2 (2.7%) | 1 (4.8%) | 1 (1.9%) | |
| eye and Nervous | 5 (6.7%) | 3 (14%) | 2 (3.7%) | |
| eye and skin | 3 (4.0%) | 0 (0%) | 3 (5.6%) | |
| eye kidney | 1 (1.3%) | 0 (0%) | 1 (1.9%) | |
| kidney | 5 (6.7%) | 1 (4.8%) | 4 (7.4%) | |
| kidney and eye | 1 (1.3%) | 1 (4.8%) | 0 (0%) | |
| kidney and Nervous | 1 (1.3%) | 1 (4.8%) | 0 (0%) | |
| Nervous | 9 (12%) | 2 (9.5%) | 7 (13%) | |
| Nervous and eye | 4 (5.3%) | 0 (0%) | 4 (7.4%) | |
| Nervous and kidney and eye | 1 (1.3%) | 0 (0%) | 1 (1.9%) | |
| Nervous and kidney and skin | 1 (1.3%) | 0 (0%) | 1 (1.9%) | |
| Nervous and sexual problems | 1 (1.3%) | 1 (4.8%) | 0 (0%) | |
| Nervous Gastrointestinal and skin | 2 (2.7%) | 0 (0%) | 2 (3.7%) | |
| no problem | 9 (12%) | 4 (19%) | 5 (9.3%) | |
| skin and eye | 1 (1.3%) | 0 (0%) | 1 (1.9%) | |
| Family_history | 0.036 | |||
| 1 | 43 (57%) | 8 (38%) | 35 (65%) | |
| 2 | 32 (43%) | 13 (62%) | 19 (35%) | |
| Age_group | 0.37 | |||
| Middle_aged | 34 (45%) | 8 (38%) | 26 (48%) | |
| Old | 30 (40%) | 8 (38%) | 22 (41%) | |
| Young | 11 (15%) | 5 (24%) | 6 (11%) | |
| Bmi | 28.7 (25.1, 31.4) | 25.0 (23.0, 28.7) | 29.2 (26.7, 31.9) | 0.003 |
| Status | 0.031 | |||
| Normalweight | 16 (21%) | 8 (38%) | 8 (15%) | |
| Obese | 25 (33%) | 4 (19%) | 21 (39%) | |
| Overweight | 33 (44%) | 8 (38%) | 25 (46%) | |
| Underweight | 1 (1.3%) | 1 (4.8%) | 0 (0%) | |
| 1 n (%); Median (IQR) | ||||
| 2 Pearson’s Chi-squared test; Welch Two Sample t-test; Fisher’s exact test | ||||
correlation
correlation<-cor.test(diabetic$Blood_sugar,diabetic$Blood_pressure)
report(correlation)
## Effect sizes were labelled following Funder's (2019) recommendations.
##
## The Pearson's product-moment correlation between diabetic$Blood_sugar and
## diabetic$Blood_pressure is positive, statistically significant, and medium (r =
## 0.24, 95% CI [0.01, 0.44], t(73) = 2.08, p = 0.041)
h<-diabetic%>%ggplot(aes(Blood_pressure))+geom_histogram(bins=18,fill="blue",alpha=0.5)
hd<-diabetic%>%ggplot(aes(Blood_sugar))+geom_histogram(bins=18,fill="blue",alpha=0.5)
par(mfrow=c(1,2))
h
hd
par(mfrow=c(1,1))
Plotting linear regressions
ff<-diabetic%>%ggplot(aes(Blood_sugar,Blood_pressure))+geom_point()+geom_smooth(method="lm",se=0)+facet_wrap(~Family_history)+theme_minimal()
fy<-diabetic%>%ggplot(aes(Blood_sugar,Height))+geom_point()+geom_smooth(method="lm",se=0)+facet_wrap(~Family_history)+theme_minimal()
fyr<-diabetic%>%ggplot(aes(Blood_pressure,Height))+geom_point()+geom_smooth(method="lm",se=0)+facet_wrap(~Family_history)+theme_minimal()
fyr<-diabetic%>%ggplot(aes(Blood_pressure,Weight))+geom_point()+geom_smooth(method="lm",se=0)+facet_wrap(~Family_history)+theme_minimal()
fyr<-diabetic%>%ggplot(aes(Blood_pressure,Age))+geom_point()+geom_smooth(method="lm",se=0)+facet_wrap(~Family_history)+theme_minimal()
f<-diabetic%>%ggplot(aes(Weight,Height))+geom_point()+geom_smooth(method="lm",se=0)+facet_wrap(~Family_history)+theme_minimal()
fn<-diabetic%>%ggplot(aes(Weight,Blood_sugar))+geom_point()+geom_smooth(method="lm",se=0)+facet_wrap(~Family_history)+theme_minimal()
fb<-diabetic%>%ggplot(aes(Weight,Blood_pressure))+geom_point()+geom_smooth(method="lm",se=0)+facet_wrap(~Family_history)+theme_minimal()
fx<-diabetic%>%ggplot(aes(Weight,Age))+geom_point()+geom_smooth(method="lm",se=0)+facet_wrap(~Family_history)+theme_minimal()
fu<-diabetic%>%ggplot(aes(Weight,Height))+geom_point()+geom_smooth(method="lm",se=0)+facet_wrap(~Family_history)+theme_minimal()
par(mfcol=c(2,2))
ff
## `geom_smooth()` using formula = 'y ~ x'
fy
## `geom_smooth()` using formula = 'y ~ x'
fb
## `geom_smooth()` using formula = 'y ~ x'
fx
## `geom_smooth()` using formula = 'y ~ x'
diabetic<-diabetic%>%mutate(x=1:75)
plot(diabetic$x,diabetic$Weight,type = 'l',col='blue')
lines(diabetic$x,diabetic$Age,col='red',ylim=c(min(diabetic$x), max(diabetic$x)))
par(mfrow=c(1,2))
plot(diabetic$x,diabetic$Weight,type = 'l',col='blue')
plot(diabetic$x,diabetic$Age,type = 'l',col='red')
par(mfrow=c(2,1),mar=c(2,4,4,2))
plot(diabetic$x,diabetic$Hight,type = 'l',col='blue')
## Warning: Unknown or uninitialised column: `Hight`.
plot(diabetic$x,diabetic$Bood_pressure,type = 'l',col='red',ylim=c(min(diabetic$x), max(diabetic$x)))
## Warning: Unknown or uninitialised column: `Bood_pressure`.
par(mfrow=(c(2,2)))
fs<-diabetic%>%ggplot(aes(x,Weight))+geom_line(color="blue")+facet_wrap(~Sex)
f<-diabetic%>%ggplot(aes(x,Age))+geom_line(color="purple")+facet_wrap(~Sex)
fd<-diabetic%>%ggplot(aes(x,Blood_pressure))+geom_line(color="purple")+facet_wrap(~Sex)
fs
f
fd
par(cex=0.7, mai=c(0.1,0.1,0.2,0.1))
# define area for the histogram
par(fig=c(0.1,0.7,0.3,0.9))
hist(diabetic$Age)
#define area for the boxplot
par(fig=c(0.8,1,0,1),new=TRUE)
boxplot(diabetic$Age)
#define area for stripchart
par(fig=c(0.1,0.67,0.1,0.25),new=TRUE)
stripchart(diabetic$Age,method = "jitter")
par(mfrow=c(2,2))
hist(diabetic$Age)
boxplot(diabetic$Age,horizontal = T)
hist(diabetic$Weight)
boxplot(diabetic$Weight,horizontal = T)
hist(diabetic$Blood_sugar)
boxplot(diabetic$Blood_sugar,horizontal = T)
hist(diabetic$Height)
boxplot(diabetic$Height,horizontal = T)
c<-diabetic%>%group_by(Family_history)%>%summarise_at(.vars=vars(Height,Weight,Blood_sugar,Blood_pressure,Age),.funs=mean)
c
names(c$Blood_sugar)=c("FamilyA","familyB")
pct<-round(c$Blood_sugar/sum(c$Blood_sugar)*100)
lbls<-paste(c("FamilyA","familyB"),"",pct,"%",sep ="")
pie(c$Blood_sugar,labels =lbls,color=c$Blood_sugar,main="% Avarage Blood_pressure by Family Type")
## Warning in text.default(1.1 * P$x, 1.1 * P$y, labels[i], xpd = TRUE, adj =
## ifelse(P$x < : "color" is not a graphical parameter
## Warning in text.default(1.1 * P$x, 1.1 * P$y, labels[i], xpd = TRUE, adj =
## ifelse(P$x < : "color" is not a graphical parameter
## Warning in title(main = main, ...): "color" is not a graphical parameter
names(c$Weight)=c("FamilyA","FamilyB")
pct<-round(c$Weight/sum(c$Weight)*100)
lbls<-paste(c("FamilyA","FamilyB"),"",pct,"%",sep ="")
pie(c$Weight,labels =lbls,color=c$Weight,main="% Avarage Weight by Family Type")
## Warning in text.default(1.1 * P$x, 1.1 * P$y, labels[i], xpd = TRUE, adj =
## ifelse(P$x < : "color" is not a graphical parameter
## Warning in text.default(1.1 * P$x, 1.1 * P$y, labels[i], xpd = TRUE, adj =
## ifelse(P$x < : "color" is not a graphical parameter
## Warning in title(main = main, ...): "color" is not a graphical parameter
layout(matrix(c(1,2,3,4),nrow = 2,ncol = 2,byrow = TRUE))
plot(diabetic$x,diabetic$Weight,type = 'l',col='red',ylim=c(min(diabetic$x), max(diabetic$x)))
hist(diabetic$Weight,main = "Weight Histogram", xlab = "Weight")
boxplot(diabetic$Weight,main = " Weight Boxplot")
pie(c$Weight,labels =lbls,color=c$Weight,main="% Avarage Weight by Family Type")
## Warning in text.default(1.1 * P$x, 1.1 * P$y, labels[i], xpd = TRUE, adj =
## ifelse(P$x < : "color" is not a graphical parameter
## Warning in text.default(1.1 * P$x, 1.1 * P$y, labels[i], xpd = TRUE, adj =
## ifelse(P$x < : "color" is not a graphical parameter
## Warning in title(main = main, ...): "color" is not a graphical parameter
layout(matrix(c(1,2,3,4),nrow = 2,ncol = 2,byrow = TRUE))
plot(diabetic$x,diabetic$Bood_pressure,type = 'l',col='red',ylim=c(min(diabetic$x), max(diabetic$x)))
## Warning: Unknown or uninitialised column: `Bood_pressure`.
hist(diabetic$Blood_pressure, main = "Blood_pressure Histogram", xlab = "Blood_pressure")
boxplot(diabetic$Blood_pressure, main = "Blood_pressure Boxplot")
pie(c$Blood_sugar,labels =lbls,color=c$Blood_sugar,main="% Avarage Blood_pressure by Family Type")
## Warning in text.default(1.1 * P$x, 1.1 * P$y, labels[i], xpd = TRUE, adj =
## ifelse(P$x < : "color" is not a graphical parameter
## Warning in text.default(1.1 * P$x, 1.1 * P$y, labels[i], xpd = TRUE, adj =
## ifelse(P$x < : "color" is not a graphical parameter
## Warning in title(main = main, ...): "color" is not a graphical parameter
linear regression
shapiro.test(diabetic$Blood_pressure)
##
## Shapiro-Wilk normality test
##
## data: diabetic$Blood_pressure
## W = 0.96409, p-value = 0.03188
shapiro.test(diabetic$Blood_sugar)
##
## Shapiro-Wilk normality test
##
## data: diabetic$Blood_sugar
## W = 0.93131, p-value = 0.0005375
shapiro.test(diabetic$Weight)
##
## Shapiro-Wilk normality test
##
## data: diabetic$Weight
## W = 0.98423, p-value = 0.4768
shapiro.test(diabetic$Height)
##
## Shapiro-Wilk normality test
##
## data: diabetic$Height
## W = 0.8918, p-value = 1.013e-05
shapiro.test(diabetic$Age)
##
## Shapiro-Wilk normality test
##
## data: diabetic$Age
## W = 0.96771, p-value = 0.0524
lm<-lm(Blood_pressure~Blood_sugar,data=diabetic)%>%tbl_regression()
lm
| Characteristic | Beta | 95% CI1 | p-value |
|---|---|---|---|
| Blood_sugar | 0.08 | 0.00, 0.16 | 0.041 |
| 1 CI = Confidence Interval | |||
data partitioning
set.seed(123)
ind<-sample(2,nrow(diabetic),replace = T,prob = c(0.8,0.2))
train<-diabetic[ind==1,]
test<-diabetic[ind==2,]
model
mymodel<-glm(Type_of_diabetes~Sex+Age_group+Height+Weight+Family_history+Blood_sugar+Blood_pressure,data = train,family = "binomial")
predict
p1<-predict(mymodel,train,type='response')
confusion matix
pred1=ifelse(p1>0.5,1,0)
tab1<-table(predicted=pred1,Actual=train$Type_of_diabetes)
tab1
## Actual
## predicted 1 2
## 0 8 3
## 1 10 39
misclassification error
(1-sum(diag(tab1))/sum(tab1))*100
## [1] 21.66667
accuracy
(sum(diag(tab1))/sum(tab1))*100
## [1] 78.33333
goodness of fit
with(mymodel,pchisq(null.deviance-deviance,df.null-df.residual,lower.tail = F))
## [1] 0.1521733
p2<-predict(mymodel,test,type='response')
pred1=ifelse(p2>0.5,1,0)
tab1<-table(predicted=pred1,Actual=test$Type_of_diabetes)
#misclassification error
(1-sum(diag(tab1))/sum(tab1))*100
## [1] 26.66667
#accuracy
(sum(diag(tab1))/sum(tab1))*100
## [1] 73.33333
#goodness of fit
with(mymodel,pchisq(null.deviance-deviance,df.null-df.residual,lower.tail = F))
## [1] 0.1521733
ordered logistc regression
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## The following object is masked from 'package:gtsummary':
##
## select
model<-polr(Status~Sex+Age_group+Height+Weight+Family_history+Blood_sugar+Blood_pressure,data=train,Hess=T)
#calculate P values
ctable<-coef(summary(model))#coeficient
p<-pnorm(abs(ctable[,"t value"]),lower.tail = FALSE)*2
(ctable<-cbind(ctable,"p value"=p))
## Value Std. Error t value p value
## Sexmale 0.141413761 0.614740465 0.2300382 0.81806212
## Age_group.L -0.152324998 0.609685391 -0.2498420 0.80270956
## Age_group.Q -0.286915315 0.467664541 -0.6135067 0.53954137
## Height -0.062616070 0.039549779 -1.5832217 0.11337093
## Weight 0.014741743 0.027057352 0.5448332 0.58586826
## Family_history2 -1.010645593 0.532862794 -1.8966338 0.05787628
## Blood_sugar 0.001226567 0.005655695 0.2168729 0.82830738
## Blood_pressure -0.019717899 0.016665065 -1.1831876 0.23673481
## Normalweight|Obese -13.200026717 6.979762761 -1.8911856 0.05859957
## Obese|Overweight -11.424596488 6.927765272 -1.6491027 0.09912659
## Overweight|Underweight -7.266107313 6.928210851 -1.0487711 0.29428348
t2<-tbl_regression(model,exponentiate=TRUE)%>%add_global_p()
t2
| Characteristic | OR1 | 95% CI1 | p-value |
|---|---|---|---|
| Sex | 0.8 | ||
| female | — | — | |
| male | 1.15 | 0.35, 3.93 | |
| Age_group | 0.8 | ||
| Age_group.L | 0.86 | 0.25, 2.87 | |
| Age_group.Q | 0.75 | 0.30, 1.88 | |
| Height | 0.94 | 0.87, 1.01 | 0.11 |
| Weight | 1.01 | 0.96, 1.07 | 0.6 |
| Family_history | 0.055 | ||
| 1 | — | — | |
| 2 | 0.36 | 0.12, 1.02 | |
| Blood_sugar | 1.00 | 0.99, 1.01 | 0.8 |
| Blood_pressure | 0.98 | 0.95, 1.01 | 0.2 |
| 1 OR = Odds Ratio, CI = Confidence Interval | |||
#predictiions
pred<-predict(model,train)
print(pred,digits = 3)
## [1] Overweight Overweight Overweight Overweight Overweight
## [6] Overweight Obese Overweight Obese Obese
## [11] Obese Overweight Obese Obese Overweight
## [16] Overweight Obese Overweight Overweight Overweight
## [21] Overweight Obese Obese Overweight Obese
## [26] Normalweight Overweight Obese Normalweight Overweight
## [31] Overweight Overweight Obese Normalweight Overweight
## [36] Overweight Overweight Overweight Overweight Overweight
## [41] Overweight Overweight Obese Obese Obese
## [46] Overweight Overweight Obese Obese Overweight
## [51] Overweight Overweight Overweight Overweight Overweight
## [56] Overweight Normalweight Obese Overweight Overweight
## Levels: Normalweight Obese Overweight Underweight
#confusion matrix
(tab<-table(pred,train$Status))
##
## pred Normalweight Obese Overweight Underweight
## Normalweight 4 0 0 0
## Obese 4 5 9 0
## Overweight 3 15 19 1
## Underweight 0 0 0 0
#misclassification
1-sum(diag(tab))/sum(tab)
## [1] 0.5333333
#confusion matrix for test data
pred1<-predict(model,test)
#missclasification error
tab<-table(pred1,test$Status)
1-sum(diag(tab))/sum(tab)
## [1] 0.7333333
## other logistic regression interpretations
library(gtsummary)
t1<-tbl_regression(mymodel,exponentiate = TRUE)%>%add_global_p()
t1
| Characteristic | OR1 | 95% CI1 | p-value |
|---|---|---|---|
| Sex | 0.6 | ||
| female | — | — | |
| male | 0.70 | 0.15, 3.15 | |
| Age_group | 0.9 | ||
| Age_group.L | 0.97 | 0.25, 4.24 | |
| Age_group.Q | 1.29 | 0.42, 4.10 | |
| Height | 0.97 | 0.89, 1.06 | 0.5 |
| Weight | 1.06 | 1.00, 1.14 | 0.045 |
| Family_history | 0.036 | ||
| 1 | — | — | |
| 2 | 0.26 | 0.07, 0.92 | |
| Blood_sugar | 1.00 | 0.99, 1.01 | >0.9 |
| Blood_pressure | 1.03 | 0.98, 1.07 | 0.3 |
| 1 OR = Odds Ratio, CI = Confidence Interval | |||
#The odds ratio for age was 'r inline_text(regr, variable = Age)'
"{estimate} ({conf.level*100}% CI
{conf.low},{conf.high}; {p.value})"
## [1] "{estimate} ({conf.level*100}% CI\n{conf.low},{conf.high}; {p.value})"