#Clear R environment
rm(list = ls())
# Working directory
setwd("C:/Users/USER/Desktop/JOOUST")
# Load data set in Rstudio
library(readxl)
data_1 <- read_excel("data_1.xlsx")
View(data_1)DATA SCIENCE SKILLS
Data wrangling
str(data_1)tibble [227 × 6] (S3: tbl_df/tbl/data.frame)
$ IdNumber : num [1:227] 1 2 3 4 5 6 7 8 9 10 ...
$ Education : chr [1:227] "Secondary" "Secondary" "Secondary" "Secondary" ...
$ Marital : chr [1:227] "Married" "Married" "Married" "Single" ...
$ Height : num [1:227] 168 138 157 151 158 158 164 171 163 156 ...
$ Gender : chr [1:227] "Female" "Female" "Male" "Male" ...
$ Agefirstsex: chr [1:227] "17+years" "17+years" "17+years" "<17years" ...
Load packages for data processing, visualization and Analysis
library(tidyverse)── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.0 ✔ tibble 3.2.1
✔ lubridate 1.9.3 ✔ tidyr 1.3.1
✔ purrr 1.0.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(table1)
Attaching package: 'table1'
The following objects are masked from 'package:base':
units, units<-
library(summarytools)
Attaching package: 'summarytools'
The following objects are masked from 'package:table1':
label, label<-
The following object is masked from 'package:tibble':
view
Convert Character to factor variables
data_1$Education<-as.factor(data_1$Education)
data_1$Marital<-as.factor(data_1$Marital)
data_1$Gender<-as.factor(data_1$Gender)
data_1$Agefirstsex<-as.factor(data_1$Agefirstsex)Loading the second data set
library(readxl)
data_2 <- read_excel("data_2.xlsx")Explore data_2
str(data_2)tibble [227 × 7] (S3: tbl_df/tbl/data.frame)
$ IdNumber : num [1:227] 1 2 3 4 5 6 7 8 9 10 ...
$ Date : POSIXct[1:227], format: "2009-12-05" "2009-12-05" ...
$ casestatus: num [1:227] 1 0 1 0 1 0 1 0 1 0 ...
$ Age : num [1:227] 30 30 23 24 27 25 33 30 63 63 ...
$ Occupation: num [1:227] 4 2 1 1 4 1 1 2 4 3 ...
$ Church : num [1:227] 7 7 4 3 4 2 3 3 3 7 ...
$ Weight : num [1:227] 69 48 55 39 44.5 75 74 74.7 66 83 ...
Data processing
Value label for categorical variables
data_2$casestatus<-ordered(data_2$casestatus,levels=c(0,1),labels=c("No","Yes"))
data_2$Church<-ordered(data_2$Church,levels=c(1,2,3,4,5,6,7),labels=c("Anglican","Apostolic","Atheist","Methodist","Pentecostal","Catholic","Other"))Convert character to factor variables
data_2$Date<-as.Date(data_2$Date)
data_2$casestatus<-as.factor(data_2$casestatus)
data_2$Church<-as.factor(data_2$Church)Delete a column
Dropvar<-names(data_2)%in% c("Occupation")
data_2<-data_2[!Dropvar]Merging data set
sti<-merge(data_2,data_1,by="IdNumber")Dealing with duplicates
# Identify duplicates
sum(duplicated(sti$IdNumber))[1] 3
# Remove duplicates
sti<-sti%>%distinct(IdNumber,.keep_all = TRUE)Missing data
Explore each and every variable to identify if there is any missing values
# Exploring categorical variables
sti%>%count(casestatus,sort = TRUE) casestatus n
1 Yes 113
2 No 112
3 <NA> 1
sti%>%count(Church,sort = TRUE) Church n
1 Apostolic 52
2 Atheist 45
3 Catholic 41
4 Pentecostal 36
5 Other 23
6 Anglican 16
7 Methodist 13
sti%>%count(Education,sort = TRUE) Education n
1 Secondary 188
2 Primary 26
3 Tertiary 11
4 None 1
sti%>%count(Marital,sort = TRUE) Marital n
1 Married 149
2 Single 43
3 Divorcee 16
4 Co-habiting 9
5 Widowed 9
sti%>%count(Agefirstsex,sort = TRUE) Agefirstsex n
1 17+years 179
2 <17years 41
3 <NA> 6
sti%>%count(Gender,sort = TRUE) Gender n
1 Male 118
2 Female 108
#Exploring continuous variables
summary(sti$Age) Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
16.00 23.00 26.00 27.98 32.00 63.00 1
summary(sti$Weight) Min. 1st Qu. Median Mean 3rd Qu. Max.
32.80 45.50 51.00 53.74 60.88 100.00
summary(sti$Height) Min. 1st Qu. Median Mean 3rd Qu. Max.
138.0 156.0 160.5 161.0 166.0 191.0
Replacing missing data in categorical variables
sti<-sti%>%
mutate(casestatus=replace_na(casestatus,"No"))
table(sti$casestatus)
No Yes
113 113
Dealing with missing data in continuous variables
sti<-sti%>%
mutate(Age=replace_na(Age,48))
summary(sti$Age) Min. 1st Qu. Median Mean 3rd Qu. Max.
16.00 23.00 26.50 28.07 32.00 63.00
Data Manipulation
sti<-sti%>%
rename(STI=casestatus)%>%
mutate(BMI=Weight/(Height/100)^2)Rounding BMI to 1decimal places
sti$BMI<-round(as.numeric(sti$BMI),1)Convert continuous variables to categorical
Participants Age group
# Age group
range(sti$Age)[1] 16 63
sti<-within(sti,{
Agegroup<-NA
Agegroup[Age>=16 & Age<=25]<-"16-25"
Agegroup[Age>=26 & Age<=35]<-"26-35"
Agegroup[Age>=36 & Age<=45]<-"36-45"
Agegroup[Age>=46 & Age<=55]<-"46-55"
Agegroup[Age>=56]<-"56+"
})Participants BMI categories
# BMI categories
range(sti$BMI)[1] 13.4 36.3
sti<-within(sti,{
BMIGroup<-NA
BMIGroup[BMI<=18.4]<-"Malnourished"
BMIGroup[BMI>=18.5 & BMI<=24.9]<-"Normal"
BMIGroup[BMI>=25 & BMI<=29.9]<-"Over weight"
BMIGroup[BMI>=30]<-"Obesity"
})
sti$BMIGroup<-as.factor(sti$BMIGroup)Variable labels
library(expss)Loading required package: maditr
To get total summary skip 'by' argument: take_all(mtcars, mean)
Attaching package: 'maditr'
The following objects are masked from 'package:dplyr':
between, coalesce, first, last
The following object is masked from 'package:purrr':
transpose
The following object is masked from 'package:readr':
cols
Use 'expss_output_rnotebook()' to display tables inside R Notebooks.
To return to the console output, use 'expss_output_default()'.
Attaching package: 'expss'
The following objects are masked from 'package:stringr':
fixed, regex
The following objects are masked from 'package:dplyr':
compute, contains, na_if, recode, vars, where
The following objects are masked from 'package:purrr':
keep, modify, modify_if, when
The following objects are masked from 'package:tidyr':
contains, nest
The following object is masked from 'package:ggplot2':
vars
sti<-apply_labels(sti,
Age="Age in Years",
Education="Level of Education",
Weight=" Weight in Kgs",
Marital="Marital Status",
Height="Height in Cm",
Agefirstsex="Age at first sex",
BMI="Body Mass Index in Kg/M2",
Agegroup="Participants age group in years",
BMIGroup="Participants body mass index categories",
STI="Sexual Transmitted Infection")Exploratory Data Analysis
EDA for categorical variables
Exploring categorical variables using tables
# Explore Education level
freq(sti$Education,report.nas = FALSE)Frequencies
sti$Education
Label: Level of Education
Type: Factor
Freq % % Cum.
--------------- ------ -------- --------
None 1 0.44 0.44
Primary 26 11.50 11.95
Secondary 188 83.19 95.13
Tertiary 11 4.87 100.00
Total 226 100.00 100.00
freq(sti$STI,report.nas = FALSE)Frequencies
sti$STI
Label: Sexual Transmitted Infection
Type: Ordered Factor
Freq % % Cum.
----------- ------ -------- --------
No 113 50.00 50.00
Yes 113 50.00 100.00
Total 226 100.00 100.00
freq(sti$Marital,report.nas = FALSE)Frequencies
sti$Marital
Label: Marital Status
Type: Factor
Freq % % Cum.
----------------- ------ -------- --------
Co-habiting 9 3.98 3.98
Divorcee 16 7.08 11.06
Married 149 65.93 76.99
Single 43 19.03 96.02
Widowed 9 3.98 100.00
Total 226 100.00 100.00
freq(sti$Gender,report.nas = FALSE)Frequencies
sti$Gender
Type: Factor
Freq % % Cum.
------------ ------ -------- --------
Female 108 47.79 47.79
Male 118 52.21 100.00
Total 226 100.00 100.00
freq(sti$Agefirstsex,report.nas = FALSE)Frequencies
sti$Agefirstsex
Label: Age at first sex
Type: Factor
Freq % % Cum.
-------------- ------ -------- --------
<17years 41 18.64 18.64
17+years 179 81.36 100.00
Total 220 100.00 100.00
freq(sti$Agegroup,report.nas = FALSE)Frequencies
sti$Agegroup
Label: Participants age group in years
Type: Character
Freq % % Cum.
----------- ------ -------- --------
16-25 98 43.36 43.36
26-35 100 44.25 87.61
36-45 22 9.73 97.35
46-55 4 1.77 99.12
56+ 2 0.88 100.00
Total 226 100.00 100.00
freq(sti$BMIGroup,report.nas = FALSE)Frequencies
sti$BMIGroup
Label: Participants body mass index categories
Type: Factor
Freq % % Cum.
------------------ ------ -------- --------
Malnourished 89 39.38 39.38
Normal 103 45.58 84.96
Obesity 8 3.54 88.50
Over weight 26 11.50 100.00
Total 226 100.00 100.00
Exploring Categorical variables using bar graphs
# Age group
DF_1<-tibble(
Agegroup=c("16-25","26-35","36-45","46-55","56 +"),
Counts=c(98,100,22,4,2)
)
ggplot(data = DF_1,aes(Agegroup,Counts,fill = Agegroup))+
geom_bar(stat = "identity",width = 0.8,show.legend = FALSE)+
geom_text(aes(label = Counts),size=4,vjust=-0.1)+
theme_classic()+labs(title = "Participants Age group",
y="Frequency",
x="Age group in years")library(ggthemes)
DF_2<-tibble(
Education=c("A","B","C","D"),
Counts=c(188,26,11,1)
)
ggplot(data=DF_2,aes(Education,Counts,fill = "Education"))+geom_bar(stat = "identity",width=0.77,show.legend = FALSE)+
geom_text(aes(label=c("Secondary","Primary","Tertiary","None")),size = 4,hjust=-0.1,color="black")+coord_flip()+theme_classic()+
ylim(0,240)+labs(title = "Participants Education level",
y="Frequency",
x="Education Level")Counts<-table(sti$BMIGroup)
barplot(Counts,
main = "Participants Body Index",
xlab = "Body Mass Index",
legend=rownames(Counts),
col = c("red","blue","green","violet"))library(forcats)
sti%>%
mutate(Church=fct_infreq(Church))%>%
mutate(Church=fct_rev(Church))%>%
ggplot(aes(Church))+geom_bar(fill="blue")+coord_flip()+
theme_classic()+
labs(title = "Participant churches",
y="Frequency",
x="Churches")EDA for continuous variables
library(psych)
Attaching package: 'psych'
The following objects are masked from 'package:ggplot2':
%+%, alpha
Sumstatistics<-sti%>%
select(Age,Weight,Height,BMI)
# Summary Statistics
describe(Sumstatistics) vars n mean sd median trimmed mad min max range skew
Age 1 226 28.07 7.18 26.5 27.21 5.19 16.0 63.0 47.0 1.57
Weight 2 226 53.74 11.70 51.0 52.66 10.38 32.8 100.0 67.2 1.05
Height 3 226 160.96 8.00 160.5 160.81 8.15 138.0 191.0 53.0 0.31
BMI 4 226 20.74 4.21 20.2 20.36 4.15 13.4 36.3 22.9 0.90
kurtosis se
Age 4.19 0.48
Weight 1.45 0.78
Height 0.65 0.53
BMI 0.84 0.28
Testing Normality of Continuous variables using histogram and Shapiro wilk test
Histogram
ggplot(sti,aes(x=Age))+
geom_histogram(fill="blue",color="white")+
geom_vline(aes(xintercept=mean(Age)),color="red",
linewidth=1,linetype="dashed")+
theme_classic()+
labs(title = "Histogram showing Age distribution",
y= "Counts",
x=" Age in years")`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Non-graphical Method
# Shapiro wilk test
library(e1071)
shapiro.test(sti$Age)
Shapiro-Wilk normality test
data: sti$Age
W = 0.88919, p-value = 7.855e-12
Body Mass Index
# Body Mass Index
hist(sti$BMI,probability=T,col = "blue", main="BMI distribution",xlab="Participant BMI(Kg/M^2)")
lines(density(sti$BMI),col="black")Weight
# Weight
ggplot(sti,aes(x=Weight))+
geom_histogram(fill="blue",color="white")+
theme_classic()+
labs(title = "Distribution of participants weight",
y="Counts",
x= "Weight in Kgs")`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Height
# Height
Stat<-sti%>%
summarise(
mean_Height=mean(Height),
SD_Height=sd(Height),
Median_Produc=median(Height),
conf.25=quantile(Height,0.25),
conf.75=quantile(Height,0.75),
con.low=quantile(Height,0.025),
conf.high=quantile(Height,0.975)
)
###Plot summary stat
ggplot(sti,aes(x=Height))+
geom_histogram(fill="blue",color="white")+
geom_vline(data =Stat,
aes(xintercept=mean_Height),
linetype="dashed",color="black")+
geom_vline(data=Stat,
aes(xintercept=mean_Height+SD_Height),
linetype="dashed",color="red")+
geom_vline(data=Stat,
aes(xintercept=mean_Height-SD_Height),
linetype="dashed",color="red")+
geom_vline(data=Stat,
aes(xintercept=mean_Height+SD_Height*2),
linetype="dotted",color="red")+
geom_vline(data=Stat,
aes(xintercept=mean_Height-SD_Height*2),
linetype="dotted",color="red")+
theme_classic()+
labs(title = "Distribution of participants height",
y="Frequency",
x="Height in Cm")`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Dealing with Outliers in continuous variables using box plot
boxplot(sti$Age,col = "blue")boxplot(sti$Weight,col="red")boxplot(sti$Height,col = "green")boxplot(sti$BMI,col="violet")How to identify outliers
To identify outlier within the data set you need to calculate minimum value and maximum value by using this formula; Max value=Q3+1.5IQR while Min value=Q1-1.5IQR.
To identify outliers in Age Variables
#Age
summary(sti$Age) Min. 1st Qu. Median Mean 3rd Qu. Max.
16.00 23.00 26.50 28.07 32.00 63.00
IQR(sti$Age)VALUES:
9
minValue<-23- 1.5*9
minValue[1] 9.5
maxValue<-32+1.5*9
maxValue[1] 45.5
# Printing out outliers
sti%>%
select(IdNumber,Age)%>%
filter(Age>45.5) IdNumber Age
1 9 63
2 10 63
3 27 48
4 185 46
5 186 50
6 214 46
To indenfy outliers in Weight Variable
# Weight
summary(sti$Weight) Min. 1st Qu. Median Mean 3rd Qu. Max.
32.80 45.50 51.00 53.74 60.88 100.00
IQR(sti$Weight)VALUES:
15.375
minValue<-32.8- 1.5*15.375
minValue[1] 9.7375
maxValue<-60.88+1.5*15.375
maxValue[1] 83.9425
# Printing out outliers
sti%>%
select(IdNumber,Weight)%>%
filter(Weight>83.9425) IdNumber Weight
1 12 94.0
2 34 91.4
3 35 100.0
4 89 95.5
To identify outliers in Body Mass Index Variable
# BMI
summary(sti$BMI) Min. 1st Qu. Median Mean 3rd Qu. Max.
13.40 17.52 20.20 20.74 23.10 36.30
IQR(sti$BMI)VALUES:
5.575
minValue<-17.52- 1.5*5.575
minValue[1] 9.1575
maxValue<-23.1+1.5*5.575
maxValue[1] 31.4625
# Printing out outliers
sti%>%
select(IdNumber,BMI)%>%
filter(BMI>31.4625) IdNumber BMI
1 10 34.1
2 12 32.5
3 13 32.8
4 34 33.2
5 35 36.3
6 89 33.0
To identify outliers in Height Variable
# Height
summary(sti$Height) Min. 1st Qu. Median Mean 3rd Qu. Max.
138.0 156.0 160.5 161.0 166.0 191.0
IQR(sti$Height)VALUES:
10
minValue<-156- 1.5*10
minValue[1] 141
maxValue<-166+1.5*10
maxValue[1] 181
# Printing out outliers
sti%>%
select(IdNumber,Height)%>%
filter(Height<141|Height>181) IdNumber Height
1 2 138
2 32 182
3 98 191
4 116 182
5 215 182
Bivariate Analysis
In bivariate analysis, we treated STI as the outcome variable. Where participants infected with STI was recorded 1 and 0 otherwise. The rest of the variables were treated as exposure variables.
Chi-Square statistics was used to test the relationship between the outcome and exposure variables.
Age group Vs STI
# Age group
ctable(
x=sti$Agegroup,
y=sti$STI,
chisq = TRUE,
headings=FALSE
)Warning in chisq.test(freq_table_min): Chi-squared approximation may be
incorrect
---------- ----- ------------- ------------- --------------
STI No Yes Total
Agegroup
16-25 50 (51.0%) 48 (49.0%) 98 (100.0%)
26-35 47 (47.0%) 53 (53.0%) 100 (100.0%)
36-45 13 (59.1%) 9 (40.9%) 22 (100.0%)
46-55 2 (50.0%) 2 (50.0%) 4 (100.0%)
56+ 1 (50.0%) 1 (50.0%) 2 (100.0%)
Total 113 (50.0%) 113 (50.0%) 226 (100.0%)
---------- ----- ------------- ------------- --------------
----------------------------
Chi.squared df p.value
------------- ---- ---------
1.1281 4 0.8898
----------------------------
Marital Status Vs STI
# Marital Status
ctable(
x=sti$Marital,
y=sti$STI,
chisq = TRUE,
headings=FALSE
)Warning in chisq.test(freq_table_min): Chi-squared approximation may be
incorrect
------------- ----- ------------- ------------- --------------
STI No Yes Total
Marital
Co-habiting 1 (11.1%) 8 (88.9%) 9 (100.0%)
Divorcee 12 (75.0%) 4 (25.0%) 16 (100.0%)
Married 75 (50.3%) 74 (49.7%) 149 (100.0%)
Single 18 (41.9%) 25 (58.1%) 43 (100.0%)
Widowed 7 (77.8%) 2 (22.2%) 9 (100.0%)
Total 113 (50.0%) 113 (50.0%) 226 (100.0%)
------------- ----- ------------- ------------- --------------
----------------------------
Chi.squared df p.value
------------- ---- ---------
13.3685 4 0.0096
----------------------------
BMI Categories Vs STI
# BMI Group
ctable(
x=sti$BMIGroup,
y=sti$STI,
chisq = TRUE,
headings=FALSE
)Warning in chisq.test(freq_table_min): Chi-squared approximation may be
incorrect
-------------- ----- ------------- ------------- --------------
STI No Yes Total
BMIGroup
Malnourished 46 (51.7%) 43 (48.3%) 89 (100.0%)
Normal 52 (50.5%) 51 (49.5%) 103 (100.0%)
Obesity 4 (50.0%) 4 (50.0%) 8 (100.0%)
Over weight 11 (42.3%) 15 (57.7%) 26 (100.0%)
Total 113 (50.0%) 113 (50.0%) 226 (100.0%)
-------------- ----- ------------- ------------- --------------
----------------------------
Chi.squared df p.value
------------- ---- ---------
0.7262 3 0.867
----------------------------
Data Visualization
DV<-sti%>%
dplyr::group_by(STI,BMIGroup)%>%
dplyr::tally()%>%
dplyr::mutate(percentage=n/sum(n))ggplot(data=DV,aes(x=BMIGroup,y=n,fill=STI))+
geom_bar(stat = "identity",width = 0.7)+
geom_text(aes(label = paste0(sprintf("%1.1f",percentage*100),"%")),
position = position_stack(vjust = 0.5),colour="white")+
theme_classic()+
labs(title = "BMI Cataegories among STI cases",
y= "Frequency",
x="Body Mass Index Categories")DV_1<-sti%>%
dplyr::group_by(STI,Gender)%>%
dplyr::tally()%>%
dplyr::mutate(percentage=n/sum(n))ggplot(data=DV_1,aes(x=Gender,y=n,fill=STI))+
geom_bar(stat = "identity",width = 0.7)+
geom_text(aes(label = paste0(sprintf("%1.1f",percentage*100),"%")),
position = position_stack(vjust = 0.5),colour="white")+
theme_classic()+
labs(title = "STI cases among Gender",
y= "Frequency",
x="Gender")Data visualization for Continuous
Correlation HeatMap
library(corrplot)corrplot 0.95 loaded
corr <- sti %>%
select(Age, BMI, Weight, Height) %>%
replace(is.na(.), 0)
correlation = cor(corr)
corrplot(correlation, type="upper", method="color", addCoef.col = "black")Data visualization for categorical vs continuous
ggplot(sti,aes(Church,Height,fill =Church))+geom_boxplot(show.legend = FALSE)+
theme_classic()+
labs(title = "Height distribution ")Summary Statistics for categorical variables
table1(~Gender+Education + Marital+Agegroup+BMIGroup|STI,data=sti,overall = "Total")| No (N=113) |
Yes (N=113) |
Total (N=226) |
|
|---|---|---|---|
| Gender | |||
| Female | 55 (48.7%) | 53 (46.9%) | 108 (47.8%) |
| Male | 58 (51.3%) | 60 (53.1%) | 118 (52.2%) |
| Level of Education | |||
| None | 0 (0%) | 1 (0.9%) | 1 (0.4%) |
| Primary | 12 (10.6%) | 14 (12.4%) | 26 (11.5%) |
| Secondary | 97 (85.8%) | 91 (80.5%) | 188 (83.2%) |
| Tertiary | 4 (3.5%) | 7 (6.2%) | 11 (4.9%) |
| Marital Status | |||
| Co-habiting | 1 (0.9%) | 8 (7.1%) | 9 (4.0%) |
| Divorcee | 12 (10.6%) | 4 (3.5%) | 16 (7.1%) |
| Married | 75 (66.4%) | 74 (65.5%) | 149 (65.9%) |
| Single | 18 (15.9%) | 25 (22.1%) | 43 (19.0%) |
| Widowed | 7 (6.2%) | 2 (1.8%) | 9 (4.0%) |
| Participants age group in years | |||
| 16-25 | 50 (44.2%) | 48 (42.5%) | 98 (43.4%) |
| 26-35 | 47 (41.6%) | 53 (46.9%) | 100 (44.2%) |
| 36-45 | 13 (11.5%) | 9 (8.0%) | 22 (9.7%) |
| 46-55 | 2 (1.8%) | 2 (1.8%) | 4 (1.8%) |
| 56+ | 1 (0.9%) | 1 (0.9%) | 2 (0.9%) |
| Participants body mass index categories | |||
| Malnourished | 46 (40.7%) | 43 (38.1%) | 89 (39.4%) |
| Normal | 52 (46.0%) | 51 (45.1%) | 103 (45.6%) |
| Obesity | 4 (3.5%) | 4 (3.5%) | 8 (3.5%) |
| Over weight | 11 (9.7%) | 15 (13.3%) | 26 (11.5%) |
Summary statistics for continuous variables
table1(~Age+Height+BMI+Weight|STI,data=sti,overall = "Total")| No (N=113) |
Yes (N=113) |
Total (N=226) |
|
|---|---|---|---|
| Age in Years | |||
| Mean (SD) | 28.3 (7.24) | 27.9 (7.15) | 28.1 (7.18) |
| Median [Min, Max] | 27.0 [16.0, 63.0] | 26.0 [17.0, 63.0] | 26.5 [16.0, 63.0] |
| Height in Cm | |||
| Mean (SD) | 161 (8.45) | 161 (7.57) | 161 (8.00) |
| Median [Min, Max] | 161 [138, 191] | 160 [145, 182] | 161 [138, 191] |
| Body Mass Index in Kg/M2 | |||
| Mean (SD) | 20.4 (4.09) | 21.1 (4.31) | 20.7 (4.21) |
| Median [Min, Max] | 19.7 [13.4, 34.1] | 20.4 [13.8, 36.3] | 20.2 [13.4, 36.3] |
| Weight in Kgs | |||
| Mean (SD) | 52.8 (11.6) | 54.7 (11.8) | 53.7 (11.7) |
| Median [Min, Max] | 50.2 [33.0, 94.0] | 51.7 [32.8, 100] | 51.0 [32.8, 100] |
Part two: Hypothesis testing using T.test
T test is one of a parametric test used in hypothesis testing if and only if the distribution of data is symmetric but if the data the data is either skewed to left or right wilcoxon sign rank test is the suitable. For this this we to hypothesize mean distribution of participants.
In hypothesis testing, the following assumption must be achieved:
1.Large representative of sample size
2.Normality assumption
HT<-sti%>%
select(Height,Gender,STI)Test normality assumption using Shapiro wilk test
Height variable is normally distributed since P-value is greater than 0.05
shapiro.test(HT$Height)
Shapiro-Wilk normality test
data: HT$Height
W = 0.98898, p-value = 0.0811
Filter outliers because outliers interfere with the distribution of mean as a measure of central tendency.
HT_1<-HT%>%
select(Gender,STI,Height)%>%
filter(Height>=141 & Height<=181)Again, check the height distribution using histogram and boxplot
hist(HT_1$Height,col="violet")boxplot(HT_1$Height,col = "red")HT_1%>%count(Gender) Gender n
1 Female 104
2 Male 117
mean(HT_1$Height)[1] 160.6425
One sided T test
Ho: Mean height of female participants>=160.6425
HT_1%>%
filter(Gender=="Female")%>%
select(Height)%>%
t.test(mu=160.6425)
One Sample t-test
data: .
t = 0.0021974, df = 103, p-value = 0.9983
alternative hypothesis: true mean is not equal to 160.6425
95 percent confidence interval:
159.0821 162.2064
sample estimates:
mean of x
160.6442
Ho: Mean height of male participants<=160.6425
HT_1%>%
filter(Gender=="Male")%>%
select(Height)%>%
t.test(mu=160.6425)
One Sample t-test
data: .
t = -0.0024323, df = 116, p-value = 0.9981
alternative hypothesis: true mean is not equal to 160.6425
95 percent confidence interval:
159.4404 161.8416
sample estimates:
mean of x
160.641
Two sided t test
Ho: Mean Height of participants infected with STI equal to mean height of participant not infected
HT_1%>%
t.test(Height~STI,data=.,
alternative = "two.sided")
Welch Two Sample t-test
data: Height by STI
t = -0.27768, df = 218.96, p-value = 0.7815
alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
95 percent confidence interval:
-2.204145 1.659748
sample estimates:
mean in group No mean in group Yes
160.5046 160.7768
Linear Regression Analysis
Simple linear Regression
Model1<-lm(BMI~Weight,data = sti)
summary(Model1)
Call:
lm(formula = BMI ~ Weight, data = sti)
Residuals:
LABEL: Body Mass Index in Kg/M2
VALUES:
-6.3411, -1.136, -0.0497, 1.1774, 6.2973
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.5491 0.6050 5.866 1.59e-08 ***
Weight 0.3199 0.0110 29.075 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.93 on 224 degrees of freedom
Multiple R-squared: 0.7905, Adjusted R-squared: 0.7896
F-statistic: 845.4 on 1 and 224 DF, p-value: < 2.2e-16
Multiple linear regression model
Model2<-lm(BMI~Weight+Height,data = sti)
summary(Model2)
Call:
lm(formula = BMI ~ Weight + Height, data = sti)
Residuals:
LABEL: Body Mass Index in Kg/M2
VALUES:
-1.29656, -0.16805, 0.01601, 0.14119, 1.60557
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 40.934623 0.473437 86.46 <2e-16 ***
Weight 0.379233 0.002125 178.49 <2e-16 ***
Height -0.252087 0.003105 -81.18 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.35 on 223 degrees of freedom
Multiple R-squared: 0.9931, Adjusted R-squared: 0.9931
F-statistic: 1.615e+04 on 2 and 223 DF, p-value: < 2.2e-16
Logistic regression Analysis
Simple logistic regression model
Model4<-glm(STI~Marital,data = sti,family = "binomial")
summary(Model4)
Call:
glm(formula = STI ~ Marital, family = "binomial", data = sti)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.079 1.061 1.961 0.0499 *
MaritalDivorcee -3.178 1.208 -2.632 0.0085 **
MaritalMarried -2.093 1.073 -1.950 0.0512 .
MaritalSingle -1.751 1.105 -1.585 0.1130
MaritalWidowed -3.332 1.330 -2.506 0.0122 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 313.30 on 225 degrees of freedom
Residual deviance: 298.83 on 221 degrees of freedom
AIC: 308.83
Number of Fisher Scoring iterations: 4
# Odd Ratio
exp(coef(Model4)) (Intercept) MaritalDivorcee MaritalMarried MaritalSingle MaritalWidowed
7.99999999 0.04166667 0.12333333 0.17361111 0.03571429
Multiple logistic regression
Model5<-glm(STI~Marital+Education+Gender,data = sti,family = "binomial")
summary(Model5)
Call:
glm(formula = STI ~ Marital + Education + Gender, family = "binomial",
data = sti)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 14.56607 882.74337 0.017 0.9868
MaritalDivorcee -3.05274 1.21771 -2.507 0.0122 *
MaritalMarried -1.91514 1.08594 -1.764 0.0778 .
MaritalSingle -1.59963 1.13760 -1.406 0.1597
MaritalWidowed -3.20350 1.33830 -2.394 0.0167 *
EducationPrimary -12.33707 882.74410 -0.014 0.9888
EducationSecondary -12.68971 882.74403 -0.014 0.9885
EducationTertiary -12.28755 882.74428 -0.014 0.9889
GenderMale -0.04266 0.29341 -0.145 0.8844
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 313.30 on 225 degrees of freedom
Residual deviance: 297.61 on 217 degrees of freedom
AIC: 315.61
Number of Fisher Scoring iterations: 13
# odd ratio
exp(coef(Model5)) (Intercept) MaritalDivorcee MaritalMarried MaritalSingle
2.118180e+06 4.722955e-02 1.473213e-01 2.019713e-01
MaritalWidowed EducationPrimary EducationSecondary EducationTertiary
4.061959e-02 4.386082e-06 3.082676e-06 4.608747e-06
GenderMale
9.582389e-01
Survival Analysis
Terminologies
Censoring means Incomplete data. This might occurs if patient is lost to follow up or withdrew from study.
Kaplan- Meier estimator is a non-parametric statistics that estimate the survival function (Kaplan and Paul Meier, 1958).The statistics gives the probability that an individual patient can survive at a particular time t.
Log-rank test Use to compare two Kaplan-Meire curves
Recall that: The Kaplan-Meire estimator of surviving at t=0 is 1 and when t goes to infinity the survival Kaplan-Meire goes to 0.
#Clear R environment
rm(list = ls())
# Working directory
setwd("C:/Users/USER/Desktop/JOOUST")
# Load data set
library(readr)
ovarian <- read_csv("ovarian.csv")Rows: 26 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
dbl (6): futime, fustat, age, resid.ds, rx, ecog.ps
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Load the required packages for this survival analysis
library(survival)
Attaching package: 'survival'
The following object is masked _by_ '.GlobalEnv':
ovarian
library(survminer)Loading required package: ggpubr
Attaching package: 'ggpubr'
The following object is masked from 'package:expss':
compare_means
Attaching package: 'survminer'
The following object is masked from 'package:survival':
myeloma
library(dplyr)Data management
# Dichotomize age and change data labels
ovarian$rx <- factor(ovarian$rx,
levels = c("1", "2"),
labels = c("A", "B"))
ovarian$resid.ds <- factor(ovarian$resid.ds,
levels = c("1", "2"),
labels = c("no", "yes"))
ovarian$ecog.ps <- factor(ovarian$ecog.ps,
levels = c("1", "2"),
labels = c("good", "bad"))Explore the covariate
hist(ovarian$age,col = "blue",
main = "Simple histogram shows Age distribution")Age of patients seems to be bi modal
ovarian <- ovarian %>% mutate(age_group = ifelse(age >=50, "old", "young"))
ovarian$age_group <- factor(ovarian$age_group)# Fit survival data using the Kaplan-Meier method
surv_object <- Surv(time = ovarian$futime, event = ovarian$fustat)
surv_object [1] 59 115 156 421+ 431 448+ 464 475 477+ 563 638 744+
[13] 769+ 770+ 803+ 855+ 1040+ 1106+ 1129+ 1206+ 1227+ 268 329 353
[25] 365 377+
fit1 <- survfit(surv_object ~ rx, data = ovarian)
summary(fit1)Call: survfit(formula = surv_object ~ rx, data = ovarian)
rx=A
time n.risk n.event survival std.err lower 95% CI upper 95% CI
59 13 1 0.923 0.0739 0.789 1.000
115 12 1 0.846 0.1001 0.671 1.000
156 11 1 0.769 0.1169 0.571 1.000
268 10 1 0.692 0.1280 0.482 0.995
329 9 1 0.615 0.1349 0.400 0.946
431 8 1 0.538 0.1383 0.326 0.891
638 5 1 0.431 0.1467 0.221 0.840
rx=B
time n.risk n.event survival std.err lower 95% CI upper 95% CI
353 13 1 0.923 0.0739 0.789 1.000
365 12 1 0.846 0.1001 0.671 1.000
464 9 1 0.752 0.1256 0.542 1.000
475 8 1 0.658 0.1407 0.433 1.000
563 7 1 0.564 0.1488 0.336 0.946
ggsurvplot(fit1, data = ovarian, pval = TRUE)# Examine prdictive value of residual disease status
fit2 <- survfit(surv_object ~ resid.ds, data = ovarian)
ggsurvplot(fit2, data = ovarian, pval = TRUE)Cox proportional regression model
Cox describe the probability of an hazard if the patient survived at a particular time t. In simple terms it estimate instantaneous risk of death. Also you need hazard function to consider covariates when comparing survival of patient group.
# Fit a Cox proportional hazards model
fit.coxph <- coxph(surv_object ~ rx + resid.ds + age_group + ecog.ps,
data = ovarian)
summary(fit.coxph)Call:
coxph(formula = surv_object ~ rx + resid.ds + age_group + ecog.ps,
data = ovarian)
n= 26, number of events= 12
coef exp(coef) se(coef) z Pr(>|z|)
rxB -1.3814 0.2512 0.6448 -2.142 0.0322 *
resid.dsyes 1.4470 4.2503 0.7292 1.984 0.0472 *
age_groupyoung -2.2013 0.1107 1.1069 -1.989 0.0467 *
ecog.psbad 0.5859 1.7966 0.6329 0.926 0.3546
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
exp(coef) exp(-coef) lower .95 upper .95
rxB 0.2512 3.9803 0.07100 0.8891
resid.dsyes 4.2503 0.2353 1.01801 17.7453
age_groupyoung 0.1107 9.0368 0.01264 0.9686
ecog.psbad 1.7966 0.5566 0.51966 6.2113
Concordance= 0.782 (se = 0.065 )
Likelihood ratio test= 12.19 on 4 df, p=0.02
Wald test = 9.02 on 4 df, p=0.06
Score (logrank) test = 11.97 on 4 df, p=0.02