DATA SCIENCE SKILLS

Author

OJALA BRIAN OLOO

#Clear R environment
rm(list = ls())
# Working directory
setwd("C:/Users/USER/Desktop/JOOUST")
# Load data set in Rstudio
library(readxl)
data_1 <- read_excel("data_1.xlsx")
View(data_1)

Data wrangling

str(data_1)
tibble [227 × 6] (S3: tbl_df/tbl/data.frame)
 $ IdNumber   : num [1:227] 1 2 3 4 5 6 7 8 9 10 ...
 $ Education  : chr [1:227] "Secondary" "Secondary" "Secondary" "Secondary" ...
 $ Marital    : chr [1:227] "Married" "Married" "Married" "Single" ...
 $ Height     : num [1:227] 168 138 157 151 158 158 164 171 163 156 ...
 $ Gender     : chr [1:227] "Female" "Female" "Male" "Male" ...
 $ Agefirstsex: chr [1:227] "17+years" "17+years" "17+years" "<17years" ...

Load packages for data processing, visualization and Analysis

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.0     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(table1)

Attaching package: 'table1'

The following objects are masked from 'package:base':

    units, units<-
library(summarytools)

Attaching package: 'summarytools'

The following objects are masked from 'package:table1':

    label, label<-

The following object is masked from 'package:tibble':

    view

Convert Character to factor variables

data_1$Education<-as.factor(data_1$Education)
data_1$Marital<-as.factor(data_1$Marital)
data_1$Gender<-as.factor(data_1$Gender)
data_1$Agefirstsex<-as.factor(data_1$Agefirstsex)

Loading the second data set

library(readxl)
data_2 <- read_excel("data_2.xlsx")

Explore data_2

str(data_2)
tibble [227 × 7] (S3: tbl_df/tbl/data.frame)
 $ IdNumber  : num [1:227] 1 2 3 4 5 6 7 8 9 10 ...
 $ Date      : POSIXct[1:227], format: "2009-12-05" "2009-12-05" ...
 $ casestatus: num [1:227] 1 0 1 0 1 0 1 0 1 0 ...
 $ Age       : num [1:227] 30 30 23 24 27 25 33 30 63 63 ...
 $ Occupation: num [1:227] 4 2 1 1 4 1 1 2 4 3 ...
 $ Church    : num [1:227] 7 7 4 3 4 2 3 3 3 7 ...
 $ Weight    : num [1:227] 69 48 55 39 44.5 75 74 74.7 66 83 ...

Data processing

Value label for categorical variables

data_2$casestatus<-ordered(data_2$casestatus,levels=c(0,1),labels=c("No","Yes"))
data_2$Church<-ordered(data_2$Church,levels=c(1,2,3,4,5,6,7),labels=c("Anglican","Apostolic","Atheist","Methodist","Pentecostal","Catholic","Other"))

Convert character to factor variables

data_2$Date<-as.Date(data_2$Date)
data_2$casestatus<-as.factor(data_2$casestatus)
data_2$Church<-as.factor(data_2$Church)

Delete a column

Dropvar<-names(data_2)%in% c("Occupation")
data_2<-data_2[!Dropvar]

Merging data set

sti<-merge(data_2,data_1,by="IdNumber")

Dealing with duplicates

# Identify duplicates
sum(duplicated(sti$IdNumber))
[1] 3
# Remove duplicates
sti<-sti%>%distinct(IdNumber,.keep_all = TRUE)

Missing data

Explore each and every variable to identify if there is any missing values

# Exploring categorical variables
sti%>%count(casestatus,sort = TRUE)
  casestatus   n
1        Yes 113
2         No 112
3       <NA>   1
sti%>%count(Church,sort = TRUE)
       Church  n
1   Apostolic 52
2     Atheist 45
3    Catholic 41
4 Pentecostal 36
5       Other 23
6    Anglican 16
7   Methodist 13
sti%>%count(Education,sort = TRUE)
  Education   n
1 Secondary 188
2   Primary  26
3  Tertiary  11
4      None   1
sti%>%count(Marital,sort = TRUE)
      Marital   n
1     Married 149
2      Single  43
3    Divorcee  16
4 Co-habiting   9
5     Widowed   9
sti%>%count(Agefirstsex,sort = TRUE)
  Agefirstsex   n
1    17+years 179
2    <17years  41
3        <NA>   6
sti%>%count(Gender,sort = TRUE)
  Gender   n
1   Male 118
2 Female 108
#Exploring continuous variables
summary(sti$Age)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  16.00   23.00   26.00   27.98   32.00   63.00       1 
summary(sti$Weight)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  32.80   45.50   51.00   53.74   60.88  100.00 
summary(sti$Height)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  138.0   156.0   160.5   161.0   166.0   191.0 

Replacing missing data in categorical variables

sti<-sti%>%
  mutate(casestatus=replace_na(casestatus,"No"))
table(sti$casestatus)

 No Yes 
113 113 

Dealing with missing data in continuous variables

sti<-sti%>%
  mutate(Age=replace_na(Age,48))
summary(sti$Age)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  16.00   23.00   26.50   28.07   32.00   63.00 

Data Manipulation

sti<-sti%>%
  rename(STI=casestatus)%>%
  mutate(BMI=Weight/(Height/100)^2)

Rounding BMI to 1decimal places

sti$BMI<-round(as.numeric(sti$BMI),1)

Convert continuous variables to categorical

Participants Age group

# Age group
range(sti$Age)
[1] 16 63
sti<-within(sti,{
  Agegroup<-NA
  Agegroup[Age>=16 & Age<=25]<-"16-25"
  Agegroup[Age>=26 & Age<=35]<-"26-35"
  Agegroup[Age>=36 & Age<=45]<-"36-45"
  Agegroup[Age>=46 & Age<=55]<-"46-55"
  Agegroup[Age>=56]<-"56+"
})

Participants BMI categories

# BMI categories
range(sti$BMI)
[1] 13.4 36.3
sti<-within(sti,{
  BMIGroup<-NA
  BMIGroup[BMI<=18.4]<-"Malnourished"
  BMIGroup[BMI>=18.5 & BMI<=24.9]<-"Normal"
  BMIGroup[BMI>=25 & BMI<=29.9]<-"Over weight"
  BMIGroup[BMI>=30]<-"Obesity"
})
sti$BMIGroup<-as.factor(sti$BMIGroup)

Variable labels

library(expss)
Loading required package: maditr

To get total summary skip 'by' argument: take_all(mtcars, mean)

Attaching package: 'maditr'
The following objects are masked from 'package:dplyr':

    between, coalesce, first, last
The following object is masked from 'package:purrr':

    transpose
The following object is masked from 'package:readr':

    cols

Use 'expss_output_rnotebook()' to display tables inside R Notebooks.
 To return to the console output, use 'expss_output_default()'.

Attaching package: 'expss'
The following objects are masked from 'package:stringr':

    fixed, regex
The following objects are masked from 'package:dplyr':

    compute, contains, na_if, recode, vars, where
The following objects are masked from 'package:purrr':

    keep, modify, modify_if, when
The following objects are masked from 'package:tidyr':

    contains, nest
The following object is masked from 'package:ggplot2':

    vars
sti<-apply_labels(sti,
                  Age="Age in Years",
                  Education="Level of Education",
                  Weight=" Weight in Kgs",
                  Marital="Marital Status",
                  Height="Height in Cm",
                  Agefirstsex="Age at first sex",
                  BMI="Body Mass Index in Kg/M2",
                  Agegroup="Participants age group in  years",
                  BMIGroup="Participants body mass index categories",
                  STI="Sexual Transmitted Infection")

Exploratory Data Analysis

EDA for categorical variables

Exploring categorical variables using tables

# Explore Education level
freq(sti$Education,report.nas = FALSE)
Frequencies  
sti$Education  
Label: Level of Education  
Type: Factor  

                  Freq        %   % Cum.
--------------- ------ -------- --------
           None      1     0.44     0.44
        Primary     26    11.50    11.95
      Secondary    188    83.19    95.13
       Tertiary     11     4.87   100.00
          Total    226   100.00   100.00
freq(sti$STI,report.nas = FALSE)
Frequencies  
sti$STI  
Label: Sexual Transmitted Infection  
Type: Ordered Factor  

              Freq        %   % Cum.
----------- ------ -------- --------
         No    113    50.00    50.00
        Yes    113    50.00   100.00
      Total    226   100.00   100.00
freq(sti$Marital,report.nas = FALSE)
Frequencies  
sti$Marital  
Label: Marital Status  
Type: Factor  

                    Freq        %   % Cum.
----------------- ------ -------- --------
      Co-habiting      9     3.98     3.98
         Divorcee     16     7.08    11.06
          Married    149    65.93    76.99
           Single     43    19.03    96.02
          Widowed      9     3.98   100.00
            Total    226   100.00   100.00
freq(sti$Gender,report.nas = FALSE)
Frequencies  
sti$Gender  
Type: Factor  

               Freq        %   % Cum.
------------ ------ -------- --------
      Female    108    47.79    47.79
        Male    118    52.21   100.00
       Total    226   100.00   100.00
freq(sti$Agefirstsex,report.nas = FALSE)
Frequencies  
sti$Agefirstsex  
Label: Age at first sex  
Type: Factor  

                 Freq        %   % Cum.
-------------- ------ -------- --------
      <17years     41    18.64    18.64
      17+years    179    81.36   100.00
         Total    220   100.00   100.00
freq(sti$Agegroup,report.nas = FALSE)
Frequencies  
sti$Agegroup  
Label: Participants age group in  years  
Type: Character  

              Freq        %   % Cum.
----------- ------ -------- --------
      16-25     98    43.36    43.36
      26-35    100    44.25    87.61
      36-45     22     9.73    97.35
      46-55      4     1.77    99.12
        56+      2     0.88   100.00
      Total    226   100.00   100.00
freq(sti$BMIGroup,report.nas = FALSE)
Frequencies  
sti$BMIGroup  
Label: Participants body mass index categories  
Type: Factor  

                     Freq        %   % Cum.
------------------ ------ -------- --------
      Malnourished     89    39.38    39.38
            Normal    103    45.58    84.96
           Obesity      8     3.54    88.50
       Over weight     26    11.50   100.00
             Total    226   100.00   100.00

Exploring Categorical variables using bar graphs

# Age group
DF_1<-tibble(
   Agegroup=c("16-25","26-35","36-45","46-55","56 +"),
   Counts=c(98,100,22,4,2)
 )
ggplot(data = DF_1,aes(Agegroup,Counts,fill = Agegroup))+
   geom_bar(stat = "identity",width = 0.8,show.legend = FALSE)+
   geom_text(aes(label = Counts),size=4,vjust=-0.1)+
   theme_classic()+labs(title = "Participants Age group",
                        y="Frequency",
                        x="Age group  in years")

library(ggthemes)
DF_2<-tibble(
   Education=c("A","B","C","D"),
   Counts=c(188,26,11,1)
 )
ggplot(data=DF_2,aes(Education,Counts,fill = "Education"))+geom_bar(stat = "identity",width=0.77,show.legend = FALSE)+
  geom_text(aes(label=c("Secondary","Primary","Tertiary","None")),size = 4,hjust=-0.1,color="black")+coord_flip()+theme_classic()+
  ylim(0,240)+labs(title = "Participants Education level",
                   y="Frequency",
                   x="Education Level")

Counts<-table(sti$BMIGroup)
barplot(Counts,
        main = "Participants Body Index",
        xlab = "Body Mass Index",
        legend=rownames(Counts),
        col = c("red","blue","green","violet"))

library(forcats)
sti%>%
  mutate(Church=fct_infreq(Church))%>%
  mutate(Church=fct_rev(Church))%>%
  ggplot(aes(Church))+geom_bar(fill="blue")+coord_flip()+
  theme_classic()+
  labs(title = "Participant churches",
       y="Frequency",
       x="Churches")

EDA for continuous variables

library(psych)

Attaching package: 'psych'
The following objects are masked from 'package:ggplot2':

    %+%, alpha
Sumstatistics<-sti%>%
  select(Age,Weight,Height,BMI)
# Summary Statistics
describe(Sumstatistics)
       vars   n   mean    sd median trimmed   mad   min   max range skew
Age       1 226  28.07  7.18   26.5   27.21  5.19  16.0  63.0  47.0 1.57
Weight    2 226  53.74 11.70   51.0   52.66 10.38  32.8 100.0  67.2 1.05
Height    3 226 160.96  8.00  160.5  160.81  8.15 138.0 191.0  53.0 0.31
BMI       4 226  20.74  4.21   20.2   20.36  4.15  13.4  36.3  22.9 0.90
       kurtosis   se
Age        4.19 0.48
Weight     1.45 0.78
Height     0.65 0.53
BMI        0.84 0.28

Testing Normality of Continuous variables using histogram and Shapiro wilk test

Histogram

ggplot(sti,aes(x=Age))+
  geom_histogram(fill="blue",color="white")+
  geom_vline(aes(xintercept=mean(Age)),color="red",
             linewidth=1,linetype="dashed")+
  theme_classic()+
  labs(title = "Histogram showing Age distribution",
       y= "Counts",
       x=" Age in years")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Non-graphical Method

# Shapiro wilk test
library(e1071)
shapiro.test(sti$Age)

    Shapiro-Wilk normality test

data:  sti$Age
W = 0.88919, p-value = 7.855e-12

Body Mass Index

# Body Mass Index
hist(sti$BMI,probability=T,col = "blue", main="BMI distribution",xlab="Participant BMI(Kg/M^2)")
lines(density(sti$BMI),col="black")

Weight

# Weight
ggplot(sti,aes(x=Weight))+
  geom_histogram(fill="blue",color="white")+
  theme_classic()+
  labs(title = "Distribution of participants weight",
       y="Counts",
       x= "Weight in Kgs")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Height

# Height
Stat<-sti%>%
  summarise(
    mean_Height=mean(Height),
    SD_Height=sd(Height),
    Median_Produc=median(Height),
    conf.25=quantile(Height,0.25),
    conf.75=quantile(Height,0.75),
    con.low=quantile(Height,0.025),
    conf.high=quantile(Height,0.975)
  )
###Plot summary stat
ggplot(sti,aes(x=Height))+
  geom_histogram(fill="blue",color="white")+
  geom_vline(data =Stat,
             aes(xintercept=mean_Height),
             linetype="dashed",color="black")+
  geom_vline(data=Stat,
             aes(xintercept=mean_Height+SD_Height),
             linetype="dashed",color="red")+
  geom_vline(data=Stat,
             aes(xintercept=mean_Height-SD_Height),
             linetype="dashed",color="red")+
  geom_vline(data=Stat,
             aes(xintercept=mean_Height+SD_Height*2),
             linetype="dotted",color="red")+
  geom_vline(data=Stat,
             aes(xintercept=mean_Height-SD_Height*2),
             linetype="dotted",color="red")+
  theme_classic()+
  labs(title = "Distribution of participants height",
       y="Frequency",
       x="Height in Cm")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Dealing with Outliers in continuous variables using box plot

boxplot(sti$Age,col = "blue")

boxplot(sti$Weight,col="red")

boxplot(sti$Height,col = "green")

boxplot(sti$BMI,col="violet")

How to identify outliers

To identify outlier within the data set you need to calculate minimum value and maximum value by using this formula; Max value=Q3+1.5IQR while Min value=Q1-1.5IQR.

To identify outliers in Age Variables

 #Age
summary(sti$Age)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  16.00   23.00   26.50   28.07   32.00   63.00 
IQR(sti$Age)
VALUES:
9
minValue<-23- 1.5*9
minValue
[1] 9.5
maxValue<-32+1.5*9
maxValue
[1] 45.5
# Printing out outliers
sti%>%
  select(IdNumber,Age)%>%
  filter(Age>45.5)
  IdNumber Age
1        9  63
2       10  63
3       27  48
4      185  46
5      186  50
6      214  46

To indenfy outliers in Weight Variable

# Weight
summary(sti$Weight)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  32.80   45.50   51.00   53.74   60.88  100.00 
IQR(sti$Weight)
VALUES:
15.375
minValue<-32.8- 1.5*15.375
minValue
[1] 9.7375
maxValue<-60.88+1.5*15.375
maxValue
[1] 83.9425
# Printing out outliers
sti%>%
  select(IdNumber,Weight)%>%
  filter(Weight>83.9425)
  IdNumber Weight
1       12   94.0
2       34   91.4
3       35  100.0
4       89   95.5

To identify outliers in Body Mass Index Variable

# BMI
summary(sti$BMI)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  13.40   17.52   20.20   20.74   23.10   36.30 
IQR(sti$BMI)
VALUES:
5.575
minValue<-17.52- 1.5*5.575
minValue
[1] 9.1575
maxValue<-23.1+1.5*5.575
maxValue
[1] 31.4625
# Printing out outliers
sti%>%
  select(IdNumber,BMI)%>%
  filter(BMI>31.4625)
  IdNumber  BMI
1       10 34.1
2       12 32.5
3       13 32.8
4       34 33.2
5       35 36.3
6       89 33.0

To identify outliers in Height Variable

# Height
summary(sti$Height)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  138.0   156.0   160.5   161.0   166.0   191.0 
IQR(sti$Height)
VALUES:
10
minValue<-156- 1.5*10
minValue
[1] 141
maxValue<-166+1.5*10
maxValue
[1] 181
# Printing out outliers
sti%>%
  select(IdNumber,Height)%>%
  filter(Height<141|Height>181)
  IdNumber Height
1        2    138
2       32    182
3       98    191
4      116    182
5      215    182

Bivariate Analysis

In bivariate analysis, we treated STI as the outcome variable. Where participants infected with STI was recorded 1 and 0 otherwise. The rest of the variables were treated as exposure variables.

Chi-Square statistics was used to test the relationship between the outcome and exposure variables.

Age group Vs STI

# Age group
ctable(
  x=sti$Agegroup,
  y=sti$STI,
  chisq = TRUE,
  headings=FALSE
)
Warning in chisq.test(freq_table_min): Chi-squared approximation may be
incorrect

---------- ----- ------------- ------------- --------------
             STI            No           Yes          Total
  Agegroup                                                 
     16-25          50 (51.0%)    48 (49.0%)    98 (100.0%)
     26-35          47 (47.0%)    53 (53.0%)   100 (100.0%)
     36-45          13 (59.1%)     9 (40.9%)    22 (100.0%)
     46-55           2 (50.0%)     2 (50.0%)     4 (100.0%)
       56+           1 (50.0%)     1 (50.0%)     2 (100.0%)
     Total         113 (50.0%)   113 (50.0%)   226 (100.0%)
---------- ----- ------------- ------------- --------------

----------------------------
 Chi.squared   df   p.value 
------------- ---- ---------
   1.1281      4    0.8898  
----------------------------

Marital Status Vs STI

# Marital Status
ctable(
  x=sti$Marital,
  y=sti$STI,
  chisq = TRUE,
  headings=FALSE
)
Warning in chisq.test(freq_table_min): Chi-squared approximation may be
incorrect

------------- ----- ------------- ------------- --------------
                STI            No           Yes          Total
      Marital                                                 
  Co-habiting           1 (11.1%)     8 (88.9%)     9 (100.0%)
     Divorcee          12 (75.0%)     4 (25.0%)    16 (100.0%)
      Married          75 (50.3%)    74 (49.7%)   149 (100.0%)
       Single          18 (41.9%)    25 (58.1%)    43 (100.0%)
      Widowed           7 (77.8%)     2 (22.2%)     9 (100.0%)
        Total         113 (50.0%)   113 (50.0%)   226 (100.0%)
------------- ----- ------------- ------------- --------------

----------------------------
 Chi.squared   df   p.value 
------------- ---- ---------
   13.3685     4    0.0096  
----------------------------

BMI Categories Vs STI

# BMI Group
ctable(
  x=sti$BMIGroup,
  y=sti$STI,
  chisq = TRUE,
  headings=FALSE
)
Warning in chisq.test(freq_table_min): Chi-squared approximation may be
incorrect

-------------- ----- ------------- ------------- --------------
                 STI            No           Yes          Total
      BMIGroup                                                 
  Malnourished          46 (51.7%)    43 (48.3%)    89 (100.0%)
        Normal          52 (50.5%)    51 (49.5%)   103 (100.0%)
       Obesity           4 (50.0%)     4 (50.0%)     8 (100.0%)
   Over weight          11 (42.3%)    15 (57.7%)    26 (100.0%)
         Total         113 (50.0%)   113 (50.0%)   226 (100.0%)
-------------- ----- ------------- ------------- --------------

----------------------------
 Chi.squared   df   p.value 
------------- ---- ---------
   0.7262      3     0.867  
----------------------------

Data Visualization

DV<-sti%>%
  dplyr::group_by(STI,BMIGroup)%>%
  dplyr::tally()%>%
  dplyr::mutate(percentage=n/sum(n))
ggplot(data=DV,aes(x=BMIGroup,y=n,fill=STI))+
geom_bar(stat = "identity",width = 0.7)+
geom_text(aes(label = paste0(sprintf("%1.1f",percentage*100),"%")),
                 position = position_stack(vjust = 0.5),colour="white")+
  theme_classic()+
  labs(title = "BMI Cataegories among STI cases",
       y= "Frequency",
       x="Body Mass Index Categories")

DV_1<-sti%>%
  dplyr::group_by(STI,Gender)%>%
  dplyr::tally()%>%
  dplyr::mutate(percentage=n/sum(n))
ggplot(data=DV_1,aes(x=Gender,y=n,fill=STI))+
geom_bar(stat = "identity",width = 0.7)+
geom_text(aes(label = paste0(sprintf("%1.1f",percentage*100),"%")),
                 position = position_stack(vjust = 0.5),colour="white")+
  theme_classic()+
  labs(title = "STI cases among Gender",
       y= "Frequency",
       x="Gender")


Data visualization for Continuous

Correlation HeatMap

library(corrplot)
corrplot 0.95 loaded
corr <- sti %>% 
  select(Age, BMI, Weight, Height) %>% 
  replace(is.na(.), 0)
correlation = cor(corr)
corrplot(correlation, type="upper", method="color", addCoef.col = "black")

Data visualization for categorical vs continuous

ggplot(sti,aes(Church,Height,fill =Church))+geom_boxplot(show.legend = FALSE)+
  theme_classic()+
  labs(title = "Height distribution ")

Summary Statistics for categorical variables

table1(~Gender+Education + Marital+Agegroup+BMIGroup|STI,data=sti,overall = "Total")
No
(N=113)
Yes
(N=113)
Total
(N=226)
Gender
Female 55 (48.7%) 53 (46.9%) 108 (47.8%)
Male 58 (51.3%) 60 (53.1%) 118 (52.2%)
Level of Education
None 0 (0%) 1 (0.9%) 1 (0.4%)
Primary 12 (10.6%) 14 (12.4%) 26 (11.5%)
Secondary 97 (85.8%) 91 (80.5%) 188 (83.2%)
Tertiary 4 (3.5%) 7 (6.2%) 11 (4.9%)
Marital Status
Co-habiting 1 (0.9%) 8 (7.1%) 9 (4.0%)
Divorcee 12 (10.6%) 4 (3.5%) 16 (7.1%)
Married 75 (66.4%) 74 (65.5%) 149 (65.9%)
Single 18 (15.9%) 25 (22.1%) 43 (19.0%)
Widowed 7 (6.2%) 2 (1.8%) 9 (4.0%)
Participants age group in years
16-25 50 (44.2%) 48 (42.5%) 98 (43.4%)
26-35 47 (41.6%) 53 (46.9%) 100 (44.2%)
36-45 13 (11.5%) 9 (8.0%) 22 (9.7%)
46-55 2 (1.8%) 2 (1.8%) 4 (1.8%)
56+ 1 (0.9%) 1 (0.9%) 2 (0.9%)
Participants body mass index categories
Malnourished 46 (40.7%) 43 (38.1%) 89 (39.4%)
Normal 52 (46.0%) 51 (45.1%) 103 (45.6%)
Obesity 4 (3.5%) 4 (3.5%) 8 (3.5%)
Over weight 11 (9.7%) 15 (13.3%) 26 (11.5%)

Summary statistics for continuous variables

table1(~Age+Height+BMI+Weight|STI,data=sti,overall = "Total")
No
(N=113)
Yes
(N=113)
Total
(N=226)
Age in Years
Mean (SD) 28.3 (7.24) 27.9 (7.15) 28.1 (7.18)
Median [Min, Max] 27.0 [16.0, 63.0] 26.0 [17.0, 63.0] 26.5 [16.0, 63.0]
Height in Cm
Mean (SD) 161 (8.45) 161 (7.57) 161 (8.00)
Median [Min, Max] 161 [138, 191] 160 [145, 182] 161 [138, 191]
Body Mass Index in Kg/M2
Mean (SD) 20.4 (4.09) 21.1 (4.31) 20.7 (4.21)
Median [Min, Max] 19.7 [13.4, 34.1] 20.4 [13.8, 36.3] 20.2 [13.4, 36.3]
Weight in Kgs
Mean (SD) 52.8 (11.6) 54.7 (11.8) 53.7 (11.7)
Median [Min, Max] 50.2 [33.0, 94.0] 51.7 [32.8, 100] 51.0 [32.8, 100]

Part two: Hypothesis testing using T.test

T test is one of a parametric test used in hypothesis testing if and only if the distribution of data is symmetric but if the data the data is either skewed to left or right wilcoxon sign rank test is the suitable. For this this we to hypothesize mean distribution of participants.

In hypothesis testing, the following assumption must be achieved:

1.Large representative of sample size

2.Normality assumption

HT<-sti%>%
  select(Height,Gender,STI)

Test normality assumption using Shapiro wilk test

Height variable is normally distributed since P-value is greater than 0.05

shapiro.test(HT$Height)

    Shapiro-Wilk normality test

data:  HT$Height
W = 0.98898, p-value = 0.0811

Filter outliers because outliers interfere with the distribution of mean as a measure of central tendency.

HT_1<-HT%>%
  select(Gender,STI,Height)%>%
  filter(Height>=141 & Height<=181)

Again, check the height distribution using histogram and boxplot

hist(HT_1$Height,col="violet")

boxplot(HT_1$Height,col = "red")

HT_1%>%count(Gender)
  Gender   n
1 Female 104
2   Male 117
mean(HT_1$Height)
[1] 160.6425

One sided T test

Ho: Mean height of female participants>=160.6425

HT_1%>%
  filter(Gender=="Female")%>%
  select(Height)%>%
  t.test(mu=160.6425)

    One Sample t-test

data:  .
t = 0.0021974, df = 103, p-value = 0.9983
alternative hypothesis: true mean is not equal to 160.6425
95 percent confidence interval:
 159.0821 162.2064
sample estimates:
mean of x 
 160.6442 

Ho: Mean height of male participants<=160.6425

HT_1%>%
  filter(Gender=="Male")%>%
  select(Height)%>%
  t.test(mu=160.6425)

    One Sample t-test

data:  .
t = -0.0024323, df = 116, p-value = 0.9981
alternative hypothesis: true mean is not equal to 160.6425
95 percent confidence interval:
 159.4404 161.8416
sample estimates:
mean of x 
  160.641 

Two sided t test

Ho: Mean Height of participants infected with STI equal to mean height of participant not infected

HT_1%>%
  t.test(Height~STI,data=.,
         alternative = "two.sided")

    Welch Two Sample t-test

data:  Height by STI
t = -0.27768, df = 218.96, p-value = 0.7815
alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
95 percent confidence interval:
 -2.204145  1.659748
sample estimates:
 mean in group No mean in group Yes 
         160.5046          160.7768 

Linear Regression Analysis

Simple linear Regression

Model1<-lm(BMI~Weight,data = sti)
summary(Model1)

Call:
lm(formula = BMI ~ Weight, data = sti)

Residuals:
LABEL: Body Mass Index in Kg/M2 
VALUES:
-6.3411, -1.136, -0.0497, 1.1774, 6.2973

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   3.5491     0.6050   5.866 1.59e-08 ***
Weight        0.3199     0.0110  29.075  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.93 on 224 degrees of freedom
Multiple R-squared:  0.7905,    Adjusted R-squared:  0.7896 
F-statistic: 845.4 on 1 and 224 DF,  p-value: < 2.2e-16

Multiple linear regression model

Model2<-lm(BMI~Weight+Height,data = sti)
summary(Model2)

Call:
lm(formula = BMI ~ Weight + Height, data = sti)

Residuals:
LABEL: Body Mass Index in Kg/M2 
VALUES:
-1.29656, -0.16805, 0.01601, 0.14119, 1.60557

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 40.934623   0.473437   86.46   <2e-16 ***
Weight       0.379233   0.002125  178.49   <2e-16 ***
Height      -0.252087   0.003105  -81.18   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.35 on 223 degrees of freedom
Multiple R-squared:  0.9931,    Adjusted R-squared:  0.9931 
F-statistic: 1.615e+04 on 2 and 223 DF,  p-value: < 2.2e-16

Logistic regression Analysis

Simple logistic regression model

Model4<-glm(STI~Marital,data = sti,family = "binomial")
summary(Model4)

Call:
glm(formula = STI ~ Marital, family = "binomial", data = sti)

Coefficients:
                Estimate Std. Error z value Pr(>|z|)   
(Intercept)        2.079      1.061   1.961   0.0499 * 
MaritalDivorcee   -3.178      1.208  -2.632   0.0085 **
MaritalMarried    -2.093      1.073  -1.950   0.0512 . 
MaritalSingle     -1.751      1.105  -1.585   0.1130   
MaritalWidowed    -3.332      1.330  -2.506   0.0122 * 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 313.30  on 225  degrees of freedom
Residual deviance: 298.83  on 221  degrees of freedom
AIC: 308.83

Number of Fisher Scoring iterations: 4
# Odd Ratio
exp(coef(Model4))
    (Intercept) MaritalDivorcee  MaritalMarried   MaritalSingle  MaritalWidowed 
     7.99999999      0.04166667      0.12333333      0.17361111      0.03571429 

Multiple logistic regression

Model5<-glm(STI~Marital+Education+Gender,data = sti,family = "binomial")
summary(Model5)

Call:
glm(formula = STI ~ Marital + Education + Gender, family = "binomial", 
    data = sti)

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)  
(Intercept)         14.56607  882.74337   0.017   0.9868  
MaritalDivorcee     -3.05274    1.21771  -2.507   0.0122 *
MaritalMarried      -1.91514    1.08594  -1.764   0.0778 .
MaritalSingle       -1.59963    1.13760  -1.406   0.1597  
MaritalWidowed      -3.20350    1.33830  -2.394   0.0167 *
EducationPrimary   -12.33707  882.74410  -0.014   0.9888  
EducationSecondary -12.68971  882.74403  -0.014   0.9885  
EducationTertiary  -12.28755  882.74428  -0.014   0.9889  
GenderMale          -0.04266    0.29341  -0.145   0.8844  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 313.30  on 225  degrees of freedom
Residual deviance: 297.61  on 217  degrees of freedom
AIC: 315.61

Number of Fisher Scoring iterations: 13
# odd ratio
exp(coef(Model5))
       (Intercept)    MaritalDivorcee     MaritalMarried      MaritalSingle 
      2.118180e+06       4.722955e-02       1.473213e-01       2.019713e-01 
    MaritalWidowed   EducationPrimary EducationSecondary  EducationTertiary 
      4.061959e-02       4.386082e-06       3.082676e-06       4.608747e-06 
        GenderMale 
      9.582389e-01 

Survival Analysis

Terminologies

Censoring means Incomplete data. This might occurs if patient is lost to follow up or withdrew from study.

Kaplan- Meier estimator is a non-parametric statistics that estimate the survival function (Kaplan and Paul Meier, 1958).The statistics gives the probability that an individual patient can survive at a particular time t.

Log-rank test Use to compare two Kaplan-Meire curves

Recall that: The Kaplan-Meire estimator of surviving at t=0 is 1 and when t goes to infinity the survival Kaplan-Meire goes to 0.

#Clear R environment
rm(list = ls())
# Working directory
setwd("C:/Users/USER/Desktop/JOOUST")
# Load data set
library(readr)
ovarian <- read_csv("ovarian.csv")
Rows: 26 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
dbl (6): futime, fustat, age, resid.ds, rx, ecog.ps

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Load the required packages for this survival analysis

library(survival)

Attaching package: 'survival'
The following object is masked _by_ '.GlobalEnv':

    ovarian
library(survminer)
Loading required package: ggpubr

Attaching package: 'ggpubr'
The following object is masked from 'package:expss':

    compare_means

Attaching package: 'survminer'
The following object is masked from 'package:survival':

    myeloma
library(dplyr)

Data management

# Dichotomize age and change data labels
ovarian$rx <- factor(ovarian$rx,
 levels = c("1", "2"),
 labels = c("A", "B"))
ovarian$resid.ds <- factor(ovarian$resid.ds,
 levels = c("1", "2"),
 labels = c("no", "yes"))
ovarian$ecog.ps <- factor(ovarian$ecog.ps,
 levels = c("1", "2"),
 labels = c("good", "bad"))

Explore the covariate

hist(ovarian$age,col = "blue",
     main = "Simple histogram shows Age distribution")

Age of patients seems to be bi modal

ovarian <- ovarian %>% mutate(age_group = ifelse(age >=50, "old", "young"))
ovarian$age_group <- factor(ovarian$age_group)
# Fit survival data using the Kaplan-Meier method
surv_object <- Surv(time = ovarian$futime, event = ovarian$fustat)
surv_object
 [1]   59   115   156   421+  431   448+  464   475   477+  563   638   744+
[13]  769+  770+  803+  855+ 1040+ 1106+ 1129+ 1206+ 1227+  268   329   353 
[25]  365   377+
fit1 <- survfit(surv_object ~ rx, data = ovarian)
summary(fit1)
Call: survfit(formula = surv_object ~ rx, data = ovarian)

                rx=A 
 time n.risk n.event survival std.err lower 95% CI upper 95% CI
   59     13       1    0.923  0.0739        0.789        1.000
  115     12       1    0.846  0.1001        0.671        1.000
  156     11       1    0.769  0.1169        0.571        1.000
  268     10       1    0.692  0.1280        0.482        0.995
  329      9       1    0.615  0.1349        0.400        0.946
  431      8       1    0.538  0.1383        0.326        0.891
  638      5       1    0.431  0.1467        0.221        0.840

                rx=B 
 time n.risk n.event survival std.err lower 95% CI upper 95% CI
  353     13       1    0.923  0.0739        0.789        1.000
  365     12       1    0.846  0.1001        0.671        1.000
  464      9       1    0.752  0.1256        0.542        1.000
  475      8       1    0.658  0.1407        0.433        1.000
  563      7       1    0.564  0.1488        0.336        0.946
ggsurvplot(fit1, data = ovarian, pval = TRUE)

# Examine prdictive value of residual disease status
fit2 <- survfit(surv_object ~ resid.ds, data = ovarian)
ggsurvplot(fit2, data = ovarian, pval = TRUE)

Cox proportional regression model

Cox describe the probability of an hazard if the patient survived at a particular time t. In simple terms it estimate instantaneous risk of death. Also you need hazard function to consider covariates when comparing survival of patient group.

# Fit a Cox proportional hazards model
fit.coxph <- coxph(surv_object ~ rx + resid.ds + age_group + ecog.ps,
 data = ovarian)
summary(fit.coxph)
Call:
coxph(formula = surv_object ~ rx + resid.ds + age_group + ecog.ps, 
    data = ovarian)

  n= 26, number of events= 12 

                  coef exp(coef) se(coef)      z Pr(>|z|)  
rxB            -1.3814    0.2512   0.6448 -2.142   0.0322 *
resid.dsyes     1.4470    4.2503   0.7292  1.984   0.0472 *
age_groupyoung -2.2013    0.1107   1.1069 -1.989   0.0467 *
ecog.psbad      0.5859    1.7966   0.6329  0.926   0.3546  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

               exp(coef) exp(-coef) lower .95 upper .95
rxB               0.2512     3.9803   0.07100    0.8891
resid.dsyes       4.2503     0.2353   1.01801   17.7453
age_groupyoung    0.1107     9.0368   0.01264    0.9686
ecog.psbad        1.7966     0.5566   0.51966    6.2113

Concordance= 0.782  (se = 0.065 )
Likelihood ratio test= 12.19  on 4 df,   p=0.02
Wald test            = 9.02  on 4 df,   p=0.06
Score (logrank) test = 11.97  on 4 df,   p=0.02