DATA PROCESSING ,ANALYSIS & PRESENTATION

Author

BRIAN OLOO OJALA(0743670039)

# Clear R environment
rm(list=ls())
# Set Working directory
setwd("C:/Analysis")

Import data set from excel to R

#Load set first data set called Sti_dataset1
library(readr)
Sti_dataset1 <- read_csv("Sti dataset1.csv")
Rows: 226 Columns: 7
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): Date
dbl (6): IdNumber, Weight, Occupation, Church, Gender, Typeofsti

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Data Processing

#1 Value label
Sti_dataset1$Occupation<-ordered(Sti_dataset1$Occupation,levels=
                                   c(1,2,3,4),labels=c("Formal","Informal","Student","Unemployed"))

Sti_dataset1$Church<-ordered(Sti_dataset1$Church,levels=
                                   c(1,2,3,4,5,6,7),labels=c("Anglican","Apostolic","Atheist","Methodist","Pentecostal" ,"Roman catholic","Other"))
Sti_dataset1$Gender<-ordered(Sti_dataset1$Gender,levels=
                                   c(1,2),labels=c("Male","Female"))
#Load second data set called sti_dataset2
library(readxl)
Sti_dataset2 <- read_excel("Sti dataset2.xlsx")
#1 Value label
Sti_dataset2$casestatus<-ordered(Sti_dataset2$casestatus,levels=
                               c(0,1),labels=c("Not infected","Infected"))
Sti_dataset2$MaritalStatus<-ordered(Sti_dataset2$MaritalStatus,levels=
                                   c(1,2,3,4,5),labels=c("Co-habiting","Divorce","Married","Single","Windowed"))

Combine the two data set using merge function

# Combining data sets
Sti<-merge(Sti_dataset1,Sti_dataset2,by="IdNumber")

Delete EducationLevel and TypeofSti from the Sti data set

# Delete TypeofSti and EducationLevel
Dropvar<-names(Sti)%in%c("EducationLevel","Typeofsti")
Sti<-Sti[!Dropvar]

Data manipulation

# Create a new column
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ purrr     1.0.2
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Sti<-Sti%>%
  mutate(BMI=Weight/(Height/100)^2)%>%
  mutate(Date=as.Date(Date),
         Occupation=as.factor(Occupation),
         Church=as.factor(Church),
         Gender=as.factor(Gender),
         MaritalStatus=as.factor(MaritalStatus),
         casestatus=as.factor(casestatus))
# Round off BMI 2dp
Sti$BMI<-round(as.numeric(Sti$BMI),2)
# Create Age category from participant Age labels
Sti<-within(Sti,{
  Age.cat<-NA
  Age.cat[Age>=16 & Age<=25]<-"16-25"
  Age.cat[Age>25 & Age<=34]<-"26-34"
  Age.cat[Age>34 & Age<=44]<-"35-44"
  Age.cat[Age>44 & Age<=54]<-"45-54"
  Age.cat[Age>54 & Age<=64]<-"55-64"
})
# Variable label
library(expss)
Loading required package: maditr

To aggregate data: take(mtcars, mean_mpg = mean(mpg), by = am)

Attaching package: 'maditr'
The following objects are masked from 'package:dplyr':

    between, coalesce, first, last
The following object is masked from 'package:purrr':

    transpose
The following object is masked from 'package:readr':

    cols

Use 'expss_output_rnotebook()' to display tables inside R Notebooks.
 To return to the console output, use 'expss_output_default()'.

Attaching package: 'expss'
The following objects are masked from 'package:stringr':

    fixed, regex
The following objects are masked from 'package:dplyr':

    compute, contains, na_if, recode, vars, where
The following objects are masked from 'package:purrr':

    keep, modify, modify_if, when
The following objects are masked from 'package:tidyr':

    contains, nest
The following object is masked from 'package:ggplot2':

    vars
Sti<-apply_labels(Sti,
                  Age="Age in Years",
                  Weight="Weight in Kgs",
                  Height="Height in Cm",
                  Occupation="Occupation status",
                  Gender="Sex",
                  casestatus="STI infections",
                  MaritalStatus="Marital Status"
)

Explanatory Data Analysis:

Qualitative variable: Bar chart, Pie chart, Tables

Quantitative variable: Box plot , Scatter plot ,Histogram, correlation heat map

Quantitative vs Qualitative: Box plot

EDA for Qualitative variable

Churches

library(stringr)
df<-Sti%>%
  dplyr::group_by(Church)%>%
  dplyr::mutate(Church=str_to_title(Church))%>%
  tally()
ggplot(df,aes(x=reorder(Church,n),y=n))+
  geom_bar(stat = "identity",fill="blue",color="black")+
  geom_text(aes(label = n),hjust=1.45)+coord_flip()+
  theme_classic()+labs(x="Church",y="Count",title = "Participant religion")

df1<-Sti%>%
  dplyr::group_by(Gender)%>%
  dplyr::mutate(Gender=str_to_title(Gender))%>%
  tally()
ggplot(df1,aes(x=reorder(Gender,n),y=n))+
  geom_bar(stat = "identity",fill="red",color="black")+
  geom_text(aes(label = n),hjust=1.5)+
  theme_classic()+labs(x="Gender",y="Count",title = "Partipant with their Gender")

ggplot(Sti,aes(x=Age))+
  geom_histogram(fill="blue",color="white")+
  geom_vline(aes(xintercept=mean(Age)),color="red",
             linewidth=1,linetype="dashed")+
  theme_classic()+
  labs(title = "Histogram showing Age distribution",
       y= "Counts",
       x=" Age in years")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Box Plot

boxplot(Sti$Age,col = "blue")

ggplot(Sti,aes(Age.cat,BMI,fill =Gender))+geom_boxplot()+
  theme_classic()+
  labs(title = "BMI distribution ")

Scatter Plot

Sti%>%
  ggplot(aes(Weight,BMI,colour = Gender))+ geom_point(method ="lm",se = FALSE)
Warning in geom_point(method = "lm", se = FALSE): Ignoring unknown parameters:
`method` and `se`

Descriptive Statistics

library(table1)

Attaching package: 'table1'
The following objects are masked from 'package:base':

    units, units<-
table1(~casestatus+MaritalStatus+Gender+Occupation+Church+Age.cat,data = Sti)
Overall
(N=226)
STI infections
Not infected 162 (71.7%)
Infected 64 (28.3%)
Marital Status
Co-habiting 9 (4.0%)
Divorce 16 (7.1%)
Married 149 (65.9%)
Single 43 (19.0%)
Windowed 9 (4.0%)
Sex
Male 108 (47.8%)
Female 118 (52.2%)
Occupation status
Formal 91 (40.3%)
Informal 45 (19.9%)
Student 14 (6.2%)
Unemployed 76 (33.6%)
Church
Anglican 16 (7.1%)
Apostolic 52 (23.0%)
Atheist 45 (19.9%)
Methodist 13 (5.8%)
Pentecostal 36 (15.9%)
Roman catholic 41 (18.1%)
Other 23 (10.2%)
Age.cat
16-25 98 (43.4%)
26-34 96 (42.5%)
35-44 25 (11.1%)
45-54 5 (2.2%)
55-64 2 (0.9%)
table1(~Age+Height+BMI+Weight|Gender,data = Sti)
Male
(N=108)
Female
(N=118)
Overall
(N=226)
Age in Years
Mean (SD) 27.0 (5.77) 29.0 (8.18) 28.1 (7.18)
Median [Min, Max] 26.0 [16.0, 46.0] 27.0 [19.0, 63.0] 26.5 [16.0, 63.0]
Height in Cm
Mean (SD) 162 (8.48) 160 (7.54) 161 (8.00)
Median [Min, Max] 162 [138, 182] 160 [143, 191] 161 [138, 191]
BMI
Mean (SD) 20.7 (4.12) 20.8 (4.31) 20.7 (4.21)
Median [Min, Max] 20.1 [13.4, 36.3] 20.3 [13.5, 34.1] 20.2 [13.4, 36.3]
Weight in Kgs
Mean (SD) 54.0 (11.9) 53.5 (11.5) 53.7 (11.7)
Median [Min, Max] 51.9 [33.0, 100] 50.5 [32.8, 94.0] 51.0 [32.8, 100]
table1(~Age+Height+BMI+Weight,data = Sti)
Overall
(N=226)
Age in Years
Mean (SD) 28.1 (7.18)
Median [Min, Max] 26.5 [16.0, 63.0]
Height in Cm
Mean (SD) 161 (8.00)
Median [Min, Max] 161 [138, 191]
BMI
Mean (SD) 20.7 (4.21)
Median [Min, Max] 20.2 [13.4, 36.3]
Weight in Kgs
Mean (SD) 53.7 (11.7)
Median [Min, Max] 51.0 [32.8, 100]
table1(~casestatus+MaritalStatus+Occupation+Church+Age.cat|Gender,data = Sti)
Male
(N=108)
Female
(N=118)
Overall
(N=226)
STI infections
Not infected 76 (70.4%) 86 (72.9%) 162 (71.7%)
Infected 32 (29.6%) 32 (27.1%) 64 (28.3%)
Marital Status
Co-habiting 3 (2.8%) 6 (5.1%) 9 (4.0%)
Divorce 11 (10.2%) 5 (4.2%) 16 (7.1%)
Married 68 (63.0%) 81 (68.6%) 149 (65.9%)
Single 21 (19.4%) 22 (18.6%) 43 (19.0%)
Windowed 5 (4.6%) 4 (3.4%) 9 (4.0%)
Occupation status
Formal 40 (37.0%) 51 (43.2%) 91 (40.3%)
Informal 17 (15.7%) 28 (23.7%) 45 (19.9%)
Student 6 (5.6%) 8 (6.8%) 14 (6.2%)
Unemployed 45 (41.7%) 31 (26.3%) 76 (33.6%)
Church
Anglican 13 (12.0%) 3 (2.5%) 16 (7.1%)
Apostolic 27 (25.0%) 25 (21.2%) 52 (23.0%)
Atheist 14 (13.0%) 31 (26.3%) 45 (19.9%)
Methodist 8 (7.4%) 5 (4.2%) 13 (5.8%)
Pentecostal 17 (15.7%) 19 (16.1%) 36 (15.9%)
Roman catholic 17 (15.7%) 24 (20.3%) 41 (18.1%)
Other 12 (11.1%) 11 (9.3%) 23 (10.2%)
Age.cat
16-25 52 (48.1%) 46 (39.0%) 98 (43.4%)
26-34 46 (42.6%) 50 (42.4%) 96 (42.5%)
35-44 9 (8.3%) 16 (13.6%) 25 (11.1%)
45-54 1 (0.9%) 4 (3.4%) 5 (2.2%)
55-64 0 (0%) 2 (1.7%) 2 (0.9%)

Inferential Statistics (Measure of Association

# Chi-square test statistics
library(summarytools)

Attaching package: 'summarytools'
The following objects are masked from 'package:table1':

    label, label<-
The following object is masked from 'package:tibble':

    view
ctable(
  x=Sti$Age.cat,
  y=Sti$casestatus,
  chisq = TRUE,
  headings=FALSE,
  totals = FALSE
)
Warning in chisq.test(freq_table_min): Chi-squared approximation may be
incorrect

--------- ------------ -------------- ------------
            casestatus   Not infected     Infected
  Age.cat                                         
    16-25                 65 ( 66.3%)   33 (33.7%)
    26-34                 76 ( 79.2%)   20 (20.8%)
    35-44                 16 ( 64.0%)    9 (36.0%)
    45-54                  3 ( 60.0%)    2 (40.0%)
    55-64                  2 (100.0%)    0 ( 0.0%)
--------- ------------ -------------- ------------

----------------------------
 Chi.squared   df   p.value 
------------- ---- ---------
    5.887      4    0.2077  
----------------------------
#====Fisher exact test
fisher.test(Sti$Age.cat,Sti$casestatus)

    Fisher's Exact Test for Count Data

data:  Sti$Age.cat and Sti$casestatus
p-value = 0.1768
alternative hypothesis: two.sided
## Casestatus vs Marital Status
ctable(
  x=Sti$MaritalStatus,
  y=Sti$casestatus,
  chisq = TRUE,
  headings=FALSE,
  totals = FALSE
)
Warning in chisq.test(freq_table_min): Chi-squared approximation may be
incorrect

--------------- ------------ -------------- ------------
                  casestatus   Not infected     Infected
  MaritalStatus                                         
    Co-habiting                   8 (88.9%)    1 (11.1%)
        Divorce                   6 (37.5%)   10 (62.5%)
        Married                 115 (77.2%)   34 (22.8%)
         Single                  29 (67.4%)   14 (32.6%)
       Windowed                   4 (44.4%)    5 (55.6%)
--------------- ------------ -------------- ------------

----------------------------
 Chi.squared   df   p.value 
------------- ---- ---------
   16.4121     4    0.0025  
----------------------------
fisher.test(Sti$MaritalStatus,Sti$casestatus)

    Fisher's Exact Test for Count Data

data:  Sti$MaritalStatus and Sti$casestatus
p-value = 0.002931
alternative hypothesis: two.sided

Simple Linear Regression

data<-Sti%>%
  select(Weight,BMI,Age,Height)
Model<-lm(BMI~Weight,data = data)
summary(Model)

Call:
lm(formula = BMI ~ Weight, data = data)

Residuals:
    Min      1Q  Median      3Q     Max 
-6.2943 -1.1245 -0.0642  1.1738  6.2964 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   3.5406     0.6050   5.852 1.71e-08 ***
Weight        0.3201     0.0110  29.092  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.93 on 224 degrees of freedom
Multiple R-squared:  0.7907,    Adjusted R-squared:  0.7898 
F-statistic: 846.4 on 1 and 224 DF,  p-value: < 2.2e-16
#Diagnostic plot
plot(Model,1)

#================
Model1<-lm(BMI~Weight+Height+Age,data = data)
summary(Model1)

Call:
lm(formula = BMI ~ Weight + Height + Age, data = data)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.24786 -0.16848  0.00388  0.13767  1.60679 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 40.834531   0.487916  83.692   <2e-16 ***
Weight       0.379247   0.002131 177.944   <2e-16 ***
Height      -0.251903   0.003108 -81.054   <2e-16 ***
Age          0.002554   0.003263   0.783    0.435    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.3488 on 222 degrees of freedom
Multiple R-squared:  0.9932,    Adjusted R-squared:  0.9931 
F-statistic: 1.085e+04 on 3 and 222 DF,  p-value: < 2.2e-16
# Diagonistic 
plot(Model1,1)

data%>%
  ggplot(aes(Weight,BMI))+geom_smooth(method = lm,se=F)+ geom_point()+
  labs(title = "Weight Against BMI",y="Body mass index(Kg/M^2)",x="Weight(Kgs)")+ theme_classic()
`geom_smooth()` using formula = 'y ~ x'