DATA PROCESSING ,ANALYSIS & PRESENTATION

Author

BRIAN OLOO OJALA(0743670039)

# Clear R environment
rm(list=ls())
# Set Working directory
setwd("C:/Analysis")

Import data set from excel to R

#Load set first data set called Sti_dataset1
library(readr)
Sti_dataset1 <- read_csv("Sti dataset1.csv")

Rows: 226 Columns: 7
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): Date
dbl (6): IdNumber, Weight, Occupation, Church, Gender, Typeofsti

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Data Processing

#1 Value label
Sti_dataset1$Occupation<-ordered(Sti_dataset1$Occupation,levels=
                                   c(1,2,3,4),labels=c("Formal","Informal","Student","Unemployed"))

Sti_dataset1$Church<-ordered(Sti_dataset1$Church,levels=
                                   c(1,2,3,4,5,6,7),labels=c("Anglican","Apostolic","Atheist","Methodist","Pentecostal" ,"Roman catholic","Other"))
Sti_dataset1$Gender<-ordered(Sti_dataset1$Gender,levels=
                                   c(1,2),labels=c("Male","Female"))

#Load second data set called sti_dataset2
library(readxl)
Sti_dataset2 <- read_excel("Sti dataset2.xlsx")
#1 Value label
Sti_dataset2$casestatus<-ordered(Sti_dataset2$casestatus,levels=
                               c(0,1),labels=c("Not infected","Infected"))
Sti_dataset2$MaritalStatus<-ordered(Sti_dataset2$MaritalStatus,levels=
                                   c(1,2,3,4,5),labels=c("Co-habiting","Divorce","Married","Single","Windowed"))

Combine the two data set using merge function

# Combining data sets
Sti<-merge(Sti_dataset1,Sti_dataset2,by="IdNumber")

Delete EducationLevel and TypeofSti from the Sti data set

# Delete TypeofSti and EducationLevel
Dropvar<-names(Sti)%in%c("EducationLevel","Typeofsti")
Sti<-Sti[!Dropvar]

Data manipulation

# Create a new column
library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ purrr     1.0.2
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Sti<-Sti%>%
  mutate(BMI=Weight/(Height/100)^2)%>%
  mutate(Date=as.Date(Date),
         Occupation=as.factor(Occupation),
         Church=as.factor(Church),
         Gender=as.factor(Gender),
         MaritalStatus=as.factor(MaritalStatus),
         casestatus=as.factor(casestatus))
# Round off BMI 2dp
Sti$BMI<-round(as.numeric(Sti$BMI),2)
# Create Age category from participant Age labels
Sti<-within(Sti,{
  Age.cat<-NA
  Age.cat[Age>=16 & Age<=25]<-"16-25"
  Age.cat[Age>25 & Age<=34]<-"26-34"
  Age.cat[Age>34 & Age<=44]<-"35-44"
  Age.cat[Age>44 & Age<=54]<-"45-54"
  Age.cat[Age>54 & Age<=64]<-"55-64"
})

# Variable label
library(expss)

Loading required package: maditr


To aggregate data: take(mtcars, mean_mpg = mean(mpg), by = am)


Attaching package: 'maditr'

The following objects are masked from 'package:dplyr':

    between, coalesce, first, last

The following object is masked from 'package:purrr':

    transpose

The following object is masked from 'package:readr':

    cols


Use 'expss_output_rnotebook()' to display tables inside R Notebooks.
 To return to the console output, use 'expss_output_default()'.


Attaching package: 'expss'

The following objects are masked from 'package:stringr':

    fixed, regex

The following objects are masked from 'package:dplyr':

    compute, contains, na_if, recode, vars, where

The following objects are masked from 'package:purrr':

    keep, modify, modify_if, when

The following objects are masked from 'package:tidyr':

    contains, nest

The following object is masked from 'package:ggplot2':

    vars

Sti<-apply_labels(Sti,
                  Age="Age in Years",
                  Weight="Weight in Kgs",
                  Height="Height in Cm",
                  Occupation="Occupation status",
                  Gender="Sex",
                  casestatus="STI infections",
                  MaritalStatus="Marital Status"
)

Explanatory Data Analysis:

Qualitative variable: Bar chart, Pie chart, Tables

Quantitative variable: Box plot , Scatter plot ,Histogram, correlation heat map

Quantitative vs Qualitative: Box plot

EDA for Qualitative variable

Churches

library(stringr)
df<-Sti%>%
  dplyr::group_by(Church)%>%
  dplyr::mutate(Church=str_to_title(Church))%>%
  tally()
ggplot(df,aes(x=reorder(Church,n),y=n))+
  geom_bar(stat = "identity",fill="blue",color="black")+
  geom_text(aes(label = n),hjust=1.45)+coord_flip()+
  theme_classic()+labs(x="Church",y="Count",title = "Participant religion")

df1<-Sti%>%
  dplyr::group_by(Gender)%>%
  dplyr::mutate(Gender=str_to_title(Gender))%>%
  tally()
ggplot(df1,aes(x=reorder(Gender,n),y=n))+
  geom_bar(stat = "identity",fill="red",color="black")+
  geom_text(aes(label = n),hjust=1.5)+
  theme_classic()+labs(x="Gender",y="Count",title = "Partipant with their Gender")

ggplot(Sti,aes(x=Age))+
  geom_histogram(fill="blue",color="white")+
  geom_vline(aes(xintercept=mean(Age)),color="red",
             linewidth=1,linetype="dashed")+
  theme_classic()+
  labs(title = "Histogram showing Age distribution",
       y= "Counts",
       x=" Age in years")

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Box Plot

boxplot(Sti$Age,col = "blue")

ggplot(Sti,aes(Age.cat,BMI,fill =Gender))+geom_boxplot()+
  theme_classic()+
  labs(title = "BMI distribution ")

Scatter Plot

Sti%>%
  ggplot(aes(Weight,BMI,colour = Gender))+ geom_point(method ="lm",se = FALSE)

Warning in geom_point(method = "lm", se = FALSE): Ignoring unknown parameters:
`method` and `se`

Descriptive Statistics

library(table1)


Attaching package: 'table1'

The following objects are masked from 'package:base':

    units, units<-

table1(~casestatus+MaritalStatus+Gender+Occupation+Church+Age.cat,data = Sti)

	Overall (N=226)
STI infections
Not infected	162 (71.7%)
Infected	64 (28.3%)
Marital Status
Co-habiting	9 (4.0%)
Divorce	16 (7.1%)
Married	149 (65.9%)
Single	43 (19.0%)
Windowed	9 (4.0%)
Sex
Male	108 (47.8%)
Female	118 (52.2%)
Occupation status
Formal	91 (40.3%)
Informal	45 (19.9%)
Student	14 (6.2%)
Unemployed	76 (33.6%)
Church
Anglican	16 (7.1%)
Apostolic	52 (23.0%)
Atheist	45 (19.9%)
Methodist	13 (5.8%)
Pentecostal	36 (15.9%)
Roman catholic	41 (18.1%)
Other	23 (10.2%)
Age.cat
16-25	98 (43.4%)
26-34	96 (42.5%)
35-44	25 (11.1%)
45-54	5 (2.2%)
55-64	2 (0.9%)

table1(~Age+Height+BMI+Weight|Gender,data = Sti)

	Male (N=108)	Female (N=118)	Overall (N=226)
Age in Years
Mean (SD)	27.0 (5.77)	29.0 (8.18)	28.1 (7.18)
Median [Min, Max]	26.0 [16.0, 46.0]	27.0 [19.0, 63.0]	26.5 [16.0, 63.0]
Height in Cm
Mean (SD)	162 (8.48)	160 (7.54)	161 (8.00)
Median [Min, Max]	162 [138, 182]	160 [143, 191]	161 [138, 191]
BMI
Mean (SD)	20.7 (4.12)	20.8 (4.31)	20.7 (4.21)
Median [Min, Max]	20.1 [13.4, 36.3]	20.3 [13.5, 34.1]	20.2 [13.4, 36.3]
Weight in Kgs
Mean (SD)	54.0 (11.9)	53.5 (11.5)	53.7 (11.7)
Median [Min, Max]	51.9 [33.0, 100]	50.5 [32.8, 94.0]	51.0 [32.8, 100]

table1(~Age+Height+BMI+Weight,data = Sti)

	Overall (N=226)
Age in Years
Mean (SD)	28.1 (7.18)
Median [Min, Max]	26.5 [16.0, 63.0]
Height in Cm
Mean (SD)	161 (8.00)
Median [Min, Max]	161 [138, 191]
BMI
Mean (SD)	20.7 (4.21)
Median [Min, Max]	20.2 [13.4, 36.3]
Weight in Kgs
Mean (SD)	53.7 (11.7)
Median [Min, Max]	51.0 [32.8, 100]

table1(~casestatus+MaritalStatus+Occupation+Church+Age.cat|Gender,data = Sti)

	Male (N=108)	Female (N=118)	Overall (N=226)
STI infections
Not infected	76 (70.4%)	86 (72.9%)	162 (71.7%)
Infected	32 (29.6%)	32 (27.1%)	64 (28.3%)
Marital Status
Co-habiting	3 (2.8%)	6 (5.1%)	9 (4.0%)
Divorce	11 (10.2%)	5 (4.2%)	16 (7.1%)
Married	68 (63.0%)	81 (68.6%)	149 (65.9%)
Single	21 (19.4%)	22 (18.6%)	43 (19.0%)
Windowed	5 (4.6%)	4 (3.4%)	9 (4.0%)
Occupation status
Formal	40 (37.0%)	51 (43.2%)	91 (40.3%)
Informal	17 (15.7%)	28 (23.7%)	45 (19.9%)
Student	6 (5.6%)	8 (6.8%)	14 (6.2%)
Unemployed	45 (41.7%)	31 (26.3%)	76 (33.6%)
Church
Anglican	13 (12.0%)	3 (2.5%)	16 (7.1%)
Apostolic	27 (25.0%)	25 (21.2%)	52 (23.0%)
Atheist	14 (13.0%)	31 (26.3%)	45 (19.9%)
Methodist	8 (7.4%)	5 (4.2%)	13 (5.8%)
Pentecostal	17 (15.7%)	19 (16.1%)	36 (15.9%)
Roman catholic	17 (15.7%)	24 (20.3%)	41 (18.1%)
Other	12 (11.1%)	11 (9.3%)	23 (10.2%)
Age.cat
16-25	52 (48.1%)	46 (39.0%)	98 (43.4%)
26-34	46 (42.6%)	50 (42.4%)	96 (42.5%)
35-44	9 (8.3%)	16 (13.6%)	25 (11.1%)
45-54	1 (0.9%)	4 (3.4%)	5 (2.2%)
55-64	0 (0%)	2 (1.7%)	2 (0.9%)

Inferential Statistics (Measure of Association

# Chi-square test statistics
library(summarytools)


Attaching package: 'summarytools'

The following objects are masked from 'package:table1':

    label, label<-

The following object is masked from 'package:tibble':

    view

ctable(
  x=Sti$Age.cat,
  y=Sti$casestatus,
  chisq = TRUE,
  headings=FALSE,
  totals = FALSE
)

Warning in chisq.test(freq_table_min): Chi-squared approximation may be
incorrect


--------- ------------ -------------- ------------
            casestatus   Not infected     Infected
  Age.cat                                         
    16-25                 65 ( 66.3%)   33 (33.7%)
    26-34                 76 ( 79.2%)   20 (20.8%)
    35-44                 16 ( 64.0%)    9 (36.0%)
    45-54                  3 ( 60.0%)    2 (40.0%)
    55-64                  2 (100.0%)    0 ( 0.0%)
--------- ------------ -------------- ------------

----------------------------
 Chi.squared   df   p.value 
------------- ---- ---------
    5.887      4    0.2077  
----------------------------

#====Fisher exact test
fisher.test(Sti$Age.cat,Sti$casestatus)


    Fisher's Exact Test for Count Data

data:  Sti$Age.cat and Sti$casestatus
p-value = 0.1768
alternative hypothesis: two.sided

## Casestatus vs Marital Status
ctable(
  x=Sti$MaritalStatus,
  y=Sti$casestatus,
  chisq = TRUE,
  headings=FALSE,
  totals = FALSE
)

Warning in chisq.test(freq_table_min): Chi-squared approximation may be
incorrect


--------------- ------------ -------------- ------------
                  casestatus   Not infected     Infected
  MaritalStatus                                         
    Co-habiting                   8 (88.9%)    1 (11.1%)
        Divorce                   6 (37.5%)   10 (62.5%)
        Married                 115 (77.2%)   34 (22.8%)
         Single                  29 (67.4%)   14 (32.6%)
       Windowed                   4 (44.4%)    5 (55.6%)
--------------- ------------ -------------- ------------

----------------------------
 Chi.squared   df   p.value 
------------- ---- ---------
   16.4121     4    0.0025  
----------------------------

fisher.test(Sti$MaritalStatus,Sti$casestatus)


    Fisher's Exact Test for Count Data

data:  Sti$MaritalStatus and Sti$casestatus
p-value = 0.002931
alternative hypothesis: two.sided

Simple Linear Regression

data<-Sti%>%
  select(Weight,BMI,Age,Height)
Model<-lm(BMI~Weight,data = data)
summary(Model)


Call:
lm(formula = BMI ~ Weight, data = data)

Residuals:
    Min      1Q  Median      3Q     Max 
-6.2943 -1.1245 -0.0642  1.1738  6.2964 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   3.5406     0.6050   5.852 1.71e-08 ***
Weight        0.3201     0.0110  29.092  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.93 on 224 degrees of freedom
Multiple R-squared:  0.7907,    Adjusted R-squared:  0.7898 
F-statistic: 846.4 on 1 and 224 DF,  p-value: < 2.2e-16

#Diagnostic plot
plot(Model,1)

#================
Model1<-lm(BMI~Weight+Height+Age,data = data)
summary(Model1)


Call:
lm(formula = BMI ~ Weight + Height + Age, data = data)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.24786 -0.16848  0.00388  0.13767  1.60679 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 40.834531   0.487916  83.692   <2e-16 ***
Weight       0.379247   0.002131 177.944   <2e-16 ***
Height      -0.251903   0.003108 -81.054   <2e-16 ***
Age          0.002554   0.003263   0.783    0.435    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.3488 on 222 degrees of freedom
Multiple R-squared:  0.9932,    Adjusted R-squared:  0.9931 
F-statistic: 1.085e+04 on 3 and 222 DF,  p-value: < 2.2e-16

# Diagonistic 
plot(Model1,1)

data%>%
  ggplot(aes(Weight,BMI))+geom_smooth(method = lm,se=F)+ geom_point()+
  labs(title = "Weight Against BMI",y="Body mass index(Kg/M^2)",x="Weight(Kgs)")+ theme_classic()

`geom_smooth()` using formula = 'y ~ x'