# Clear R environment
rm(list=ls())
# Set Working directory
setwd("C:/Analysis")DATA PROCESSING ,ANALYSIS & PRESENTATION
Import data set from excel to R
#Load set first data set called Sti_dataset1
library(readr)
Sti_dataset1 <- read_csv("Sti dataset1.csv")Rows: 226 Columns: 7
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): Date
dbl (6): IdNumber, Weight, Occupation, Church, Gender, Typeofsti
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Data Processing
#1 Value label
Sti_dataset1$Occupation<-ordered(Sti_dataset1$Occupation,levels=
c(1,2,3,4),labels=c("Formal","Informal","Student","Unemployed"))
Sti_dataset1$Church<-ordered(Sti_dataset1$Church,levels=
c(1,2,3,4,5,6,7),labels=c("Anglican","Apostolic","Atheist","Methodist","Pentecostal" ,"Roman catholic","Other"))
Sti_dataset1$Gender<-ordered(Sti_dataset1$Gender,levels=
c(1,2),labels=c("Male","Female"))#Load second data set called sti_dataset2
library(readxl)
Sti_dataset2 <- read_excel("Sti dataset2.xlsx")
#1 Value label
Sti_dataset2$casestatus<-ordered(Sti_dataset2$casestatus,levels=
c(0,1),labels=c("Not infected","Infected"))
Sti_dataset2$MaritalStatus<-ordered(Sti_dataset2$MaritalStatus,levels=
c(1,2,3,4,5),labels=c("Co-habiting","Divorce","Married","Single","Windowed"))Combine the two data set using merge function
# Combining data sets
Sti<-merge(Sti_dataset1,Sti_dataset2,by="IdNumber")Delete EducationLevel and TypeofSti from the Sti data set
# Delete TypeofSti and EducationLevel
Dropvar<-names(Sti)%in%c("EducationLevel","Typeofsti")
Sti<-Sti[!Dropvar]Data manipulation
# Create a new column
library(tidyverse)── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ purrr 1.0.2
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.2 ✔ tibble 3.2.1
✔ lubridate 1.9.3 ✔ tidyr 1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Sti<-Sti%>%
mutate(BMI=Weight/(Height/100)^2)%>%
mutate(Date=as.Date(Date),
Occupation=as.factor(Occupation),
Church=as.factor(Church),
Gender=as.factor(Gender),
MaritalStatus=as.factor(MaritalStatus),
casestatus=as.factor(casestatus))
# Round off BMI 2dp
Sti$BMI<-round(as.numeric(Sti$BMI),2)
# Create Age category from participant Age labels
Sti<-within(Sti,{
Age.cat<-NA
Age.cat[Age>=16 & Age<=25]<-"16-25"
Age.cat[Age>25 & Age<=34]<-"26-34"
Age.cat[Age>34 & Age<=44]<-"35-44"
Age.cat[Age>44 & Age<=54]<-"45-54"
Age.cat[Age>54 & Age<=64]<-"55-64"
})# Variable label
library(expss)Loading required package: maditr
To aggregate data: take(mtcars, mean_mpg = mean(mpg), by = am)
Attaching package: 'maditr'
The following objects are masked from 'package:dplyr':
between, coalesce, first, last
The following object is masked from 'package:purrr':
transpose
The following object is masked from 'package:readr':
cols
Use 'expss_output_rnotebook()' to display tables inside R Notebooks.
To return to the console output, use 'expss_output_default()'.
Attaching package: 'expss'
The following objects are masked from 'package:stringr':
fixed, regex
The following objects are masked from 'package:dplyr':
compute, contains, na_if, recode, vars, where
The following objects are masked from 'package:purrr':
keep, modify, modify_if, when
The following objects are masked from 'package:tidyr':
contains, nest
The following object is masked from 'package:ggplot2':
vars
Sti<-apply_labels(Sti,
Age="Age in Years",
Weight="Weight in Kgs",
Height="Height in Cm",
Occupation="Occupation status",
Gender="Sex",
casestatus="STI infections",
MaritalStatus="Marital Status"
)Explanatory Data Analysis:
Qualitative variable: Bar chart, Pie chart, Tables
Quantitative variable: Box plot , Scatter plot ,Histogram, correlation heat map
Quantitative vs Qualitative: Box plot
EDA for Qualitative variable
Churches
library(stringr)
df<-Sti%>%
dplyr::group_by(Church)%>%
dplyr::mutate(Church=str_to_title(Church))%>%
tally()
ggplot(df,aes(x=reorder(Church,n),y=n))+
geom_bar(stat = "identity",fill="blue",color="black")+
geom_text(aes(label = n),hjust=1.45)+coord_flip()+
theme_classic()+labs(x="Church",y="Count",title = "Participant religion")df1<-Sti%>%
dplyr::group_by(Gender)%>%
dplyr::mutate(Gender=str_to_title(Gender))%>%
tally()
ggplot(df1,aes(x=reorder(Gender,n),y=n))+
geom_bar(stat = "identity",fill="red",color="black")+
geom_text(aes(label = n),hjust=1.5)+
theme_classic()+labs(x="Gender",y="Count",title = "Partipant with their Gender")ggplot(Sti,aes(x=Age))+
geom_histogram(fill="blue",color="white")+
geom_vline(aes(xintercept=mean(Age)),color="red",
linewidth=1,linetype="dashed")+
theme_classic()+
labs(title = "Histogram showing Age distribution",
y= "Counts",
x=" Age in years")`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Box Plot
boxplot(Sti$Age,col = "blue")ggplot(Sti,aes(Age.cat,BMI,fill =Gender))+geom_boxplot()+
theme_classic()+
labs(title = "BMI distribution ")Scatter Plot
Sti%>%
ggplot(aes(Weight,BMI,colour = Gender))+ geom_point(method ="lm",se = FALSE)Warning in geom_point(method = "lm", se = FALSE): Ignoring unknown parameters:
`method` and `se`
Descriptive Statistics
library(table1)
Attaching package: 'table1'
The following objects are masked from 'package:base':
units, units<-
table1(~casestatus+MaritalStatus+Gender+Occupation+Church+Age.cat,data = Sti)| Overall (N=226) |
|
|---|---|
| STI infections | |
| Not infected | 162 (71.7%) |
| Infected | 64 (28.3%) |
| Marital Status | |
| Co-habiting | 9 (4.0%) |
| Divorce | 16 (7.1%) |
| Married | 149 (65.9%) |
| Single | 43 (19.0%) |
| Windowed | 9 (4.0%) |
| Sex | |
| Male | 108 (47.8%) |
| Female | 118 (52.2%) |
| Occupation status | |
| Formal | 91 (40.3%) |
| Informal | 45 (19.9%) |
| Student | 14 (6.2%) |
| Unemployed | 76 (33.6%) |
| Church | |
| Anglican | 16 (7.1%) |
| Apostolic | 52 (23.0%) |
| Atheist | 45 (19.9%) |
| Methodist | 13 (5.8%) |
| Pentecostal | 36 (15.9%) |
| Roman catholic | 41 (18.1%) |
| Other | 23 (10.2%) |
| Age.cat | |
| 16-25 | 98 (43.4%) |
| 26-34 | 96 (42.5%) |
| 35-44 | 25 (11.1%) |
| 45-54 | 5 (2.2%) |
| 55-64 | 2 (0.9%) |
table1(~Age+Height+BMI+Weight|Gender,data = Sti)| Male (N=108) |
Female (N=118) |
Overall (N=226) |
|
|---|---|---|---|
| Age in Years | |||
| Mean (SD) | 27.0 (5.77) | 29.0 (8.18) | 28.1 (7.18) |
| Median [Min, Max] | 26.0 [16.0, 46.0] | 27.0 [19.0, 63.0] | 26.5 [16.0, 63.0] |
| Height in Cm | |||
| Mean (SD) | 162 (8.48) | 160 (7.54) | 161 (8.00) |
| Median [Min, Max] | 162 [138, 182] | 160 [143, 191] | 161 [138, 191] |
| BMI | |||
| Mean (SD) | 20.7 (4.12) | 20.8 (4.31) | 20.7 (4.21) |
| Median [Min, Max] | 20.1 [13.4, 36.3] | 20.3 [13.5, 34.1] | 20.2 [13.4, 36.3] |
| Weight in Kgs | |||
| Mean (SD) | 54.0 (11.9) | 53.5 (11.5) | 53.7 (11.7) |
| Median [Min, Max] | 51.9 [33.0, 100] | 50.5 [32.8, 94.0] | 51.0 [32.8, 100] |
table1(~Age+Height+BMI+Weight,data = Sti)| Overall (N=226) |
|
|---|---|
| Age in Years | |
| Mean (SD) | 28.1 (7.18) |
| Median [Min, Max] | 26.5 [16.0, 63.0] |
| Height in Cm | |
| Mean (SD) | 161 (8.00) |
| Median [Min, Max] | 161 [138, 191] |
| BMI | |
| Mean (SD) | 20.7 (4.21) |
| Median [Min, Max] | 20.2 [13.4, 36.3] |
| Weight in Kgs | |
| Mean (SD) | 53.7 (11.7) |
| Median [Min, Max] | 51.0 [32.8, 100] |
table1(~casestatus+MaritalStatus+Occupation+Church+Age.cat|Gender,data = Sti)| Male (N=108) |
Female (N=118) |
Overall (N=226) |
|
|---|---|---|---|
| STI infections | |||
| Not infected | 76 (70.4%) | 86 (72.9%) | 162 (71.7%) |
| Infected | 32 (29.6%) | 32 (27.1%) | 64 (28.3%) |
| Marital Status | |||
| Co-habiting | 3 (2.8%) | 6 (5.1%) | 9 (4.0%) |
| Divorce | 11 (10.2%) | 5 (4.2%) | 16 (7.1%) |
| Married | 68 (63.0%) | 81 (68.6%) | 149 (65.9%) |
| Single | 21 (19.4%) | 22 (18.6%) | 43 (19.0%) |
| Windowed | 5 (4.6%) | 4 (3.4%) | 9 (4.0%) |
| Occupation status | |||
| Formal | 40 (37.0%) | 51 (43.2%) | 91 (40.3%) |
| Informal | 17 (15.7%) | 28 (23.7%) | 45 (19.9%) |
| Student | 6 (5.6%) | 8 (6.8%) | 14 (6.2%) |
| Unemployed | 45 (41.7%) | 31 (26.3%) | 76 (33.6%) |
| Church | |||
| Anglican | 13 (12.0%) | 3 (2.5%) | 16 (7.1%) |
| Apostolic | 27 (25.0%) | 25 (21.2%) | 52 (23.0%) |
| Atheist | 14 (13.0%) | 31 (26.3%) | 45 (19.9%) |
| Methodist | 8 (7.4%) | 5 (4.2%) | 13 (5.8%) |
| Pentecostal | 17 (15.7%) | 19 (16.1%) | 36 (15.9%) |
| Roman catholic | 17 (15.7%) | 24 (20.3%) | 41 (18.1%) |
| Other | 12 (11.1%) | 11 (9.3%) | 23 (10.2%) |
| Age.cat | |||
| 16-25 | 52 (48.1%) | 46 (39.0%) | 98 (43.4%) |
| 26-34 | 46 (42.6%) | 50 (42.4%) | 96 (42.5%) |
| 35-44 | 9 (8.3%) | 16 (13.6%) | 25 (11.1%) |
| 45-54 | 1 (0.9%) | 4 (3.4%) | 5 (2.2%) |
| 55-64 | 0 (0%) | 2 (1.7%) | 2 (0.9%) |
Inferential Statistics (Measure of Association
# Chi-square test statistics
library(summarytools)
Attaching package: 'summarytools'
The following objects are masked from 'package:table1':
label, label<-
The following object is masked from 'package:tibble':
view
ctable(
x=Sti$Age.cat,
y=Sti$casestatus,
chisq = TRUE,
headings=FALSE,
totals = FALSE
)Warning in chisq.test(freq_table_min): Chi-squared approximation may be
incorrect
--------- ------------ -------------- ------------
casestatus Not infected Infected
Age.cat
16-25 65 ( 66.3%) 33 (33.7%)
26-34 76 ( 79.2%) 20 (20.8%)
35-44 16 ( 64.0%) 9 (36.0%)
45-54 3 ( 60.0%) 2 (40.0%)
55-64 2 (100.0%) 0 ( 0.0%)
--------- ------------ -------------- ------------
----------------------------
Chi.squared df p.value
------------- ---- ---------
5.887 4 0.2077
----------------------------
#====Fisher exact test
fisher.test(Sti$Age.cat,Sti$casestatus)
Fisher's Exact Test for Count Data
data: Sti$Age.cat and Sti$casestatus
p-value = 0.1768
alternative hypothesis: two.sided
## Casestatus vs Marital Status
ctable(
x=Sti$MaritalStatus,
y=Sti$casestatus,
chisq = TRUE,
headings=FALSE,
totals = FALSE
)Warning in chisq.test(freq_table_min): Chi-squared approximation may be
incorrect
--------------- ------------ -------------- ------------
casestatus Not infected Infected
MaritalStatus
Co-habiting 8 (88.9%) 1 (11.1%)
Divorce 6 (37.5%) 10 (62.5%)
Married 115 (77.2%) 34 (22.8%)
Single 29 (67.4%) 14 (32.6%)
Windowed 4 (44.4%) 5 (55.6%)
--------------- ------------ -------------- ------------
----------------------------
Chi.squared df p.value
------------- ---- ---------
16.4121 4 0.0025
----------------------------
fisher.test(Sti$MaritalStatus,Sti$casestatus)
Fisher's Exact Test for Count Data
data: Sti$MaritalStatus and Sti$casestatus
p-value = 0.002931
alternative hypothesis: two.sided
Simple Linear Regression
data<-Sti%>%
select(Weight,BMI,Age,Height)
Model<-lm(BMI~Weight,data = data)
summary(Model)
Call:
lm(formula = BMI ~ Weight, data = data)
Residuals:
Min 1Q Median 3Q Max
-6.2943 -1.1245 -0.0642 1.1738 6.2964
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.5406 0.6050 5.852 1.71e-08 ***
Weight 0.3201 0.0110 29.092 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.93 on 224 degrees of freedom
Multiple R-squared: 0.7907, Adjusted R-squared: 0.7898
F-statistic: 846.4 on 1 and 224 DF, p-value: < 2.2e-16
#Diagnostic plot
plot(Model,1)#================
Model1<-lm(BMI~Weight+Height+Age,data = data)
summary(Model1)
Call:
lm(formula = BMI ~ Weight + Height + Age, data = data)
Residuals:
Min 1Q Median 3Q Max
-1.24786 -0.16848 0.00388 0.13767 1.60679
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 40.834531 0.487916 83.692 <2e-16 ***
Weight 0.379247 0.002131 177.944 <2e-16 ***
Height -0.251903 0.003108 -81.054 <2e-16 ***
Age 0.002554 0.003263 0.783 0.435
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.3488 on 222 degrees of freedom
Multiple R-squared: 0.9932, Adjusted R-squared: 0.9931
F-statistic: 1.085e+04 on 3 and 222 DF, p-value: < 2.2e-16
# Diagonistic
plot(Model1,1)data%>%
ggplot(aes(Weight,BMI))+geom_smooth(method = lm,se=F)+ geom_point()+
labs(title = "Weight Against BMI",y="Body mass index(Kg/M^2)",x="Weight(Kgs)")+ theme_classic()`geom_smooth()` using formula = 'y ~ x'