setwd("D:/My Projects")
getwd()
STIData <- read.csv("D:/My Projects/STIData.csv")
View(STIData)
##Checking Duplicates
dups<-duplicated(STIData$IdNumber)
table(dups)

STIData$IdNumber [duplicated(STIData$IdNumber)]
STIData<- STIData[order(STIData$IdNumber),]
View(STIData[STIData$IdNumber==51,])            
STIData<-STIData[!(STIData$IdNumber==51 & STIData$A1Age ==23),] 

##Cleaning Inconsistencies
table(STIData$CaseStatus)
STIData<-STIData[order(-STIData$CaseStatus),]
View(STIData[STIData$CaseStatus==3,])
STIData$CaseStatus[STIData$IdNumber==1 & STIData$A1Age==30]=1
STIData$CaseStatus[STIData$IdNumber==31 & STIData$A1Age==23]=1
table(STIData$CaseStatus) 

##Handling Missing
View(STIData[is.na(STIData$Height),]) 

attach(STIData)
summary(Height)
STIData$Height[STIData$IdNumber==187]=161

##Checking out of range
summary(Weight)
summary(A1Age) 

##Creating new variables
STIData$AgeCat<-NA
STIData$AgeCat[STIData$A1Age<=35]=1
STIData$AgeCat[STIData$A1Age>=35]=0
STIData$AgeCat<-factor(STIData$AgeCat, levels = c(1,0),labels = c("16-34 years","35-63 years"))
table(STIData$AgeCat) 

STIData$occupation<-NA
STIData$occupation[STIData$A2Occupation=="1 unemployed"|STIData$A2Occupation== "4 student"]=1
STIData$occupation[STIData$A2Occupation =="2 informal"|STIData$A2Occupation =="3 formal"]=0
STIData$occupation<-factor(STIData$occupation, levels = c(1,0),labels = c("Unemployed", "Employed"))

STIData$church <- NA
STIData$church[STIData$A3Church=="7 roman catholic" |STIData$A3Church=="2 apostolic"| STIData$A3Church=="3 methodist"|STIData$A3Church=="4 anglican"|STIData$A3Church=="5 pentecostal"]=1
STIData$church[STIData$A3Church=="6 atheist" |STIData$A3Church=="8 other "]=0

STIData$church<-factor(STIData$church, levels = c(1,0),labels = c("Believers", "Non-Believers"))

STIData$education<-NA                                 
STIData$education[STIData$A4LevelOfEducation=="1 none"|STIData$A4LevelOfEducation=="2 primary"]=1
STIData$education[STIData$A4LevelOfEducation=="3 secondary" |STIData$A4LevelOfEducation=="4 tertiary"]=0
STIData$education <- factor(STIData$education, levels = c(1,0), labels = c("Primary & Below","Secondary & Above"))

STIData$marital<-NA
STIData$marital[STIData$A5MaritalStatus=="2 married"]=1
STIData$marital[STIData$A5MaritalStatus=="1 single"|STIData$A5MaritalStatus=="5 widowed"|STIData$A5MaritalStatus=="3 co-habiting"|STIData$A5MaritalStatus=="4 divorcee"]=0

STIData$marital<-factor(STIData$marital, levels = c(1,0),labels = c("Married", "Not Married"))






## Creating new varible and handling missing
table(Sex) 

STIData$Sex_Cat<-NA
STIData$Sex_Cat[STIData$Sex=="Female"]=1
STIData$Sex_Cat[STIData$Sex=="Male"]=0
STIData$Sex_Cat<-factor(STIData$Sex_Cat,levels = c(1,0),labels = c("Female","Male"))
class(STIData$Sex_Cat)
table(STIData$Sex_Cat) 

## Handling missing 
View(STIData[is.na(STIData$Sex_Cat),]) 
STIData$Sex_Cat[STIData$IdNumber==48]="Male"
STIData$Sex_Cat[STIData$IdNumber==213]="Female"
table(STIData$Sex_Cat) 


# Calculating BMI 
STIData$BMI <- STIData$Weight / ((STIData$Height / 100) ^ 2)

# Creating BMI_Cat 
STIData$BMI_Cat <- NA

# Assigning BMI categories using WHO Standards
STIData$BMI_Cat[STIData$BMI < 18.5]="Underweight"
STIData$BMI_Cat[STIData$BMI >= 18.5 & STIData$BMI < 25]= "Normal"
STIData$BMI_Cat[STIData$BMI >= 25 & STIData$BMI < 30]= "Overweight"
STIData$BMI_Cat[STIData$BMI >= 30]="Obese"
table(STIData$BMI_Cat)

#Saving STIData as a CSV file
write.csv(STIData, file = "cleaned_STIData.csv", row.names = FALSE)

# 2.Generating frequency tables
attach(STIData)
table(AgeCat)
table(Sex_Cat)
table(occupation)
table(church)
table(education)
table(marital)
table(BMI_Cat)


###Histogram and pie chart
library(ggplot2)
class(A1Age)
hist(STIData$A1Age, main="Age Histogram", ylab="Frequency",col ="blue", xlab="AgeCat")

##Pie chart
BMI <- table(BMI_Cat) 
pie(BMI,
    main = "Pie Chart of BMI",
    col = c('blue','pink','yellow','red'),
    labels = paste0(names(BMI),round(prop.table(BMI) * 100, 1), "%"))
    
# 3.Prevalence of STI
#Creating variable STI
STIData$STI<-NA
STIData$STI[STIData$CaseStatus==1]=1
STIData$STI[STIData$CaseStatus==2]=0
STIData$STI=factor(STIData$STI,levels=c(1,0),labels=c("Positive","Negative")) 

# Frequency distribution table
sti <- table(STIData$STI)
cat("Frequency Distribution of STI:\n")
print(sti)
#Calculating prevalence of sti
sti_prop <- prop.table(sti)
prevalence <- sti_prop["Positive"] * 100
cat("\nSTI Prevalence: ", round(prevalence, 1), "%\n")


# Generating pie chart using frequency distribution
sti <- table(STIData$STI)
pie(sti, 
    main = "STI Prevalence",
    col = c("red", "blue"),
    labels = paste(names(sti), "\n", round(prop.table(sti) * 100, 1), "%"))
    
# 4. Establishing association between STI and other factors
# a) socio-demographic
attach(STIData)
var1 <- c("AgeCat","Sex_Cat","occupation","church","education","marital","BMI_Cat")

chi2 <- function(var, STIData, STI) {
  #creating contingency table
  table_data <- table(STIData[[var]], STIData[[STI]])
  
  #calculating row percentages
  row_percent <- prop.table(table_data, margin = 1) * 100
  
  #chi-square test
  chi2_test <- chisq.test(table_data)
  
  #results
  cat("\nVariable:", var, "\n")
  cat("Contingency Table:\n")
  print(table_data)
  cat("\nRow Percentages:\n")
  print(round(row_percent, 2))
  cat("\nChi-Square Test P-value:", chi2_test$p.value, "\n")
  cat("-----------------------\n")
}

for (var in var1) {
  chi2(var, STIData, "STI")
}

#4b. Social Capital

# Contingency table
tab <- table(STIData$D1BurialSociety, STIData$STI)
print(tab)
# Row percentages
prop.table(tab, margin = 1) * 100
# P-value
chisq.test(tab)$p.value 

tab <- table(STIData$D1religiousgrp, STIData$STI)
print(tab)
prop.table(tab, margin = 1) * 100
chisq.test(tab)$p.value

tab <- table(STIData$D1savingsClub, STIData$STI)
print(tab)
prop.table(tab, margin = 1) * 100
chisq.test(tab)$p.value

tab <- table(STIData$D1tradersAssoc, STIData$STI)
print(tab)
prop.table(tab, margin = 1) * 100
chisq.test(tab)$p.value

tab <- table(STIData$Belong, STIData$STI)
print(tab)
prop.table(tab, margin = 1) * 100
chisq.test(tab)$p.value

tab <- table(STIData$ReceiveHelp, STIData$STI)
print(tab)
prop.table(tab, margin = 1) * 100
chisq.test(tab)$p.value

#4c. Risky sexual Behavior
attach(STIData)
STIData$usecondom<-NA
STIData$usecondom[STIData$N12UseCondom=='1 yes']=1
STIData$usecondom[STIData$N12UseCondom=='2 no' | STIData$N12UseCondom=='2 No' ]=0
STIData$usecondom=factor(STIData$usecondom,levels=c(1,0),labels=c("yes","no")) 

summary(N16HowOldIs)
STIData$howold<-NA
STIData$howold[STIData$N16HowOldIs<=29]=1
STIData$howold[STIData$N16HowOldIs >=29]=0
STIData$howold<-factor(STIData$howold, levels = c(1,0),labels = c("16-28 years","29-57 years"))

summary(N2SexDebut)
STIData$firstsex<-NA
STIData$firstsex[STIData$N2SexDebut<=19]=1
STIData$firstsex[STIData$N2SexDebut>=19]=0
STIData$firstsex<-factor(STIData$firstsex, levels = c(1,0),labels = c("12-18 years","19-32 years"))

var3<-c("N10givereceiveforsex","N11Usedcondom","usecondom","N13TakenAlcohol","N14DoYouHave","N15LivingTogether","howold","firstsex","N3HadAnSti","N9Relationship","AgeFirstSex","AlcoholUse","SexPartner1year","SexPartner3month","SexPartnerLife3","LastPartnerSpouse")

chi2 <- function(var, STIData, STI) {
  #creating contingency table
  table_data <- table(STIData[[var]], STIData[[STI]])
  
  #calculating row percentages
  row_percent <- prop.table(table_data, margin = 1) * 100
  
  #chi-square test
  chi2_test <- chisq.test(table_data)
  
  #results
  cat("\nVariable:", var, "\n")
  cat("Contingency Table:\n")
  print(table_data)
  cat("\nRow Percentages:\n")
  print(round(row_percent, 2))
  cat("\nChi-Square Test P-value:", chi2_test$p.value, "\n")
  cat("-----------------------\n")
}

for (var in var3) {
  chi2(var, STIData, "STI")
}

# 5. Multivariable logistic regression

significant <- c("SexPartnerLife3","SexPartner3month","SexPartner1year","AlcoholUse","N3HadAnSti","N15LivingTogether","N13TakenAlcohol","Belong")


Model <- glm(STI ~ SexPartnerLife3+SexPartner3month+SexPartner1year+AlcoholUse+N3HadAnSti+N15LivingTogether+N13TakenAlcohol+Belong,family=binomial(link=logit),data=STIData)
summary(Model)
coef(Model)
OR <- exp(coef(Model))
OR
CI <- exp(confint(Model))
CI

# 6. 
library(ggplot2)
install.packages("dplyr")
library(dplyr)

# Boxplot of BMI by Sex
ggplot(STIData, aes(x = Sex_Cat, y = BMI_Cat, fill = Sex_Cat)) +
  geom_boxplot() +
  labs(title = "Distribution of BMI by Sex",
       x = "Sex_Cat",
       y = "BMI_Cat") +
  theme_minimal() +
  theme(legend.position = "none")


# Density plot of BMI
ggplot(STIData, aes(x = BMI_Cat)) +
  geom_density(fill='violet', alpha = 0.7) +
  labs(title = "Density Plot of BMI",
       x = "BMI_Cat",
       y = "Density") +
  theme_minimal() 

# Faceted bar plot of STI prevalence by AgeGroup and Sex
ggplot(STIData, aes(x = Sex_Cat, fill = STI)) +
  geom_bar(position = "fill") +
  facet_wrap(~ AgeCat) +
  labs(title = "STI Prevalence by Age Group and Sex",
       x = "Sex_Cat",
       y = "Proportion",
       fill = "STI Status") +
  theme_minimal() +
  scale_y_continuous(labels = scales::percent)  # Show y-axis as percentages

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.