setwd("D:/My Projects")
getwd()
STIData <- read.csv("D:/My Projects/STIData.csv")
View(STIData)
##Checking Duplicates
dups<-duplicated(STIData$IdNumber)
table(dups)
STIData$IdNumber [duplicated(STIData$IdNumber)]
STIData<- STIData[order(STIData$IdNumber),]
View(STIData[STIData$IdNumber==51,])
STIData<-STIData[!(STIData$IdNumber==51 & STIData$A1Age ==23),]
##Cleaning Inconsistencies
table(STIData$CaseStatus)
STIData<-STIData[order(-STIData$CaseStatus),]
View(STIData[STIData$CaseStatus==3,])
STIData$CaseStatus[STIData$IdNumber==1 & STIData$A1Age==30]=1
STIData$CaseStatus[STIData$IdNumber==31 & STIData$A1Age==23]=1
table(STIData$CaseStatus)
##Handling Missing
View(STIData[is.na(STIData$Height),])
attach(STIData)
summary(Height)
STIData$Height[STIData$IdNumber==187]=161
##Checking out of range
summary(Weight)
summary(A1Age)
##Creating new variables
STIData$AgeCat<-NA
STIData$AgeCat[STIData$A1Age<=35]=1
STIData$AgeCat[STIData$A1Age>=35]=0
STIData$AgeCat<-factor(STIData$AgeCat, levels = c(1,0),labels = c("16-34 years","35-63 years"))
table(STIData$AgeCat)
STIData$occupation<-NA
STIData$occupation[STIData$A2Occupation=="1 unemployed"|STIData$A2Occupation== "4 student"]=1
STIData$occupation[STIData$A2Occupation =="2 informal"|STIData$A2Occupation =="3 formal"]=0
STIData$occupation<-factor(STIData$occupation, levels = c(1,0),labels = c("Unemployed", "Employed"))
STIData$church <- NA
STIData$church[STIData$A3Church=="7 roman catholic" |STIData$A3Church=="2 apostolic"| STIData$A3Church=="3 methodist"|STIData$A3Church=="4 anglican"|STIData$A3Church=="5 pentecostal"]=1
STIData$church[STIData$A3Church=="6 atheist" |STIData$A3Church=="8 other "]=0
STIData$church<-factor(STIData$church, levels = c(1,0),labels = c("Believers", "Non-Believers"))
STIData$education<-NA
STIData$education[STIData$A4LevelOfEducation=="1 none"|STIData$A4LevelOfEducation=="2 primary"]=1
STIData$education[STIData$A4LevelOfEducation=="3 secondary" |STIData$A4LevelOfEducation=="4 tertiary"]=0
STIData$education <- factor(STIData$education, levels = c(1,0), labels = c("Primary & Below","Secondary & Above"))
STIData$marital<-NA
STIData$marital[STIData$A5MaritalStatus=="2 married"]=1
STIData$marital[STIData$A5MaritalStatus=="1 single"|STIData$A5MaritalStatus=="5 widowed"|STIData$A5MaritalStatus=="3 co-habiting"|STIData$A5MaritalStatus=="4 divorcee"]=0
STIData$marital<-factor(STIData$marital, levels = c(1,0),labels = c("Married", "Not Married"))
## Creating new varible and handling missing
table(Sex)
STIData$Sex_Cat<-NA
STIData$Sex_Cat[STIData$Sex=="Female"]=1
STIData$Sex_Cat[STIData$Sex=="Male"]=0
STIData$Sex_Cat<-factor(STIData$Sex_Cat,levels = c(1,0),labels = c("Female","Male"))
class(STIData$Sex_Cat)
table(STIData$Sex_Cat)
## Handling missing
View(STIData[is.na(STIData$Sex_Cat),])
STIData$Sex_Cat[STIData$IdNumber==48]="Male"
STIData$Sex_Cat[STIData$IdNumber==213]="Female"
table(STIData$Sex_Cat)
# Calculating BMI
STIData$BMI <- STIData$Weight / ((STIData$Height / 100) ^ 2)
# Creating BMI_Cat
STIData$BMI_Cat <- NA
# Assigning BMI categories using WHO Standards
STIData$BMI_Cat[STIData$BMI < 18.5]="Underweight"
STIData$BMI_Cat[STIData$BMI >= 18.5 & STIData$BMI < 25]= "Normal"
STIData$BMI_Cat[STIData$BMI >= 25 & STIData$BMI < 30]= "Overweight"
STIData$BMI_Cat[STIData$BMI >= 30]="Obese"
table(STIData$BMI_Cat)
#Saving STIData as a CSV file
write.csv(STIData, file = "cleaned_STIData.csv", row.names = FALSE)
# 2.Generating frequency tables
attach(STIData)
table(AgeCat)
table(Sex_Cat)
table(occupation)
table(church)
table(education)
table(marital)
table(BMI_Cat)
###Histogram and pie chart
library(ggplot2)
class(A1Age)
hist(STIData$A1Age, main="Age Histogram", ylab="Frequency",col ="blue", xlab="AgeCat")
##Pie chart
BMI <- table(BMI_Cat)
pie(BMI,
main = "Pie Chart of BMI",
col = c('blue','pink','yellow','red'),
labels = paste0(names(BMI),round(prop.table(BMI) * 100, 1), "%"))
# 3.Prevalence of STI
#Creating variable STI
STIData$STI<-NA
STIData$STI[STIData$CaseStatus==1]=1
STIData$STI[STIData$CaseStatus==2]=0
STIData$STI=factor(STIData$STI,levels=c(1,0),labels=c("Positive","Negative"))
# Frequency distribution table
sti <- table(STIData$STI)
cat("Frequency Distribution of STI:\n")
print(sti)
#Calculating prevalence of sti
sti_prop <- prop.table(sti)
prevalence <- sti_prop["Positive"] * 100
cat("\nSTI Prevalence: ", round(prevalence, 1), "%\n")
# Generating pie chart using frequency distribution
sti <- table(STIData$STI)
pie(sti,
main = "STI Prevalence",
col = c("red", "blue"),
labels = paste(names(sti), "\n", round(prop.table(sti) * 100, 1), "%"))
# 4. Establishing association between STI and other factors
# a) socio-demographic
attach(STIData)
var1 <- c("AgeCat","Sex_Cat","occupation","church","education","marital","BMI_Cat")
chi2 <- function(var, STIData, STI) {
#creating contingency table
table_data <- table(STIData[[var]], STIData[[STI]])
#calculating row percentages
row_percent <- prop.table(table_data, margin = 1) * 100
#chi-square test
chi2_test <- chisq.test(table_data)
#results
cat("\nVariable:", var, "\n")
cat("Contingency Table:\n")
print(table_data)
cat("\nRow Percentages:\n")
print(round(row_percent, 2))
cat("\nChi-Square Test P-value:", chi2_test$p.value, "\n")
cat("-----------------------\n")
}
for (var in var1) {
chi2(var, STIData, "STI")
}
#4b. Social Capital
# Contingency table
tab <- table(STIData$D1BurialSociety, STIData$STI)
print(tab)
# Row percentages
prop.table(tab, margin = 1) * 100
# P-value
chisq.test(tab)$p.value
tab <- table(STIData$D1religiousgrp, STIData$STI)
print(tab)
prop.table(tab, margin = 1) * 100
chisq.test(tab)$p.value
tab <- table(STIData$D1savingsClub, STIData$STI)
print(tab)
prop.table(tab, margin = 1) * 100
chisq.test(tab)$p.value
tab <- table(STIData$D1tradersAssoc, STIData$STI)
print(tab)
prop.table(tab, margin = 1) * 100
chisq.test(tab)$p.value
tab <- table(STIData$Belong, STIData$STI)
print(tab)
prop.table(tab, margin = 1) * 100
chisq.test(tab)$p.value
tab <- table(STIData$ReceiveHelp, STIData$STI)
print(tab)
prop.table(tab, margin = 1) * 100
chisq.test(tab)$p.value
#4c. Risky sexual Behavior
attach(STIData)
STIData$usecondom<-NA
STIData$usecondom[STIData$N12UseCondom=='1 yes']=1
STIData$usecondom[STIData$N12UseCondom=='2 no' | STIData$N12UseCondom=='2 No' ]=0
STIData$usecondom=factor(STIData$usecondom,levels=c(1,0),labels=c("yes","no"))
summary(N16HowOldIs)
STIData$howold<-NA
STIData$howold[STIData$N16HowOldIs<=29]=1
STIData$howold[STIData$N16HowOldIs >=29]=0
STIData$howold<-factor(STIData$howold, levels = c(1,0),labels = c("16-28 years","29-57 years"))
summary(N2SexDebut)
STIData$firstsex<-NA
STIData$firstsex[STIData$N2SexDebut<=19]=1
STIData$firstsex[STIData$N2SexDebut>=19]=0
STIData$firstsex<-factor(STIData$firstsex, levels = c(1,0),labels = c("12-18 years","19-32 years"))
var3<-c("N10givereceiveforsex","N11Usedcondom","usecondom","N13TakenAlcohol","N14DoYouHave","N15LivingTogether","howold","firstsex","N3HadAnSti","N9Relationship","AgeFirstSex","AlcoholUse","SexPartner1year","SexPartner3month","SexPartnerLife3","LastPartnerSpouse")
chi2 <- function(var, STIData, STI) {
#creating contingency table
table_data <- table(STIData[[var]], STIData[[STI]])
#calculating row percentages
row_percent <- prop.table(table_data, margin = 1) * 100
#chi-square test
chi2_test <- chisq.test(table_data)
#results
cat("\nVariable:", var, "\n")
cat("Contingency Table:\n")
print(table_data)
cat("\nRow Percentages:\n")
print(round(row_percent, 2))
cat("\nChi-Square Test P-value:", chi2_test$p.value, "\n")
cat("-----------------------\n")
}
for (var in var3) {
chi2(var, STIData, "STI")
}
# 5. Multivariable logistic regression
significant <- c("SexPartnerLife3","SexPartner3month","SexPartner1year","AlcoholUse","N3HadAnSti","N15LivingTogether","N13TakenAlcohol","Belong")
Model <- glm(STI ~ SexPartnerLife3+SexPartner3month+SexPartner1year+AlcoholUse+N3HadAnSti+N15LivingTogether+N13TakenAlcohol+Belong,family=binomial(link=logit),data=STIData)
summary(Model)
coef(Model)
OR <- exp(coef(Model))
OR
CI <- exp(confint(Model))
CI
# 6.
library(ggplot2)
install.packages("dplyr")
library(dplyr)
# Boxplot of BMI by Sex
ggplot(STIData, aes(x = Sex_Cat, y = BMI_Cat, fill = Sex_Cat)) +
geom_boxplot() +
labs(title = "Distribution of BMI by Sex",
x = "Sex_Cat",
y = "BMI_Cat") +
theme_minimal() +
theme(legend.position = "none")
# Density plot of BMI
ggplot(STIData, aes(x = BMI_Cat)) +
geom_density(fill='violet', alpha = 0.7) +
labs(title = "Density Plot of BMI",
x = "BMI_Cat",
y = "Density") +
theme_minimal()
# Faceted bar plot of STI prevalence by AgeGroup and Sex
ggplot(STIData, aes(x = Sex_Cat, fill = STI)) +
geom_bar(position = "fill") +
facet_wrap(~ AgeCat) +
labs(title = "STI Prevalence by Age Group and Sex",
x = "Sex_Cat",
y = "Proportion",
fill = "STI Status") +
theme_minimal() +
scale_y_continuous(labels = scales::percent) # Show y-axis as percentages
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.