This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.
install.packages("caTools")
Error in install.packages : Updating loaded packages
library(MASS)
library(tidyverse)
library(caTools)
data1 <- read.csv("data_synth.csv", header=TRUE, stringsAsFactors=FALSE)
#column_list<-list("age","single","inschool","arv_start","arv_dura","lifetime")
data<-data1%>%dplyr::select(age,single,inschool,arv_start,arv_dura,lifetime,male)%>%drop_na(arv_dura,arv_start)
summary(data)
age single inschool arv_start arv_dura
Min. :15.00 Min. :0.000 Min. :0.0000 Min. : 1.000 Min. : 0.00
1st Qu.:16.00 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.: 1.000 1st Qu.:10.00
Median :17.00 Median :1.000 Median :1.0000 Median : 1.000 Median :14.00
Mean :17.22 Mean :0.962 Mean :0.6899 Mean : 4.848 Mean :12.11
3rd Qu.:19.00 3rd Qu.:1.000 3rd Qu.:1.0000 3rd Qu.: 7.000 3rd Qu.:16.00
Max. :23.00 Max. :1.000 Max. :1.0000 Max. :20.000 Max. :19.00
lifetime male
Min. :0.0000 Min. :0.0000
1st Qu.:0.0000 1st Qu.:0.0000
Median :0.0000 Median :0.0000
Mean :0.3101 Mean :0.3608
3rd Qu.:1.0000 3rd Qu.:1.0000
Max. :1.0000 Max. :1.0000
rename1<-function(x){
return (paste("Average",x,sep=" "))
}
rename2<-function(x){
return (paste("Deviation",x,sep=" "))
}
rename3<-function(x)
{
if (x==0)
{
return("Female")
}
else
{
return("Male")
}
}
print(rename1("male"))
[1] "Average male"
avg_data <- data %>% group_by(male) %>% summarise_each(funs(mean)) #data is skewed towards females
Warning: `summarise_each()` was deprecated in dplyr 0.7.0.
ℹ Please use `across()` instead.
Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
Warning: `funs()` was deprecated in dplyr 0.8.0.
ℹ Please use a list of either functions or lambdas:
# Simple named list: list(mean = mean, median = median)
# Auto named with `tibble::lst()`: tibble::lst(mean, median)
# Using lambdas list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
avg_data<-avg_data %>%rename_with(rename1)%>%rename(c("Sex"="Average male"))
print(avg_data)
dev_data <- data %>% group_by(male) %>% summarise_each(funs(sd)) #data is skewed towards females
Warning: `summarise_each()` was deprecated in dplyr 0.7.0.
ℹ Please use `across()` instead.
Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
Warning: `funs()` was deprecated in dplyr 0.8.0.
ℹ Please use a list of either functions or lambdas:
# Simple named list: list(mean = mean, median = median)
# Auto named with `tibble::lst()`: tibble::lst(mean, median)
# Using lambdas list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
dev_data<-dev_data %>%rename_with(rename2)%>%rename(c("Sex"="Deviation male"))
print(dev_data)
results_table<-left_join(dev_data,avg_data,by="Sex")
for (name in column_list)
{
results_table<-results_table%>%relocate(paste("Average",name,sep=" "),.before=paste("Deviation",name,sep=" "))
}
results_table["Sex"]<-results_table%>%dplyr::select(Sex)%>%mutate(Sex=if_else(Sex==0,"Female","Male"))
print(results_table)
write.csv(results_table,"./results/task1.csv")
#inschool's relationship (chisq test)
in_school_data<-data1%>%dplyr::select(inschool,hiv_talk_friend)
in_school_data<-table(in_school_data$inschool,in_school_data$hiv_talk_friend)
print(in_school_data)
0 1
0 16 37
1 71 43
print(chisq.test(in_school_data))
Pearson's Chi-squared test with Yates' continuity correction
data: in_school_data
X-squared = 13.673, df = 1, p-value = 0.0002176
#inschool does look pretty significant
#age spearmans corellation
df_plot<-data1%>%dplyr::select(age,hiv_talk_friend)%>%group_by(age)%>%summarise(percentage=mean(hiv_talk_friend))
age<-df_plot["age"]$age
hiv_talk_friend<-df_plot["percentage"]$percentage
plot(age,hiv_talk_friend)
#age is significant
lifetime_data<-data1%>%dplyr::select(lifetime,hiv_talk_friend)
lifetime_data<-table(lifetime_data$lifetime,lifetime_data$hiv_talk_friend)
print(lifetime_data)
0 1
0 75 38
1 12 42
print(chisq.test(lifetime_data))
Pearson's Chi-squared test with Yates' continuity correction
data: lifetime_data
X-squared = 26.797, df = 1, p-value = 2.26e-07
#lifetime sex is vary significant
single_data<-data1%>%dplyr::select(single,hiv_talk_friend)
single_data<-table(single_data$single,single_data$hiv_talk_friend)
print(single_data)
0 1
0 0 6
1 87 74
print(chisq.test(single_data))
Warning in chisq.test(single_data) :
Chi-squared approximation may be incorrect
Pearson's Chi-squared test with Yates' continuity correction
data: single_data
X-squared = 4.7761, df = 1, p-value = 0.02886
#being single is significant
male_data<-data1%>%dplyr::select(male,hiv_talk_friend)
male_data<-table(male_data$male,male_data$hiv_talk_friend)
print(male_data)
0 1
0 49 58
1 38 22
print(chisq.test(male_data))
Pearson's Chi-squared test with Yates' continuity correction
data: male_data
X-squared = 4.0619, df = 1, p-value = 0.04386
#not that significant
edu_data<-data1%>%dplyr::select(education_4cat,hiv_talk_friend)
edu_data<-table(edu_data$education_4cat,edu_data$hiv_talk_friend)
print(edu_data)
0 1
0 0 2
1 37 41
2 45 26
3 5 11
print(chisq.test(edu_data))
Warning in chisq.test(edu_data) :
Chi-squared approximation may be incorrect
Pearson's Chi-squared test
data: edu_data
X-squared = 9.2625, df = 3, p-value = 0.026
# Splitting dataset
split <- sample.split(data1, SplitRatio = 0.8)
train_reg <- subset(data1, split == "TRUE")
test_reg <- subset(data1, split == "FALSE")
#head(test_reg)
model<-glm(hiv_talk_friend~single+lifetime+age+inschool,family = "binomial",data=train_reg)
summary(model)
Call:
glm(formula = hiv_talk_friend ~ single + lifetime + age + inschool,
family = "binomial", data = train_reg)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 8.0303 952.9790 0.008 0.99328
single -15.6733 952.9759 -0.016 0.98688
lifetime 1.0246 0.5361 1.911 0.05597 .
age 0.4154 0.1457 2.851 0.00436 **
inschool -0.1922 0.5289 -0.363 0.71628
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 180.31 on 130 degrees of freedom
Residual deviance: 139.15 on 126 degrees of freedom
AIC: 149.15
Number of Fisher Scoring iterations: 15
metric<-function(threshold)
{
print("")
print(paste("Threshold value choosen : ",threshold))
predict_reg <- predict(model,
test_reg, type = "response")
#print(predict_reg)
predict_reg <- ifelse(predict_reg >threshold, 1, 0)
# Evaluating model accuracy
# using confusion matrix
cm<-table(test_reg$hiv_talk_friend, predict_reg)
missing_classerr <- mean(predict_reg != test_reg$hiv_talk_friend)
print(paste('Accuracy =', 1 - missing_classerr))
all_p=sum(predict_reg==1,na.rm=TRUE)
tp=sum(predict_reg[(test_reg$hiv_talk_friend==1)]==1,na.rm=TRUE)
fn=sum(test_reg$hiv_talk_friend[predict_reg==0],na.rm=TRUE)
print(paste("Precision = ",tp/all_p))
print(paste("Recall = ",tp/(tp+fn)))
print("")
}
metric(0.2)
[1] ""
[1] "Threshold value choosen : 0.2"
[1] "Accuracy = 0.527777777777778"
[1] "Precision = 0.571428571428571"
[1] "Recall = 0.761904761904762"
[1] ""
metric(0.3)
[1] ""
[1] "Threshold value choosen : 0.3"
[1] "Accuracy = 0.666666666666667"
[1] "Precision = 0.714285714285714"
[1] "Recall = 0.714285714285714"
[1] ""
metric(0.4)
[1] ""
[1] "Threshold value choosen : 0.4"
[1] "Accuracy = 0.666666666666667"
[1] "Precision = 0.736842105263158"
[1] "Recall = 0.666666666666667"
[1] ""
metric(0.5)
[1] ""
[1] "Threshold value choosen : 0.5"
[1] "Accuracy = 0.694444444444444"
[1] "Precision = 0.857142857142857"
[1] "Recall = 0.571428571428571"
[1] ""
metric(0.6)
[1] ""
[1] "Threshold value choosen : 0.6"
[1] "Accuracy = 0.638888888888889"
[1] "Precision = 0.833333333333333"
[1] "Recall = 0.476190476190476"
[1] ""
df_plot<-data1%>%dplyr::select(age,hiv_talk_friend)%>%group_by(age)%>%summarise(percentage=mean(hiv_talk_friend))
age<-df_plot["age"]$age
hiv_talk_friend<-df_plot["percentage"]$percentage
plot(age,hiv_talk_friend,pch="o", col="blue")
predict_reg<-predict(model,data1,type = "response")
age_cont<-data1%>%dplyr::select(age)
names(predict_reg)<-NULL
data1["predy"]<-predict_reg
df_predy<-data1%>%dplyr::select(age,predy)%>%group_by(age)%>%summarise(percentage=mean(predy))
predy<-df_predy$percentage
#head(df_predy)
points(df_predy$age,predy,col="red", pch="P")
lines(smooth.spline(df_predy$age,predy),col="orange",lwd=2)
legend(19.5,0.4,legend=c("Predicted Percentage","Ground Truth Percentage"),col=c("red","blue"),pch=c("P","o"),lty=c(1,0))
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.