•Download the Covid-19 dataset from the URL: https://www.kaggle.com/kimjihoo/coronavirusdataset The dataset is online and publicly available with a free Kaggle account.
•This dataset was released by Korea Centers for Disease Control and Prevention, and it contains information the Covid-19 cases in South Korea.
The aim of the final project is to predict mortality among confirmed CoVID-19 patients in South Korea using logistic regression and support vector machines.
library(readr)
PatientInfo <- read_csv("./PatientInfo.csv")
## Warning: One or more parsing issues, see `problems()` for details
## Rows: 5165 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): sex, age, country, province, city, infection_case, state
## dbl (3): patient_id, infected_by, contact_number
## date (4): symptom_onset_date, confirmed_date, released_date, deceased_date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#View(PatientInfo)
### Delete patient with missing information
PatientInfo=PatientInfo[complete.cases(PatientInfo[ ,"age"]),]
PatientInfo=PatientInfo[complete.cases(PatientInfo[ ,"sex"]),]
PatientInfo=PatientInfo[complete.cases(PatientInfo[ ,"infection_case"]),]
PatientInfo=PatientInfo[complete.cases(PatientInfo[ ,"province"]),]
PatientInfo=PatientInfo[complete.cases(PatientInfo[ ,"state"]),]
#training data
dt = sort(sample(nrow(PatientInfo),nrow(PatientInfo)*0.80 ))
train <- PatientInfo[dt,]
test <- PatientInfo[-dt,]
Compare their performances on the test data. If you are doing the project, then you can train only one model (either logistic regression or support vector machines). As a hint, you could follow a similar approach to the following paper (in terms of choosing the predictors etc.): [Predicting CoVID-19 community mortality risk using machine learning and development of an online prognostic tool, 2020] by Das et al. However, this paper is given as an example, you do not have to follow it. Other approaches are also welcome.
##selection of variables (According to Das et al. (2020), PeerJ, DOI 10.7717/peerj.10083 we use four features: Age, Exposure, Provience and Sex; for Age we transform the date into 10 groups so we encode the date into the model, for Sex we transform Male/Female to 1/2 and the Provioence to be 1 if it is Seoul and -1 otherwise. Same for the infection_case to be 1 if the exposure is because it is from contact with patient and 0 otherwise)
train$sex = as.numeric(as.factor(train$sex))
test$sex = as.numeric(as.factor(test$sex))
train$age = as.numeric(as.factor(train$age))
test$age = as.numeric(as.factor(test$age))
train$province = ifelse(train$province=="Seoul", 1,0)
test$province = ifelse(test$province=="Seoul", 1, 0)
train$infection_case = ifelse(train$infection_case=="contact with patient", 1, 0)
test$infection_case = ifelse(test$infection_case=="contact with patient", 1, 0)
# train$infection_case = as.numeric(as.factor(train$infection_case))
# test$infection_case = as.numeric(as.factor(test$infection_case))
train$state = ifelse(train$state=="deceased", 1, -1)
test$state = ifelse(test$state=="deceased", 1, -1)
table(train$state)
##
## -1 1
## 2333 31
The data is imbalanced. We oversample the positive class to improve the performnace:
library(caTools)
library(ROSE)
## Loaded ROSE 0.0-4
cols <- c(2:3, 5, 7,14)
train <-train[,cols]
test <-test[,cols]
train <- ovun.sample(state ~.,data = train,N= 5000, seed=1,
method="over")$data
test <- ovun.sample(state ~.,data = test,N= 1000, seed=1,
method="over")$data
table(train$state)
##
## -1 1
## 2333 2667
table(test$state)
##
## -1 1
## 587 413
We next test our model on the “test data”.
If \(f(x) > 0\), we classify the input \(x\) as \(+1\) (Deceased), if \(f(x) < 0\), we classify the input \(x\) as \(-1\) (Not Deceased). The percent of the test data can you classify correctly:
train$state = ifelse(train$state==1, 1, 0)
test$state = ifelse(test$state==1, 1, 0)
mylogit <- glm(state ~ age+sex+province+infection_case, data = train, family = "binomial")
summary(mylogit)
##
## Call:
## glm(formula = state ~ age + sex + province + infection_case,
## family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.0632 -0.4369 0.1988 0.5428 1.7420
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.64290 0.27704 -31.197 <2e-16 ***
## age 1.03671 0.02879 36.014 <2e-16 ***
## sex 1.61323 0.09201 17.533 <2e-16 ***
## province -0.26843 0.11175 -2.402 0.0163 *
## infection_case -1.49173 0.09569 -15.589 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6909.1 on 4999 degrees of freedom
## Residual deviance: 3851.3 on 4995 degrees of freedom
## AIC: 3861.3
##
## Number of Fisher Scoring iterations: 5
confint(mylogit)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -9.1957549 -8.1093884
## age 0.9812855 1.0941664
## sex 1.4345238 1.7953258
## province -0.4881636 -0.0499505
## infection_case -1.6807066 -1.3054923
# Make predictions
summary(mylogit)$coef
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.6429012 0.27704499 -31.196742 1.179546e-213
## age 1.0367149 0.02878666 36.013728 5.100743e-284
## sex 1.6132260 0.09201200 17.532778 8.053713e-69
## province -0.2684324 0.11174548 -2.402177 1.629783e-02
## infection_case -1.4917310 0.09569289 -15.588734 8.683787e-55
prob_pred = predict(mylogit, type = 'response', newdata =test)
y_pred = ifelse(prob_pred > 0.5, 1, 0)
mean(y_pred == test$state)
## [1] 0.809
accuracy.meas(test$state, y_pred, threshold = 0.5)
##
## Call:
## accuracy.meas(response = test$state, predicted = y_pred, threshold = 0.5)
##
## Examples are labelled as positive when predicted is greater than 0.5
##
## precision: 0.684
## recall: 1.000
## F: 0.406
The confusion matrix:
table(y_pred ,test$state)
##
## y_pred 0 1
## 0 396 0
## 1 191 413
Next, we visualize the results on the training data
library(ggplot2)
train$state <- factor(train$state)
ggplot(train, aes(sex, 10*age, colour = state)) + geom_point()+scale_color_manual(breaks = c("0", "1"), values=c("red", "green"))+ ggtitle("Sex versus Age in the train set") + geom_abline(intercept = 100, slope = -27, color='purple',size=2)
library(ggplot2)
test$state <- factor(test$state)
ggplot(test, aes(sex, 10*age, colour = state)) + geom_point()+scale_color_manual(breaks = c("0", "1"), values=c("red", "green"))+ ggtitle("Sex versus Age in the train set") + geom_abline(intercept = 100, slope = -27, color='purple',size=1.2)
roc.curve(test$state, y_pred, plotit = TRUE, add.roc = FALSE,
n.thresholds=100)
## Area under the curve (AUC): 0.837