#Session Information
sessionInfo()
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 18363)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_South Africa.1252
## [2] LC_CTYPE=English_South Africa.1252
## [3] LC_MONETARY=English_South Africa.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_South Africa.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] compiler_3.5.2 magrittr_1.5 tools_3.5.2 htmltools_0.3.6
## [5] yaml_2.2.0 Rcpp_1.0.1 stringi_1.4.3 rmarkdown_1.12
## [9] knitr_1.22 stringr_1.4.0 xfun_0.6 digest_0.6.18
## [13] evaluate_0.13
(Disclaimer - this is work in progress. Just attempted to compare logistic regression and a decision tree to predict whether the number of cases in a country can be used to predict recovery of a patient infected by coronavirus. )
The coronavirus package provides a tidy format dataset of the 2019 Novel Coronavirus COVID-19 (2019-nCoV) epidemic. The packaged was released by Rami Krispin on https://github.com/RamiKrispin/coronavirus
The raw data was pulled from the Johns Hopkins University Center for Systems Science and Engineering (JHU CCSE) Coronavirus repository.
The provided data relates to people that have been confirmed as infected by the coronavirus, died or recovered from the virus. The dataset has the columns “Province.State”, “Country.Region”, “Lat”,“Long”,“date” and “cases”.I want to investigate the following:
Is the chance of recovery lower/higher for someone infected by corona virus based on the number of cases in a country or where they were infected ?
A target variable “recovery” was computed with 1 indicating a recovered person and 0 indicating someone who is confirmed as infected or died from coronavirus.
The package by Rami Krispin is currently available on Guthub and installation is as follows:
# install.packages("devtools")
#devtools::install_github("RamiKrispin/coronavirus", force = TRUE)
The package contains a single dataset called coronavirus
library(coronavirus)
data("coronavirus")
Just to explore the data and sense-check the loaded data,let us look at the column names.
names(coronavirus)
## [1] "Province.State" "Country.Region" "Lat" "Long"
## [5] "date" "cases" "type"
Lets check the first 6 rows of data.
head(coronavirus)
## Province.State Country.Region Lat Long date cases
## 1 Japan 35.67620 139.6503 2020-01-21 1
## 2 Thailand 13.75630 100.5018 2020-01-21 2
## 3 Beijing Mainland China 40.18238 116.4142 2020-01-21 10
## 4 Chongqing Mainland China 30.05718 107.8740 2020-01-21 5
## 5 Guangdong Mainland China 23.33841 113.4220 2020-01-21 17
## 6 Hubei Mainland China 30.97564 112.2707 2020-01-21 270
## type
## 1 confirmed
## 2 confirmed
## 3 confirmed
## 4 confirmed
## 5 confirmed
## 6 confirmed
Check the shape of the data.
dim(coronavirus)
## [1] 1207 7
##Total Cases by region and type (top 20) Here is an example of a summary total cases by region and type (top 20):
It can be seen that confirmed case in Mainland CHina account for 59 823 occurrences.
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
summary_df <- coronavirus %>% group_by(Country.Region, type) %>%
summarise(total_cases = sum(cases)) %>%
arrange(-total_cases)
summary_df %>% head(87)
## # A tibble: 55 x 3
## # Groups: Country.Region [29]
## Country.Region type total_cases
## <chr> <chr> <dbl>
## 1 Mainland China confirmed 59823
## 2 Mainland China recovered 6205
## 3 Mainland China death 1367
## 4 Others confirmed 175
## 5 Singapore confirmed 58
## 6 Hong Kong confirmed 53
## 7 Thailand confirmed 33
## 8 Japan confirmed 28
## 9 South Korea confirmed 28
## 10 Malaysia confirmed 19
## # ... with 45 more rows
#Create New Column recovery
recovery <- c(1:1207)
cbind(coronavirus,recovery)
coronavirus$recovery[coronavirus$type == "recovered"]<-1
## Warning: Unknown or uninitialised column: 'recovery'.
coronavirus$recovery[coronavirus$type != "recovered" ]<-0
#New Names
names(coronavirus)
## [1] "Province.State" "Country.Region" "Lat" "Long"
## [5] "date" "cases" "type" "recovery"
#Tabulation of recovery
table(coronavirus$recovery)
##
## 0 1
## 821 386
##Splitting Training & Testing Data
Split-out validation dataset.Given that there is only one data set, it is paramount that we randomly split the data set into a training set and testing set.
For reproducibility, we’ll set our seed to initialize the random number generator.The CATools package contains sample.split command to split the data with a split ratio of 0.75 implying we’ll put 75% of the data in the training set, which we’ll use to build the model, and 25% of the data in the testing
library(caTools)
## Warning: package 'caTools' was built under R version 3.5.3
# Randomly split data
set.seed(1)
split = sample.split(coronavirus$recovery, SplitRatio = 0.85)
The subset function was used to create the training and testing sets.Training set will be called dfTrain and testing set dfTest.
dfTrain = subset(coronavirus, split == TRUE)
dfTest = subset(coronavirus, split == FALSE)
Check the dimensions of both the training and testing sets
dim(dfTrain)
## [1] 1026 8
dim(dfTest)
## [1] 181 8
##Tabulate recovery variable outcomes
table(dfTrain$recovery)
##
## 0 1
## 698 328
table(dfTest$recovery)
##
## 0 1
## 123 58
Change the variables from integer to factor
dfTrain$recovery<-factor(dfTrain$recovery)
str(dfTrain)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1026 obs. of 8 variables:
## $ Province.State: chr "" "" "Beijing" "Guangdong" ...
## $ Country.Region: chr "Japan" "Thailand" "Mainland China" "Mainland China" ...
## $ Lat : num 35.7 13.8 40.2 23.3 27.6 ...
## $ Long : num 140 101 116 113 116 ...
## $ date : Date, format: "2020-01-21" "2020-01-21" ...
## $ cases : num 1 2 10 17 2 1 1 9 2 1 ...
## $ type : chr "confirmed" "confirmed" "confirmed" "confirmed" ...
## $ recovery : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##Baseline Model The baseline model has an accuracy of 32%.This is what we’ll try to beat with the logistic regression model.
table(dfTrain$recovery)
##
## 0 1
## 698 328
290/906
## [1] 0.3200883
#Feature Selection
library(caret)
## Warning: package 'caret' was built under R version 3.5.3
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.3
set.seed(7)
# prepare training scheme
control <- trainControl(method="repeatedcv", number=10, repeats=3)
# train the model
model <- train(recovery~cases + Country.Region, data=dfTrain, method="glm", preProcess="scale", trControl=control)
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Country.RegionSpain
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Country.RegionBelgium
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Country.RegionSweden
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Country.RegionCambodia
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Country.RegionRussia
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Country.RegionCambodia,
## Country.RegionSweden
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Country.RegionSpain
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Country.RegionRussia
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Country.RegionBelgium
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Country.RegionCambodia
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Country.RegionSri Lanka
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Country.RegionBelgium,
## Country.RegionRussia
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Country.RegionSpain
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Country.RegionNepal
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Country.RegionSweden
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# estimate variable importance
importance <- varImp(model, scale=FALSE)
# summarize importance
print(importance)
## glm variable importance
##
## only 20 most important variables shown (out of 29)
##
## Overall
## cases 2.01587
## `Country.RegionMainland China` 1.69378
## Country.RegionNepal 1.30759
## Country.RegionFinland 1.30759
## `Country.RegionSri Lanka` 1.30759
## `Country.RegionSouth Korea` 1.24029
## Country.RegionVietnam 1.06177
## Country.RegionMacau 1.03460
## Country.RegionPhilippines 0.99792
## Country.RegionUK 0.77396
## Country.RegionThailand 0.70468
## Country.RegionMalaysia 0.70048
## Country.RegionCanada 0.59798
## Country.RegionSingapore 0.52209
## Country.RegionFrance 0.45925
## Country.RegionJapan 0.39645
## Country.RegionUS 0.39387
## `Country.RegionHong Kong` 0.26992
## Country.RegionTaiwan 0.07211
## Country.RegionGermany 0.01572
# plot importance
plot(importance)
##Logistic Regression Model The logistic regression method assumes that:
#Linearity assumption
logistic = glm(recovery ~ cases + Country.Region , data=dfTrain, family="binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logistic)
##
## Call:
## glm(formula = recovery ~ cases + Country.Region, family = "binomial",
## data = dfTrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1774 -0.9543 -0.8541 1.4067 2.6488
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.299e+00 1.049e+00 -2.192 0.0284
## cases -3.692e-03 1.831e-03 -2.016 0.0438
## Country.RegionBelgium -1.426e+01 2.400e+03 -0.006 0.9953
## Country.RegionCambodia 1.887e+01 2.400e+03 0.008 0.9937
## Country.RegionCanada 9.167e-01 1.533e+00 0.598 0.5499
## Country.RegionFinland 2.302e+00 1.761e+00 1.308 0.1910
## Country.RegionFrance 6.965e-01 1.517e+00 0.459 0.6461
## Country.RegionGermany -1.426e+01 9.069e+02 -0.016 0.9875
## Country.RegionHong Kong -3.973e-01 1.472e+00 -0.270 0.7872
## Country.RegionIndia -1.426e+01 1.385e+03 -0.010 0.9918
## Country.RegionItaly -1.426e+01 1.697e+03 -0.008 0.9933
## Country.RegionJapan 5.144e-01 1.297e+00 0.396 0.6918
## Country.RegionMacau 1.388e+00 1.342e+00 1.035 0.3009
## Country.RegionMainland China 1.781e+00 1.052e+00 1.694 0.0903
## Country.RegionMalaysia 9.200e-01 1.313e+00 0.700 0.4836
## Country.RegionNepal 2.302e+00 1.761e+00 1.308 0.1910
## Country.RegionOthers -1.411e+01 1.199e+03 -0.012 0.9906
## Country.RegionPhilippines 1.609e+00 1.612e+00 0.998 0.3183
## Country.RegionRussia 1.887e+01 2.400e+03 0.008 0.9937
## Country.RegionSingapore 6.385e-01 1.223e+00 0.522 0.6016
## Country.RegionSouth Korea 1.499e+00 1.209e+00 1.240 0.2149
## Country.RegionSpain -1.426e+01 2.400e+03 -0.006 0.9953
## Country.RegionSri Lanka 2.302e+00 1.761e+00 1.308 0.1910
## Country.RegionSweden -1.426e+01 2.400e+03 -0.006 0.9953
## Country.RegionTaiwan 1.072e-01 1.487e+00 0.072 0.9425
## Country.RegionThailand 9.255e-01 1.313e+00 0.705 0.4810
## Country.RegionUK 1.207e+00 1.560e+00 0.774 0.4390
## Country.RegionUnited Arab Emirates -1.426e+01 1.385e+03 -0.010 0.9918
## Country.RegionUS 5.110e-01 1.297e+00 0.394 0.6937
## Country.RegionVietnam 1.325e+00 1.248e+00 1.062 0.2883
##
## (Intercept) *
## cases *
## Country.RegionBelgium
## Country.RegionCambodia
## Country.RegionCanada
## Country.RegionFinland
## Country.RegionFrance
## Country.RegionGermany
## Country.RegionHong Kong
## Country.RegionIndia
## Country.RegionItaly
## Country.RegionJapan
## Country.RegionMacau
## Country.RegionMainland China .
## Country.RegionMalaysia
## Country.RegionNepal
## Country.RegionOthers
## Country.RegionPhilippines
## Country.RegionRussia
## Country.RegionSingapore
## Country.RegionSouth Korea
## Country.RegionSpain
## Country.RegionSri Lanka
## Country.RegionSweden
## Country.RegionTaiwan
## Country.RegionThailand
## Country.RegionUK
## Country.RegionUnited Arab Emirates
## Country.RegionUS
## Country.RegionVietnam
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1285.9 on 1025 degrees of freedom
## Residual deviance: 1221.7 on 996 degrees of freedom
## AIC: 1281.7
##
## Number of Fisher Scoring iterations: 15
##Multicollinearity None of the predictor variables showed multicollinearity as they all had low VIF value below 5.
car::vif(logistic)
## GVIF Df GVIF^(1/(2*Df))
## cases 1.017389 1 1.008657
## Country.Region 1.017389 28 1.000308
##Check for Influential Outliers
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.5.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble 2.1.3 v purrr 0.3.3
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## Warning: package 'tibble' was built under R version 3.5.3
## Warning: package 'tidyr' was built under R version 3.5.3
## Warning: package 'readr' was built under R version 3.5.3
## Warning: package 'purrr' was built under R version 3.5.3
## Warning: package 'stringr' was built under R version 3.5.3
## Warning: package 'forcats' was built under R version 3.5.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::lift() masks caret::lift()
library(broom)
## Warning: package 'broom' was built under R version 3.5.3
plot(logistic, which = 4, id.n = 3)
# Extract model results
logistic.data <- augment(logistic) %>%
mutate(index = 1:n())
logistic.data %>% top_n(3, .cooksd)
## # A tibble: 3 x 11
## recovery cases Country.Region .fitted .se.fit .resid .hat .sigma
## <fct> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 1 Sweden -16.6 2400. -3.57e-4 1.000 NaN
## 2 0 1 Belgium -16.6 2400. -3.57e-4 1.000 NaN
## 3 1 802 Mainland China -3.48 1.44 2.65e+0 0.0600 1.10
## # ... with 3 more variables: .cooksd <dbl>, .std.resid <dbl>, index <int>
ggplot(logistic.data, aes(index, .std.resid)) +
geom_point(aes(color = recovery), alpha = .5) +
theme_bw()
## Warning: Removed 3 rows containing missing values (geom_point).
logistic.data %>%
filter(abs(.std.resid) > 3)
## # A tibble: 2 x 11
## recovery cases Country.Region .fitted .se.fit .resid .hat .sigma
## <fct> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 1 Sweden -16.6 2400. -3.57e-4 1.000 NaN
## 2 0 1 Belgium -16.6 2400. -3.57e-4 1.000 NaN
## # ... with 3 more variables: .cooksd <dbl>, .std.resid <dbl>, index <int>
##Making predictions on Training set
predicted = data.frame(probability.of.recovery= logistic$fitted.value, recovery=dfTrain$recovery)
predicted = predicted[order(predicted$probability.of.recovery, decreasing = FALSE),]
predicted$rank = 1:nrow(predicted)
library(ggplot2)
library(cowplot)
## Warning: package 'cowplot' was built under R version 3.5.3
##
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
## default ggplot2 theme anymore. To recover the previous
## behavior, execute:
## theme_set(theme_cowplot())
## ********************************************************
ggplot(data=predicted, aes(x=rank, y=probability.of.recovery)) +
geom_point(aes(colour=recovery), alpha=1, shape=4, stroke=2)+
xlab("Index") +
ylab("Predicted Probability of Recovery")
#More predictions
predictTrain = predict(logistic, type="response")
summary(predictTrain)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.3300 0.3641 0.3197 0.3710 1.0000
tapply(predictTrain, dfTrain$recovery, mean)
## 0 1
## 0.3036068 0.3539099
The threshold of 0.5 was chosen as it had the highest Accuracy compared to 0.2 and 0.7 thresholds.
table(dfTrain$recovery, predictTrain > 0.4)
##
## FALSE TRUE
## 0 695 3
## 1 323 5
a = 695
b = 3
c = 323
d = 5
(a+d)/(a+b+c+d)
## [1] 0.6822612
b/(b+d)
## [1] 0.375
a/(a+c)
## [1] 0.6827112
b/(b+c)
## [1] 0.009202454
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.5.3
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.5.3
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
ROCRpred = prediction(predictTrain, dfTrain$recovery)
ROCRperf = performance(ROCRpred, "tpr", "fpr")
plot(ROCRperf, colorize=TRUE, print.cutoffs.at=seq(0,1,by=0.1), text.adj=c(-0.2,1.7))
predictTest = predict(logistic, type = "response", newdata = dfTest)
table(dfTest$recovery,predictTest >= 0.4)
##
## FALSE TRUE
## 0 121 2
## 1 58 0
m = 121
n = 2
o = 58
p = 0
(m+n)/(m+n+o+p)
## [1] 0.679558
n/(n+p)
## [1] 1
m/(m+o)
## [1] 0.6759777
n/(n+o)
## [1] 0.03333333
library(rpart, warn.conflicts = FALSE)
library(rpart.plot, warn.conflicts = FALSE)
## Warning: package 'rpart.plot' was built under R version 3.5.3
library(rattle, warn.conflicts = FALSE)
## Warning: package 'rattle' was built under R version 3.5.3
## Rattle: A free graphical interface for data science with R.
## Version 5.2.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(RColorBrewer, warn.conflicts = FALSE)
# Changing the prior probabilities of recovery
tree_prior = rpart(recovery ~ cases + Country.Region, method = "class",
data = dfTrain,
parms = list(prior = c(0.7,0.3)), # changing the probabilities of recovery
control = rpart.control(cp = 0.001))
prp(tree_prior)
# cp and xerror values.
printcp(tree_prior)
##
## Classification tree:
## rpart(formula = recovery ~ cases + Country.Region, data = dfTrain,
## method = "class", parms = list(prior = c(0.7, 0.3)), control = rpart.control(cp = 0.001))
##
## Variables actually used in tree construction:
## [1] cases Country.Region
##
## Root node error: 307.8/1026 = 0.3
##
## n= 1026
##
## CP nsplit rel error xerror xstd
## 1 0.0026801 0 1.00000 1.0000 0.045543
## 2 0.0010000 6 0.98138 1.0268 0.046167
plotcp(tree_prior)
tree_min = tree_prior$cptable[which.min(tree_prior$cptable[,"xerror"]),"CP"]
# pruning the tree for increased model performance.
ptree_prior = prune(tree_prior, cp = tree_min) # pruning the tree.
prp(ptree_prior)
pred_prior = predict(ptree_prior, newdata = dfTest, type = "class") # making predictions.
confmat_prior = table(dfTest$recovery,pred_prior) # making the confusion matrix.
confmat_prior
## pred_prior
## 0 1
## 0 123 0
## 1 58 0
acc_prior = sum(diag(confmat_prior)) / sum(confmat_prior)
acc_prior
## [1] 0.679558
#Prp
prp(ptree_prior)
#Loss Matrix
# Including a loss matrix
tree_loss_matrix = rpart(recovery ~ cases + Country.Region, method = "class",
data = dfTrain,
parms = list(loss = matrix(c(0,10,1,0),ncol = 2)),
control = rpart.control(cp = 0.001))
printcp(tree_loss_matrix)
##
## Classification tree:
## rpart(formula = recovery ~ cases + Country.Region, data = dfTrain,
## method = "class", parms = list(loss = matrix(c(0, 10, 1,
## 0), ncol = 2)), control = rpart.control(cp = 0.001))
##
## Variables actually used in tree construction:
## [1] cases Country.Region
##
## Root node error: 698/1026 = 0.68031
##
## n= 1026
##
## CP nsplit rel error xerror xstd
## 1 0.0157593 0 1.00000 10.0000 0.21401
## 2 0.0100287 5 0.91691 9.2393 0.22094
## 3 0.0085960 7 0.89685 8.9570 0.22299
## 4 0.0060888 9 0.87966 8.8152 0.22389
## 5 0.0057307 13 0.85530 8.7421 0.22439
## 6 0.0028653 18 0.82235 8.4069 0.22592
## 7 0.0014327 19 0.81948 8.3095 0.22626
## 8 0.0010000 20 0.81805 8.2092 0.22666
#Plot Tree Loss Matrix
plotcp(tree_loss_matrix)
#Another plot
ptree_loss_matrix = prune(tree_loss_matrix, cp = 0.0011)
prp(tree_loss_matrix)
#Confmat
pred_loss_matrix = predict(ptree_loss_matrix,newdata = dfTest, type = "class")
confmat_loss_matrix = table(dfTest$recovery,pred_loss_matrix)
confmat_loss_matrix
## pred_loss_matrix
## 0 1
## 0 19 104
## 1 3 55
#Acc loss matrix
acc_loss_matrix = sum(diag(confmat_loss_matrix)) / sum(confmat_loss_matrix)
acc_loss_matrix
## [1] 0.4088398
case_weights = ifelse(dfTrain$recovery == 0,1,3)
tree_weights = rpart(recovery ~ cases + Country.Region, method = "class",
data = dfTrain,
control = rpart.control(cp = 0.001,minsplit = 5,minbucket = 2),
weights = case_weights)
plotcp(tree_weights)
ptree_weights = prune(tree_weights, cp = 0.00183101)
prp(ptree_weights,extra = 1)
pred_weights = predict(ptree_weights, newdata = dfTest,type = "class")
confmat_weights = table(dfTest$recovery,pred_weights)
confmat_weights
## pred_weights
## 0 1
## 0 51 72
## 1 10 48
#Acc weights
acc_weights = sum(diag(confmat_weights)) / sum(confmat_weights)
acc_weights
## [1] 0.5469613
It can be seen that most of the cases in the data did not recover. A logistic regression and a decision tree algorithm were each applied to the data in an effort to optimally predict whether a patient would recover based on the frequency of cases in their country.
From the results in the analysis it can be seen that the logistic regression method produced better results than Decision trees with respect to Accuracy at 68% success rate at prediction.