library(readr)
library(knitr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(outliers)
library(MVN)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
## sROC 0.1-2 loaded
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
There exist some conspiracy theories that new cellular 5G wireless technology is in some way linked to the spread of COVID-19. It stands to reason that the countries that have so far introduced 5G technology might also be the countries with the highest rates of mobile phone usage (this is an assumption of the project).
I have therefore obtained data with a view to exploring the relationship between the incidence of COVID-19 so far in 2020, and the rate of mobile phone usage in most countries around the world.
I use various techniques to preprocess the data, and to join and mutate variables so that there exist ways to meaningfully compare the rate of mobile phone subscription per 100 in the poulation with the incidence of COVID-19 per population.
Total Population Data: https://data.worldbank.org/indicator/SP.POP.TOTL
Mobile cellular subscriptions (per 100 people) https://data.worldbank.org/indicator/IT.CEL.SETS.P2
COVID-19 Infections: https://www.kaggle.com/imdevskp/corona-virus-report?select=covid_19_clean_complete.csv
covid <- read_csv("Data/covid_19_clean_complete.csv")
## Parsed with column specification:
## cols(
## `Province/State` = col_character(),
## `Country/Region` = col_character(),
## Lat = col_double(),
## Long = col_double(),
## Date = col_character(),
## Confirmed = col_double(),
## Deaths = col_double(),
## Recovered = col_double(),
## `WHO Region` = col_character()
## )
phones <- read_csv("Data/API_IT.CEL.SETS.P2_DS2_en_csv_v2_1121611.csv", skip=4)
## Warning: Missing column names filled in: 'X65' [65]
## Parsed with column specification:
## cols(
## .default = col_double(),
## `Country Name` = col_character(),
## `Country Code` = col_character(),
## `Indicator Name` = col_character(),
## `Indicator Code` = col_character(),
## `1961` = col_logical(),
## `1962` = col_logical(),
## `1963` = col_logical(),
## `1964` = col_logical(),
## `1966` = col_logical(),
## `1967` = col_logical(),
## `1968` = col_logical(),
## `1969` = col_logical(),
## `1971` = col_logical(),
## `1972` = col_logical(),
## `1973` = col_logical(),
## `1974` = col_logical(),
## `2019` = col_logical(),
## X65 = col_logical()
## )
## See spec(...) for full column specifications.
pop <- read_csv("Data/API_SP.POP.TOTL_DS2_en_csv_v2_1120881.csv", skip=4)
## Warning: Missing column names filled in: 'X65' [65]
## Parsed with column specification:
## cols(
## .default = col_double(),
## `Country Name` = col_character(),
## `Country Code` = col_character(),
## `Indicator Name` = col_character(),
## `Indicator Code` = col_character(),
## `2019` = col_logical(),
## X65 = col_logical()
## )
## See spec(...) for full column specifications.
head(covid)
## # A tibble: 6 x 9
## `Province/State` `Country/Region` Lat Long Date Confirmed Deaths
## <chr> <chr> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 <NA> Afghanistan 33 65 1/22~ 0 0
## 2 <NA> Albania 41.2 20.2 1/22~ 0 0
## 3 <NA> Algeria 28.0 1.66 1/22~ 0 0
## 4 <NA> Andorra 42.5 1.52 1/22~ 0 0
## 5 <NA> Angola -11.2 17.9 1/22~ 0 0
## 6 <NA> Antigua and Bar~ 17.1 -61.8 1/22~ 0 0
## # ... with 2 more variables: Recovered <dbl>, `WHO Region` <chr>
head(phones)
## # A tibble: 6 x 65
## `Country Name` `Country Code` `Indicator Name` `Indicator Code` `1960` `1961`
## <chr> <chr> <chr> <chr> <dbl> <lgl>
## 1 Aruba ABW Mobile cellular~ IT.CEL.SETS.P2 0 NA
## 2 Afghanistan AFG Mobile cellular~ IT.CEL.SETS.P2 0 NA
## 3 Angola AGO Mobile cellular~ IT.CEL.SETS.P2 0 NA
## 4 Albania ALB Mobile cellular~ IT.CEL.SETS.P2 0 NA
## 5 Andorra AND Mobile cellular~ IT.CEL.SETS.P2 0 NA
## 6 Arab World ARB Mobile cellular~ IT.CEL.SETS.P2 NA NA
## # ... with 59 more variables: `1962` <lgl>, `1963` <lgl>, `1964` <lgl>,
## # `1965` <dbl>, `1966` <lgl>, `1967` <lgl>, `1968` <lgl>, `1969` <lgl>,
## # `1970` <dbl>, `1971` <lgl>, `1972` <lgl>, `1973` <lgl>, `1974` <lgl>,
## # `1975` <dbl>, `1976` <dbl>, `1977` <dbl>, `1978` <dbl>, `1979` <dbl>,
## # `1980` <dbl>, `1981` <dbl>, `1982` <dbl>, `1983` <dbl>, `1984` <dbl>,
## # `1985` <dbl>, `1986` <dbl>, `1987` <dbl>, `1988` <dbl>, `1989` <dbl>,
## # `1990` <dbl>, `1991` <dbl>, `1992` <dbl>, `1993` <dbl>, `1994` <dbl>,
## # `1995` <dbl>, `1996` <dbl>, `1997` <dbl>, `1998` <dbl>, `1999` <dbl>,
## # `2000` <dbl>, `2001` <dbl>, `2002` <dbl>, `2003` <dbl>, `2004` <dbl>,
## # `2005` <dbl>, `2006` <dbl>, `2007` <dbl>, `2008` <dbl>, `2009` <dbl>,
## # `2010` <dbl>, `2011` <dbl>, `2012` <dbl>, `2013` <dbl>, `2014` <dbl>,
## # `2015` <dbl>, `2016` <dbl>, `2017` <dbl>, `2018` <dbl>, `2019` <lgl>,
## # X65 <lgl>
Analysis of the data sets shows that the COVID-19 data is in long format, with each country’s daily reports of COVID-19 confirmed cases, deaths and recoveries recorded as an observation. On the other hand, the population and mobile phone data is in wide format, showing the rate of mobile phone subscriptions per 100 population each year since 1960 in a seperate column.
To address this, I can aggregate the COVID-19 data by country prior to merging the datasets to provide summation/totals for cases.
covid2 <- aggregate(cbind(covid$Confirmed, covid$Deaths, covid$Recovered), by=list(Category=covid$'Country/Region'), FUN=sum) %>%
rename(Country = Category, Confirmed = V1, Deaths = V2, Recovered = V3)
head(covid2)
## Country Confirmed Deaths Recovered
## 1 Afghanistan 303643 6712 31482
## 2 Albania 50228 1893 32569
## 3 Algeria 314591 28423 152649
## 4 Andorra 47851 2627 26344
## 5 Angola 2546 163 650
## 6 Antigua and Barbuda 1532 164 745
For the mobile phone dataset, I am only interested in the last 10 or so years. No data is recorded for 2019, so I will subset to include only 2008 to 2018 as follows:
phones2 <- phones %>% select(c("Country Name", "Country Code", "2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018"))
For the population, I am only interested in the most recent complete estimates from 2018:
pop2 <- pop %>% select(c("Country Name", "2018")) %>% rename(Population = "2018")
I now combine the population data with the phone data:
phonepop <- left_join(phones2, pop2, by = "Country Name")
Noting that the United States is called the Us in the COVID-19 dataset, I rename this observation and combine the new phone and population dataset with the complete COVID-19 dataset. Some other countries do not necesarily match, such as the Diamond Princess (which is a ship).
phonepop[phonepop$`Country Name` == "United States", "Country Name"] <- "US"
combo <- left_join(covid2, phonepop, by = c("Country" = "Country Name"))
head(combo)
## Country Confirmed Deaths Recovered Country Code 2008
## 1 Afghanistan 303643 6712 31482 AFG 28.49300
## 2 Albania 50228 1893 32569 ALB 61.93245
## 3 Algeria 314591 28423 152649 DZA 77.83184
## 4 Andorra 47851 2627 26344 AND 76.55672
## 5 Angola 2546 163 650 AGO 31.21990
## 6 Antigua and Barbuda 1532 164 745 ATG 159.94941
## 2009 2010 2011 2012 2013 2014 2015
## 1 36.97858 35.00313 45.81363 49.22798 52.08358 55.15951 57.27107
## 2 82.86920 91.32805 105.85291 120.10586 126.93697 115.99794 117.65922
## 3 92.63014 91.11307 97.14818 100.38468 103.61014 111.23861 108.80894
## 4 76.42281 77.55568 77.66726 77.48068 79.14799 83.62390 91.44351
## 5 36.01901 40.26060 49.84677 50.92060 51.06592 52.15898 49.79322
## 6 155.54031 190.81429 197.20121 140.89416 124.95957 129.68713 188.10252
## 2016 2017 2018 Population
## 1 61.05464 65.92913 59.12085 37172386
## 2 116.74444 125.71035 94.17700 2866376
## 3 116.00421 110.76725 111.66479 42228429
## 4 98.49283 104.33241 107.28255 77006
## 5 45.07629 44.68611 43.13052 30809762
## 6 190.42178 192.81957 NA 96286
Now that I am dealing with my combined dataset, I cosnider the structure andclasses of the data.
str(combo)
## 'data.frame': 188 obs. of 17 variables:
## $ Country : chr "Afghanistan" "Albania" "Algeria" "Andorra" ...
## $ Confirmed : num 303643 50228 314591 47851 2546 ...
## $ Deaths : num 6712 1893 28423 2627 163 ...
## $ Recovered : num 31482 32569 152649 26344 650 ...
## $ Country Code: chr "AFG" "ALB" "DZA" "AND" ...
## $ 2008 : num 28.5 61.9 77.8 76.6 31.2 ...
## $ 2009 : num 37 82.9 92.6 76.4 36 ...
## $ 2010 : num 35 91.3 91.1 77.6 40.3 ...
## $ 2011 : num 45.8 105.9 97.1 77.7 49.8 ...
## $ 2012 : num 49.2 120.1 100.4 77.5 50.9 ...
## $ 2013 : num 52.1 126.9 103.6 79.1 51.1 ...
## $ 2014 : num 55.2 116 111.2 83.6 52.2 ...
## $ 2015 : num 57.3 117.7 108.8 91.4 49.8 ...
## $ 2016 : num 61.1 116.7 116 98.5 45.1 ...
## $ 2017 : num 65.9 125.7 110.8 104.3 44.7 ...
## $ 2018 : num 59.1 94.2 111.7 107.3 43.1 ...
## $ Population : num 37172386 2866376 42228429 77006 30809762 ...
Some of the variables have been identified as the incorrect class, so I correct this as follows, using the Lubridate package for the dates:
combo$`Country` <- as.factor(combo$`Country`)
combo$`Country Code` <- as.factor(combo$`Country Code`)
str(combo)
## 'data.frame': 188 obs. of 17 variables:
## $ Country : Factor w/ 188 levels "Afghanistan",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Confirmed : num 303643 50228 314591 47851 2546 ...
## $ Deaths : num 6712 1893 28423 2627 163 ...
## $ Recovered : num 31482 32569 152649 26344 650 ...
## $ Country Code: Factor w/ 163 levels "AFG","AGO","ALB",..: 1 3 45 4 2 8 6 7 9 10 ...
## $ 2008 : num 28.5 61.9 77.8 76.6 31.2 ...
## $ 2009 : num 37 82.9 92.6 76.4 36 ...
## $ 2010 : num 35 91.3 91.1 77.6 40.3 ...
## $ 2011 : num 45.8 105.9 97.1 77.7 49.8 ...
## $ 2012 : num 49.2 120.1 100.4 77.5 50.9 ...
## $ 2013 : num 52.1 126.9 103.6 79.1 51.1 ...
## $ 2014 : num 55.2 116 111.2 83.6 52.2 ...
## $ 2015 : num 57.3 117.7 108.8 91.4 49.8 ...
## $ 2016 : num 61.1 116.7 116 98.5 45.1 ...
## $ 2017 : num 65.9 125.7 110.8 104.3 44.7 ...
## $ 2018 : num 59.1 94.2 111.7 107.3 43.1 ...
## $ Population : num 37172386 2866376 42228429 77006 30809762 ...
As mentioned in the previous section, the original datsets contain both long and wide format data, with years as variables and dates as observations. For the purposes of this analysis, I have aggregated the long data into summary information by country, and combined it with the long format data to further prepare variables.
The 2018 mobile phone subscription data for some countries is missing, but these have 2017 values. As such I check for the number of NAs, then impute values from 2017 data to correct for this, assuming little change from 2017 to 2018. Due to errors in R handling variables named for years, I rename the 2018 and 2017 variables to X18 and X17.
sum(is.na(combo$`2018`))
## [1] 42
combo <- combo %>% rename(X18 = "2018", X17 = "2017") %>% mutate(X18 = ifelse(is.na(X18), X17, X18))
sum(is.na(combo$X18))
## [1] 26
The second is.na sum calculation shows that the number of NA entries reduced from 42 to 26. This is approx 14% of the observations. Whilst less than 5% would be ideal, this will not be possible because not all the countries have data on mobile phone usage, and may not have matched during the earlier left_join because of different spelling names and regional complexities. There are also non-country observations, such as the Diamond Princess cruise ship.
I will create a variable that shows the change in the rate of mobile phone subscriptions per 100 population over the period from 2008 to 2018. This will be expressed as a proportion change over the 2008 rate. For example if the rate was 50 per 100 people in 2008, and has risen to 60 (in 2018), this would represent a 20% increase in the mobile phone subscription rate in the country.
combo <- combo %>% rename(Now = X18, Before = "2008") %>% mutate(PhoneChange = (Now - Before)/Before)
I will also create a variable that shows the rate of confirmed COVID-19 infections per 100 population, which may be compared to the “Now” (2018) value, which is also a rate per 100 population (but of mobile phone usage). This variable may be very useful for those wanting to analyse any possible correlation between the rate of mobile phone use and the rate of COVID-19 infection.
combo <- combo %>% mutate(COVRate = Confirmed/Population*100)
head(combo)
## Country Confirmed Deaths Recovered Country Code Before
## 1 Afghanistan 303643 6712 31482 AFG 28.49300
## 2 Albania 50228 1893 32569 ALB 61.93245
## 3 Algeria 314591 28423 152649 DZA 77.83184
## 4 Andorra 47851 2627 26344 AND 76.55672
## 5 Angola 2546 163 650 AGO 31.21990
## 6 Antigua and Barbuda 1532 164 745 ATG 159.94941
## 2009 2010 2011 2012 2013 2014 2015
## 1 36.97858 35.00313 45.81363 49.22798 52.08358 55.15951 57.27107
## 2 82.86920 91.32805 105.85291 120.10586 126.93697 115.99794 117.65922
## 3 92.63014 91.11307 97.14818 100.38468 103.61014 111.23861 108.80894
## 4 76.42281 77.55568 77.66726 77.48068 79.14799 83.62390 91.44351
## 5 36.01901 40.26060 49.84677 50.92060 51.06592 52.15898 49.79322
## 6 155.54031 190.81429 197.20121 140.89416 124.95957 129.68713 188.10252
## 2016 X17 Now Population PhoneChange COVRate
## 1 61.05464 65.92913 59.12085 37172386 1.0749251 0.816850982
## 2 116.74444 125.71035 94.17700 2866376 0.5206406 1.752317212
## 3 116.00421 110.76725 111.66479 42228429 0.4346929 0.744974434
## 4 98.49283 104.33241 107.28255 77006 0.4013472 62.139313820
## 5 45.07629 44.68611 43.13052 30809762 0.3815071 0.008263615
## 6 190.42178 192.81957 192.81957 96286 0.2055034 1.591093202
The missing value scan conducted on the 2018 has been completed. I will now subset the data to select only the relevant information for analysis of outliers:
combo <- combo %>% select(Country, Population, Confirmed, COVRate, Now, PhoneChange) %>% rename(PhoneRate = Now)
head(combo)
## Country Population Confirmed COVRate PhoneRate PhoneChange
## 1 Afghanistan 37172386 303643 0.816850982 59.12085 1.0749251
## 2 Albania 2866376 50228 1.752317212 94.17700 0.5206406
## 3 Algeria 42228429 314591 0.744974434 111.66479 0.4346929
## 4 Andorra 77006 47851 62.139313820 107.28255 0.4013472
## 5 Angola 30809762 2546 0.008263615 43.13052 0.3815071
## 6 Antigua and Barbuda 96286 1532 1.591093202 192.81957 0.2055034
I check the confirmed cases for outliers:
z.scores <- combo$Confirmed %>% scores(type = "z")
z.scores %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.2224 -0.2206 -0.2120 0.0000 -0.1336 12.3300
which( abs(z.scores) >3 )
## [1] 180
This identifies observation 180, which is the USA. Unfortunately, I know this is the accurate figure of COVID-19 cases and not an outlier we would want to exclude or distort, so I will not impute another value.
I now complete univariate scanning of the remaining data, which finds no outliers. I do not scan the population data for outliers, as this variable is complete and accurate.
z.scores <- combo$COVRate %>% scores(type = "z")
z.scores %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## NA NA NA NaN NA NA 188
which( abs(z.scores) >3 )
## integer(0)
z.scores <- combo$PhoneRate %>% scores(type = "z")
z.scores %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## NA NA NA NaN NA NA 188
which( abs(z.scores) >3 )
## integer(0)
z.scores <- combo$PhoneChange %>% scores(type = "z")
z.scores %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## NA NA NA NaN NA NA 188
which( abs(z.scores) >3 )
## integer(0)
Whist the “PhoneRate” data is actually fairly symmetrically distributed, the COVRate is quite right skewed.
hist(combo$PhoneRate, main="Rate of mobile phone subscriptions per 100", col="red")
hist(combo$COVRate, main ="Pre-transformation COVID-19 case rate per 100", col="red")
I therefore use a BoxCox transformation to transform the COVID-19 case data.
combo$COVRate_boxcox <- BoxCox(combo$COVRate,lambda = "auto")
hist(combo$COVRate_boxcox, main ="Transformed COVID-19 case rate per 100", col="red")
An analysis could now take place using the data following pre-processing.