Coronavirus Disease 2019 (COVID-19) is one of the coronavirus family that is usually common in animals. COVID-19 affects the respiratory and cause symptoms such as: cough, fever, shortness of breath, muscle aches, sore throat, unexplained loss of taste or smell, diarrhea and headache. COVID-19 can be severe, and some cases have caused death. There are 4,013,728 confirmed cases and 278,993 Confirme deaths of COVID-19
1.Determine the effective factors/features on spreading Coronavirus. 2.Detect the factor(s) that may lead to increase the fatality rate. 3.Build a classifier that can estimate the probability of being infected.
Data cleaning, prepration, statistical analysis metrics such as: mean, median, and correlation.The analysis include graphs; histograms, box plots, barplot, and time series plot to make better view.Backward selection method to choose the most effective features indicated using the changes in model performance.Random Forest algorithm to detect the most important features in relation with the infection and deaths. and finally isNeural Network Model.
if(!require(dplyr)){
install.packages("dplyr")
library(dplyr)
}
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 3.6.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
if(!require(ggplot2)){
install.packages("ggplot2")
library(ggplot2)
}
## Loading required package: ggplot2
if(!require(keras)){
install.packages("keras")
library(keras)
}
## Loading required package: keras
if(!require(fastDummies)){
install.packages("fastDummies")
library(fastDummies)
}
## Loading required package: fastDummies
## Warning: package 'fastDummies' was built under R version 3.6.3
if(!require(caTools)){
install.packages("caTools")
library(caTools)
}
## Loading required package: caTools
## Warning: package 'caTools' was built under R version 3.6.3
if(!require(caret)){
install.packages("caret")
library(caret)
}
## Loading required package: caret
## Warning: package 'caret' was built under R version 3.6.3
## Loading required package: lattice
if(!require(randomForest)){
install.packages("randomForest")
library(randomForest)
}
## Loading required package: randomForest
## Warning: package 'randomForest' was built under R version 3.6.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
if(!require(e1071)){
install.packages("e1071")
library(e1071)
}
## Loading required package: e1071
## Warning: package 'e1071' was built under R version 3.6.3
if(!require(rjson)){
install.packages("rjson")
library(rjson)
}
## Loading required package: rjson
1.COVID-19 global report 2. Countries’ space or size and GDP dataset 3. Countries detailed reports for literacy, 4. foods dataset 5. Countries detailed reports for production, 6. healthcare quality care quality, education dataset
#Import the data
global_data <- read.csv('train.csv')
#Display the first few fows.
head(global_data)
## Id Province.State Country.Region Lat Long Date ConfirmedCases
## 1 1 Afghanistan 33 65 2020-01-22 0
## 2 2 Afghanistan 33 65 2020-01-23 0
## 3 3 Afghanistan 33 65 2020-01-24 0
## 4 4 Afghanistan 33 65 2020-01-25 0
## 5 5 Afghanistan 33 65 2020-01-26 0
## 6 6 Afghanistan 33 65 2020-01-27 0
## Fatalities
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
#Remove the ID column
global_data = global_data[-c(1)]
str(global_data)
## 'data.frame': 17892 obs. of 7 variables:
## $ Province.State: Factor w/ 129 levels "","Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Country.Region: Factor w/ 163 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Lat : num 33 33 33 33 33 33 33 33 33 33 ...
## $ Long : num 65 65 65 65 65 65 65 65 65 65 ...
## $ Date : Factor w/ 63 levels "2020-01-22","2020-01-23",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ ConfirmedCases: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Fatalities : num 0 0 0 0 0 0 0 0 0 0 ...
#The date in this data is defined as factor and to do time series analysis, we need to convert it into date type
global_data$Date = as.character(global_data$Date)
global_data$Date = as.Date(global_data$Date, format = "%Y-%m-%d")
str (global_data)
## 'data.frame': 17892 obs. of 7 variables:
## $ Province.State: Factor w/ 129 levels "","Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Country.Region: Factor w/ 163 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Lat : num 33 33 33 33 33 33 33 33 33 33 ...
## $ Long : num 65 65 65 65 65 65 65 65 65 65 ...
## $ Date : Date, format: "2020-01-22" "2020-01-23" ...
## $ ConfirmedCases: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Fatalities : num 0 0 0 0 0 0 0 0 0 0 ...
# Analyze the data using decribtive statistics. we will start by analyzing the continues data
confirmed_countries = aggregate(global_data$ConfirmedCases,list(global_data$Country.Region),sum)
head(confirmed_countries)
## Group.1 x
## 1 Afghanistan 363
## 2 Albania 851
## 3 Algeria 1485
## 4 Andorra 720
## 5 Antigua and Barbuda 16
## 6 Argentina 1666
#Gather all the fatalities in each country
death_countries = aggregate(global_data$Fatalities,list(global_data$Country.Region),sum)
head(death_countries)
## Group.1 x
## 1 Afghanistan 3
## 2 Albania 26
## 3 Algeria 113
## 4 Andorra 3
## 5 Antigua and Barbuda 0
## 6 Argentina 41
Countries_means = death_countries
# Renaming the x column to mean_deaths
colnames(Countries_means)[2] <-"Fatalities"
# Adding the mean infection to the data frame called (Countries_means).
Countries_means['ConfirmedCases'] = confirmed_countries$x
head(Countries_means)
## Group.1 Fatalities ConfirmedCases
## 1 Afghanistan 3 363
## 2 Albania 26 851
## 3 Algeria 113 1485
## 4 Andorra 3 720
## 5 Antigua and Barbuda 0 16
## 6 Argentina 41 1666
#import the geo info for the countries
geo = read.csv('Counties geo.csv')
colnames(geo)[2] <-"Area"
head(geo)
## Country Area Population Continent
## 1 NA NA
## 2 Andorra 468 77006 EU
## 3 United Arab Emirates 82880 9630959 AS
## 4 Afghanistan 647500 37172386 AS
## 5 Antigua and Barbuda 443 96286 <NA>
## 6 Anguilla 102 13254 <NA>
# Match the geographical data with the first data (global_data)
global_data$Area <- geo$Area[match(global_data$Country.Region, geo$Country)]
global_data$population <- geo$Population[match(global_data$Country.Region, geo$Country)]
global_data$Continent <- geo$Continent[match(global_data$Country.Region, geo$Country)]
head (global_data)
## Province.State Country.Region Lat Long Date ConfirmedCases Fatalities
## 1 Afghanistan 33 65 2020-01-22 0 0
## 2 Afghanistan 33 65 2020-01-23 0 0
## 3 Afghanistan 33 65 2020-01-24 0 0
## 4 Afghanistan 33 65 2020-01-25 0 0
## 5 Afghanistan 33 65 2020-01-26 0 0
## 6 Afghanistan 33 65 2020-01-27 0 0
## Area population Continent
## 1 647500 37172386 AS
## 2 647500 37172386 AS
## 3 647500 37172386 AS
## 4 647500 37172386 AS
## 5 647500 37172386 AS
## 6 647500 37172386 AS
#population per kilometer
global_data$pop_per_k = global_data$population / global_data$Area
head(global_data)
## Province.State Country.Region Lat Long Date ConfirmedCases Fatalities
## 1 Afghanistan 33 65 2020-01-22 0 0
## 2 Afghanistan 33 65 2020-01-23 0 0
## 3 Afghanistan 33 65 2020-01-24 0 0
## 4 Afghanistan 33 65 2020-01-25 0 0
## 5 Afghanistan 33 65 2020-01-26 0 0
## 6 Afghanistan 33 65 2020-01-27 0 0
## Area population Continent pop_per_k
## 1 647500 37172386 AS 57.40909
## 2 647500 37172386 AS 57.40909
## 3 647500 37172386 AS 57.40909
## 4 647500 37172386 AS 57.40909
## 5 647500 37172386 AS 57.40909
## 6 647500 37172386 AS 57.40909
#Gather the of the population per kilometer for each country.
mean_numbers_per_k = aggregate(global_data$pop_per_k,list(global_data$Country.Region),mean)
head(mean_numbers_per_k)
## Group.1 x
## 1 Afghanistan 57.40909
## 2 Albania 99.70697
## 3 Algeria 17.73008
## 4 Andorra 164.54274
## 5 Antigua and Barbuda 217.34989
## 6 Argentina 16.08105
#Merge the population ratio to our countries_means data.
Countries_means['population_ration'] = mean_numbers_per_k$x
#rename the cloumn Group.1 to Country
colnames(Countries_means)[1] <-"Country"
#Display the head
head(Countries_means)
## Country Fatalities ConfirmedCases population_ration
## 1 Afghanistan 3 363 57.40909
## 2 Albania 26 851 99.70697
## 3 Algeria 113 1485 17.73008
## 4 Andorra 3 720 164.54274
## 5 Antigua and Barbuda 0 16 217.34989
## 6 Argentina 41 1666 16.08105
# Remove the outliers
#Make a copy of our data (Countries_means)
country = Countries_means
#Search for outliers in population_ration.
outliers <- boxplot(Countries_means$population_ration, plot=FALSE)$out
Countries_means<- Countries_means[-which(Countries_means$population_ration %in% outliers),]
#Check for outliers in Fatalities
outliers <- boxplot(Countries_means$Fatalities, plot=FALSE)$out
Countries_means<- Countries_means[-which(Countries_means$Fatalities %in% outliers),]
#Check for outliers in Confirmed cases
outliers <- boxplot(Countries_means$ConfirmedCases, plot=FALSE)$out
Countries_means<- Countries_means[-which(Countries_means$ConfirmedCases %in% outliers),]
hist(Countries_means$Fatalities)
The histogram plot showa that the most frequent deaths is between 0 and 5 deaths. We can tell that the data of the Fatalities cases is not normally distributed, which will help us in our descriptive analysis
hist(Countries_means$ConfirmedCases )
The histogram plot that the most frequent infections is between 0 and 500 cases. We can tell that the data of the Confirmed cases is not normally distributed, which will help us in our descriptive analysis
hist(Countries_means$population_ration )
The histogram plot shows the most frequent population per kilometer is between 0 and 100. We can tell that the data of the Fatalities cases is not normally distributed, which will help us in our descriptive analysis
#Find the confirmed and deaths cases by continents.
Continents_casses_sum = aggregate(global_data$ConfirmedCases,list(global_data$Continent),sum)
Continents_deaths_sum = aggregate(global_data$Fatalities,list(global_data$Continent),sum)
#Combine the two columns in a new data frame called (Continents_means)
Continents_means = Continents_casses_sum
Continents_means['Fatalities'] = Continents_deaths_sum$x
#Renaming the columns names
colnames(Continents_means)[2] <-"Confirmed"
colnames(Continents_means)[1] <-"continent"
#Display the head
head(Continents_means)
## continent Confirmed Fatalities
## 1 AF 10588 306
## 2 AS 3888236 139594
## 3 EU 1449744 69018
## 4 OC 11598 91
## 5 SA 26023 322
#Get the difference between the Confirmed and Fatalities.
Continents_means$Difference = Continents_means$Confirmed / Continents_means$Fatalities
head(Continents_means)
## continent Confirmed Fatalities Difference
## 1 AF 10588 306 34.60131
## 2 AS 3888236 139594 27.85389
## 3 EU 1449744 69018 21.00530
## 4 OC 11598 91 127.45055
## 5 SA 26023 322 80.81677
ggplot(data=Continents_means, aes(x=continent, y=Fatalities)) +
geom_bar(stat="identity")
we have indication that Asia is the highest country with Fatalities, Europe is in the second place
ggplot(data=Continents_means, aes(x=continent, y=Confirmed)) +
geom_bar(stat="identity")
we have indication that Asia is the highest country with infected cases, Europe is in the second place.
ggplot(data=Continents_means, aes(x=continent, y=Difference)) +
geom_bar(stat="identity")
#Time series of the confirmed cases
ggplot(global_data, aes(x=Date, y=ConfirmedCases)) +
geom_line( color="steelblue") +
geom_point() +
xlab("")
#Seprate the confirmed cases and fatalities cases
GlobalConfirmedCasses = aggregate(global_data$ConfirmedCases,list(global_data$Date),sum)
GlobalDeathCases = aggregate(global_data$Fatalities,list(global_data$Date),sum)
#Combine them in a new data frame and rename the columns
GlobalConfimred_Deaths = GlobalConfirmedCasses
GlobalConfimred_Deaths['Totaldeaths'] = GlobalDeathCases$x
colnames(GlobalConfimred_Deaths)[2] <-"casses"
colnames(GlobalConfimred_Deaths)[1] <-"Date"
#Display the head
head(GlobalConfimred_Deaths)
## Date casses Totaldeaths
## 1 2020-01-22 539 17
## 2 2020-01-23 627 18
## 3 2020-01-24 901 25
## 4 2020-01-25 1347 41
## 5 2020-01-26 1959 53
## 6 2020-01-27 2694 79
ggplot(GlobalConfimred_Deaths, aes(x=Date, y=casses)) +
geom_line( color="steelblue") +
geom_point() +
xlab("")
As shown above, the number of cases increase over the time.
ggplot(GlobalConfimred_Deaths, aes(x=Date, y=Totaldeaths)) +
geom_line( color="steelblue") +
geom_point() +
xlab("")
As we have another strong indication that as the time passes both death cases and infection cases increase all around the world. we can indicate a strong relationship between the infection and the deaths as both of the trends have the same shape
#Import another dataset that has useful information about gender development, expected life, the GDP of countries and life expectancy.
general_countries <- read.csv('gender_development.csv')
#Rename columns names
colnames(general_countries)[3] <-"GENDER_DEVELOPMENT"
colnames(general_countries)[6] <-"FEMALE_LIFE_EXPECTANCY_ON_BIRTH"
colnames(general_countries)[7] <-"MALE_LIFE_EXPECTANCY_ON_BIRTH"
colnames(general_countries)[8] <-"FEMALE_YEARS_LEARNING"
colnames(general_countries)[9] <-"MALE_YEARS_LEARNING"
colnames(general_countries)[12] <-"GDP"
#Change some columns to be numeric
general_countries$GENDER_DEVELOPMENT = as.numeric(as.character(general_countries$GENDER_DEVELOPMENT))
## Warning: NAs introduced by coercion
general_countries$ FEMALE_LIFE_EXPECTANCY_ON_BIRTH=as.numeric(as.character(general_countries$FEMALE_LIFE_EXPECTANCY_ON_BIRTH))
## Warning: NAs introduced by coercion
general_countries$ MALE_LIFE_EXPECTANCY_ON_BIRTH= as.numeric(as.character(general_countries$MALE_LIFE_EXPECTANCY_ON_BIRTH))
## Warning: NAs introduced by coercion
general_countries$FEMALE_YEARS_LEARNING = as.numeric(as.character(general_countries$FEMALE_YEARS_LEARNING))
## Warning: NAs introduced by coercion
general_countries$MALE_YEARS_LEARNING = as.numeric(as.character(general_countries$MALE_YEARS_LEARNING))
## Warning: NAs introduced by coercion
general_countries$GDP = as.numeric(as.character(general_countries$GDP))
## Warning: NAs introduced by coercion
#Display the head
head(general_countries)
## GDI.Rank Country GENDER_DEVELOPMENT Human.Development.Index..Female.
## 1 1 Norway 0.996 0.94
## 2 2 Australia 0.976 0.922
## 3 3 Switzerland 0.950 0.898
## 4 4 Denmark 0.977 0.912
## 5 5 Netherlands 0.947 0.893
## 6 6 Germany 0.963 0.901
## Human.Development.Index..Male. FEMALE_LIFE_EXPECTANCY_ON_BIRTH
## 1 0.944 83.6
## 2 0.945 84.5
## 3 0.945 85.0
## 4 0.934 82.2
## 5 0.943 83.3
## 6 0.936 83.3
## MALE_LIFE_EXPECTANCY_ON_BIRTH FEMALE_YEARS_LEARNING MALE_YEARS_LEARNING
## 1 79.5 18.2 16.8
## 2 80.3 20.7 19.7
## 3 80.8 15.7 15.9
## 4 78.3 19.3 18.1
## 5 79.7 18.0 17.9
## 6 78.5 16.3 16.6
## Mean.Years.of.Education..Female. Mean.Years.of.Education..Male. GDP
## 1 12.7 12.5 57140
## 2 13.1 12.9 33688
## 3 11.5 13.1 44132
## 4 12.8 12.7 36439
## 5 11.6 12.2 29500
## 6 12.9 13.8 34886
## Estimated.Gross.National.Income.per.Capita..Male.
## 1 72825
## 2 50914
## 3 69077
## 4 51727
## 5 61641
## 6 53290
#Get the difference of expected education among males and females
general_countries$EXPECTED_EDUCATION_YEARS = (general_countries$FEMALE_YEARS_LEARNING + general_countries$MALE_YEARS_LEARNING)/2
##Get the difference of expected life on birth among males and females
general_countries$EXPECTED_lIFE_ON_BIRTH = (general_countries$FEMALE_LIFE_EXPECTANCY_ON_BIRTH + general_countries$MALE_LIFE_EXPECTANCY_ON_BIRTH )/2
#Add and match GDP, EXPECTED EDUCATION YEARS, EXPECTED LIFE ON BIRTH to the COUNTRIES_MEANS data.
Countries_means$GDP = general_countries$GDP[match(Countries_means$Country, general_countries$Country)]
Countries_means$EXPECTED_EDUCATION_YEARS<-general_countries$EXPECTED_EDUCATION_YEARS[match(Countries_means$Country, Countries_means$Country)]
Countries_means$EXPECTED_lIFE_ON_BIRTH<-general_countries$EXPECTED_lIFE_ON_BIRTH[match(Countries_means$Country, general_countries$Country)]
#Display the head
head(Countries_means)
## Country Fatalities ConfirmedCases population_ration GDP
## 1 Afghanistan 3 363 57.40909 506
## 2 Albania 26 851 99.70697 7217
## 4 Andorra 3 720 164.54274 NA
## 5 Antigua and Barbuda 0 16 217.34989 NA
## 6 Argentina 41 1666 16.08105 14202
## 7 Armenia 0 1370 99.05289 6042
## EXPECTED_EDUCATION_YEARS EXPECTED_lIFE_ON_BIRTH
## 1 17.50 60.40
## 2 20.20 77.90
## 4 15.80 NA
## 5 18.70 76.00
## 6 17.95 76.25
## 7 16.45 74.75
str(Countries_means)
## 'data.frame': 107 obs. of 7 variables:
## $ Country : Factor w/ 163 levels "Afghanistan",..: 1 2 4 5 6 7 11 15 17 18 ...
## $ Fatalities : num 3 26 3 0 41 0 12 0 0 0 ...
## $ ConfirmedCases : num 363 851 720 16 1666 ...
## $ population_ration : num 57.4 99.7 164.5 217.3 16.1 ...
## $ GDP : num 506 7217 NA NA 14202 ...
## $ EXPECTED_EDUCATION_YEARS: num 17.5 20.2 15.8 18.7 17.9 ...
## $ EXPECTED_lIFE_ON_BIRTH : num 60.4 77.9 NA 76 76.2 ...
To calculate the relationship between the continuous variables, we have used the spearman correlation as the data is not normally distributed which is important condition to use the standard correlation (pearson).
cor(Countries_means$ConfirmedCases, Countries_means$GDP, method = "spearman", use = "complete.obs")
## [1] 0.5531839
The Correlation between the GDP (Growth Development Progress) and the infection is : 0.5531839 (medium positive relationship). GDP is an economical definition but I find it to be indication that rich countries have specific features that participate in the spreading of the COVID-19 like : food, education, etc.
cor(Countries_means$ConfirmedCases, Countries_means$EXPECTED_lIFE_ON_BIRTH, method = "spearman", use = "complete.obs")
## [1] 0.5781181
Correlation between the EXPECTED_lIFE_ON_BIRTH and the infection is : 0.5781181 (also medium positive relationship). We can think of the feature EXPECTED_lIFE_ON_BIRTH is an Indecation to the quality of healthcare system in the country, eventhough it has a medium relationship with the spreading.
cor(Countries_means$EXPECTED_EDUCATION_YEARS, Countries_means$ConfirmedCases, method = "spearman", use = "complete.obs")
## [1] -0.09366682
The correlation between the EXPECTED_EDUCATION_YEARS and the infection is : -0.09366682 (weak negative relationship).
#Import another data that has information the gender and age of COVID19 patients
data2 = read.csv('metadata.csv')
#Remve unnecassary features
data2 = data2[c(2,3,4,5)]
head(data2)
## offset sex age finding
## 1 0 M 65 COVID-19
## 2 3 M 65 COVID-19
## 3 5 M 65 COVID-19
## 4 6 M 65 COVID-19
## 5 0 F 52 COVID-19
## 6 5 F 52 COVID-19
data2<-data2[(data2$finding=="COVID-19"),]
#Show the distribution of COVID-19 among males and females
barplot(prop.table(table(data2$sex)))
This barplot shows that males have higher infection rate than females.
hist(data2$age)
People between 40 to 70 years of age are more infected with COVID-19. The highest infection rate is in the age range of 40 to 70 years old, Which makes sense as the immunity becomes weaker as we advance in age.
#more graph to show the relationship between age and gender
ggplot(data2,
aes(x = age,
fill = sex)) +
geom_histogram(position = "stack")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 46 rows containing non-finite values (stat_bin).
#Import another data that has useful information about the usage of fat and dairy supplement all over the countries.
data3 = read.csv('Fat_Supply_Quantity_Data.csv')
#Select important columns only
data3 = data3[c(1,3,4,6,7,8,9,10,11,12,13)]
#Display the data
head(data3)
## Country Animal.Products Animal.fats Cereals...Excluding.Beer
## 1 Afghanistan 21.6397 6.2224 8.0353
## 2 Albania 32.0002 3.4172 2.6734
## 3 Algeria 14.4175 0.8972 4.2035
## 4 Angola 15.3041 1.3130 6.5545
## 5 Antigua and Barbuda 27.7033 4.6686 3.2153
## 6 Argentina 30.3572 3.3076 1.3316
## Eggs Fish..Seafood Fruits...Excluding.Wine Meat Miscellaneous
## 1 0.6859 0.0327 0.4246 6.1244 0.0163
## 2 1.6448 0.1445 0.6418 8.7428 0.0170
## 3 1.2171 0.2008 0.5772 3.8961 0.0439
## 4 0.1539 1.4155 0.3488 11.0268 0.0308
## 5 0.3872 1.5263 1.2177 14.3202 0.0898
## 6 1.5706 0.1664 0.2091 19.2693 0.0000
## Milk...Excluding.Butter Offals
## 1 8.2803 0.3103
## 2 17.7576 0.2933
## 3 8.0934 0.1067
## 4 1.2309 0.1539
## 5 6.6607 0.1347
## 6 5.8512 0.1878
#Rename columns names:
colnames(data3)[2] <-"animal_products"
colnames(data3)[3] <-"animal_fats"
colnames(data3)[4] <-"Cerial_excluding_beer"
colnames(data3)[5] <-"Eggs"
colnames(data3)[6] <-"Fish"
colnames(data3)[7] <-"Fruits"
colnames(data3)[8] <-"Meat"
colnames(data3)[9] <-"Miscellaneous"
colnames(data3)[10] <-"Milk"
#Match and add these columns to our previous data (Countries_means)
Countries_means$animal_products <- data3$animal_products[match(Countries_means$Country, data3$Country)]
Countries_means$animal_fats <- data3$animal_fats[match(Countries_means$Country, data3$Country)]
Countries_means$Cerial_excluding_beer <- data3$Cerial_excluding_beer[match(Countries_means$Country, data3$Country)]
Countries_means$Eggs <- data3$Eggs[match(Countries_means$Country, data3$Country)]
Countries_means$Fish <- data3$Fish[match(Countries_means$Country, data3$Country)]
Countries_means$Fruits <- data3$Fruits[match(Countries_means$Country, data3$Country)]
Countries_means$Meat <- data3$Meat[match(Countries_means$Country, data3$Country)]
Countries_means$Miscellaneous <- data3$Miscellaneous[match(Countries_means$Country, data3$Country)]
Countries_means$Milk <- data3$Milk[match(Countries_means$Country, data3$Country)]
head(Countries_means)
## Country Fatalities ConfirmedCases population_ration GDP
## 1 Afghanistan 3 363 57.40909 506
## 2 Albania 26 851 99.70697 7217
## 4 Andorra 3 720 164.54274 NA
## 5 Antigua and Barbuda 0 16 217.34989 NA
## 6 Argentina 41 1666 16.08105 14202
## 7 Armenia 0 1370 99.05289 6042
## EXPECTED_EDUCATION_YEARS EXPECTED_lIFE_ON_BIRTH animal_products animal_fats
## 1 17.50 60.40 21.6397 6.2224
## 2 20.20 77.90 32.0002 3.4172
## 4 15.80 NA NA NA
## 5 18.70 76.00 27.7033 4.6686
## 6 17.95 76.25 30.3572 3.3076
## 7 16.45 74.75 29.6642 6.2619
## Cerial_excluding_beer Eggs Fish Fruits Meat Miscellaneous Milk
## 1 8.0353 0.6859 0.0327 0.4246 6.1244 0.0163 8.2803
## 2 2.6734 1.6448 0.1445 0.6418 8.7428 0.0170 17.7576
## 4 NA NA NA NA NA NA NA
## 5 3.2153 0.3872 1.5263 1.2177 14.3202 0.0898 6.6607
## 6 1.3316 1.5706 0.1664 0.2091 19.2693 0.0000 5.8512
## 7 2.5068 1.6196 0.2218 0.5468 10.8165 0.0361 10.4709
data4 = read.csv('countries_worldGDP.csv')
head(data4)
## Country Region Population Area..sq..mi..
## 1 Afghanistan ASIA (EX. NEAR EAST) 31056997 647500
## 2 Albania EASTERN EUROPE 3581655 28748
## 3 Algeria NORTHERN AFRICA 32930091 2381740
## 4 American Samoa OCEANIA 57794 199
## 5 Andorra WESTERN EUROPE 71201 468
## 6 Angola SUB-SAHARAN AFRICA 12127071 1246700
## Pop..Density..per.sq..mi.. Coastline..coast.area.ratio. Net.migration
## 1 48,0 0,00 23,06
## 2 124,6 1,26 -4,93
## 3 13,8 0,04 -0,39
## 4 290,4 58,29 -20,71
## 5 152,1 0,00 6,6
## 6 9,7 0,13 0
## Infant.mortality..per.1000.births. GDP....per.capita. Literacy....
## 1 163,07 700 36,0
## 2 21,52 4500 86,5
## 3 31 6000 70,0
## 4 9,27 8000 97,0
## 5 4,05 19000 100,0
## 6 191,19 1900 42,0
## Phones..per.1000. Arable.... Crops.... Other.... Climate Birthrate Deathrate
## 1 3,2 12,13 0,22 87,65 1 46,6 20,34
## 2 71,2 21,09 4,42 74,49 3 15,11 5,22
## 3 78,1 3,22 0,25 96,53 1 17,14 4,61
## 4 259,5 10 15 75 2 22,46 3,27
## 5 497,2 2,22 0 97,78 3 8,71 6,25
## 6 7,8 2,41 0,24 97,35 45,11 24,2
## Agriculture Industry Service
## 1 0,38 0,24 0,38
## 2 0,232 0,188 0,579
## 3 0,101 0,6 0,298
## 4
## 5
## 6 0,096 0,658 0,246
#Print the srtucture of the data.
str(data4)
## 'data.frame': 227 obs. of 20 variables:
## $ Country : Factor w/ 227 levels "Afghanistan ",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Region : Factor w/ 11 levels "ASIA (EX. NEAR EAST) ",..: 1 4 7 9 11 10 5 5 5 3 ...
## $ Population : int 31056997 3581655 32930091 57794 71201 12127071 13477 69108 39921833 2976372 ...
## $ Area..sq..mi.. : int 647500 28748 2381740 199 468 1246700 102 443 2766890 29800 ...
## $ Pop..Density..per.sq..mi.. : Factor w/ 219 levels "0,0","1,0","1,8",..: 149 24 31 103 49 209 36 52 40 219 ...
## $ Coastline..coast.area.ratio. : Factor w/ 151 levels "0,00","0,01",..: 1 53 4 129 1 11 130 111 15 1 ...
## $ Net.migration : Factor w/ 158 levels "","-0,02","-0,03",..: 138 79 20 64 154 90 117 82 98 83 ...
## $ Infant.mortality..per.1000.births.: Factor w/ 221 levels "","10,03","10,09",..: 38 63 96 208 104 49 60 46 33 66 ...
## $ GDP....per.capita. : int 700 4500 6000 8000 19000 1900 8600 11000 11200 3500 ...
## $ Literacy.... : Factor w/ 141 levels "","100,0","17,6",..: 7 84 52 120 2 14 113 90 121 132 ...
## $ Phones..per.1000. : Factor w/ 215 levels "","0,2","1,3",..: 91 190 196 74 151 187 137 162 63 51 ...
## $ Arable.... : Factor w/ 204 levels "","0","0,02",..: 40 101 129 25 81 83 2 73 42 69 ...
## $ Crops.... : Factor w/ 163 levels "","0","0,01",..: 17 134 20 88 2 19 2 137 31 104 ...
## $ Other.... : Factor w/ 210 levels "","100","33,33",..: 119 63 173 66 187 183 2 75 116 84 ...
## $ Climate : Factor w/ 7 levels "","1","1,5","2",..: 2 6 2 4 6 1 4 4 6 7 ...
## $ Birthrate : Factor w/ 221 levels "","10","10,02",..: 194 52 73 109 201 189 47 71 69 29 ...
## $ Deathrate : Factor w/ 202 levels "","10,01","10,13",..: 59 101 87 70 127 66 104 106 159 171 ...
## $ Agriculture : Factor w/ 151 levels "","0","0,001",..: 133 106 61 1 1 57 31 30 56 109 ...
## $ Industry : Factor w/ 156 levels "","0,02","0,032",..: 55 33 151 1 1 153 32 47 113 108 ...
## $ Service : Factor w/ 168 levels "","0,062","0,177",..: 28 90 13 1 1 7 154 143 77 44 ...
#Change the Country column in Data4 and Countries_means to character.
Countries_means$Country <- as.character(Countries_means$Country)
data4$Country <- as.character(data4$Country)
#Rename column 5
colnames(data4)[5] <-"population_density"
#Match and merge population_desity column to countries_means data
Countries_means$population_density<- data4$population_density[match(Countries_means$Country, data4$Country)]
data4$Country[2] == Countries_means$Country[2]
## [1] FALSE
head(Countries_means)
## Country Fatalities ConfirmedCases population_ration GDP
## 1 Afghanistan 3 363 57.40909 506
## 2 Albania 26 851 99.70697 7217
## 4 Andorra 3 720 164.54274 NA
## 5 Antigua and Barbuda 0 16 217.34989 NA
## 6 Argentina 41 1666 16.08105 14202
## 7 Armenia 0 1370 99.05289 6042
## EXPECTED_EDUCATION_YEARS EXPECTED_lIFE_ON_BIRTH animal_products animal_fats
## 1 17.50 60.40 21.6397 6.2224
## 2 20.20 77.90 32.0002 3.4172
## 4 15.80 NA NA NA
## 5 18.70 76.00 27.7033 4.6686
## 6 17.95 76.25 30.3572 3.3076
## 7 16.45 74.75 29.6642 6.2619
## Cerial_excluding_beer Eggs Fish Fruits Meat Miscellaneous Milk
## 1 8.0353 0.6859 0.0327 0.4246 6.1244 0.0163 8.2803
## 2 2.6734 1.6448 0.1445 0.6418 8.7428 0.0170 17.7576
## 4 NA NA NA NA NA NA NA
## 5 3.2153 0.3872 1.5263 1.2177 14.3202 0.0898 6.6607
## 6 1.3316 1.5706 0.1664 0.2091 19.2693 0.0000 5.8512
## 7 2.5068 1.6196 0.2218 0.5468 10.8165 0.0361 10.4709
## population_density
## 1 <NA>
## 2 <NA>
## 4 <NA>
## 5 <NA>
## 6 <NA>
## 7 <NA>
#I have combined these datasets (data4 and Countries_mean) using the countries names, one of the datasets has misspelled in the countries names so I built a function to fixand match the country name.
i=1
for (c1 in Countries_means$Country){
c_chars = strsplit(c1,'')
c_chars=c_chars[[1]]
true_numbers = list()
for (c2 in data4$Country){
c2_chars = strsplit(c2,'')
c2_chars=c2_chars[[1]]
if (length(c_chars) >= length(c2_chars)-2){
if (((c_chars[1] == c2_chars[1]) & (c_chars[2] == c2_chars[2]) ) || ((c_chars[3] == c2_chars[3]) & (c_chars[2] == c2_chars[2]))){
counter = 0
for (char in c2_chars){
setuation =is.null(c_chars[char])
if (setuation == FALSE ){
counter = counter+1
}
}
}
}
true_numbers[[i]] <- counter
replacement_index = match(c(max(unlist(true_numbers))),true_numbers)
}
replacement_word = data4$Country[replacement_index]
Countries_means$Country[i]= replacement_word
i=i+1
}
#Match and merge columns from data4 to Countries_mean data
Countries_means$population_density<- data4$population_density[match(Countries_means$Country, data4$Country)]
Countries_means$population_density <- gsub(',', '.', Countries_means$population_density)
Countries_means$Literacy<- data4$Literacy[match(Countries_means$Country, data4$Country)]
Countries_means$Literacy <- gsub(',', '.', Countries_means$Literacy)
Countries_means$Climate<- data4$Climate[match(Countries_means$Country, data4$Country)]
Countries_means$Climate <- gsub(',', '.', Countries_means$Climate)
Countries_means$Industry<- data4$Industry[match(Countries_means$Country, data4$Country)]
Countries_means$Industry <- gsub(',', '.', Countries_means$Industry)
Countries_means$Agriculture<- data4$Agriculture[match(Countries_means$Country, data4$Country)]
Countries_means$Agriculture <- gsub(',', '.', Countries_means$Agriculture)
#Change the Literacy, population_density, Climate, Industry, and Agriculture columns to be numeric
Countries_means$Literacy = as.numeric(as.character(Countries_means$Literacy))
Countries_means$population_density = as.numeric(as.character(Countries_means$population_density))
Countries_means$Climate = as.numeric(as.character(Countries_means$Climate))
Countries_means$Industry = as.numeric(as.character(Countries_means$Industry))
Countries_means$Agriculture = as.numeric(as.character(Countries_means$Agriculture))
#Rename some columns
colnames(data4)[10] <-"Literacy"
colnames(data4)[15] <-"Climate"
colnames(data4)[19] <-"Industry"
colnames(data4)[18] <-"Agriculture"
head(Countries_means)
## Country Fatalities ConfirmedCases population_ration GDP
## 1 Afghanistan 3 363 57.40909 506
## 2 Albania 26 851 99.70697 7217
## 4 Algeria 3 720 164.54274 NA
## 5 American Samoa 0 16 217.34989 NA
## 6 Andorra 41 1666 16.08105 14202
## 7 Angola 0 1370 99.05289 6042
## EXPECTED_EDUCATION_YEARS EXPECTED_lIFE_ON_BIRTH animal_products animal_fats
## 1 17.50 60.40 21.6397 6.2224
## 2 20.20 77.90 32.0002 3.4172
## 4 15.80 NA NA NA
## 5 18.70 76.00 27.7033 4.6686
## 6 17.95 76.25 30.3572 3.3076
## 7 16.45 74.75 29.6642 6.2619
## Cerial_excluding_beer Eggs Fish Fruits Meat Miscellaneous Milk
## 1 8.0353 0.6859 0.0327 0.4246 6.1244 0.0163 8.2803
## 2 2.6734 1.6448 0.1445 0.6418 8.7428 0.0170 17.7576
## 4 NA NA NA NA NA NA NA
## 5 3.2153 0.3872 1.5263 1.2177 14.3202 0.0898 6.6607
## 6 1.3316 1.5706 0.1664 0.2091 19.2693 0.0000 5.8512
## 7 2.5068 1.6196 0.2218 0.5468 10.8165 0.0361 10.4709
## population_density Literacy Climate Industry Agriculture
## 1 48.0 36.0 1 0.240 0.380
## 2 124.6 86.5 3 0.188 0.232
## 4 13.8 70.0 1 0.600 0.101
## 5 290.4 97.0 2 NA NA
## 6 152.1 100.0 3 NA NA
## 7 9.7 42.0 NA 0.658 0.096
i=1
for ( c in Countries_means ){
col_name= names(Countries_means[i])
if (col_name == "Country"){
print('text')
}else{
plot(data = Countries_means, x=c, y=Countries_means$mean_infection, ,
xlab=col_name, ylab="mean_infection", pch=19)
}
i=i+1
}
## [1] "text"
## Warning in plot.window(...): "data" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "data" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in box(...): "data" is not a graphical parameter
## Warning in title(...): "data" is not a graphical parameter
## Warning in plot.window(...): "data" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "data" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in box(...): "data" is not a graphical parameter
## Warning in title(...): "data" is not a graphical parameter
## Warning in plot.window(...): "data" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "data" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in box(...): "data" is not a graphical parameter
## Warning in title(...): "data" is not a graphical parameter
## Warning in plot.window(...): "data" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "data" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in box(...): "data" is not a graphical parameter
## Warning in title(...): "data" is not a graphical parameter
## Warning in plot.window(...): "data" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "data" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in box(...): "data" is not a graphical parameter
## Warning in title(...): "data" is not a graphical parameter
## Warning in plot.window(...): "data" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "data" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in box(...): "data" is not a graphical parameter
## Warning in title(...): "data" is not a graphical parameter
## Warning in plot.window(...): "data" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "data" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in box(...): "data" is not a graphical parameter
## Warning in title(...): "data" is not a graphical parameter
## Warning in plot.window(...): "data" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "data" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in box(...): "data" is not a graphical parameter
## Warning in title(...): "data" is not a graphical parameter
## Warning in plot.window(...): "data" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "data" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in box(...): "data" is not a graphical parameter
## Warning in title(...): "data" is not a graphical parameter
## Warning in plot.window(...): "data" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "data" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in box(...): "data" is not a graphical parameter
## Warning in title(...): "data" is not a graphical parameter
## Warning in plot.window(...): "data" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "data" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in box(...): "data" is not a graphical parameter
## Warning in title(...): "data" is not a graphical parameter
## Warning in plot.window(...): "data" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "data" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in box(...): "data" is not a graphical parameter
## Warning in title(...): "data" is not a graphical parameter
## Warning in plot.window(...): "data" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "data" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in box(...): "data" is not a graphical parameter
## Warning in title(...): "data" is not a graphical parameter
## Warning in plot.window(...): "data" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "data" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in box(...): "data" is not a graphical parameter
## Warning in title(...): "data" is not a graphical parameter
## Warning in plot.window(...): "data" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "data" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in box(...): "data" is not a graphical parameter
## Warning in title(...): "data" is not a graphical parameter
## Warning in plot.window(...): "data" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "data" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in box(...): "data" is not a graphical parameter
## Warning in title(...): "data" is not a graphical parameter
## Warning in plot.window(...): "data" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "data" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in box(...): "data" is not a graphical parameter
## Warning in title(...): "data" is not a graphical parameter
## Warning in plot.window(...): "data" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "data" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in box(...): "data" is not a graphical parameter
## Warning in title(...): "data" is not a graphical parameter
## Warning in plot.window(...): "data" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "data" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in box(...): "data" is not a graphical parameter
## Warning in title(...): "data" is not a graphical parameter
## Warning in plot.window(...): "data" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "data" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in box(...): "data" is not a graphical parameter
## Warning in title(...): "data" is not a graphical parameter
corelations = data.frame(cor(Countries_means[,2:21], use = "complete.obs"))
corelations
## Fatalities ConfirmedCases population_ration
## Fatalities 1.000000000 0.51477105 0.22842732
## ConfirmedCases 0.514771048 1.00000000 0.10699096
## population_ration 0.228427325 0.10699096 1.00000000
## GDP 0.004612264 0.39114807 -0.14319804
## EXPECTED_EDUCATION_YEARS 0.152101518 -0.04175775 0.14187334
## EXPECTED_lIFE_ON_BIRTH 0.227465578 0.39880304 0.04830087
## animal_products 0.115622879 0.29824625 -0.20657686
## animal_fats 0.234887692 0.34031001 -0.01724759
## Cerial_excluding_beer -0.091364889 -0.20224662 -0.02285233
## Eggs 0.323032099 0.47562183 -0.12070160
## Fish -0.070970880 -0.04374120 0.12217717
## Fruits 0.030825961 -0.02368161 0.50905317
## Meat -0.138464053 0.03406397 -0.25905295
## Miscellaneous -0.285431616 -0.33117262 -0.01124536
## Milk 0.161369435 0.24559219 -0.14617169
## population_density 0.031061822 -0.13482838 -0.10540466
## Literacy 0.198867355 -0.22306465 0.12254749
## Climate -0.041080931 -0.05753895 -0.03410104
## Industry -0.037573387 0.05587653 0.13039275
## Agriculture -0.056119568 0.15620824 -0.13626929
## GDP EXPECTED_EDUCATION_YEARS
## Fatalities 0.004612264 0.152101518
## ConfirmedCases 0.391148072 -0.041757748
## population_ration -0.143198036 0.141873336
## GDP 1.000000000 -0.123517994
## EXPECTED_EDUCATION_YEARS -0.123517994 1.000000000
## EXPECTED_lIFE_ON_BIRTH 0.639342069 -0.006362098
## animal_products 0.636156105 0.041472736
## animal_fats 0.632141691 0.052615903
## Cerial_excluding_beer -0.550717401 0.006771626
## Eggs 0.451255122 -0.029103173
## Fish -0.043653181 0.174997955
## Fruits -0.143637764 0.081346342
## Meat 0.483572190 -0.089061015
## Miscellaneous 0.050856242 0.086024322
## Milk 0.229459300 0.111238702
## population_density -0.182096219 0.032229351
## Literacy -0.160245341 -0.007845262
## Climate -0.080634415 -0.313575888
## Industry 0.053246360 -0.084248559
## Agriculture 0.224726528 0.028569313
## EXPECTED_lIFE_ON_BIRTH animal_products animal_fats
## Fatalities 0.227465578 0.115622879 0.23488769
## ConfirmedCases 0.398803044 0.298246254 0.34031001
## population_ration 0.048300867 -0.206576864 -0.01724759
## GDP 0.639342069 0.636156105 0.63214169
## EXPECTED_EDUCATION_YEARS -0.006362098 0.041472736 0.05261590
## EXPECTED_lIFE_ON_BIRTH 1.000000000 0.650711087 0.51370596
## animal_products 0.650711087 1.000000000 0.66895504
## animal_fats 0.513705963 0.668955041 1.00000000
## Cerial_excluding_beer -0.501999114 -0.545771653 -0.43232842
## Eggs 0.612926076 0.496072977 0.36081344
## Fish -0.005158596 -0.166379300 -0.22311398
## Fruits -0.033385147 -0.159875677 -0.18341499
## Meat 0.370810680 0.768378696 0.27708505
## Miscellaneous 0.080494813 0.009060422 -0.24000336
## Milk 0.460083085 0.684283351 0.25545016
## population_density -0.178032351 -0.024204581 -0.13608843
## Literacy -0.126622141 -0.191933522 -0.16780668
## Climate -0.029880097 -0.092352501 -0.06786627
## Industry 0.107450012 -0.152558822 -0.13437955
## Agriculture 0.143644016 0.302028406 0.30399549
## Cerial_excluding_beer Eggs Fish
## Fatalities -0.091364889 0.32303210 -0.070970880
## ConfirmedCases -0.202246622 0.47562183 -0.043741196
## population_ration -0.022852332 -0.12070160 0.122177170
## GDP -0.550717401 0.45125512 -0.043653181
## EXPECTED_EDUCATION_YEARS 0.006771626 -0.02910317 0.174997955
## EXPECTED_lIFE_ON_BIRTH -0.501999114 0.61292608 -0.005158596
## animal_products -0.545771653 0.49607298 -0.166379300
## animal_fats -0.432328425 0.36081344 -0.223113982
## Cerial_excluding_beer 1.000000000 -0.19203031 0.120735933
## Eggs -0.192030309 1.00000000 -0.127679256
## Fish 0.120735933 -0.12767926 1.000000000
## Fruits -0.024232451 -0.06463785 0.029067410
## Meat -0.413265075 0.26803110 0.029577488
## Miscellaneous -0.079993779 -0.29554144 0.358335493
## Milk -0.343454377 0.33095296 -0.381545811
## population_density 0.075740284 -0.18282827 0.004689693
## Literacy 0.070018502 0.02665923 0.059133829
## Climate 0.073046614 0.08781734 -0.139383408
## Industry 0.138327045 0.16375622 -0.015219924
## Agriculture -0.246519772 -0.03194723 -0.075089484
## Fruits Meat Miscellaneous Milk
## Fatalities 0.030825961 -0.13846405 -0.285431616 0.161369435
## ConfirmedCases -0.023681610 0.03406397 -0.331172615 0.245592188
## population_ration 0.509053174 -0.25905295 -0.011245357 -0.146171693
## GDP -0.143637764 0.48357219 0.050856242 0.229459300
## EXPECTED_EDUCATION_YEARS 0.081346342 -0.08906102 0.086024322 0.111238702
## EXPECTED_lIFE_ON_BIRTH -0.033385147 0.37081068 0.080494813 0.460083085
## animal_products -0.159875677 0.76837870 0.009060422 0.684283351
## animal_fats -0.183414989 0.27708505 -0.240003357 0.255450165
## Cerial_excluding_beer -0.024232451 -0.41326507 -0.079993779 -0.343454377
## Eggs -0.064637854 0.26803110 -0.295541444 0.330952960
## Fish 0.029067410 0.02957749 0.358335493 -0.381545811
## Fruits 1.000000000 -0.12064236 -0.092726535 -0.049766803
## Meat -0.120642362 1.00000000 0.294378212 0.260681075
## Miscellaneous -0.092726535 0.29437821 1.000000000 -0.119880776
## Milk -0.049766803 0.26068108 -0.119880776 1.000000000
## population_density -0.004744694 -0.02727151 -0.076801254 0.124852262
## Literacy 0.139117067 -0.10468484 0.059751041 -0.167671678
## Climate 0.013484015 -0.11410168 -0.178702431 0.001650653
## Industry -0.016064694 -0.10385136 0.143579939 -0.114904691
## Agriculture -0.106645786 0.21558103 -0.048683724 0.161394634
## population_density Literacy Climate
## Fatalities 0.031061822 0.198867355 -0.041080931
## ConfirmedCases -0.134828382 -0.223064652 -0.057538953
## population_ration -0.105404656 0.122547491 -0.034101043
## GDP -0.182096219 -0.160245341 -0.080634415
## EXPECTED_EDUCATION_YEARS 0.032229351 -0.007845262 -0.313575888
## EXPECTED_lIFE_ON_BIRTH -0.178032351 -0.126622141 -0.029880097
## animal_products -0.024204581 -0.191933522 -0.092352501
## animal_fats -0.136088435 -0.167806680 -0.067866267
## Cerial_excluding_beer 0.075740284 0.070018502 0.073046614
## Eggs -0.182828272 0.026659228 0.087817335
## Fish 0.004689693 0.059133829 -0.139383408
## Fruits -0.004744694 0.139117067 0.013484015
## Meat -0.027271508 -0.104684836 -0.114101675
## Miscellaneous -0.076801254 0.059751041 -0.178702431
## Milk 0.124852262 -0.167671678 0.001650653
## population_density 1.000000000 0.063834181 -0.050977052
## Literacy 0.063834181 1.000000000 0.365118244
## Climate -0.050977052 0.365118244 1.000000000
## Industry -0.180273709 0.115843930 -0.061973541
## Agriculture -0.152581035 -0.747046522 -0.204220518
## Industry Agriculture
## Fatalities -0.03757339 -0.05611957
## ConfirmedCases 0.05587653 0.15620824
## population_ration 0.13039275 -0.13626929
## GDP 0.05324636 0.22472653
## EXPECTED_EDUCATION_YEARS -0.08424856 0.02856931
## EXPECTED_lIFE_ON_BIRTH 0.10745001 0.14364402
## animal_products -0.15255882 0.30202841
## animal_fats -0.13437955 0.30399549
## Cerial_excluding_beer 0.13832705 -0.24651977
## Eggs 0.16375622 -0.03194723
## Fish -0.01521992 -0.07508948
## Fruits -0.01606469 -0.10664579
## Meat -0.10385136 0.21558103
## Miscellaneous 0.14357994 -0.04868372
## Milk -0.11490469 0.16139463
## population_density -0.18027371 -0.15258104
## Literacy 0.11584393 -0.74704652
## Climate -0.06197354 -0.20422052
## Industry 1.00000000 -0.31631339
## Agriculture -0.31631339 1.00000000
#write.table(corrlations, "D:/mydata.csv", sep=",")
#aggregate(Countries_means$ConfirmedCases,list(Countries_means$Climate),sum)
#Remove some columns with weak relationship to the output.
#Countries_means = Countries_means[-c(2,3,12,13,14,17,19,20,21)]
#Replace the NA values with the mean.
#NA2mean <- function(x) replace(x, is.na(x), mean(x, na.rm = TRUE))
#Countries_means[2:12]=replace(Countries_means[2:12], TRUE, sapply(Countries_means[2:12], NA2mean))
#Sum on NA values
#sum(is.na(Countries_means))
#Import the data
final_data2 = read.csv('master_dataset.csv' ,header = FALSE)
#Display the fist few rows of the data
head(final_data2)
## V1 V2 V3 V4 V5 V6 V7 V8
## 1 survey_date region country ip_latitude ip_longitude ip_accuracy sex age
## 2 3/26/2020 <NA> CA 43.7626 -79.4654 100 male 20_30
## 3 3/25/2020 <NA> CA 43.244 -79.8536 100 male 80_90
## 4 3/25/2020 <NA> CA 51.1195 -113.9604 5 female 30_40
## 5 3/26/2020 <NA> CA 45.518 -73.6985 5 male 60_70
## 6 3/25/2020 <NA> CA 48.4194 -123.4261 1 female 50_60
## V9 V10 V11 V12 V13 V14 V15 V16 V17
## 1 height weight bmi blood_type smoking alcohol cannabis amphetamines cocaine
## 2 178 88 27.7 bn
## 3 184 94 27.7 an
## 4 158 54 21.6 unknown
## 5 172 96 32.4 unknown
## 6 166 82 29.7 unknown
## V18 V19 V20 V21 V22 V23
## 1 lsd mdma contacts_count house_count text_working rate_government_action
## 2 4 1
## 3 4 1
## 4 2 0
## 5 5 0
## 6 2 1
## V24 V25 V26
## 1 rate_reducing_risk_single rate_reducing_risk_house rate_reducing_mask
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## V27 V28 V29 V30 V31
## 1 covid19_positive covid19_symptoms covid19_contact asthma kidney_disease
## 2 0 0 0 0 0
## 3 0 0 0 0 0
## 4 0 0 0 0 0
## 5 0 0 0 0 0
## 6 0 0 0 0 0
## V32 V33 V34 V35 V36
## 1 compromised_immune heart_disease lung_disease diabetes hiv_positive
## 2 0 0 0 0 0
## 3 1 0 1 1 0
## 4 0 0 0 0 0
## 5 0 0 0 1 0
## 6 1 0 0 0 0
## V37 V38 V39 V40
## 1 hypertension other_chronic prescription_medication opinion_infection
## 2 0 0
## 3 1 0
## 4 0 0
## 5 1 0
## 6 0 1
## V41 V42 V43
## 1 opinion_mortality risk_infection risk_mortality
## 2 9 0.5
## 3 9 64.824
## 4 9 0.5
## 5 9 21.658
## 6 9 12.061
#any values with empty string as NA should be removed
final_data2 = subset(final_data2, V7!="" || V8!=""|| V9!="" || V10!="" || V11!="" || V12!="" || V13!="" || V14!="" || V15!="" || V16!="" || V17!="" || V18!="" || V19!="" || V21!="" || V27!="" || V30!="" || V31!="" || V32!="" || V33!="" || V34!="" || V35!="" || V36!="" || V37!="" || V38!="")
#any values with empty string as unknown should be removed
final_data2 = subset(final_data2, V12!="unknown")
#Create a function to substitue the header
header_rename <- function(df) {
names(df) <- as.character(unlist(df[1,]))
df[-1,]
}
final_data2=header_rename(final_data2)
final_data2 = subset(final_data2, smoking!="")
#Import another data that has almost the same features but from json file.
result = fromJSON(file ="convertcsvy.json")
final_data <- do.call(rbind, result)
json_data_frame <- as.data.frame(final_data)
final_data2 <- rbind(final_data2, json_data_frame)
## Warning in `[<-.factor`(`*tmp*`, ri, value = list(`1` = "NA", `2` = "NA", :
## invalid factor level, NA generated
## Warning in `[<-.factor`(`*tmp*`, ri, value = list(`1` = "NA", `2` = "NA", :
## invalid factor level, NA generated
## Warning in `[<-.factor`(`*tmp*`, ri, value = list(`1` = "NA", `2` = "NA", :
## invalid factor level, NA generated
## Warning in `[<-.factor`(`*tmp*`, ri, value = list(`1` = "NA", `2` = "NA", :
## invalid factor level, NA generated
## Warning in `[<-.factor`(`*tmp*`, ri, value = list(`1` = "NA", `2` = "NA", :
## invalid factor level, NA generated
## Warning in `[<-.factor`(`*tmp*`, ri, value = list(`1` = "NA", `2` = "NA", :
## invalid factor level, NA generated
#Remove un-important/irrelevant features
final_data2 = final_data2[-c(1,2,4,5,6,44,20,22,23,24,25,26,29,39,40,41,42,43)]
#change all values to string
bob <- data.frame(lapply(final_data2, as.character), stringsAsFactors=FALSE)
# Divide the positive and negative cases
final_data2_p = final_data2[final_data2$covid19_positive =="1",]
final_data2_n = final_data2[final_data2$covid19_positive =="0",]
#Reduce the negative cases to 770. Negative cases are very large than positive cases which will cause biase in the data due to in quality in number of psotive and negative cases.
final_data2_n = final_data2_n[1:770,]
new_data <- rbind(final_data2_p, final_data2_n)
#Plot histogram
plotHistFunc <- function(x, na.rm = TRUE) {
nm <- names(x)
for (i in seq_along(nm)) {
print(ggplot(x,aes_string(x = nm[i],fill = "covid19_positive")) + geom_bar()) }
}
plotHistFunc(new_data)
#change height, weight, and bmi to numberic. It's important to change it to character first before numberic to aviod mistakes.
new_data$height = as.numeric(as.character(new_data$height))
new_data$weight = as.numeric(as.character(new_data$weight))
new_data$bmi = as.numeric(as.character(new_data$bmi))
new_data$covid19_positive <- factor(new_data$covid19_positive)
new_data$sex = as.factor(as.numeric(new_data$sex))
new_data$blood_type = as.factor(as.numeric(new_data$blood_type))
new_data$smoking = as.factor(as.numeric(new_data$smoking))
new_data$alcohol = as.factor(as.numeric(new_data$alcohol))
new_data$cannabis = as.factor(as.numeric(new_data$cannabis))
new_data$amphetamines = as.factor(as.numeric(new_data$amphetamines))
new_data$cocaine = as.factor(as.numeric(new_data$cocaine))
new_data$age = as.factor(as.numeric(new_data$age))
#Scalling and dummy variables
new_data2 = new_data
new_data2$height = scale(new_data2$height)
new_data2$weight = scale(new_data2$weight)
new_data2$bmi = scale(new_data2$bmi)
new_data2$cannabis = scale(as.numeric(new_data2$cannabis))
new_data2$amphetamines = scale(as.numeric(new_data2$amphetamines))
new_data2$cocaine = scale(as.numeric(new_data2$cocaine))
new_data2$mdma = scale(as.numeric(as.character(new_data2$mdma)))
new_data2$lsd = scale(as.numeric(as.character(new_data2$lsd)))
#Dummy variables
new_data2 <- fastDummies::dummy_cols(new_data2)
#Remove categorical features after we did the dummy variables.
new_data2 = new_data2[-c(1,2,3,7,8,9,14,15,16,17,18,19,20,21,22,23,24,25,26)]
new_data2 = new_data2[-c(1,2,6,7)]
#names(new_data2)
new_data3 = new_data2
new_data2 =new_data3
new_data2 = new_data2 [-c(241)]
## set the seed to make your partition reproducible
new_data2$covid19_positive_1 = as.factor(as.character(new_data2$covid19_positive_1 ))
#install.packages('caTools')
library(caTools)
smp_size <- floor(0.8 * nrow(new_data2))
## set the seed to make your partition reproducible
set.seed(123)
train_ind <- sample(seq_len(nrow(new_data2)), size = smp_size)
train <- new_data2[train_ind, ]
test <- new_data2[-train_ind, ]
#install.packages('caret')
library(caret)
library(randomForest)
#Random Forest Classifier
rf <- randomForest(
covid19_positive_1 ~ .,
data=train,
importance=TRUE,
prOximity=TRUE,
na.action=na.roughfix
)
# making predictions of x_test
y_pred = predict(rf ,test )
# producing the confusion Matrix
confusionMatrix(y_pred, test$covid19_positive)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 135 82
## 1 12 79
##
## Accuracy : 0.6948
## 95% CI : (0.6401, 0.7458)
## No Information Rate : 0.5227
## P-Value [Acc > NIR] : 5.981e-10
##
## Kappa : 0.4008
##
## Mcnemar's Test P-Value : 1.105e-12
##
## Sensitivity : 0.9184
## Specificity : 0.4907
## Pos Pred Value : 0.6221
## Neg Pred Value : 0.8681
## Prevalence : 0.4773
## Detection Rate : 0.4383
## Detection Prevalence : 0.7045
## Balanced Accuracy : 0.7045
##
## 'Positive' Class : 0
##
https://www.geonames.org/countries/
Data source:https://www.kaggle.com/ahmedmd/corona-observations?select=convertcsvy.json
Data source: https://www.covid19survivalcalculator.com/en/download