Obtaining data from a source we have two dataset. One with observations of individuals from different countries and another one with the country indices such as: population, GDP and life expectancy.

head(Data)
head(Info.by.state)

Q1: create a new dataset where these information is displayed

New_data<-dplyr::full_join(Data,Info.by.state,by="Country")
head(New_data)
View(New_data)

The function is not working because we have missing values which are removing observations and by this way making less observations. So we need to deal with missing values NA.

library(ggplot2)
# GGally::ggpairs(New_data,mapping = aes(color = Country),na.omit=TRUE)
# Error in cor.test.default(x, y, method = method, use = use) : not enough finite observations
which(is.na(New_data$Age))# will display positions of NA in Age variable
[1]  7 21
# let fill these NA with the average age
# first we will create an index vector 
Ind_1<-which(is.na(New_data$Age))

# then we will substitute NA-s with teh average of the Age variable
New_data$Age[Ind_1]=mean(New_data$Age)
New_data$Age[Ind_1]
[1] NA NA

Q:What is happening? A: some functions such as the mean () are not performing if there are missing observations. So we need to add the argument na.rm=TRUE

New_data$Age[Ind_1]=mean(New_data$Age,na.rm = TRUE)
New_data$Age[Ind_1]
[1] 49.07143 49.07143

The same we will do for the other variables when we have missing values.

Ind_2<-which(is.na(New_data$Salary))
# then we will substitute NA-s with the average of the Salary variable
Ind_2
[1]  5 11 17 25
New_data$Age[Ind_2]=mean(New_data$Salary,na.rm = TRUE)
New_data$Age[Ind_2]
[1] 62230.77 62230.77 62230.77 62230.77
Ind_3<-which(is.na(New_data$Purchased))
# then we will substitute NA-s with the mode of the Purchased variable
Ind_3
integer(0)

Another way of showing missing values NA in all variables of teh matrix:

apply(is.na(New_data), 2, which)
$Country
integer(0)

$Age
integer(0)

$Salary
[1]  5 11 17 25

$Purchased
integer(0)

$Pop
integer(0)

$GDP
integer(0)

$LifeEx
integer(0)

Unfortunately the function here is not working because of the small number of observation

# GGally::ggpairs(New_data,mapping = aes(color=Country))
# GGally::ggpairs(New_data[,c(2,3)],mapping = aes(color=Purchased))
#
#Error in cor.test.default(x, y, method = method, use = use) : not enough finite observations
GGally::ggpairs(New_data)

 plot: [1,1] [>---------------------------------------------------------------]  2% est: 0s 
 plot: [1,2] [==>-------------------------------------------------------------]  4% est: 2s Warning: Removed 2 rows containing non-finite values (stat_boxplot).

 plot: [1,3] [===>------------------------------------------------------------]  6% est: 4s Warning: Removed 4 rows containing non-finite values (stat_boxplot).

 plot: [1,4] [====>-----------------------------------------------------------]  8% est: 4s 
 plot: [1,5] [======>---------------------------------------------------------] 10% est: 4s 
 plot: [1,6] [=======>--------------------------------------------------------] 12% est: 4s 
 plot: [1,7] [========>-------------------------------------------------------] 14% est: 4s 
 plot: [2,1] [=========>------------------------------------------------------] 16% est: 5s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 2 rows containing non-finite values (stat_bin).

 plot: [2,2] [===========>----------------------------------------------------] 18% est: 6s Warning: Removed 2 rows containing non-finite values (stat_density).

 plot: [2,3] [============>---------------------------------------------------] 20% est: 6s Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm,  :
  Removed 6 rows containing missing values

 plot: [2,4] [=============>--------------------------------------------------] 22% est: 5s Warning: Removed 2 rows containing non-finite values (stat_boxplot).

 plot: [2,5] [===============>------------------------------------------------] 24% est: 5s Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm,  :
  Removed 2 rows containing missing values

 plot: [2,6] [================>-----------------------------------------------] 27% est: 5s Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm,  :
  Removed 2 rows containing missing values

 plot: [2,7] [=================>----------------------------------------------] 29% est: 5s Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm,  :
  Removed 2 rows containing missing values

 plot: [3,1] [===================>--------------------------------------------] 31% est: 4s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 4 rows containing non-finite values (stat_bin).

 plot: [3,2] [====================>-------------------------------------------] 33% est: 5s Warning: Removed 6 rows containing missing values (geom_point).

 plot: [3,3] [=====================>------------------------------------------] 35% est: 5s Warning: Removed 4 rows containing non-finite values (stat_density).

 plot: [3,4] [=======================>----------------------------------------] 37% est: 4s Warning: Removed 4 rows containing non-finite values (stat_boxplot).

 plot: [3,5] [========================>---------------------------------------] 39% est: 4s Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm,  :
  Removed 4 rows containing missing values

 plot: [3,6] [=========================>--------------------------------------] 41% est: 4s Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm,  :
  Removed 4 rows containing missing values

 plot: [3,7] [==========================>-------------------------------------] 43% est: 4s Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm,  :
  Removed 4 rows containing missing values

 plot: [4,1] [============================>-----------------------------------] 45% est: 4s 
 plot: [4,2] [=============================>----------------------------------] 47% est: 4s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 2 rows containing non-finite values (stat_bin).

 plot: [4,3] [==============================>---------------------------------] 49% est: 4s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 4 rows containing non-finite values (stat_bin).

 plot: [4,4] [================================>-------------------------------] 51% est: 3s 
 plot: [4,5] [=================================>------------------------------] 53% est: 3s 
 plot: [4,6] [==================================>-----------------------------] 55% est: 3s 
 plot: [4,7] [====================================>---------------------------] 57% est: 3s 
 plot: [5,1] [=====================================>--------------------------] 59% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [5,2] [======================================>-------------------------] 61% est: 3s Warning: Removed 2 rows containing missing values (geom_point).

 plot: [5,3] [=======================================>------------------------] 63% est: 3s Warning: Removed 4 rows containing missing values (geom_point).

 plot: [5,4] [=========================================>----------------------] 65% est: 2s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [5,5] [==========================================>---------------------] 67% est: 2s 
 plot: [5,6] [===========================================>--------------------] 69% est: 2s 
 plot: [5,7] [=============================================>------------------] 71% est: 2s 
 plot: [6,1] [==============================================>-----------------] 73% est: 2s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [6,2] [===============================================>----------------] 76% est: 2s Warning: Removed 2 rows containing missing values (geom_point).

 plot: [6,3] [=================================================>--------------] 78% est: 2s Warning: Removed 4 rows containing missing values (geom_point).

 plot: [6,4] [==================================================>-------------] 80% est: 1s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [6,5] [===================================================>------------] 82% est: 1s 
 plot: [6,6] [=====================================================>----------] 84% est: 1s 
 plot: [6,7] [======================================================>---------] 86% est: 1s 
 plot: [7,1] [=======================================================>--------] 88% est: 1s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [7,2] [========================================================>-------] 90% est: 1s Warning: Removed 2 rows containing missing values (geom_point).

 plot: [7,3] [==========================================================>-----] 92% est: 1s Warning: Removed 4 rows containing missing values (geom_point).

 plot: [7,4] [===========================================================>----] 94% est: 0s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [7,5] [============================================================>---] 96% est: 0s 
 plot: [7,6] [==============================================================>-] 98% est: 0s 
 plot: [7,7] [================================================================]100% est: 0s 
                                                                                            

Q:Use information for states : Italy and Greece. Obtain some results

Extract information for Italy and Greece:

library(dplyr)
Data_2<-New_data %>% filter(Country==c("Italy","Greece"))
Data_2

Q:Create a new dataset only for states Italy and Greece for individual observations (do not display state information such as life expectancy, gdp, population)

Data_3<- Data_2 %>% select (Country,Age, Salary, Purchased)
Data_3
LS0tDQp0aXRsZTogIk1pc3Npbmcgb2JzZXJ2YXRpb24iDQpzdWJ0aXRsZTogTWFuaXB1bGF0ZSBkYXRhDQpvdXRwdXQ6DQogIGh0bWxfbm90ZWJvb2s6IGRlZmF1bHQNCiAgd29yZF9kb2N1bWVudDogZGVmYXVsdA0KLS0tDQoNCg0KT2J0YWluaW5nIGRhdGEgZnJvbSBhIHNvdXJjZSB3ZSBoYXZlIHR3byBkYXRhc2V0LiBPbmUgd2l0aCBvYnNlcnZhdGlvbnMgb2YgaW5kaXZpZHVhbHMgZnJvbSBkaWZmZXJlbnQgY291bnRyaWVzIGFuZCBhbm90aGVyIG9uZSB3aXRoIHRoZSBjb3VudHJ5IGluZGljZXMgc3VjaCBhczogcG9wdWxhdGlvbiwgR0RQIGFuZCBsaWZlIGV4cGVjdGFuY3kuDQoNCmBgYHtyfQ0KaGVhZChEYXRhKQ0KaGVhZChJbmZvLmJ5LnN0YXRlKQ0KYGBgDQoNClExOiBjcmVhdGUgYSBuZXcgZGF0YXNldCB3aGVyZSB0aGVzZSBpbmZvcm1hdGlvbiBpcyBkaXNwbGF5ZWQgDQpgYGB7cn0NCk5ld19kYXRhPC1kcGx5cjo6ZnVsbF9qb2luKERhdGEsSW5mby5ieS5zdGF0ZSxieT0iQ291bnRyeSIpDQpoZWFkKE5ld19kYXRhKQ0KVmlldyhOZXdfZGF0YSkNCmBgYA0KDQpUaGUgZnVuY3Rpb24gaXMgbm90IHdvcmtpbmcgYmVjYXVzZSB3ZSBoYXZlIG1pc3NpbmcgdmFsdWVzIHdoaWNoIGFyZSByZW1vdmluZyBvYnNlcnZhdGlvbnMgYW5kIGJ5IHRoaXMgd2F5IG1ha2luZyBsZXNzIG9ic2VydmF0aW9ucy4NClNvIHdlIG5lZWQgdG8gZGVhbCB3aXRoIG1pc3NpbmcgdmFsdWVzIE5BLg0KYGBge3J9DQpsaWJyYXJ5KGdncGxvdDIpDQojIEdHYWxseTo6Z2dwYWlycyhOZXdfZGF0YSxtYXBwaW5nID0gYWVzKGNvbG9yID0gQ291bnRyeSksbmEub21pdD1UUlVFKQ0KIyBFcnJvciBpbiBjb3IudGVzdC5kZWZhdWx0KHgsIHksIG1ldGhvZCA9IG1ldGhvZCwgdXNlID0gdXNlKSA6IG5vdCBlbm91Z2ggZmluaXRlIG9ic2VydmF0aW9ucw0KYGBgDQoNCmBgYHtyfQ0Kd2hpY2goaXMubmEoTmV3X2RhdGEkQWdlKSkjIHdpbGwgZGlzcGxheSBwb3NpdGlvbnMgb2YgTkEgaW4gQWdlIHZhcmlhYmxlDQpgYGANCmBgYHtyfQ0KIyBsZXQgZmlsbCB0aGVzZSBOQSB3aXRoIHRoZSBhdmVyYWdlIGFnZQ0KIyBmaXJzdCB3ZSB3aWxsIGNyZWF0ZSBhbiBpbmRleCB2ZWN0b3IgDQpJbmRfMTwtd2hpY2goaXMubmEoTmV3X2RhdGEkQWdlKSkNCg0KIyB0aGVuIHdlIHdpbGwgc3Vic3RpdHV0ZSBOQS1zIHdpdGggdGVoIGF2ZXJhZ2Ugb2YgdGhlIEFnZSB2YXJpYWJsZQ0KTmV3X2RhdGEkQWdlW0luZF8xXT1tZWFuKE5ld19kYXRhJEFnZSkNCk5ld19kYXRhJEFnZVtJbmRfMV0NCmBgYA0KUTpXaGF0IGlzIGhhcHBlbmluZz8NCkE6IHNvbWUgZnVuY3Rpb25zIHN1Y2ggYXMgdGhlIG1lYW4gKCkgYXJlIG5vdCBwZXJmb3JtaW5nIGlmIHRoZXJlIGFyZSBtaXNzaW5nIG9ic2VydmF0aW9ucy4gU28gd2UgbmVlZCB0byBhZGQgdGhlIGFyZ3VtZW50IG5hLnJtPVRSVUUNCg0KYGBge3J9DQpOZXdfZGF0YSRBZ2VbSW5kXzFdPW1lYW4oTmV3X2RhdGEkQWdlLG5hLnJtID0gVFJVRSkNCk5ld19kYXRhJEFnZVtJbmRfMV0NCmBgYA0KVGhlIHNhbWUgd2Ugd2lsbCBkbyBmb3IgdGhlIG90aGVyIHZhcmlhYmxlcyB3aGVuIHdlIGhhdmUgbWlzc2luZyB2YWx1ZXMuDQoNCmBgYHtyfQ0KSW5kXzI8LXdoaWNoKGlzLm5hKE5ld19kYXRhJFNhbGFyeSkpDQojIHRoZW4gd2Ugd2lsbCBzdWJzdGl0dXRlIE5BLXMgd2l0aCB0aGUgYXZlcmFnZSBvZiB0aGUgU2FsYXJ5IHZhcmlhYmxlDQpJbmRfMg0KYGBgDQoNCmBgYHtyfQ0KTmV3X2RhdGEkQWdlW0luZF8yXT1tZWFuKE5ld19kYXRhJFNhbGFyeSxuYS5ybSA9IFRSVUUpDQpOZXdfZGF0YSRBZ2VbSW5kXzJdDQpgYGANCg0KYGBge3J9DQpJbmRfMzwtd2hpY2goaXMubmEoTmV3X2RhdGEkUHVyY2hhc2VkKSkNCiMgdGhlbiB3ZSB3aWxsIHN1YnN0aXR1dGUgTkEtcyB3aXRoIHRoZSBtb2RlIG9mIHRoZSBQdXJjaGFzZWQgdmFyaWFibGUNCkluZF8zDQpgYGANCg0KDQpBbm90aGVyIHdheSBvZiBzaG93aW5nIG1pc3NpbmcgdmFsdWVzIE5BIGluIGFsbCB2YXJpYWJsZXMgb2YgdGVoIG1hdHJpeDoNCmBgYHtyfQ0KIyBhcHBseShpcy5uYShOZXdfZGF0YSksIDIsIHdoaWNoKQ0KYGBgDQoNClVuZm9ydHVuYXRlbHkgdGhlIGZ1bmN0aW9uIGhlcmUgaXMgbm90IHdvcmtpbmcgYmVjYXVzZSBvZiB0aGUgc21hbGwgbnVtYmVyIG9mIG9ic2VydmF0aW9uDQpgYGB7cn0NCiMgR0dhbGx5OjpnZ3BhaXJzKE5ld19kYXRhLG1hcHBpbmcgPSBhZXMoY29sb3I9Q291bnRyeSkpDQojIEdHYWxseTo6Z2dwYWlycyhOZXdfZGF0YVssYygyLDMpXSxtYXBwaW5nID0gYWVzKGNvbG9yPVB1cmNoYXNlZCkpDQojDQojRXJyb3IgaW4gY29yLnRlc3QuZGVmYXVsdCh4LCB5LCBtZXRob2QgPSBtZXRob2QsIHVzZSA9IHVzZSkgOiBub3QgZW5vdWdoIGZpbml0ZSBvYnNlcnZhdGlvbnMNCg0KYGBgDQoNCmBgYHtyfQ0KR0dhbGx5OjpnZ3BhaXJzKE5ld19kYXRhKQ0KYGBgDQoNClE6VXNlIGluZm9ybWF0aW9uIGZvciBzdGF0ZXMgOiBJdGFseSBhbmQgR3JlZWNlLiBPYnRhaW4gc29tZSByZXN1bHRzDQoNCkV4dHJhY3QgaW5mb3JtYXRpb24gZm9yIEl0YWx5IGFuZCBHcmVlY2U6DQoNCmBgYHtyfQ0KbGlicmFyeShkcGx5cikNCkRhdGFfMjwtTmV3X2RhdGEgJT4lIGZpbHRlcihDb3VudHJ5PT1jKCJJdGFseSIsIkdyZWVjZSIpKQ0KRGF0YV8yDQpgYGANCg0KUTpDcmVhdGUgYSBuZXcgZGF0YXNldCBvbmx5IGZvciBzdGF0ZXMgSXRhbHkgYW5kIEdyZWVjZSBmb3IgaW5kaXZpZHVhbCBvYnNlcnZhdGlvbnMgKGRvDQpub3QgZGlzcGxheSBzdGF0ZSBpbmZvcm1hdGlvbiBzdWNoIGFzIGxpZmUgZXhwZWN0YW5jeSwgZ2RwLCBwb3B1bGF0aW9uKQ0KDQpgYGB7cn0NCkRhdGFfMzwtIERhdGFfMiAlPiUgc2VsZWN0IChDb3VudHJ5LEFnZSwgU2FsYXJ5LCBQdXJjaGFzZWQpDQpEYXRhXzMNCmBgYA0KDQo=