true

Find interesting dataset and prepare short report (in R Markdown) which will consists: - short description of the dataset, - 6 plots where you use labels, smoothing line, scatterplots and other interesting tweaks you can find on ggplot extentions website. - brief comments which describes obtained results.

Then, edit theme of the graphs and all scales of the graph and prepare publication-ready plots.

Introduction

This dataset is compiled by the US Government between 1970 - 2017. It has a total of twenty columns which represent economic indicators.

Source: https://www.kaggle.com/fernandol/countries-of-the-world

Data Variables

  1. Country | Countries’ name
  2. Region | Africa, Asia, Middle East, Americas…
  3. Population | Countries’ population
  4. Area | The square mile (sq. mi.)
  5. Pop. Density | Population density (per sq. mi.)
  6. Coastline | The boundary between sea and land(coast/area ratio)
  7. Net migration | The difference between the number of immigrants and the number of emigrants
  8. Infant mortality | The death of an infant before his or her first birthday (per 1000 births)
  9. GDP | Gross domestic product (USD per capita)
  10. Literacy (%) | proportion of the population that have the ability to read and write.
  11. Phones | (per 1000)
  12. Arable | Rate of land used for or right for growing crops
  13. Crops |
  14. Other |
  15. Climate |
    • 1 - Dry tropical or tundra and ice, classification B and E.
    • 2 - Wet tropical, classification A.
    • 3 - Temperate humid subtropical and temperate continental, classification Cfa, Cwa, and D.
    • 4- Dry hot summers and wet winters.
  16. Birth rate | Rate of birth %
  17. Death rate | Rate of death %
  18. Agriculture |
  19. Industry | part of industry in the economy
  20. Service | part of service in the economy
library(ggplot2)
library(dplyr)
## 
## 載入套件:'dplyr'
## 下列物件被遮斷自 'package:stats':
## 
##     filter, lag
## 下列物件被遮斷自 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(lubridate)
## 
## 載入套件:'lubridate'
## 下列物件被遮斷自 'package:base':
## 
##     date, intersect, setdiff, union
library(readr)
library(scales)
## 
## 載入套件:'scales'
## 下列物件被遮斷自 'package:readr':
## 
##     col_factor
library(ggrepel)
library(ggalt)
## Registered S3 methods overwritten by 'ggalt':
##   method                  from   
##   grid.draw.absoluteGrob  ggplot2
##   grobHeight.absoluteGrob ggplot2
##   grobWidth.absoluteGrob  ggplot2
##   grobX.absoluteGrob      ggplot2
##   grobY.absoluteGrob      ggplot2
library(stringr)

world_gdp <- read.csv('/Users/jeank4723/Desktop/Advance VR/1/Data/world_gdp.csv')

colnames(world_gdp)
##  [1] "Country"                            "Region"                            
##  [3] "Population"                         "Area..sq..mi.."                    
##  [5] "Pop..Density..per.sq..mi.."         "Coastline..coast.area.ratio."      
##  [7] "Net.migration"                      "Infant.mortality..per.1000.births."
##  [9] "GDP....per.capita."                 "Literacy...."                      
## [11] "Phones..per.1000."                  "Arable...."                        
## [13] "Crops...."                          "Other...."                         
## [15] "Climate"                            "Birthrate"                         
## [17] "Deathrate"                          "Agriculture"                       
## [19] "Industry"                           "Service"
world_gdp <- world_gdp %>%
    mutate(Region = factor(Region, levels = c("ASIA (EX. NEAR EAST)         ",
                                              "EASTERN EUROPE                     ",
                                              "NORTHERN AFRICA                    ", 
                                              "OCEANIA                            ",
                                              "WESTERN EUROPE                     ", 
                                              "SUB-SAHARAN AFRICA                 ",
                                              "LATIN AMER. & CARIB    ",             
                                              "C.W. OF IND. STATES ",
                                              "NEAR EAST                          ", 
                                              "NORTHERN AMERICA                   ",
                                              "BALTICS                            "), 
              labels = c("ASIA",
                         "EAST_EUROPE",
                         "NOR_AFRICA",
                         "OCEANIA",
                         "WEST_EUROPE", 
                         "SUB-SAHARAN AFRICA",
                         "LATIN AMER. & CARIB",             
                         "CW_IND. STATES",
                         "NEAR EAST", 
                         "NOR_AMERICA",
                         "BALTICS")))

colnames(world_gdp) <- c("Country", 
                         "Region", 
                         "Population", 
                         "Area", 
                         "Pop_Density", 
                         "Coastline_ratio", 
                         "Net_migration", 
                         "Infant_mortality", 
                         "GDP", 
                         "Literacy", 
                         "Phones_rate", 
                         "Arable_rate", 
                         "Crops_rate", 
                         "Other_rate", 
                         "Climate", 
                         "Birthrate", 
                         "Deathrate", 
                         "Agriculture", 
                         "Industry",
                         "Service")
  

head(world_gdp)
##           Country             Region Population    Area Pop_Density
## 1    Afghanistan                ASIA   31056997  647500        48,0
## 2        Albania         EAST_EUROPE    3581655   28748       124,6
## 3        Algeria          NOR_AFRICA   32930091 2381740        13,8
## 4 American Samoa             OCEANIA      57794     199       290,4
## 5        Andorra         WEST_EUROPE      71201     468       152,1
## 6         Angola  SUB-SAHARAN AFRICA   12127071 1246700         9,7
##   Coastline_ratio Net_migration Infant_mortality   GDP Literacy Phones_rate
## 1            0,00         23,06           163,07   700     36,0         3,2
## 2            1,26         -4,93            21,52  4500     86,5        71,2
## 3            0,04         -0,39               31  6000     70,0        78,1
## 4           58,29        -20,71             9,27  8000     97,0       259,5
## 5            0,00           6,6             4,05 19000    100,0       497,2
## 6            0,13             0           191,19  1900     42,0         7,8
##   Arable_rate Crops_rate Other_rate Climate Birthrate Deathrate Agriculture
## 1       12,13       0,22      87,65       1      46,6     20,34        0,38
## 2       21,09       4,42      74,49       3     15,11      5,22       0,232
## 3        3,22       0,25      96,53       1     17,14      4,61       0,101
## 4          10         15         75       2     22,46      3,27            
## 5        2,22          0      97,78       3      8,71      6,25            
## 6        2,41       0,24      97,35             45,11      24,2       0,096
##   Industry Service
## 1     0,24    0,38
## 2    0,188   0,579
## 3      0,6   0,298
## 4                 
## 5                 
## 6    0,658   0,246
# world_gdp <- na.omit(world_gdp)


world_pop <- data.frame(world_gdp$Country)

world_gdp$Pop_Density <- str_replace_all(world_gdp$Pop_Density, ',','.')
world_gdp$Coastline_ratio <- str_replace_all(world_gdp$Coastline_ratio, ',','.')
world_gdp$Net_migration <- str_replace_all(world_gdp$Net_migration, ',','.')
world_gdp$Infant_mortality <- str_replace_all(world_gdp$Infant_mortality, ',','.')
world_gdp$Literacy <- str_replace_all(world_gdp$Literacy, ',','.')
world_gdp$Phones_rate <- str_replace_all(world_gdp$Phones_rate, ',','.')
world_gdp$Arable_rate <- str_replace_all(world_gdp$Arable_rate, ',','.')
world_gdp$Crops_rate <- str_replace_all(world_gdp$Crops_rate, ',','.')
world_gdp$Other_rate <- str_replace_all(world_gdp$Other_rate, ',','.')
world_gdp$Birthrate <- str_replace_all(world_gdp$Birthrate, ',','.')
world_gdp$Deathrate <- str_replace_all(world_gdp$Deathrate, ',','.')
world_gdp$Agriculture <- str_replace_all(world_gdp$Agriculture, ',','.')
world_gdp$Industry <- str_replace_all(world_gdp$Industry, ',','.')
world_gdp$Service <- str_replace_all(world_gdp$Service, ',','.')


# Population per million
world_gdp$Population <- as.integer(world_gdp$Population/100000)
world_gdp$Pop_Density <- as.numeric(world_gdp$Pop_Density)
world_gdp$Coastline_ratio <- as.numeric(world_gdp$Coastline_ratio)
world_gdp$Net_migration <- as.numeric(world_gdp$Net_migration)
world_gdp$Infant_mortality <- as.numeric(world_gdp$Infant_mortality)
world_gdp$Literacy <- as.numeric(world_gdp$Literacy)
world_gdp$Phones_rate <- as.numeric(world_gdp$Phones_rate)
world_gdp$Arable_rate <- as.numeric(world_gdp$Arable_rate)
world_gdp$Crops_rate <- as.numeric(world_gdp$Crops_rate)
world_gdp$Other_rate <- as.numeric(world_gdp$Other_rate)
world_gdp$Birthrate <- as.numeric(world_gdp$Birthrate)
world_gdp$Deathrate <- as.numeric(world_gdp$Deathrate)
world_gdp$Agriculture <- as.numeric(world_gdp$Agriculture)
world_gdp$Industry <- as.numeric(world_gdp$Industry)
world_gdp$Service <- as.numeric(world_gdp$Service)

world_gdp$Climate <- as.factor(world_gdp$Climate)

tail(world_gdp)
##                Country             Region Population   Area Pop_Density
## 222 Wallis and Futuna             OCEANIA          0    274        58.5
## 223         West Bank           NEAR EAST         24   5860       419.9
## 224    Western Sahara          NOR_AFRICA          2 266000         1.0
## 225             Yemen           NEAR EAST        214 527970        40.6
## 226            Zambia  SUB-SAHARAN AFRICA        115 752614        15.3
## 227          Zimbabwe  SUB-SAHARAN AFRICA        122 390580        31.3
##     Coastline_ratio Net_migration Infant_mortality  GDP Literacy Phones_rate
## 222           47.08            NA               NA 3700     50.0       118.6
## 223            0.00          2.98            19.62  800       NA       145.2
## 224            0.42            NA               NA   NA       NA          NA
## 225            0.36          0.00            61.50  800     50.2        37.2
## 226            0.00          0.00            88.29  800     80.6         8.2
## 227            0.00          0.00            67.69 1900     90.7        26.8
##     Arable_rate Crops_rate Other_rate Climate Birthrate Deathrate Agriculture
## 222        5.00      25.00      70.00       2        NA        NA          NA
## 223       16.90      18.97      64.13       3     31.67      3.92       0.090
## 224        0.02       0.00      99.98       1        NA        NA          NA
## 225        2.78       0.24      96.98       1     42.89      8.30       0.135
## 226        7.08       0.03      92.90       2     41.00     19.93       0.220
## 227        8.32       0.34      91.34       2     28.01     21.84       0.179
##     Industry Service
## 222       NA      NA
## 223    0.280   0.630
## 224       NA   0.400
## 225    0.472   0.393
## 226    0.290   0.489
## 227    0.243   0.579

6 plots

1. GDP and Population

In the plot, we can observe that there are two countries in the red circle, India and China, who have high population. However, their population density are close to 4000 per square meter which is not higher than mean value. In the blue circle, USA and Japan, which circled those countries which have high GDP and average population.

world_gdp_countries <- world_gdp  %>%
  group_by(Country) %>%
  summarise(Population = sum(Population, na.rm = TRUE), GDP = mean(GDP, na.rm = TRUE), Pop_Density = Pop_Density, Region) 

high_gdp <- world_gdp_countries %>% 
  filter(GDP > 20000, GDP <= 100000, Population > 1000, Population < 5000)

high_pop <- world_gdp_countries %>% 
  filter(GDP > 100, GDP < 10000, Population > 10000, Population < 50000)


p1 <- ggplot(world_gdp_countries, aes(x = GDP, y = Population))

p1 +
  geom_point(aes(size = Pop_Density, color = Region)) +
  geom_text_repel(aes(label = Country ), size = 2) +
  labs(title = 'Population and GDP',
       x = 'GDP' , y = 'Population (per million)')  + 
  theme_bw() + 
  geom_text_repel(aes(label = Country)) +
  geom_encircle(data = high_gdp, color = "blue", size =  1, expand = 0.09) +
  geom_encircle(data = high_pop, color = "red", size =  1, expand = 0.09) 
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text_repel).

## Warning: Removed 1 rows containing missing values (geom_text_repel).
## Warning: ggrepel: 216 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 219 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

2. Climate and Literacy in EUROPE

In this plot, we can see that the GDP of the countries is not closely related to the literacy. In addition, the plot shows that all the countries in Europe have their literacy higher than average except one country Albania.

EUR_gdp <- world_gdp %>%
  filter(Region == 'WEST_EUROPE'| Region == 'EAST_EUROPE') %>%
  summarise(GDP = GDP, Literacy = Literacy, Country = Country, Climate = Climate)

# EUR_gdp <- EUR_gdp %>%
#     mutate(Climate = factor(Climate, levels = c("1","2","3","4",""), 
#               labels = c("1","2","3","4","Unknown")))


p2 <- ggplot(data = EUR_gdp, aes(x = GDP, y = Literacy, color = Climate))

p2 +
  geom_point(aes(size = GDP)) + 
  geom_text_repel(aes(label = Country)) +
   geom_hline(yintercept = median(world_gdp$Literacy, na.rm = TRUE), 
             linetype = 'longdash') 
## Warning: Removed 8 rows containing missing values (geom_point).
## Warning: Removed 8 rows containing missing values (geom_text_repel).
## Warning: ggrepel: 8 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

3. Agricultural Situation

The plot shows that the counties have higher arable rate of land used for or right for growing crops do not have high crops rate. The each point represent each region, there is no point in red square in the plot so we know that there is no region has high crops rate and higher arable rate at the same time.

world_farm <- world_gdp %>%
  group_by(Region) %>%
  summarise(Arable_rate = mean(Arable_rate, na.rm = TRUE), Crops_rate = mean(Crops_rate, na.rm = TRUE), GDP = mean(GDP, na.rm = TRUE), Coastline_ratio = mean(Coastline_ratio, na.rm = TRUE))

p3 <- ggplot(world_farm, aes(x = Arable_rate, y = Crops_rate))

world_sel <- world_farm %>% 
  filter(Arable_rate > 5, Arable_rate <= 8.2, Crops_rate > 800, Crops_rate < 1100)

p3 +
  geom_point(aes(size = GDP/1000, color = Region)) +
  labs(title = 'Agricultural Situation', color = 'Region',
       x = 'Arable Rate' , y = 'Crops Rate', size = 'GDP')  + 
  theme_bw()  +
  geom_text_repel(aes(label = Region ), size = 2) +
  # geom_encircle(data = crime_sel, color = "red", size =  2, expand = 0.09) +
  # theme(legend.position = 'top') +
  geom_vline(xintercept = mean(world_gdp$Arable_rate, na.rm = TRUE), 
             linetype = 'longdash') +
  geom_hline(yintercept = mean(world_gdp$Crops_rate, na.rm = TRUE), 
             linetype = 'longdash') +
  annotate("rect", xmin = mean(world_gdp$Arable_rate, na.rm = TRUE), xmax = Inf, 
           ymin = mean(world_gdp$Crops_rate, na.rm = TRUE), ymax = Inf, alpha = 0.1, fill = "red") +
  annotate("rect", xmin = -Inf, xmax = mean(world_gdp$Arable_rate, na.rm = TRUE), 
           ymin = -Inf, ymax = mean(world_gdp$Crops_rate, na.rm = TRUE), alpha = 0.1, fill = "green") +
  annotate("text", label = "Fertile Area ", 
           x = mean(world_gdp$Arable_rate, na.rm = TRUE), 
           y = mean(world_gdp$Crops_rate, na.rm = TRUE), size = 4,  hjust = 0, vjust = -1, colour = "red") +
  annotate("text", label = "Infertile Area", 
           x = mean(world_gdp$Arable_rate, na.rm = TRUE), 
           y = mean(world_gdp$Crops_rate, na.rm = TRUE), size = 4, hjust = 1, vjust = 1, colour = "darkgreen")

4. Infant Mortality and GDP

Before running the plot, I assume that the country with high infant mortality has low GDP. Because if a country has high value of the infant mortality which means they do not have proper health care in their country also mean they do not productive, they have low GDP. According to the plot, my assumption is correct 50%. We can observe that high infant mortality lead to low GDP. However, low infant mortality could not lead to any result of GDP.

world_high_infant_mortality <- world_gdp  %>%
  filter(world_gdp$Infant_mortality > mean(world_gdp$Infant_mortality, na.rm = TRUE)) %>% 
  summarise(GDP = GDP, Country = Country, Infant_mortality, Region) 

p4 <- ggplot(world_high_infant_mortality, aes(x = Infant_mortality, y = GDP))

p4 +
  geom_point(aes(color = Region)) +  
  
  geom_text_repel(aes(label = Country ), size = 3) +
  annotate("rect", 
           xmin = 100, 
           xmax = Inf, 
           ymin = 6000, 
           ymax = -Inf, 
           alpha = 0.1, fill = "red") +
  
  annotate("rect", 
           xmin = -Inf, 
           xmax = 100, 
           ymin = Inf, 
           ymax = 6000, 
           alpha = 0.1, fill = "green") +
  
  labs(title = "Infant Mortality and GDP",
       x = "Infant Mortality",
       y = "GDP")
## Warning: ggrepel: 49 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

5. Infant Mortality and Literacy

I randomly pick the countries in this plot. There is 143 observations. The horizontal line indicated mean of literacy and vertical line indicated to mean of infant mortality. In this plot, we can see that the counties with high infant mortality have low level of literacy which mean literacy and infant mortality are closely related.

world_high_infant_mortality <- world_gdp  %>%
  filter(str_detect(Country, "^[A-M]")) %>% 
  summarise(GDP = GDP, Country = Country, Infant_mortality, Region, Literacy) 

p5 <- ggplot(world_high_infant_mortality, aes(x = Infant_mortality, y = Literacy))

p5 +
  geom_point(aes(color = Region)) +  
  
  geom_text_repel(aes(label = Country ), size = 3) +
  
  geom_vline(xintercept = mean(world_gdp$Infant_mortality, na.rm = TRUE), 
             linetype = 'longdash') +
  geom_hline(yintercept = mean(world_gdp$Literacy, na.rm = TRUE), 
             linetype = 'longdash') +
  
  labs(title = "Infant Mortality and Literacy",
       x = "Infant Mortality",
       y = "Literacy") 
## Warning: Removed 12 rows containing missing values (geom_point).
## Warning: Removed 12 rows containing missing values (geom_text_repel).
## Warning: ggrepel: 81 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

  # annotate("rect", 
  #          xmin = mean(world_gdp$Infant_mortality, na.rm = TRUE), 
  #          xmax = Inf, 
  #          ymin = mean(world_gdp$Literacy, na.rm = TRUE), 
  #          ymax = -Inf) +
  # 
  # annotate("rect", 
  #          xmin = -Inf, 
  #          xmax = mean(world_gdp$Infant_mortality, na.rm = TRUE), 
  #          ymin = Inf, 
  #          ymax = mean(world_gdp$Literacy, na.rm = TRUE)) +

6. Birthrate and Deathrate

According to the plot, the result shows that the countries with high GDP have low birth rate and low death rate. On the contrary, the countries with high birth rate and high death rate have low GDP.

world_birth_death <- world_gdp  %>%
  filter(str_detect(Country, "^[N-Z]")) %>% 
  summarise(Country = Country, Birthrate, Region, Deathrate, GDP = GDP) 


p6 <- ggplot(world_birth_death, aes(x = Birthrate, y = Deathrate))

p6 +
  geom_point(aes(size = GDP/1000, color = Region)) +  
  
  geom_text_repel(aes(label = Country ), size = 3) +
  
  geom_vline(xintercept = median(world_gdp$Birthrate, na.rm = TRUE), 
             linetype = 'longdash') +
  geom_hline(yintercept = median(world_gdp$Deathrate, na.rm = TRUE), 
             linetype = 'longdash') +
  labs(title = "Birth Rate and Death Rate",
       x = "Birth Rate",
       y = "Death Rate") 
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing missing values (geom_text_repel).
## Warning: ggrepel: 49 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps