Find interesting dataset and prepare short report (in R Markdown) which will consists: - short description of the dataset, - 6 plots where you use labels, smoothing line, scatterplots and other interesting tweaks you can find on ggplot extentions website. - brief comments which describes obtained results.
Then, edit theme of the graphs and all scales of the graph and prepare publication-ready plots.
This dataset is compiled by the US Government between 1970 - 2017. It has a total of twenty columns which represent economic indicators.
Source: https://www.kaggle.com/fernandol/countries-of-the-world
library(ggplot2)
library(dplyr)
##
## 載入套件:'dplyr'
## 下列物件被遮斷自 'package:stats':
##
## filter, lag
## 下列物件被遮斷自 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(lubridate)
##
## 載入套件:'lubridate'
## 下列物件被遮斷自 'package:base':
##
## date, intersect, setdiff, union
library(readr)
library(scales)
##
## 載入套件:'scales'
## 下列物件被遮斷自 'package:readr':
##
## col_factor
library(ggrepel)
library(ggalt)
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
library(stringr)
world_gdp <- read.csv('/Users/jeank4723/Desktop/Advance VR/1/Data/world_gdp.csv')
colnames(world_gdp)
## [1] "Country" "Region"
## [3] "Population" "Area..sq..mi.."
## [5] "Pop..Density..per.sq..mi.." "Coastline..coast.area.ratio."
## [7] "Net.migration" "Infant.mortality..per.1000.births."
## [9] "GDP....per.capita." "Literacy...."
## [11] "Phones..per.1000." "Arable...."
## [13] "Crops...." "Other...."
## [15] "Climate" "Birthrate"
## [17] "Deathrate" "Agriculture"
## [19] "Industry" "Service"
world_gdp <- world_gdp %>%
mutate(Region = factor(Region, levels = c("ASIA (EX. NEAR EAST) ",
"EASTERN EUROPE ",
"NORTHERN AFRICA ",
"OCEANIA ",
"WESTERN EUROPE ",
"SUB-SAHARAN AFRICA ",
"LATIN AMER. & CARIB ",
"C.W. OF IND. STATES ",
"NEAR EAST ",
"NORTHERN AMERICA ",
"BALTICS "),
labels = c("ASIA",
"EAST_EUROPE",
"NOR_AFRICA",
"OCEANIA",
"WEST_EUROPE",
"SUB-SAHARAN AFRICA",
"LATIN AMER. & CARIB",
"CW_IND. STATES",
"NEAR EAST",
"NOR_AMERICA",
"BALTICS")))
colnames(world_gdp) <- c("Country",
"Region",
"Population",
"Area",
"Pop_Density",
"Coastline_ratio",
"Net_migration",
"Infant_mortality",
"GDP",
"Literacy",
"Phones_rate",
"Arable_rate",
"Crops_rate",
"Other_rate",
"Climate",
"Birthrate",
"Deathrate",
"Agriculture",
"Industry",
"Service")
head(world_gdp)
## Country Region Population Area Pop_Density
## 1 Afghanistan ASIA 31056997 647500 48,0
## 2 Albania EAST_EUROPE 3581655 28748 124,6
## 3 Algeria NOR_AFRICA 32930091 2381740 13,8
## 4 American Samoa OCEANIA 57794 199 290,4
## 5 Andorra WEST_EUROPE 71201 468 152,1
## 6 Angola SUB-SAHARAN AFRICA 12127071 1246700 9,7
## Coastline_ratio Net_migration Infant_mortality GDP Literacy Phones_rate
## 1 0,00 23,06 163,07 700 36,0 3,2
## 2 1,26 -4,93 21,52 4500 86,5 71,2
## 3 0,04 -0,39 31 6000 70,0 78,1
## 4 58,29 -20,71 9,27 8000 97,0 259,5
## 5 0,00 6,6 4,05 19000 100,0 497,2
## 6 0,13 0 191,19 1900 42,0 7,8
## Arable_rate Crops_rate Other_rate Climate Birthrate Deathrate Agriculture
## 1 12,13 0,22 87,65 1 46,6 20,34 0,38
## 2 21,09 4,42 74,49 3 15,11 5,22 0,232
## 3 3,22 0,25 96,53 1 17,14 4,61 0,101
## 4 10 15 75 2 22,46 3,27
## 5 2,22 0 97,78 3 8,71 6,25
## 6 2,41 0,24 97,35 45,11 24,2 0,096
## Industry Service
## 1 0,24 0,38
## 2 0,188 0,579
## 3 0,6 0,298
## 4
## 5
## 6 0,658 0,246
# world_gdp <- na.omit(world_gdp)
world_pop <- data.frame(world_gdp$Country)
world_gdp$Pop_Density <- str_replace_all(world_gdp$Pop_Density, ',','.')
world_gdp$Coastline_ratio <- str_replace_all(world_gdp$Coastline_ratio, ',','.')
world_gdp$Net_migration <- str_replace_all(world_gdp$Net_migration, ',','.')
world_gdp$Infant_mortality <- str_replace_all(world_gdp$Infant_mortality, ',','.')
world_gdp$Literacy <- str_replace_all(world_gdp$Literacy, ',','.')
world_gdp$Phones_rate <- str_replace_all(world_gdp$Phones_rate, ',','.')
world_gdp$Arable_rate <- str_replace_all(world_gdp$Arable_rate, ',','.')
world_gdp$Crops_rate <- str_replace_all(world_gdp$Crops_rate, ',','.')
world_gdp$Other_rate <- str_replace_all(world_gdp$Other_rate, ',','.')
world_gdp$Birthrate <- str_replace_all(world_gdp$Birthrate, ',','.')
world_gdp$Deathrate <- str_replace_all(world_gdp$Deathrate, ',','.')
world_gdp$Agriculture <- str_replace_all(world_gdp$Agriculture, ',','.')
world_gdp$Industry <- str_replace_all(world_gdp$Industry, ',','.')
world_gdp$Service <- str_replace_all(world_gdp$Service, ',','.')
# Population per million
world_gdp$Population <- as.integer(world_gdp$Population/100000)
world_gdp$Pop_Density <- as.numeric(world_gdp$Pop_Density)
world_gdp$Coastline_ratio <- as.numeric(world_gdp$Coastline_ratio)
world_gdp$Net_migration <- as.numeric(world_gdp$Net_migration)
world_gdp$Infant_mortality <- as.numeric(world_gdp$Infant_mortality)
world_gdp$Literacy <- as.numeric(world_gdp$Literacy)
world_gdp$Phones_rate <- as.numeric(world_gdp$Phones_rate)
world_gdp$Arable_rate <- as.numeric(world_gdp$Arable_rate)
world_gdp$Crops_rate <- as.numeric(world_gdp$Crops_rate)
world_gdp$Other_rate <- as.numeric(world_gdp$Other_rate)
world_gdp$Birthrate <- as.numeric(world_gdp$Birthrate)
world_gdp$Deathrate <- as.numeric(world_gdp$Deathrate)
world_gdp$Agriculture <- as.numeric(world_gdp$Agriculture)
world_gdp$Industry <- as.numeric(world_gdp$Industry)
world_gdp$Service <- as.numeric(world_gdp$Service)
world_gdp$Climate <- as.factor(world_gdp$Climate)
tail(world_gdp)
## Country Region Population Area Pop_Density
## 222 Wallis and Futuna OCEANIA 0 274 58.5
## 223 West Bank NEAR EAST 24 5860 419.9
## 224 Western Sahara NOR_AFRICA 2 266000 1.0
## 225 Yemen NEAR EAST 214 527970 40.6
## 226 Zambia SUB-SAHARAN AFRICA 115 752614 15.3
## 227 Zimbabwe SUB-SAHARAN AFRICA 122 390580 31.3
## Coastline_ratio Net_migration Infant_mortality GDP Literacy Phones_rate
## 222 47.08 NA NA 3700 50.0 118.6
## 223 0.00 2.98 19.62 800 NA 145.2
## 224 0.42 NA NA NA NA NA
## 225 0.36 0.00 61.50 800 50.2 37.2
## 226 0.00 0.00 88.29 800 80.6 8.2
## 227 0.00 0.00 67.69 1900 90.7 26.8
## Arable_rate Crops_rate Other_rate Climate Birthrate Deathrate Agriculture
## 222 5.00 25.00 70.00 2 NA NA NA
## 223 16.90 18.97 64.13 3 31.67 3.92 0.090
## 224 0.02 0.00 99.98 1 NA NA NA
## 225 2.78 0.24 96.98 1 42.89 8.30 0.135
## 226 7.08 0.03 92.90 2 41.00 19.93 0.220
## 227 8.32 0.34 91.34 2 28.01 21.84 0.179
## Industry Service
## 222 NA NA
## 223 0.280 0.630
## 224 NA 0.400
## 225 0.472 0.393
## 226 0.290 0.489
## 227 0.243 0.579
In the plot, we can observe that there are two countries in the red circle, India and China, who have high population. However, their population density are close to 4000 per square meter which is not higher than mean value. In the blue circle, USA and Japan, which circled those countries which have high GDP and average population.
world_gdp_countries <- world_gdp %>%
group_by(Country) %>%
summarise(Population = sum(Population, na.rm = TRUE), GDP = mean(GDP, na.rm = TRUE), Pop_Density = Pop_Density, Region)
high_gdp <- world_gdp_countries %>%
filter(GDP > 20000, GDP <= 100000, Population > 1000, Population < 5000)
high_pop <- world_gdp_countries %>%
filter(GDP > 100, GDP < 10000, Population > 10000, Population < 50000)
p1 <- ggplot(world_gdp_countries, aes(x = GDP, y = Population))
p1 +
geom_point(aes(size = Pop_Density, color = Region)) +
geom_text_repel(aes(label = Country ), size = 2) +
labs(title = 'Population and GDP',
x = 'GDP' , y = 'Population (per million)') +
theme_bw() +
geom_text_repel(aes(label = Country)) +
geom_encircle(data = high_gdp, color = "blue", size = 1, expand = 0.09) +
geom_encircle(data = high_pop, color = "red", size = 1, expand = 0.09)
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text_repel).
## Warning: Removed 1 rows containing missing values (geom_text_repel).
## Warning: ggrepel: 216 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 219 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
In this plot, we can see that the GDP of the countries is not closely related to the literacy. In addition, the plot shows that all the countries in Europe have their literacy higher than average except one country Albania.
EUR_gdp <- world_gdp %>%
filter(Region == 'WEST_EUROPE'| Region == 'EAST_EUROPE') %>%
summarise(GDP = GDP, Literacy = Literacy, Country = Country, Climate = Climate)
# EUR_gdp <- EUR_gdp %>%
# mutate(Climate = factor(Climate, levels = c("1","2","3","4",""),
# labels = c("1","2","3","4","Unknown")))
p2 <- ggplot(data = EUR_gdp, aes(x = GDP, y = Literacy, color = Climate))
p2 +
geom_point(aes(size = GDP)) +
geom_text_repel(aes(label = Country)) +
geom_hline(yintercept = median(world_gdp$Literacy, na.rm = TRUE),
linetype = 'longdash')
## Warning: Removed 8 rows containing missing values (geom_point).
## Warning: Removed 8 rows containing missing values (geom_text_repel).
## Warning: ggrepel: 8 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
The plot shows that the counties have higher arable rate of land used for or right for growing crops do not have high crops rate. The each point represent each region, there is no point in red square in the plot so we know that there is no region has high crops rate and higher arable rate at the same time.
world_farm <- world_gdp %>%
group_by(Region) %>%
summarise(Arable_rate = mean(Arable_rate, na.rm = TRUE), Crops_rate = mean(Crops_rate, na.rm = TRUE), GDP = mean(GDP, na.rm = TRUE), Coastline_ratio = mean(Coastline_ratio, na.rm = TRUE))
p3 <- ggplot(world_farm, aes(x = Arable_rate, y = Crops_rate))
world_sel <- world_farm %>%
filter(Arable_rate > 5, Arable_rate <= 8.2, Crops_rate > 800, Crops_rate < 1100)
p3 +
geom_point(aes(size = GDP/1000, color = Region)) +
labs(title = 'Agricultural Situation', color = 'Region',
x = 'Arable Rate' , y = 'Crops Rate', size = 'GDP') +
theme_bw() +
geom_text_repel(aes(label = Region ), size = 2) +
# geom_encircle(data = crime_sel, color = "red", size = 2, expand = 0.09) +
# theme(legend.position = 'top') +
geom_vline(xintercept = mean(world_gdp$Arable_rate, na.rm = TRUE),
linetype = 'longdash') +
geom_hline(yintercept = mean(world_gdp$Crops_rate, na.rm = TRUE),
linetype = 'longdash') +
annotate("rect", xmin = mean(world_gdp$Arable_rate, na.rm = TRUE), xmax = Inf,
ymin = mean(world_gdp$Crops_rate, na.rm = TRUE), ymax = Inf, alpha = 0.1, fill = "red") +
annotate("rect", xmin = -Inf, xmax = mean(world_gdp$Arable_rate, na.rm = TRUE),
ymin = -Inf, ymax = mean(world_gdp$Crops_rate, na.rm = TRUE), alpha = 0.1, fill = "green") +
annotate("text", label = "Fertile Area ",
x = mean(world_gdp$Arable_rate, na.rm = TRUE),
y = mean(world_gdp$Crops_rate, na.rm = TRUE), size = 4, hjust = 0, vjust = -1, colour = "red") +
annotate("text", label = "Infertile Area",
x = mean(world_gdp$Arable_rate, na.rm = TRUE),
y = mean(world_gdp$Crops_rate, na.rm = TRUE), size = 4, hjust = 1, vjust = 1, colour = "darkgreen")
Before running the plot, I assume that the country with high infant mortality has low GDP. Because if a country has high value of the infant mortality which means they do not have proper health care in their country also mean they do not productive, they have low GDP. According to the plot, my assumption is correct 50%. We can observe that high infant mortality lead to low GDP. However, low infant mortality could not lead to any result of GDP.
world_high_infant_mortality <- world_gdp %>%
filter(world_gdp$Infant_mortality > mean(world_gdp$Infant_mortality, na.rm = TRUE)) %>%
summarise(GDP = GDP, Country = Country, Infant_mortality, Region)
p4 <- ggplot(world_high_infant_mortality, aes(x = Infant_mortality, y = GDP))
p4 +
geom_point(aes(color = Region)) +
geom_text_repel(aes(label = Country ), size = 3) +
annotate("rect",
xmin = 100,
xmax = Inf,
ymin = 6000,
ymax = -Inf,
alpha = 0.1, fill = "red") +
annotate("rect",
xmin = -Inf,
xmax = 100,
ymin = Inf,
ymax = 6000,
alpha = 0.1, fill = "green") +
labs(title = "Infant Mortality and GDP",
x = "Infant Mortality",
y = "GDP")
## Warning: ggrepel: 49 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
I randomly pick the countries in this plot. There is 143 observations. The horizontal line indicated mean of literacy and vertical line indicated to mean of infant mortality. In this plot, we can see that the counties with high infant mortality have low level of literacy which mean literacy and infant mortality are closely related.
world_high_infant_mortality <- world_gdp %>%
filter(str_detect(Country, "^[A-M]")) %>%
summarise(GDP = GDP, Country = Country, Infant_mortality, Region, Literacy)
p5 <- ggplot(world_high_infant_mortality, aes(x = Infant_mortality, y = Literacy))
p5 +
geom_point(aes(color = Region)) +
geom_text_repel(aes(label = Country ), size = 3) +
geom_vline(xintercept = mean(world_gdp$Infant_mortality, na.rm = TRUE),
linetype = 'longdash') +
geom_hline(yintercept = mean(world_gdp$Literacy, na.rm = TRUE),
linetype = 'longdash') +
labs(title = "Infant Mortality and Literacy",
x = "Infant Mortality",
y = "Literacy")
## Warning: Removed 12 rows containing missing values (geom_point).
## Warning: Removed 12 rows containing missing values (geom_text_repel).
## Warning: ggrepel: 81 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
# annotate("rect",
# xmin = mean(world_gdp$Infant_mortality, na.rm = TRUE),
# xmax = Inf,
# ymin = mean(world_gdp$Literacy, na.rm = TRUE),
# ymax = -Inf) +
#
# annotate("rect",
# xmin = -Inf,
# xmax = mean(world_gdp$Infant_mortality, na.rm = TRUE),
# ymin = Inf,
# ymax = mean(world_gdp$Literacy, na.rm = TRUE)) +
According to the plot, the result shows that the countries with high GDP have low birth rate and low death rate. On the contrary, the countries with high birth rate and high death rate have low GDP.
world_birth_death <- world_gdp %>%
filter(str_detect(Country, "^[N-Z]")) %>%
summarise(Country = Country, Birthrate, Region, Deathrate, GDP = GDP)
p6 <- ggplot(world_birth_death, aes(x = Birthrate, y = Deathrate))
p6 +
geom_point(aes(size = GDP/1000, color = Region)) +
geom_text_repel(aes(label = Country ), size = 3) +
geom_vline(xintercept = median(world_gdp$Birthrate, na.rm = TRUE),
linetype = 'longdash') +
geom_hline(yintercept = median(world_gdp$Deathrate, na.rm = TRUE),
linetype = 'longdash') +
labs(title = "Birth Rate and Death Rate",
x = "Birth Rate",
y = "Death Rate")
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing missing values (geom_text_repel).
## Warning: ggrepel: 49 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps