#Import packages & dataset 
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
gapminder <- read.csv("C:/Users/User/Downloads/GAPMINDER DATASET/gapminder_full.csv", header = TRUE, sep = ",")
head(gapminder,10)
##        country year population continent life_exp  gdp_cap
## 1  Afghanistan 1952    8425333      Asia   28.801 779.4453
## 2  Afghanistan 1957    9240934      Asia   30.332 820.8530
## 3  Afghanistan 1962   10267083      Asia   31.997 853.1007
## 4  Afghanistan 1967   11537966      Asia   34.020 836.1971
## 5  Afghanistan 1972   13079460      Asia   36.088 739.9811
## 6  Afghanistan 1977   14880372      Asia   38.438 786.1134
## 7  Afghanistan 1982   12881816      Asia   39.854 978.0114
## 8  Afghanistan 1987   13867957      Asia   40.822 852.3959
## 9  Afghanistan 1992   16317921      Asia   41.674 649.3414
## 10 Afghanistan 1997   22227415      Asia   41.763 635.3414
#To create a subset of the gapminder dataset with the year=1957
gapminder_1957 <- gapminder %>% filter(year == 1957)
#Plot a scatter plot of the population on the x-axis and the life expectancy on the y-axis

ggplot(gapminder_1957, aes(x = population, y = life_exp)) +    geom_point()

#Plot a scatter plot of the population on the x-axis and the gdp per capita on the y-axis

ggplot(gapminder_1957, aes(x = population, y = gdp_cap)) +    geom_point()

#Plot a scatter plot of the gdp per capita on the x-axis and the life expectancy on the y-axis

ggplot(gapminder_1957, aes(x = gdp_cap, y = life_exp)) +    geom_point()

#Adding log scale
#Plot a scatter plot of the population on the y-axis and the life expectancy on the y-axis

ggplot(gapminder_1957, aes(x = population, y = life_exp)) +    geom_point() + scale_x_log10()

Polpulation increases as life expectancy increases.

#Plot a scatter plot of the population on the y-axis and the gdp per capita on the y-axis

ggplot(gapminder_1957, aes(x = population, y = gdp_cap)) +    geom_point() + scale_x_log10() + scale_y_log10()

Population rises with an increase in gdp/capita

#Using the color and size aesthetic
#Plot a scatter plot of the population on the y-axis and the life expectancy on the y-axis with color representing the continent

ggplot(gapminder_1957, aes(x = population, y = life_exp, color = continent)) + 
  geom_point() + scale_x_log10()

Africans have lower life expectancy. Europeans have higher life expectancy.

#Using the color and size aesthetic
#Plot a scatter plot of the population on the y-axis and the life expectancy on the y-axis with color representing the continent and size representing the gdp per capita

ggplot(gapminder_1957, aes(x = population, y = life_exp, color = continent, size = gdp_cap)) + 
  geom_point() + scale_x_log10()

Africans had lower life expectancy with smaller gdp_cap. Asians had bigger gdp_cap and a moderate life expectancy.

#Using faceting
#Plot a scatter plot comparing population on the x-axis and the life expectancy on the y-axis facetted by continent

ggplot(gapminder_1957, aes(x = population, y = life_exp)) +    geom_point() + scale_x_log10() + facet_wrap(~continent)

#Plot a scatter plot of the gdp per capita on the x-axis and the life expectancy on the y-axis with color represented by continent, size represented by population and facetted by year

ggplot(gapminder, aes(x = gdp_cap, y = life_exp, color = continent, size = population)) + 
  geom_point() + scale_x_log10() + facet_wrap(~year)

Life expectancy and gdp_cap increase globally over the years. Asians tend to have bigger gdp_cap.

#VISUALIZING WITH SUMMARIZED DATA
#creating a variable by_year that get the median life expectancy for each year
by_year <- gapminder %>% group_by(year) %>% summarise(medianLifeExp = median(life_exp))
head(by_year, 10)
## # A tibble: 10 × 2
##     year medianLifeExp
##    <int>         <dbl>
##  1  1952          45.1
##  2  1957          48.4
##  3  1962          50.9
##  4  1967          53.8
##  5  1972          56.5
##  6  1977          59.7
##  7  1982          62.4
##  8  1987          65.8
##  9  1992          67.7
## 10  1997          69.4
#create a scatter plot to show the change in median life expectancy using the by_year data

ggplot(by_year, aes(x = year, y = medianLifeExp)) +    geom_point() + expand_limits(y=0)

#expand_limits(y=0) function was added since the y-axis did not start from zero.
#To summarize median gdp per capita within each continent with by_year_continent
by_year_continent <- gapminder %>% group_by(year, continent) %>% summarise(medianGdpPercap = median(gdp_cap))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
head(by_year_continent, 10)
## # A tibble: 10 × 3
## # Groups:   year [2]
##     year continent medianGdpPercap
##    <int> <chr>               <dbl>
##  1  1952 Africa               987.
##  2  1952 Americas            3048.
##  3  1952 Asia                1207.
##  4  1952 Europe              5142.
##  5  1952 Oceania            10298.
##  6  1957 Africa              1024.
##  7  1957 Americas            3781.
##  8  1957 Asia                1548.
##  9  1957 Europe              6067.
## 10  1957 Oceania            11599.
#Plot a scatter plot median gdp per capita in each continent for by_year_continent (the expand_limits(y = 0) is because the y axis does not start from the origin).
ggplot(by_year_continent, aes(x = year, y = medianGdpPercap, color = continent)) +
  geom_point() + expand_limits(y = 0)

The median Gdp_cap increased over the years. Though, Africa had the lowest, while oceania had the highest.

#Summarize the median gdp per capita and median life expectancy by continent in 2007
by_continent_2007 <- gapminder %>% filter(year == 2007) %>% group_by(continent) %>% summarise(medianGdpPercap = median(gdp_cap), medianLifeExp = median(life_exp))
by_continent_2007
## # A tibble: 5 × 3
##   continent medianGdpPercap medianLifeExp
##   <chr>               <dbl>         <dbl>
## 1 Africa              1452.          52.9
## 2 Americas            8948.          72.9
## 3 Asia                4471.          72.4
## 4 Europe             28054.          78.6
## 5 Oceania            29810.          80.7
#Create the scatter plot of the by_continent_2007 data
ggplot(by_continent_2007, aes(x = medianGdpPercap, y = medianLifeExp, color = continent)) +    geom_point() + expand_limits(y = 0)

#Creating Line Plots
#To summarize median gdp per capita by year
by_year_Gdp <- gapminder %>% group_by(year) %>% summarise(medianGdpPercap = median(gdp_cap))
head(by_year_Gdp, 10)
## # A tibble: 10 × 2
##     year medianGdpPercap
##    <int>           <dbl>
##  1  1952           1969.
##  2  1957           2173.
##  3  1962           2335.
##  4  1967           2678.
##  5  1972           3339.
##  6  1977           3799.
##  7  1982           4216.
##  8  1987           4280.
##  9  1992           4386.
## 10  1997           4782.
#Create a line plot of the median gdp per capita for the year

ggplot(by_year_Gdp, aes(x = year, y = medianGdpPercap)) +    geom_line() + expand_limits(y = 0)

#Summarize the median gpd per capita by year and continent
by_year_continent_Gdp <- gapminder %>% group_by(year, continent) %>% summarise(medianGdpPercap = median(gdp_cap))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
head(by_year_continent_Gdp, 10)
## # A tibble: 10 × 3
## # Groups:   year [2]
##     year continent medianGdpPercap
##    <int> <chr>               <dbl>
##  1  1952 Africa               987.
##  2  1952 Americas            3048.
##  3  1952 Asia                1207.
##  4  1952 Europe              5142.
##  5  1952 Oceania            10298.
##  6  1957 Africa              1024.
##  7  1957 Americas            3781.
##  8  1957 Asia                1548.
##  9  1957 Europe              6067.
## 10  1957 Oceania            11599.
#To create a line plot for the by_year_continent_Gdp data
ggplot(by_year_continent_Gdp, aes(x = year, y = medianGdpPercap, color = continent)) + geom_line() + expand_limits(y = 0)

#Creating Barplots
#Summarize the median gdp by continent in 1957
by_continent_1957 <- gapminder %>% filter(year == 1957) %>% group_by(continent) %>% summarise(medianGdpPercap = median(gdp_cap)) 
by_continent_1957
## # A tibble: 5 × 2
##   continent medianGdpPercap
##   <chr>               <dbl>
## 1 Africa              1024.
## 2 Americas            3781.
## 3 Asia                1548.
## 4 Europe              6067.
## 5 Oceania            11599.
#Plot the barchart of the by_continent_1957 data
ggplot(by_continent_1957, aes(x = continent, y = medianGdpPercap)) +  geom_col()

#Creating Histogram
#Filter the dataset for the year 1957. Create a new column called population per million (pop_by_mil)
gapminder_1957_pop <- gapminder %>% filter(year == 1957) %>% mutate(pop_by_mil = population/1000000) 
head(gapminder_1957_pop, 10)
##        country year population continent life_exp    gdp_cap pop_by_mil
## 1  Afghanistan 1957    9240934      Asia   30.332   820.8530   9.240934
## 2      Albania 1957    1476505    Europe   59.280  1942.2842   1.476505
## 3      Algeria 1957   10270856    Africa   45.685  3013.9760  10.270856
## 4       Angola 1957    4561361    Africa   31.999  3827.9405   4.561361
## 5    Argentina 1957   19610538  Americas   64.399  6856.8562  19.610538
## 6    Australia 1957    9712569   Oceania   70.330 10949.6496   9.712569
## 7      Austria 1957    6965860    Europe   67.480  8842.5980   6.965860
## 8      Bahrain 1957     138655      Asia   53.832 11635.7995   0.138655
## 9   Bangladesh 1957   51365468      Asia   39.348   661.6375  51.365468
## 10     Belgium 1957    8989111    Europe   69.240  9714.9606   8.989111
#Create a histogram of the gapminder_1957_pop dataset
ggplot(gapminder_1957_pop, aes(x = pop_by_mil)) +    geom_histogram(bins = 50) + scale_x_log10()

#Create the gapminder_1957 and filter for the year 1962
gapminder_1962 <- gapminder %>% filter(year == 1962)
head(gapminder_1962, 10)
##        country year population continent life_exp    gdp_cap
## 1  Afghanistan 1962   10267083      Asia   31.997   853.1007
## 2      Albania 1962    1728137    Europe   64.820  2312.8890
## 3      Algeria 1962   11000948    Africa   48.303  2550.8169
## 4       Angola 1962    4826015    Africa   34.000  4269.2767
## 5    Argentina 1962   21283783  Americas   65.142  7133.1660
## 6    Australia 1962   10794968   Oceania   70.930 12217.2269
## 7      Austria 1962    7129864    Europe   69.540 10750.7211
## 8      Bahrain 1962     171863      Asia   56.923 12753.2751
## 9   Bangladesh 1962   56839289      Asia   41.216   686.3416
## 10     Belgium 1962    9218400    Europe   70.250 10991.2068
#create a box plot of the gapminder_1962 data
ggplot(gapminder_1962, aes(x = continent, y = gdp_cap)) +    geom_boxplot() + scale_y_log10()