week6homework2

Use related library and load the crime dataset

# install.packages("plotly")
library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.1.1     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(readr)
library(ggplot2)
library(dplyr)
library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

#nations <- read_csv('nations.csv')
nations <- read_csv('C:/Users/wrxio/projects/datasets/nations.csv')

## Rows: 5275 Columns: 10

## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (5): iso2c, iso3c, country, region, income
## dbl (5): year, gdp_percap, population, birth_rate, neonat_mortal_rate
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

head(nations)

## # A tibble: 6 x 10
##   iso2c iso3c country  year gdp_percap population birth_rate neonat_mortal_rate
##   <chr> <chr> <chr>   <dbl>      <dbl>      <dbl>      <dbl>              <dbl>
## 1 AD    AND   Andorra  1996         NA      64291       10.9                2.8
## 2 AD    AND   Andorra  1994         NA      62707       10.9                3.2
## 3 AD    AND   Andorra  2003         NA      74783       10.3                2  
## 4 AD    AND   Andorra  1990         NA      54511       11.9                4.3
## 5 AD    AND   Andorra  2009         NA      85474        9.9                1.7
## 6 AD    AND   Andorra  2011         NA      82326       NA                  1.6
## # ... with 2 more variables: region <chr>, income <chr>

Get some information about nations dataset

# Check the results
str(nations)            # structure of nations dataset  -- most info

## spec_tbl_df [5,275 x 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ iso2c             : chr [1:5275] "AD" "AD" "AD" "AD" ...
##  $ iso3c             : chr [1:5275] "AND" "AND" "AND" "AND" ...
##  $ country           : chr [1:5275] "Andorra" "Andorra" "Andorra" "Andorra" ...
##  $ year              : num [1:5275] 1996 1994 2003 1990 2009 ...
##  $ gdp_percap        : num [1:5275] NA NA NA NA NA NA NA NA NA NA ...
##  $ population        : num [1:5275] 64291 62707 74783 54511 85474 ...
##  $ birth_rate        : num [1:5275] 10.9 10.9 10.3 11.9 9.9 NA 10.9 9.8 11.8 11.2 ...
##  $ neonat_mortal_rate: num [1:5275] 2.8 3.2 2 4.3 1.7 1.6 2 1.7 2.1 2.1 ...
##  $ region            : chr [1:5275] "Europe & Central Asia" "Europe & Central Asia" "Europe & Central Asia" "Europe & Central Asia" ...
##  $ income            : chr [1:5275] "High income" "High income" "High income" "High income" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   iso2c = col_character(),
##   ..   iso3c = col_character(),
##   ..   country = col_character(),
##   ..   year = col_double(),
##   ..   gdp_percap = col_double(),
##   ..   population = col_double(),
##   ..   birth_rate = col_double(),
##   ..   neonat_mortal_rate = col_double(),
##   ..   region = col_character(),
##   ..   income = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

#typeof(nations)         # type of nations dataset - list
#attributes(nations)     # type of nations dataset

#tail(nations)           # last six rows of nations dataset
dim(nations)            # number of rows and conlumns of nations dataset

## [1] 5275   10

#nrow(nations)           # number of rows of nations dataset
#ncol(nations)           # number of conlumns of nations dataset
names(nations)          # column names of nations dataset

##  [1] "iso2c"              "iso3c"              "country"           
##  [4] "year"               "gdp_percap"         "population"        
##  [7] "birth_rate"         "neonat_mortal_rate" "region"            
## [10] "income"

#sapply(nations, class)  # the class of each column of nations dataset

Clean up the datasets

remove the NA values from gdp_percap) and year

nations_nona <- nations %>%
 filter(!is.na(gdp_percap) & !is.na(year)) 
head(nations_nona)

## # A tibble: 6 x 10
##   iso2c iso3c country     year gdp_percap population birth_rate neonat_mortal_r~
##   <chr> <chr> <chr>      <dbl>      <dbl>      <dbl>      <dbl>            <dbl>
## 1 AE    ARE   United Ar~  1991     73037.    1913190       24.6              7.9
## 2 AE    ARE   United Ar~  1993     71960.    2127863       22.4              7.3
## 3 AE    ARE   United Ar~  2001     83534.    3217865       15.8              5.5
## 4 AE    ARE   United Ar~  1992     73154.    2019014       23.5              7.6
## 5 AE    ARE   United Ar~  1994     74684.    2238281       21.3              6.9
## 6 AE    ARE   United Ar~  2007     75427.    6010100       12.8              4.7
## # ... with 2 more variables: region <chr>, income <chr>

# Check the result
#str(nations_nona) 
#dim(nations_nona)

Compute and Add a New Variable to a Dataframe in R with mutate() - get the following sample code from https://www.marsja.se/how-to-add-a-column-to-dataframe-in-r-with-tibble-dplyr/

first need to create a new variable in the data, using mutate from dplyr, giving the GDP of each country in trillions of dollars, by multiplying gdp_percap by population and dividing by a trillion.

# Insert new column with mutate - sample code
#dataf <- dataf %>% 
#  mutate(DepressionIndex = mean(c_across(Depr1:Depr5))) %>%
#  head()

# Figure out GDP in a trillion
nations_nona <- nations_nona %>% 
  mutate(GDP = (gdp_percap*population)/10^12) %>%

# Check the result
str(nations_nona)

## spec_tbl_df [4,509 x 11] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ iso2c             : chr [1:4509] "AE" "AE" "AE" "AE" ...
##  $ iso3c             : chr [1:4509] "ARE" "ARE" "ARE" "ARE" ...
##  $ country           : chr [1:4509] "United Arab Emirates" "United Arab Emirates" "United Arab Emirates" "United Arab Emirates" ...
##  $ year              : num [1:4509] 1991 1993 2001 1992 1994 ...
##  $ gdp_percap        : num [1:4509] 73037 71960 83534 73154 74684 ...
##  $ population        : num [1:4509] 1913190 2127863 3217865 2019014 2238281 ...
##  $ birth_rate        : num [1:4509] 24.6 22.4 15.8 23.5 21.3 ...
##  $ neonat_mortal_rate: num [1:4509] 7.9 7.3 5.5 7.6 6.9 4.7 5.1 6.4 4.9 5.6 ...
##  $ region            : chr [1:4509] "Middle East & North Africa" "Middle East & North Africa" "Middle East & North Africa" "Middle East & North Africa" ...
##  $ income            : chr [1:4509] "High income" "High income" "High income" "High income" ...
##  $ GDP               : num [1:4509] 0.14 0.153 0.269 0.148 0.167 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   iso2c = col_character(),
##   ..   iso3c = col_character(),
##   ..   country = col_character(),
##   ..   year = col_double(),
##   ..   gdp_percap = col_double(),
##   ..   population = col_double(),
##   ..   birth_rate = col_double(),
##   ..   neonat_mortal_rate = col_double(),
##   ..   region = col_character(),
##   ..   income = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

#dim(nations_nona)

Draw first chart

Map variables in the data onto the X and Y axes and change the axes labels and theme

# Change the theme
ggplot(nations, aes(x = year, y = gdp_percap)) +
#ggplot(nations_nona, aes(x = year, y = gdp_percap)) +
 xlab("year") +
 ylab("GDP ($ trillion)") +
 theme_minimal(base_size = 12)

# Include all the related counties
p1 <- ggplot(nations, aes(x = year, y = gdp_percap)) +
#p1 <- ggplot(nations_nona, aes(x = year, y = gdp_percap)) +
 labs(title = "GDP VERSUS YEAR FOR EACH COUNTRY",
 caption = "Source: The World Bank and Siadi") +
 xlab("year") +
 ylab("GDP ($ trillion)") +
 #filter(starwars, species == "Human) +
 #filter(nations_nona, country  == "China")+  
 theme_minimal(base_size = 12)
p1 + geom_point()

## Warning: Removed 766 rows containing missing values (geom_point).

Select the related countries

#nations_nona %>% filter(country == "China" | country == "Japan" | country == "Germary"  | country == "United States")
#nations_nona_ft <- nations_nona %>% filter(country == "China" | country == "Japan" | country == "Germary")

# Check the results
#head(nations_nona_ft)
#tail(nations_nona_ft)
#str(nations_nona_ft) 
#dim(nations_nona_ft)

Shall select the related countries

p2 <- ggplot(nations, aes(x = year, y = gdp_percap)) +
 labs(title = "GDP VERSUS YEAR FOR THE SELECTED COUNTRIES",  
caption = "Source: The World Bank and Siadi") +
 xlab("year") +
 ylab("GDP ($ trillion)") +
 theme_minimal(base_size = 12)
p2 + geom_point()

## Warning: Removed 766 rows containing missing values (geom_point).

the scatterplot appears to show a correlation

p3 <- p2 + xlim(1990,2015)+ ylim(0,10^5)
p3 + geom_point()

## Warning: Removed 781 rows containing missing values (geom_point).

Add a smoother in red with a confidence interval

p4 <- p3 + geom_point() + geom_smooth(color = "red")
p4

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

## Warning: Removed 781 rows containing non-finite values (stat_smooth).

## Warning: Removed 781 rows containing missing values (geom_point).

Add a linear regression with confidence interval

p5 <- p3 + geom_point() + geom_smooth(method='lm',formula=y~x)
p5

## Warning: Removed 781 rows containing non-finite values (stat_smooth).

## Warning: Removed 781 rows containing missing values (geom_point).

Add a title, make the line dashed, and remove the confidence interval band

p6 <- p3 + geom_point() + geom_smooth(method='lm',formula=y~x, se = FALSE, linetype= "dotdash", size = 0.3) +
  ggtitle("GDP VERSUS YEAR IN THE SELECTED COUNTRIES")
p6

## Warning: Removed 781 rows containing non-finite values (stat_smooth).

## Warning: Removed 781 rows containing missing values (geom_point).

The second chart

###For the second chart, using dplyr you will need to group_by region and year, and then summarize on your mutated value for gdp using summarise(GDP = sum(gdp, na.rm = TRUE)). (There will be null values, or NAs, in this data, so you will need to use na.rm = TRUE).

# Change the theme
ggplot(nations, aes(x = year, y = gdp_percap)) +
#ggplot(nations_nona, aes(x = year, y = gdp_percap)) +
 xlab("year") +
 ylab("GDP ($ trillion)") +
 theme_minimal(base_size = 12)

# Include all the related counties and generated by the command geom_area () 
p21 <- ggplot(nations, aes(x = year, y = gdp_percap)) +
#p21 <- ggplot(nations_nona, aes(x = year, y = gdp_percap)) +
 labs(title = "GDP VERSUS YEAR FOR EACH COUNTRY",
 caption = "Source: The World Bank and Siadi") +
 xlab("year") +
 ylab("GDP ($ trillion)") +
 #filter(starwars, species == "Human) +
 #filter(nations_nona, country  == "China")+  
 theme_minimal(base_size = 12)
p21 + geom_area ()

## Warning: Removed 766 rows containing missing values (position_stack).