u <- getURL("https://raw.githubusercontent.com/MundyMSDS/DATA607/master/UN_MigrantStockTotal_2015.csv")
un_data <- read_csv(u, skip =15)
head(un_data)## # A tibble: 6 x 32
## X1 X2 X3 X4 X5 `1990` `1995` `2000` `2005` `2010` `2015`
## <dbl> <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 1 WORLD <NA> 900 <NA> 18 83~ 17 85~ 15 82~ 13 27~ 15 37~ 19 57~
## 2 2 Deve~ (b) 901 <NA> 2 014~ 3 609~ 2 997~ 2 361~ 2 046~ 1 954~
## 3 3 Deve~ (c) 902 <NA> 16 82~ 14 24~ 12 83~ 10 91~ 13 32~ 17 62~
## 4 4 Leas~ (d) 941 <NA> 5 048~ 5 160~ 3 047~ 2 363~ 1 957~ 3 443~
## 5 5 Less~ <NA> 934 <NA> 11 77~ 9 084~ 9 783~ 8 551~ 11 36~ 14 17~
## 6 6 Sub-~ (e) 947 <NA> 5 516~ 5 747~ 3 421~ 2 555~ 2 215~ 3 638~
## # ... with 21 more variables: `1990_1` <chr>, `1995_1` <chr>,
## # `2000_1` <chr>, `2005_1` <chr>, `2010_1` <dbl>, `2015_1` <dbl>,
## # `1990-1995` <chr>, `1995-2000` <chr>, `2000-2005` <chr>,
## # `2005-2010` <chr>, `2010-2015` <chr>, X23 <lgl>, X24 <lgl>, X25 <lgl>,
## # X26 <lgl>, X27 <lgl>, X28 <lgl>, X29 <lgl>, X30 <lgl>, X31 <lgl>,
## # X32 <lgl>
un_data <- un_data %>%
select(-X3) %>%
#chaning column names
rename(id =X1, country = X2, loc_code = X4) %>%
#removing colunm that are not need for analysis
select(-ends_with("_1"),-contains("-"),-starts_with("X"))
head(un_data) %>%
kable() %>%
kable_styling()| id | country | loc_code | 1990 | 1995 | 2000 | 2005 | 2010 | 2015 |
|---|---|---|---|---|---|---|---|---|
| 1 | WORLD | 900 | 18 836 571 | 17 853 840 | 15 827 803 | 13 276 733 | 15 370 755 | 19 577 474 |
| 2 | Developed regions | 901 | 2 014 564 | 3 609 670 | 2 997 256 | 2 361 229 | 2 046 917 | 1 954 224 |
| 3 | Developing regions | 902 | 16 822 007 | 14 244 170 | 12 830 547 | 10 915 504 | 13 323 838 | 17 623 250 |
| 4 | Least developed countries | 941 | 5 048 391 | 5 160 131 | 3 047 488 | 2 363 782 | 1 957 884 | 3 443 582 |
| 5 | Less developed regions excluding least developed countries | 934 | 11 773 616 | 9 084 039 | 9 783 059 | 8 551 722 | 11 365 954 | 14 179 668 |
| 6 | Sub-Saharan Africa | 947 | 5 516 042 | 5 747 830 | 3 421 165 | 2 555 099 | 2 215 890 | 3 638 433 |
un_data <- un_data %>%
#identifying key regions
mutate(region = if_else(loc_code==903|loc_code==935|loc_code==908|loc_code==904|loc_code==909|loc_code==905, country,"")) %>%
filter(id > 6) %>%
#creating region column
mutate(region = na_if(region,"")) %>%
fill(region) %>%
#Filtering out region rows
filter(loc_code <900) %>%
#Converting numeric strings to doubles
mutate(`1990` = as.double(str_replace_all(`1990`,' ',""))) %>%
mutate(`1995` = as.double(str_replace_all(`1995`,' ',""))) %>%
mutate(`2000` = as.double(str_replace_all(`2000`,' ',""))) %>%
mutate(`2005` = as.double(str_replace_all(`2005`,' ',""))) %>%
mutate(`2010` = as.double(str_replace_all(`2010`,' ',""))) %>%
mutate(`2015` = as.double(str_replace_all(`2015`,' ',""))) %>%
#Filter out rows with NA or zeros
filter(!is.na(`1990`)&!is.na(`2000`)&!is.na(`2005`)&!is.na(`2010`)&!is.na(`2015`)) %>%
filter(`1990`!=0&`2000`!=0&`2005`!=0&`2010`!=0&`2015`!=0) %>%
#Gather the year column
gather(`1990`,`1995`,`2000`,`2005`,`2010`,`2015`,key="year", value ="refugees") %>%
arrange(country, year) ## # A tibble: 112 x 3
## country region data
## <chr> <chr> <list>
## 1 Albania Europe <tibble [6 x 4]>
## 2 Algeria Africa <tibble [6 x 4]>
## 3 Angola Africa <tibble [6 x 4]>
## 4 Argentina Latin America and the Caribbean <tibble [6 x 4]>
## 5 Australia Oceania <tibble [6 x 4]>
## 6 Austria Europe <tibble [6 x 4]>
## 7 Bahrain Asia <tibble [6 x 4]>
## 8 Bangladesh Asia <tibble [6 x 4]>
## 9 Belgium Europe <tibble [6 x 4]>
## 10 Belize Latin America and the Caribbean <tibble [6 x 4]>
## # ... with 102 more rows
####Here I calculate the model using the country_model function and add the model residuals. This will enable me to plot the model residuals
country_model <- function(df) {
lm(refugees ~year, data=df)
}
by_country <- by_country %>%
mutate(model=map(data,country_model)) %>%
mutate(resids = map2(data, model, add_residuals)) %>%
arrange(region, country)
by_country## # A tibble: 112 x 5
## country region data model resids
## <chr> <chr> <list> <list> <list>
## 1 Algeria Africa <tibble [6 x 4]> <S3: l~ <tibble [6 x 5~
## 2 Angola Africa <tibble [6 x 4]> <S3: l~ <tibble [6 x 5~
## 3 Benin Africa <tibble [6 x 4]> <S3: l~ <tibble [6 x 5~
## 4 Botswana Africa <tibble [6 x 4]> <S3: l~ <tibble [6 x 5~
## 5 Burkina Faso Africa <tibble [6 x 4]> <S3: l~ <tibble [6 x 5~
## 6 Burundi Africa <tibble [6 x 4]> <S3: l~ <tibble [6 x 5~
## 7 "C\xf4te d'Ivoire" Africa <tibble [6 x 4]> <S3: l~ <tibble [6 x 5~
## 8 Cameroon Africa <tibble [6 x 4]> <S3: l~ <tibble [6 x 5~
## 9 Central African Republic Africa <tibble [6 x 4]> <S3: l~ <tibble [6 x 5~
## 10 Congo Africa <tibble [6 x 4]> <S3: l~ <tibble [6 x 5~
## # ... with 102 more rows
## # A tibble: 672 x 7
## country region id loc_code year refugees resid
## <chr> <chr> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 Algeria Africa 40 12 1990 169107 -1.16e-10
## 2 Algeria Africa 40 12 1995 192489 -2.91e-11
## 3 Algeria Africa 40 12 2000 167453 -5.82e-11
## 4 Algeria Africa 40 12 2005 94101 -5.82e-11
## 5 Algeria Africa 40 12 2010 94144 -4.37e-11
## 6 Algeria Africa 40 12 2015 94144 -2.91e-11
## 7 Angola Africa 30 24 1990 12000 -9.09e-12
## 8 Angola Africa 30 24 1995 11404 -3.64e-12
## 9 Angola Africa 30 24 2000 12579 -3.64e-12
## 10 Angola Africa 30 24 2005 13867 -3.64e-12
## # ... with 662 more rows
p <- ggplot(resids, aes(year, resid)) +
geom_line(aes(group=country), alpha=1/2) +
geom_smooth(se=FALSE)
p <- ggplotly(p)
p####Here I calculate the model using the country_model function and add the model residuals. This will enable me to plot the model residuals. I will use plotly so I can work with the plots to identify insights.
resids %>%
ggplot(aes(year, resid)) +
geom_line(aes(group=country), alpha=1/2) +
geom_smooth(se=FALSE)