Exploratory Question Analysis

Q6 - Which countries in the future will have more immigrants, and which will have less? Visualized by Scatter plot and regression plots used to observe the immigration patterns.

Import the necessary libraries

library(tidyverse) #For data manipulation and ggplot
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.1.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dplyr) #For data manipulation
library(ggpubr) #For sub plotting

Import the Dataset

DF <- read.csv('/Users/salahkaf/Desktop/UpdatedDF.csv') #Reading the DF
head(DF)
##          Country Continent          Region            DevName X1980 X1981 X1982
## 1    Afghanistan      Asia   Southern Asia Developing regions    16    39    39
## 2        Albania    Europe Southern Europe  Developed regions     1     0     0
## 3        Algeria    Africa Northern Africa Developing regions    80    67    71
## 4 American Samoa   Oceania       Polynesia Developing regions     0     1     0
## 5        Andorra    Europe Southern Europe  Developed regions     0     0     0
## 6         Angola    Africa   Middle Africa Developing regions     1     3     6
##   X1983 X1984 X1985 X1986 X1987 X1988 X1989 X1990 X1991 X1992 X1993 X1994 X1995
## 1    47    71   340   496   741   828  1076  1028  1378  1170   713   858  1537
## 2     0     0     0     1     2     2     3     3    21    56    96    71    63
## 3    69    63    44    69   132   242   434   491   872   795   717   595  1106
## 4     0     0     0     0     1     0     1     2     0     0     0     0     0
## 5     0     0     0     2     0     0     0     3     0     1     0     0     0
## 6     6     4     3     5     5    11     6     8    23    26    22     8    26
##   X1996 X1997 X1998 X1999 X2000 X2001 X2002 X2003 X2004 X2005 X2006 X2007 X2008
## 1  2212  2555  1999  2395  3326  4067  3697  3479  2978  3436  3009  2652  2111
## 2   113   307   574  1264  1816  1602  1021   853  1450  1223   856   702   560
## 3  2054  1842  2292  2389  2867  3418  3406  3072  3616  3626  4807  3623  4005
## 4     0     0     0     0     0     0     0     0     0     0     1     0     0
## 5     0     0     2     0     0     1     0     2     0     0     1     1     0
## 6    38    27    58    49    70   169   168   165   268   295   184   106    76
##   X2009 X2010 X2011 X2012 X2013 Total
## 1  1746  1758  2203  2635  2004 58639
## 2   716   561   539   620   603 15699
## 3  5393  4752  4325  3774  4331 69439
## 4     0     0     0     0     0     6
## 5     0     0     0     1     1    15
## 6    62    61    39    70    45  2113

Select a country or total as an option

Q6DF <- DF[DF$Country=="China",5:38] #Here "Total" is the user input, the purpose is to automate this option
Q6DF
##    X1980 X1981 X1982 X1983 X1984 X1985 X1986 X1987 X1988 X1989 X1990 X1991
## 37  5123  6682  3308  1863  1527  1816  1960  2643  2758  4323  8076 14255
##    X1992 X1993 X1994 X1995 X1996 X1997 X1998 X1999 X2000 X2001 X2002 X2003
## 37 10846  9817 13128 14398 19415 20475 21049 30069 35529 36434 31961 36439
##    X2004 X2005 X2006 X2007 X2008 X2009 X2010 X2011 X2012 X2013
## 37 36619 42584 33518 27642 30037 29622 30391 28502 33024 34129

Data wrangling

#### Transposing the subsetted dataset
Q6DF <- gather(Q6DF,Year,Total,1:34) #Transposing the dataset 
Q6DF$Year<-gsub("X","",as.character(Q6DF$Year)) #Removing X from years
Q6DF$Year <- as.integer(Q6DF$Year)
Q6DF #Final DF ready to be plotted
##    Year Total
## 1  1980  5123
## 2  1981  6682
## 3  1982  3308
## 4  1983  1863
## 5  1984  1527
## 6  1985  1816
## 7  1986  1960
## 8  1987  2643
## 9  1988  2758
## 10 1989  4323
## 11 1990  8076
## 12 1991 14255
## 13 1992 10846
## 14 1993  9817
## 15 1994 13128
## 16 1995 14398
## 17 1996 19415
## 18 1997 20475
## 19 1998 21049
## 20 1999 30069
## 21 2000 35529
## 22 2001 36434
## 23 2002 31961
## 24 2003 36439
## 25 2004 36619
## 26 2005 42584
## 27 2006 33518
## 28 2007 27642
## 29 2008 30037
## 30 2009 29622
## 31 2010 30391
## 32 2011 28502
## 33 2012 33024
## 34 2013 34129
Q6Regplot <- ggplot(data = Q6DF, aes(x= Year, y=Total)) +
geom_point() +
  geom_smooth(method="lm") +
  scale_x_discrete(breaks = seq(1980, 2013, by = 2)) + #A jump of 2 years
  scale_y_continuous(breaks = seq(10000, 50000, by = 5000)) +
  ggtitle("Linear Regression plot") +
  theme(plot.title = element_text(hjust = 0.5))+
  xlab("Years") + ylab("Number of Immigrants")
Q6Regplot
## `geom_smooth()` using formula 'y ~ x'