Exploratory Question Analysis
Q6 - Which countries in the future will have more immigrants, and which will have less? Visualized by Scatter plot and regression plots used to observe the immigration patterns.
Import the necessary libraries
library(tidyverse) #For data manipulation and ggplot
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr) #For data manipulation
library(ggpubr) #For sub plotting
Import the Dataset
DF <- read.csv('/Users/salahkaf/Desktop/UpdatedDF.csv') #Reading the DF
head(DF)
## Country Continent Region DevName X1980 X1981 X1982
## 1 Afghanistan Asia Southern Asia Developing regions 16 39 39
## 2 Albania Europe Southern Europe Developed regions 1 0 0
## 3 Algeria Africa Northern Africa Developing regions 80 67 71
## 4 American Samoa Oceania Polynesia Developing regions 0 1 0
## 5 Andorra Europe Southern Europe Developed regions 0 0 0
## 6 Angola Africa Middle Africa Developing regions 1 3 6
## X1983 X1984 X1985 X1986 X1987 X1988 X1989 X1990 X1991 X1992 X1993 X1994 X1995
## 1 47 71 340 496 741 828 1076 1028 1378 1170 713 858 1537
## 2 0 0 0 1 2 2 3 3 21 56 96 71 63
## 3 69 63 44 69 132 242 434 491 872 795 717 595 1106
## 4 0 0 0 0 1 0 1 2 0 0 0 0 0
## 5 0 0 0 2 0 0 0 3 0 1 0 0 0
## 6 6 4 3 5 5 11 6 8 23 26 22 8 26
## X1996 X1997 X1998 X1999 X2000 X2001 X2002 X2003 X2004 X2005 X2006 X2007 X2008
## 1 2212 2555 1999 2395 3326 4067 3697 3479 2978 3436 3009 2652 2111
## 2 113 307 574 1264 1816 1602 1021 853 1450 1223 856 702 560
## 3 2054 1842 2292 2389 2867 3418 3406 3072 3616 3626 4807 3623 4005
## 4 0 0 0 0 0 0 0 0 0 0 1 0 0
## 5 0 0 2 0 0 1 0 2 0 0 1 1 0
## 6 38 27 58 49 70 169 168 165 268 295 184 106 76
## X2009 X2010 X2011 X2012 X2013 Total
## 1 1746 1758 2203 2635 2004 58639
## 2 716 561 539 620 603 15699
## 3 5393 4752 4325 3774 4331 69439
## 4 0 0 0 0 0 6
## 5 0 0 0 1 1 15
## 6 62 61 39 70 45 2113
Select a country or total as an option
Q6DF <- DF[DF$Country=="China",5:38] #Here "Total" is the user input, the purpose is to automate this option
Q6DF
## X1980 X1981 X1982 X1983 X1984 X1985 X1986 X1987 X1988 X1989 X1990 X1991
## 37 5123 6682 3308 1863 1527 1816 1960 2643 2758 4323 8076 14255
## X1992 X1993 X1994 X1995 X1996 X1997 X1998 X1999 X2000 X2001 X2002 X2003
## 37 10846 9817 13128 14398 19415 20475 21049 30069 35529 36434 31961 36439
## X2004 X2005 X2006 X2007 X2008 X2009 X2010 X2011 X2012 X2013
## 37 36619 42584 33518 27642 30037 29622 30391 28502 33024 34129
Data wrangling
#### Transposing the subsetted dataset
Q6DF <- gather(Q6DF,Year,Total,1:34) #Transposing the dataset
Q6DF$Year<-gsub("X","",as.character(Q6DF$Year)) #Removing X from years
Q6DF$Year <- as.integer(Q6DF$Year)
Q6DF #Final DF ready to be plotted
## Year Total
## 1 1980 5123
## 2 1981 6682
## 3 1982 3308
## 4 1983 1863
## 5 1984 1527
## 6 1985 1816
## 7 1986 1960
## 8 1987 2643
## 9 1988 2758
## 10 1989 4323
## 11 1990 8076
## 12 1991 14255
## 13 1992 10846
## 14 1993 9817
## 15 1994 13128
## 16 1995 14398
## 17 1996 19415
## 18 1997 20475
## 19 1998 21049
## 20 1999 30069
## 21 2000 35529
## 22 2001 36434
## 23 2002 31961
## 24 2003 36439
## 25 2004 36619
## 26 2005 42584
## 27 2006 33518
## 28 2007 27642
## 29 2008 30037
## 30 2009 29622
## 31 2010 30391
## 32 2011 28502
## 33 2012 33024
## 34 2013 34129
Q6Regplot <- ggplot(data = Q6DF, aes(x= Year, y=Total)) +
geom_point() +
geom_smooth(method="lm") +
scale_x_discrete(breaks = seq(1980, 2013, by = 2)) + #A jump of 2 years
scale_y_continuous(breaks = seq(10000, 50000, by = 5000)) +
ggtitle("Linear Regression plot") +
theme(plot.title = element_text(hjust = 0.5))+
xlab("Years") + ylab("Number of Immigrants")
Q6Regplot
## `geom_smooth()` using formula 'y ~ x'
