Descriptive and Exploratory Question Analysis

Q4 - What are the immigrants counts for the top 5 immigrants countries from 1980 - 2013 Visualized by line charts and area plot.

Import the necessary libraries

library(tidyverse) #For data manipulation and ggplot
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.1.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dplyr) #For Data manipulation
library(ggpubr) #For sub plotting

Import the Dataset

DF <- read.csv('/Users/salahkaf/Desktop/UpdatedDF.csv') #Reading the DF
head(DF)
##          Country Continent          Region            DevName X1980 X1981 X1982
## 1    Afghanistan      Asia   Southern Asia Developing regions    16    39    39
## 2        Albania    Europe Southern Europe  Developed regions     1     0     0
## 3        Algeria    Africa Northern Africa Developing regions    80    67    71
## 4 American Samoa   Oceania       Polynesia Developing regions     0     1     0
## 5        Andorra    Europe Southern Europe  Developed regions     0     0     0
## 6         Angola    Africa   Middle Africa Developing regions     1     3     6
##   X1983 X1984 X1985 X1986 X1987 X1988 X1989 X1990 X1991 X1992 X1993 X1994 X1995
## 1    47    71   340   496   741   828  1076  1028  1378  1170   713   858  1537
## 2     0     0     0     1     2     2     3     3    21    56    96    71    63
## 3    69    63    44    69   132   242   434   491   872   795   717   595  1106
## 4     0     0     0     0     1     0     1     2     0     0     0     0     0
## 5     0     0     0     2     0     0     0     3     0     1     0     0     0
## 6     6     4     3     5     5    11     6     8    23    26    22     8    26
##   X1996 X1997 X1998 X1999 X2000 X2001 X2002 X2003 X2004 X2005 X2006 X2007 X2008
## 1  2212  2555  1999  2395  3326  4067  3697  3479  2978  3436  3009  2652  2111
## 2   113   307   574  1264  1816  1602  1021   853  1450  1223   856   702   560
## 3  2054  1842  2292  2389  2867  3418  3406  3072  3616  3626  4807  3623  4005
## 4     0     0     0     0     0     0     0     0     0     0     1     0     0
## 5     0     0     2     0     0     1     0     2     0     0     1     1     0
## 6    38    27    58    49    70   169   168   165   268   295   184   106    76
##   X2009 X2010 X2011 X2012 X2013 Total
## 1  1746  1758  2203  2635  2004 58639
## 2   716   561   539   620   603 15699
## 3  5393  4752  4325  3774  4331 69439
## 4     0     0     0     0     0     6
## 5     0     0     0     1     1    15
## 6    62    61    39    70    45  2113

Cleaning the dataset

DFQ4 <- DF[-c(196,197),] # Removing "unknown" and "Total" rows

Sorting by descending order to find top 5

DFtop5 <- arrange(DFQ4,desc(Total))
DFtop5 <- DFtop5[1:5,c(1,5:38)]
DFtop5
##                                                Country X1980 X1981 X1982 X1983
## 1                                                India  8880  8670  8147  7338
## 2                                                China  5123  6682  3308  1863
## 3 United Kingdom of Great Britain and Northern Ireland 22045 24796 20620 10015
## 4                                          Philippines  6051  5921  5249  4562
## 5                                             Pakistan   978   972  1201   900
##   X1984 X1985 X1986 X1987 X1988 X1989 X1990 X1991 X1992 X1993 X1994 X1995 X1996
## 1  5704  4211  7150 10189 11522 10343 12041 13734 13673 21496 18620 18489 23859
## 2  1527  1816  1960  2643  2758  4323  8076 14255 10846  9817 13128 14398 19415
## 3 10170  9564  9470 21337 27359 23795 31668 23380 34123 33720 39231 30145 29322
## 4  3801  3150  4166  7360  8639 11865 12509 12718 13670 20479 19532 15864 13692
## 5   668   514   691  1072  1334  2261  2470  3079  4071  4777  4666  4994  9125
##   X1997 X1998 X1999 X2000 X2001 X2002 X2003 X2004 X2005 X2006 X2007 X2008 X2009
## 1 22268 17241 18974 28572 31223 31889 27155 28235 36210 33848 28742 28261 29456
## 2 20475 21049 30069 35529 36434 31961 36439 36619 42584 33518 27642 30037 29622
## 3 22965 10367  7045  8840 11728  8046  6797  7533  7258  7140  8216  8979  8876
## 4 11549  8735  9734 10763 13836 11707 12758 14004 18139 18400 19837 24887 28573
## 5 13073  9068  9979 15400 16708 15110 13205 13399 14314 13127 10124  8994  7217
##   X2010 X2011 X2012 X2013
## 1 34235 27509 30933 33087
## 2 30391 28502 33024 34129
## 3  8724  6204  6195  5827
## 4 38617 36765 34315 29544
## 5  6811  7468 11227 12603

Manipulate the table to make it suitable for visualization

Q4DF <- t(DFtop5) #Take the transpose of df
Q4DF <- as.data.frame(Q4DF) #Convert the transposed data into data frame
Q4DF <- cbind(Year = rownames(Q4DF), Q4DF) #Make the index as a column
rownames(Q4DF) <- 1:nrow(Q4DF) #Create a new index
Q4DF$Year<-gsub("X","",as.character(Q4DF$Year)) #Removing X from years
Q4DF <- Q4DF[-1,] #Remove the unnecessary column
colnames(Q4DF) <- c("Year","India","China","UK","Philippines","Pakistan") #Rename the columns
#Make all columns as integer data type
for (i in 1:ncol(Q4DF)) {
Q4DF[,i] <- as.integer(Q4DF[,i])
}
Q4DF #Final DF
##    Year India China    UK Philippines Pakistan
## 2  1980  8880  5123 22045        6051      978
## 3  1981  8670  6682 24796        5921      972
## 4  1982  8147  3308 20620        5249     1201
## 5  1983  7338  1863 10015        4562      900
## 6  1984  5704  1527 10170        3801      668
## 7  1985  4211  1816  9564        3150      514
## 8  1986  7150  1960  9470        4166      691
## 9  1987 10189  2643 21337        7360     1072
## 10 1988 11522  2758 27359        8639     1334
## 11 1989 10343  4323 23795       11865     2261
## 12 1990 12041  8076 31668       12509     2470
## 13 1991 13734 14255 23380       12718     3079
## 14 1992 13673 10846 34123       13670     4071
## 15 1993 21496  9817 33720       20479     4777
## 16 1994 18620 13128 39231       19532     4666
## 17 1995 18489 14398 30145       15864     4994
## 18 1996 23859 19415 29322       13692     9125
## 19 1997 22268 20475 22965       11549    13073
## 20 1998 17241 21049 10367        8735     9068
## 21 1999 18974 30069  7045        9734     9979
## 22 2000 28572 35529  8840       10763    15400
## 23 2001 31223 36434 11728       13836    16708
## 24 2002 31889 31961  8046       11707    15110
## 25 2003 27155 36439  6797       12758    13205
## 26 2004 28235 36619  7533       14004    13399
## 27 2005 36210 42584  7258       18139    14314
## 28 2006 33848 33518  7140       18400    13127
## 29 2007 28742 27642  8216       19837    10124
## 30 2008 28261 30037  8979       24887     8994
## 31 2009 29456 29622  8876       28573     7217
## 32 2010 34235 30391  8724       38617     6811
## 33 2011 27509 28502  6204       36765     7468
## 34 2012 30933 33024  6195       34315    11227
## 35 2013 33087 34129  5827       29544    12603

Plotting the dataset

colors <- c("India", "China", "UK","Philippines","Pakistan")

Q4plot_multiplelines <- ggplot(data = Q4DF, aes(x= Year, group = 1)) + 
  geom_line(aes(y = India, color = "India"),linetype="twodash") + 
  geom_line(aes(y = China,color = "China"),linetype="twodash") +
  geom_line(aes(y= UK, color = "UK"),linetype="twodash") +
  geom_line(aes(y= Philippines, color = "Philippines"),linetype="twodash") +
  geom_line(aes(y= Pakistan, color = "Pakistan"),linetype="twodash") +
  scale_x_continuous(breaks = seq(1980, 2013, by = 5))+ #A jump of 5 years
  scale_y_continuous(breaks = seq(10000, 50000, by = 10000)) +
  ggtitle("Number of Immigrants per Year") +
  theme(plot.title = element_text(hjust = 0.5))+
  labs(x = "Year",y = "Immigrants") 
Q4plot_multiplelines

Plot Area plot

Q4plotarea <- ggplot(data = Q4DF, aes(x=Year,group = 1)) +
geom_area(aes(y= India, fill='India',alpha= 0.65)) +
geom_area(aes(y= China,fill='China',alpha= 0.65)) +
geom_area(aes(y= UK,fill= 'UK',alpha= 0.65)) +
geom_area(aes(y= Philippines,fill= 'Philippines',alpha= 0.65)) +
geom_area(aes(y= Pakistan,fill= 'Pakistan',alpha= 0.65)) +
scale_x_continuous(breaks = seq(1980, 2013, by = 5))+ #A jump of 5 years
scale_y_continuous(breaks = seq(10000, 50000, by = 10000)) +
ggtitle("Number of Immigrants per Year") +
theme(plot.title = element_text(hjust = 0.5))+
labs(x = "Year",y = "Immigrants") 
Q4plotarea