Descriptive Analysis Question

Q2 - How many total immigrants to Canada by continent from 1980 to 2013. Visualized by pie and line charts.

Part 1 - Pie Chart

Import the necessary libraries

library(tidyverse) #For data manipulation and ggplot
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.1.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dplyr) #For Data manipulation
library(ggpubr) #For sub plotting

Import the Dataset

DF <- read.csv('/Users/salahkaf/Desktop/UpdatedDF.csv') #Reading the DF
head(DF)
##          Country Continent          Region            DevName X1980 X1981 X1982
## 1    Afghanistan      Asia   Southern Asia Developing regions    16    39    39
## 2        Albania    Europe Southern Europe  Developed regions     1     0     0
## 3        Algeria    Africa Northern Africa Developing regions    80    67    71
## 4 American Samoa   Oceania       Polynesia Developing regions     0     1     0
## 5        Andorra    Europe Southern Europe  Developed regions     0     0     0
## 6         Angola    Africa   Middle Africa Developing regions     1     3     6
##   X1983 X1984 X1985 X1986 X1987 X1988 X1989 X1990 X1991 X1992 X1993 X1994 X1995
## 1    47    71   340   496   741   828  1076  1028  1378  1170   713   858  1537
## 2     0     0     0     1     2     2     3     3    21    56    96    71    63
## 3    69    63    44    69   132   242   434   491   872   795   717   595  1106
## 4     0     0     0     0     1     0     1     2     0     0     0     0     0
## 5     0     0     0     2     0     0     0     3     0     1     0     0     0
## 6     6     4     3     5     5    11     6     8    23    26    22     8    26
##   X1996 X1997 X1998 X1999 X2000 X2001 X2002 X2003 X2004 X2005 X2006 X2007 X2008
## 1  2212  2555  1999  2395  3326  4067  3697  3479  2978  3436  3009  2652  2111
## 2   113   307   574  1264  1816  1602  1021   853  1450  1223   856   702   560
## 3  2054  1842  2292  2389  2867  3418  3406  3072  3616  3626  4807  3623  4005
## 4     0     0     0     0     0     0     0     0     0     0     1     0     0
## 5     0     0     2     0     0     1     0     2     0     0     1     1     0
## 6    38    27    58    49    70   169   168   165   268   295   184   106    76
##   X2009 X2010 X2011 X2012 X2013 Total
## 1  1746  1758  2203  2635  2004 58639
## 2   716   561   539   620   603 15699
## 3  5393  4752  4325  3774  4331 69439
## 4     0     0     0     0     0     6
## 5     0     0     0     1     1    15
## 6    62    61    39    70    45  2113

Wrangling the table part 1

DF <- DF[-c(196,197),] #Remove last two rows as they do not have a continent
Q2DF1 <- tapply(DF$Total,DF$Continent,sum) #Sum total immigrants per continent
Q2DF1 <- as.data.frame.table(Q2DF1) #Convert it to a data frame
Q2DF1
##                              Var1    Freq
## 1                          Africa  618948
## 2                            Asia 3317794
## 3                          Europe 1410947
## 4 Latin America and the Caribbean  765148
## 5                Northern America  241142
## 6                         Oceania   55174

Wrangling the table part 2

names <- c("Africa", "Asia", "Europe", "Latin America and the Caribbean","Northern America","Oceania") #Names of the continents
percentage <- round(Q2DF1$Freq/sum(Q2DF1$Freq)*100,2) #Count the percentage of each Continent
lebals <- paste(names, percentage) # add percents to labels 
lebals <- paste(lebals,"%",sep="") # add % to labels 
Q2DF1$Continent <- lebals #Add the labels as a new column "Continent"
Q2DF1<- Q2DF1[,-1] #Remove the old column of names
names(Q2DF1)[1] <- "Total" #Rename Freq to Total
Q2DF1 #Final DF for part 1
##     Total                              Continent
## 1  618948                           Africa 9.66%
## 2 3317794                            Asia 51.77%
## 3 1410947                          Europe 22.01%
## 4  765148 Latin America and the Caribbean 11.94%
## 5  241142                 Northern America 3.76%
## 6   55174                          Oceania 0.86%

Plot the Pie Chart

Q2plot_Pie <- ggplot(Q2DF1, aes(x="", y=Total, fill=Continent)) +
  geom_bar(stat="identity", width=1, color="White") +
  coord_polar("y", start=0) + 
  ggtitle("Percentage of Immigrants per Continent [1980-2013]") +
  theme(plot.title = element_text(hjust = 0.5))+
  theme_void() # remove background, grid, numeric labels
Q2plot_Pie

Line plot of [Total number of immigrants per continant from 1980 to 2013]

Creating a suitable DF for the line plot - Part A

Continent <- c("Africa","Asia","Europe","Latin America and the Caribbean","Northern America","Oceania")
Q2DF2 <- data.frame(Continent)
Q2DF2
##                         Continent
## 1                          Africa
## 2                            Asia
## 3                          Europe
## 4 Latin America and the Caribbean
## 5                Northern America
## 6                         Oceania
years <- names(DF[,-c(1,2,3,4,39)]) #Keeping only the years columns

# looping through each column and sum number of immigrants per continent
for (col in years) {
summation <- tapply(DF[,col],DF$Continent,sum) #Sum total immigrants per continent for each year
Q2DF2[col] <- summation
}
Q2DF2
##                         Continent X1980 X1981 X1982 X1983 X1984 X1985 X1986
## 1                          Africa  3951  4363  3819  2671  2639  2650  3782
## 2                            Asia 31025 34314 30214 24696 27274 23850 28739
## 3                          Europe 39760 44802 42720 24638 22287 20844 24370
## 4 Latin America and the Caribbean 13081 15215 16769 15427 13678 15171 21179
## 5                Northern America  9378 10030  9074  7100  6661  6543  7074
## 6                         Oceania  1942  1839  1675  1018   878   920   904
##   X1987 X1988 X1989 X1990 X1991 X1992 X1993 X1994 X1995  X1996  X1997 X1998
## 1  7494  7552  9894 11012 14228 19242 16531 13072 14806  15700  14471 13791
## 2 43203 47454 60256 72829 89964 90752 98737 88852 90424 103030 106383 90929
## 3 46698 54726 60893 68301 57938 64123 62937 62531 55764  55642  48841 36719
## 4 28471 21924 25060 27942 36827 37853 33840 21341 20262  18645  17174 13830
## 5  7705  6469  6790  5895  6057  6846  7438  5902  4891   5516   4753  4437
## 6  1200  1181  1539  2075  2495  2871  2566  1967  1565   1552   1263  1021
##    X1999  X2000  X2001  X2002  X2003  X2004  X2005  X2006  X2007  X2008  X2009
## 1  15996  20346  24292  22710  23366  28192  27523  29188  28284  29890  34534
## 2 106844 134544 148083 137653 131769 134850 159253 149054 133459 139894 141434
## 3  35639  38215  42779  36798  34556  38082  35955  33053  33495  34692  35078
## 4  15088  16898  20067  19317  20263  22181  24747  24676  26011  26547  26867
## 5   5196   5433   5604   4948   5543   6990   8394   9613   9463  10190   8995
## 6   1055   1276   1818   1685   1800   1788   1585   1473   1693   1834   1860
##    X2010  X2011  X2012  X2013
## 1  40892  35441  38083  38543
## 2 163845 146894 152218 155075
## 3  33425  26778  29177  28691
## 4  28818  27856  27173  24950
## 5   8142   7677   7892   8503
## 6   1834   1548   1679   1775

Creating a suitable DF for the line plot - Part B

Q2DF2 <- Q2DF2[Q2DF2$Continent=="Africa",2:35] #Here "Africa" is the user input, the purpose is to automate this option
Q2DF2
##   X1980 X1981 X1982 X1983 X1984 X1985 X1986 X1987 X1988 X1989 X1990 X1991 X1992
## 1  3951  4363  3819  2671  2639  2650  3782  7494  7552  9894 11012 14228 19242
##   X1993 X1994 X1995 X1996 X1997 X1998 X1999 X2000 X2001 X2002 X2003 X2004 X2005
## 1 16531 13072 14806 15700 14471 13791 15996 20346 24292 22710 23366 28192 27523
##   X2006 X2007 X2008 X2009 X2010 X2011 X2012 X2013
## 1 29188 28284 29890 34534 40892 35441 38083 38543

The steps of line plot is repeated as before in Q1

Transposing the subsetted dataset

Q2DF2 <- gather(Q2DF2,Year,Total,1:34) #Transposing the dataset 
Q2DF2$Year <- gsub("X","",as.character(Q2DF2$Year)) #Removing X from years
Q2DF2 #Final DF ready to be plotted
##    Year Total
## 1  1980  3951
## 2  1981  4363
## 3  1982  3819
## 4  1983  2671
## 5  1984  2639
## 6  1985  2650
## 7  1986  3782
## 8  1987  7494
## 9  1988  7552
## 10 1989  9894
## 11 1990 11012
## 12 1991 14228
## 13 1992 19242
## 14 1993 16531
## 15 1994 13072
## 16 1995 14806
## 17 1996 15700
## 18 1997 14471
## 19 1998 13791
## 20 1999 15996
## 21 2000 20346
## 22 2001 24292
## 23 2002 22710
## 24 2003 23366
## 25 2004 28192
## 26 2005 27523
## 27 2006 29188
## 28 2007 28284
## 29 2008 29890
## 30 2009 34534
## 31 2010 40892
## 32 2011 35441
## 33 2012 38083
## 34 2013 38543
#### Plotting Code - Line plot
Q2plot_line <- ggplot(data=Q2DF2, aes(x=Year, y=Total, group=1)) +
  geom_line(color = "Green")+ #Draw a line
  geom_point() + #Draw points 
   scale_x_discrete(breaks = seq(1980, 2013, by = 5))+ #A jump of 5 years
   scale_y_continuous(breaks = seq(10000, 50000, by = 5000)) +
  ggtitle("Number of Immigrants per Year") +
  theme(plot.title = element_text(hjust = 0.5))+
  xlab("Years") + ylab("Number of Immigrants")
Q2plot_line