Descriptive and Exploratory Question Analysis
Q4 - What are the immigrants counts for the top 5 immigrants countries from 1980 - 2013 Visualized by line charts and area plot.
Import the necessary libraries
library(tidyverse) #For data manipulation and ggplot
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr) #For Data manipulation
library(ggpubr) #For sub plotting
Import the Dataset
DF <- read.csv('/Users/salahkaf/Desktop/UpdatedDF.csv') #Reading the DF
head(DF)
## Country Continent Region DevName X1980 X1981 X1982
## 1 Afghanistan Asia Southern Asia Developing regions 16 39 39
## 2 Albania Europe Southern Europe Developed regions 1 0 0
## 3 Algeria Africa Northern Africa Developing regions 80 67 71
## 4 American Samoa Oceania Polynesia Developing regions 0 1 0
## 5 Andorra Europe Southern Europe Developed regions 0 0 0
## 6 Angola Africa Middle Africa Developing regions 1 3 6
## X1983 X1984 X1985 X1986 X1987 X1988 X1989 X1990 X1991 X1992 X1993 X1994 X1995
## 1 47 71 340 496 741 828 1076 1028 1378 1170 713 858 1537
## 2 0 0 0 1 2 2 3 3 21 56 96 71 63
## 3 69 63 44 69 132 242 434 491 872 795 717 595 1106
## 4 0 0 0 0 1 0 1 2 0 0 0 0 0
## 5 0 0 0 2 0 0 0 3 0 1 0 0 0
## 6 6 4 3 5 5 11 6 8 23 26 22 8 26
## X1996 X1997 X1998 X1999 X2000 X2001 X2002 X2003 X2004 X2005 X2006 X2007 X2008
## 1 2212 2555 1999 2395 3326 4067 3697 3479 2978 3436 3009 2652 2111
## 2 113 307 574 1264 1816 1602 1021 853 1450 1223 856 702 560
## 3 2054 1842 2292 2389 2867 3418 3406 3072 3616 3626 4807 3623 4005
## 4 0 0 0 0 0 0 0 0 0 0 1 0 0
## 5 0 0 2 0 0 1 0 2 0 0 1 1 0
## 6 38 27 58 49 70 169 168 165 268 295 184 106 76
## X2009 X2010 X2011 X2012 X2013 Total
## 1 1746 1758 2203 2635 2004 58639
## 2 716 561 539 620 603 15699
## 3 5393 4752 4325 3774 4331 69439
## 4 0 0 0 0 0 6
## 5 0 0 0 1 1 15
## 6 62 61 39 70 45 2113
Cleaning the dataset
DFQ4 <- DF[-c(196,197),] # Removing "unknown" and "Total" rows
Sorting by descending order to find top 5
DFtop5 <- arrange(DFQ4,desc(Total))
DFtop5 <- DFtop5[1:5,c(1,5:38)]
DFtop5
## Country X1980 X1981 X1982 X1983
## 1 India 8880 8670 8147 7338
## 2 China 5123 6682 3308 1863
## 3 United Kingdom of Great Britain and Northern Ireland 22045 24796 20620 10015
## 4 Philippines 6051 5921 5249 4562
## 5 Pakistan 978 972 1201 900
## X1984 X1985 X1986 X1987 X1988 X1989 X1990 X1991 X1992 X1993 X1994 X1995 X1996
## 1 5704 4211 7150 10189 11522 10343 12041 13734 13673 21496 18620 18489 23859
## 2 1527 1816 1960 2643 2758 4323 8076 14255 10846 9817 13128 14398 19415
## 3 10170 9564 9470 21337 27359 23795 31668 23380 34123 33720 39231 30145 29322
## 4 3801 3150 4166 7360 8639 11865 12509 12718 13670 20479 19532 15864 13692
## 5 668 514 691 1072 1334 2261 2470 3079 4071 4777 4666 4994 9125
## X1997 X1998 X1999 X2000 X2001 X2002 X2003 X2004 X2005 X2006 X2007 X2008 X2009
## 1 22268 17241 18974 28572 31223 31889 27155 28235 36210 33848 28742 28261 29456
## 2 20475 21049 30069 35529 36434 31961 36439 36619 42584 33518 27642 30037 29622
## 3 22965 10367 7045 8840 11728 8046 6797 7533 7258 7140 8216 8979 8876
## 4 11549 8735 9734 10763 13836 11707 12758 14004 18139 18400 19837 24887 28573
## 5 13073 9068 9979 15400 16708 15110 13205 13399 14314 13127 10124 8994 7217
## X2010 X2011 X2012 X2013
## 1 34235 27509 30933 33087
## 2 30391 28502 33024 34129
## 3 8724 6204 6195 5827
## 4 38617 36765 34315 29544
## 5 6811 7468 11227 12603
Manipulate the table to make it suitable for visualization
Q4DF <- t(DFtop5) #Take the transpose of df
Q4DF <- as.data.frame(Q4DF) #Convert the transposed data into data frame
Q4DF <- cbind(Year = rownames(Q4DF), Q4DF) #Make the index as a column
rownames(Q4DF) <- 1:nrow(Q4DF) #Create a new index
Q4DF$Year<-gsub("X","",as.character(Q4DF$Year)) #Removing X from years
Q4DF <- Q4DF[-1,] #Remove the unnecessary column
colnames(Q4DF) <- c("Year","India","China","UK","Philippines","Pakistan") #Rename the columns
#Make all columns as integer data type
for (i in 1:ncol(Q4DF)) {
Q4DF[,i] <- as.integer(Q4DF[,i])
}
Q4DF #Final DF
## Year India China UK Philippines Pakistan
## 2 1980 8880 5123 22045 6051 978
## 3 1981 8670 6682 24796 5921 972
## 4 1982 8147 3308 20620 5249 1201
## 5 1983 7338 1863 10015 4562 900
## 6 1984 5704 1527 10170 3801 668
## 7 1985 4211 1816 9564 3150 514
## 8 1986 7150 1960 9470 4166 691
## 9 1987 10189 2643 21337 7360 1072
## 10 1988 11522 2758 27359 8639 1334
## 11 1989 10343 4323 23795 11865 2261
## 12 1990 12041 8076 31668 12509 2470
## 13 1991 13734 14255 23380 12718 3079
## 14 1992 13673 10846 34123 13670 4071
## 15 1993 21496 9817 33720 20479 4777
## 16 1994 18620 13128 39231 19532 4666
## 17 1995 18489 14398 30145 15864 4994
## 18 1996 23859 19415 29322 13692 9125
## 19 1997 22268 20475 22965 11549 13073
## 20 1998 17241 21049 10367 8735 9068
## 21 1999 18974 30069 7045 9734 9979
## 22 2000 28572 35529 8840 10763 15400
## 23 2001 31223 36434 11728 13836 16708
## 24 2002 31889 31961 8046 11707 15110
## 25 2003 27155 36439 6797 12758 13205
## 26 2004 28235 36619 7533 14004 13399
## 27 2005 36210 42584 7258 18139 14314
## 28 2006 33848 33518 7140 18400 13127
## 29 2007 28742 27642 8216 19837 10124
## 30 2008 28261 30037 8979 24887 8994
## 31 2009 29456 29622 8876 28573 7217
## 32 2010 34235 30391 8724 38617 6811
## 33 2011 27509 28502 6204 36765 7468
## 34 2012 30933 33024 6195 34315 11227
## 35 2013 33087 34129 5827 29544 12603
Plotting the dataset
colors <- c("India", "China", "UK","Philippines","Pakistan")
Q4plot_multiplelines <- ggplot(data = Q4DF, aes(x= Year, group = 1)) +
geom_line(aes(y = India, color = "India"),linetype="twodash") +
geom_line(aes(y = China,color = "China"),linetype="twodash") +
geom_line(aes(y= UK, color = "UK"),linetype="twodash") +
geom_line(aes(y= Philippines, color = "Philippines"),linetype="twodash") +
geom_line(aes(y= Pakistan, color = "Pakistan"),linetype="twodash") +
scale_x_continuous(breaks = seq(1980, 2013, by = 5))+ #A jump of 5 years
scale_y_continuous(breaks = seq(10000, 50000, by = 10000)) +
ggtitle("Number of Immigrants per Year") +
theme(plot.title = element_text(hjust = 0.5))+
labs(x = "Year",y = "Immigrants")
Q4plot_multiplelines

Plot Area plot
Q4plotarea <- ggplot(data = Q4DF, aes(x=Year,group = 1)) +
geom_area(aes(y= India, fill='India',alpha= 0.65)) +
geom_area(aes(y= China,fill='China',alpha= 0.65)) +
geom_area(aes(y= UK,fill= 'UK',alpha= 0.65)) +
geom_area(aes(y= Philippines,fill= 'Philippines',alpha= 0.65)) +
geom_area(aes(y= Pakistan,fill= 'Pakistan',alpha= 0.65)) +
scale_x_continuous(breaks = seq(1980, 2013, by = 5))+ #A jump of 5 years
scale_y_continuous(breaks = seq(10000, 50000, by = 10000)) +
ggtitle("Number of Immigrants per Year") +
theme(plot.title = element_text(hjust = 0.5))+
labs(x = "Year",y = "Immigrants")
Q4plotarea
