Required packages

library(readr)
## Warning: package 'readr' was built under R version 3.6.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.6.3
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 3.6.3
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.6.3
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(ggplot2)
library(outliers)
library(forecast)
## Warning: package 'forecast' was built under R version 3.6.3
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(knitr)
## Warning: package 'knitr' was built under R version 3.6.3
library(stringr)
## Warning: package 'stringr' was built under R version 3.6.3

Executive Summary

Data Information

Tidy & Manipulate Data I :-

world_pop <- read_csv("world_pop.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Country = col_character(),
##   `Country Code` = col_character(),
##   `Indicator Name` = col_character(),
##   `Indicator Code` = col_character()
## )
## See spec(...) for full column specifications.
head(world_pop,10) 
country<- read_csv("Countrywise Quarterly International Air Traffic To And From The Indian Territory.csv")
## Parsed with column specification:
## cols(
##   YEAR = col_double(),
##   QUARTER = col_character(),
##   `COUNTRY NAME` = col_character(),
##   `PASSENGERS TO INDIA` = col_double(),
##   `PASSENGERS FROM INDIA` = col_double(),
##   `FREIGHT TO INDIA` = col_double(),
##   `FREIGHT FROM INDIA` = col_double()
## )
head(country,10)
World<-world_pop %>%gather(year,Population,5:61)  #data was untidy hence gathering process done. 
head(World,10)
class(country$YEAR)
## [1] "numeric"
country$YEAR<-as.factor(country$YEAR)
World$Country<-toupper(World$Country) #changing country names to uppercase for join function.

flight<-left_join(country,World,by=c("COUNTRY NAME"="Country","YEAR"="year"))
flight<-flight %>% rename(`FLIGHT FROM INDIA`=`FREIGHT FROM INDIA`,`FLIGHT TO INDIA`=`FREIGHT TO INDIA`) # spelling correctiong using dplyr.
head(flight,10)

Understand

dim(flight) #dimension of data frame
## [1] 499  11
summary(flight)
##      YEAR             QUARTER          COUNTRY NAME       PASSENGERS TO INDIA
##  Length:499         Length:499         Length:499         Min.   :      0    
##  Class :character   Class :character   Class :character   1st Qu.:   5921    
##  Mode  :character   Mode  :character   Mode  :character   Median :  21144    
##                                                           Mean   : 114450    
##                                                           3rd Qu.: 108567    
##                                                           Max.   :2384541    
##                                                                              
##  PASSENGERS FROM INDIA FLIGHT TO INDIA   FLIGHT FROM INDIA   Country Code      
##  Min.   :      0       Min.   :    0.0   Min.   :     0.00   Length:499        
##  1st Qu.:   6913       1st Qu.:    4.0   1st Qu.:    51.15   Class :character  
##  Median :  21846       Median :  255.7   Median :   588.40   Mode  :character  
##  Mean   : 119050       Mean   : 2576.1   Mean   :  4041.46                     
##  3rd Qu.: 112085       3rd Qu.: 2150.1   3rd Qu.:  3076.30                     
##  Max.   :2480816       Max.   :35914.1   Max.   :108131.40                     
##                                                                                
##  Indicator Name     Indicator Code       Population       
##  Length:499         Length:499         Min.   :9.342e+04  
##  Class :character   Class :character   1st Qu.:5.607e+06  
##  Mode  :character   Mode  :character   Median :2.898e+07  
##                                        Mean   :7.166e+07  
##                                        3rd Qu.:6.886e+07  
##                                        Max.   :1.379e+09  
##                                        NA's   :138
typeof(flight$QUARTER) #checking type of data.
## [1] "character"
typeof(flight$`Country Code`)
## [1] "character"
typeof(flight$`Indicator Code`)
## [1] "character"
typeof(flight$YEAR)
## [1] "character"
typeof(flight$Population)
## [1] "double"
typeof(flight$`PASSENGERS FROM INDIA`)
## [1] "double"
typeof(flight$`FLIGHT FROM INDIA`)
## [1] "double"
typeof(flight$`Indicator Name`)
## [1] "character"
flight$YEAR<-as.factor(flight$YEAR)
flight$QUARTER<-as.factor(flight$QUARTER)   # changing data types where required.
flight$`Country Code`<-as.factor(flight$`Country Code`)
flight$`Indicator Name`<-as.factor(flight$`Indicator Name`)
flight$`Indicator Code`<-as.factor(flight$`Indicator Code`)
levels(flight$QUARTER) # labelling or ordering  one of the factor variable.
## [1] "Q1" "Q2" "Q3" "Q4"
flight$QUARTER<- factor(flight$QUARTER,levels = c("Q1", "Q2", "Q3", "Q4"),labels=c("Q-1","Q-2","Q-3","Q-4"))

Tidy & Manipulate Data II

flight<-mutate(flight,Difference =(flight$`FLIGHT FROM INDIA`-flight$`FLIGHT TO INDIA`))
head(flight,10)

Scan I

sum(is.na(flight)) # finding actual sum of missing values in dataframe.
## [1] 552
flight$`Country Code`<- impute(flight$`Country Code`, fun= mode) # removing null 
sum(is.na(flight$`Country Code`))  
## [1] 0
flight$`Indicator Name`<- impute(flight$`Indicator Name`, fun= mode)
sum(is.na(flight$`Country Code`))
## [1] 0
flight$`Indicator Code`<-impute(flight$`Indicator Code`,fun=mode)
sum(is.na(flight$`Indicator Code`))
## [1] 0
flight$Population<-impute(flight$Population,fun=mean)
sum(is.na(flight$Population))
## [1] 0
sum(is.na(flight$Difference))
## [1] 0
sum(is.na(flight)) 
## [1] 0
flight$QUARTER<-impute(flight$QUARTER,fun=mode)
sum(is.na(flight$QUARTER))
## [1] 0
sum(is.na(flight))
## [1] 0
sum(is.infinite(flight$Difference))   # Finding special INF values
## [1] 0
sum(is.infinite(flight$YEAR))
## [1] 0
sum(is.infinite(flight$Population))
## [1] 0
sum(is.infinite(flight$`PASSENGERS TO INDIA`))
## [1] 0
sum(is.infinite(flight$`COUNTRY NAME`))
## [1] 0
sum(is.infinite(flight$`Indicator Code`))
## [1] 0
sum(is.infinite(flight$`FLIGHT TO INDIA`))  #No special values ( INF ...)
## [1] 0
sum(is.nan(flight$Difference))   # Finding special NAN values
## [1] 0
sum(is.nan(flight$YEAR))
## [1] 0
sum(is.nan(flight$Population))
## [1] 0
sum(is.nan(flight$`PASSENGERS TO INDIA`))
## [1] 0
sum(is.nan(flight$`COUNTRY NAME`))
## [1] 0
sum(is.nan(flight$`Indicator Code`))
## [1] 0
sum(is.nan(flight$`FLIGHT TO INDIA`)) # No special NAN values 
## [1] 0

Scan II

class(flight$Population)
## [1] "impute"
flight$Population<-as.numeric(flight$Population)

pindiato<-flight$`PASSENGERS TO INDIA` %>% boxplot(main="box pot",ylab="PASSENGERS TO INDIA",col="yellow",horizontal = TRUE)

fromindia<-flight$`FLIGHT FROM INDIA` %>% boxplot(main="box pot",ylab="FLIGHTS FROM INDIA",col="red",horizontal = TRUE)

pfromindia<-flight$`PASSENGERS FROM INDIA` %>% boxplot(main="box pot",ylab="PASSENGERS FROM INDIA",col="blue",horizontal = TRUE)

indiato<-flight$`FLIGHT TO INDIA`%>% boxplot(main="box pot",ylab="FLIGHTS TO INDIA",col="green",horizontal = TRUE)

popoutlier<-flight$Population %>% boxplot(main="box pot",ylab=" Population",col="pink",horizontal = TRUE)

# detecting outliers in population
sum(popoutlier$out)
## [1] 12529875956
sum(pindiato$out)
## [1] 40112418
sum(fromindia$out)
## [1] 1503252
sum(indiato$out)
## [1] 981997.1
#excluding outliers by capping method. 
cap <- function(x){
  quantiles <- quantile( x, c(.05, 0.25, 0.75, .95 ) )
  x[ x < quantiles[2] - 1.5*IQR(x) ] <- quantiles[1]
  x[ x > quantiles[3] + 1.5*IQR(x) ] <- quantiles[4]
  x
}

Diamonds_sub <- flight %>%  dplyr::select(Population,`FLIGHT TO INDIA`,`FLIGHT FROM INDIA`,`PASSENGERS TO INDIA`,`PASSENGERS FROM INDIA`)
cappopulation<- Diamonds_sub$Population %>% cap()
cappopulation %>% boxplot(main=" population (cap)",col="pink",horizontal = TRUE)

capflight1<- Diamonds_sub$`FLIGHT TO INDIA` %>% cap()
capflight2<- Diamonds_sub$`FLIGHT FROM INDIA` %>% cap()
cappass1<- Diamonds_sub$`PASSENGERS TO INDIA` %>% cap()
cappass1 %>% boxplot(main="Passengers to India(CAP)",col="yellow",horizontal = TRUE)

cappass2 <- Diamonds_sub$`PASSENGERS FROM INDIA` %>% cap()
cappass2 %>% boxplot(horizontal=TRUE,main="Passengers from India(cap)",col="blue")

datasummary<-sapply(Diamonds_sub,FUN = cap) # we can also do above step using sapply method in combined way.
summary(Diamonds_sub)
##    Population        FLIGHT TO INDIA   FLIGHT FROM INDIA   PASSENGERS TO INDIA
##  Min.   :9.342e+04   Min.   :    0.0   Min.   :     0.00   Min.   :      0    
##  1st Qu.:8.747e+06   1st Qu.:    4.0   1st Qu.:    51.15   1st Qu.:   5921    
##  Median :6.060e+07   Median :  255.7   Median :   588.40   Median :  21144    
##  Mean   :7.166e+07   Mean   : 2576.1   Mean   :  4041.46   Mean   : 114450    
##  3rd Qu.:7.166e+07   3rd Qu.: 2150.1   3rd Qu.:  3076.30   3rd Qu.: 108567    
##  Max.   :1.379e+09   Max.   :35914.1   Max.   :108131.40   Max.   :2384541    
##  PASSENGERS FROM INDIA
##  Min.   :      0      
##  1st Qu.:   6913      
##  Median :  21846      
##  Mean   : 119050      
##  3rd Qu.: 112085      
##  Max.   :2480816
summary(datasummary) # compare with above summary.
##    Population        FLIGHT TO INDIA   FLIGHT FROM INDIA  PASSENGERS TO INDIA
##  Min.   :    93419   Min.   :    0.0   Min.   :    0.00   Min.   :     0     
##  1st Qu.:  8747358   1st Qu.:    4.0   1st Qu.:   51.15   1st Qu.:  5921     
##  Median : 60600590   Median :  255.7   Median :  588.40   Median : 21144     
##  Mean   : 51178561   Mean   : 2398.5   Mean   : 3161.12   Mean   : 86828     
##  3rd Qu.: 71660319   3rd Qu.: 2150.1   3rd Qu.: 3076.30   3rd Qu.:108567     
##  Max.   :162951560   Max.   :13744.2   Max.   :16624.41   Max.   :381580     
##  PASSENGERS FROM INDIA
##  Min.   :     0       
##  1st Qu.:  6913       
##  Median : 21846       
##  Mean   : 90527       
##  3rd Qu.:112085       
##  Max.   :401412

Transform

#box-cox for reducing skewness of population
class(flight$Population)
## [1] "numeric"
flight$Population %>% hist(main="Population")

boxycoxy<-BoxCox(flight$Population,lambda = "auto")
boxycoxy %>% hist(main="BOX-COX")

#for  reducing right skewness
logpop<-log(flight$Population)
hist(logpop)

#for left skewness
square<- function(x) {x*x}
popsquare<-square(flight$Population)
hist(popsquare)

#normalizing 
#without centering
popnocenter<-scale(flight$Population,center = FALSE,scale = sd(flight$Population))
hist(popnocenter)

#with centering
popcenter<-scale(flight$Population,center = TRUE,scale = FALSE)
hist(popcenter)

References: