library(readr)
## Warning: package 'readr' was built under R version 3.6.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.6.3
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 3.6.3
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.6.3
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
library(ggplot2)
library(outliers)
library(forecast)
## Warning: package 'forecast' was built under R version 3.6.3
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(knitr)
## Warning: package 'knitr' was built under R version 3.6.3
library(stringr)
## Warning: package 'stringr' was built under R version 3.6.3
We choosed two data sets (world population and flight traffic). World population data is exact description of population count of different countries for the year of 1960 - 2016 , which consist variables such as :- Country name, country code ,indicator name and code and population count for each year from (1960-2016).
The second data set is based on flight traffic in India .which consists variables such as :- Country , Quarter,Year, Passengers comming to India, Passengers comming from India, Flights departing from India and flights arriving in India.
Therefore by performing leftjoin() we got a new dataframe named flight where both data got combined on basis of common country name and year, on which we can carry out our further operations.
Data Source :- Kaggle
Link:-
(World population) www.kaggle.com/chadalee/country-wise-population-data
(Flight Traffic) www.kaggle.com/rajanand/international-air-traffic-from-and-to-india?select=Countrywise+Quarterly+International+Air+Traffic+To+And+From+The+Indian+Territory.csv
world_pop <- read_csv("world_pop.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## Country = col_character(),
## `Country Code` = col_character(),
## `Indicator Name` = col_character(),
## `Indicator Code` = col_character()
## )
## See spec(...) for full column specifications.
head(world_pop,10)
country<- read_csv("Countrywise Quarterly International Air Traffic To And From The Indian Territory.csv")
## Parsed with column specification:
## cols(
## YEAR = col_double(),
## QUARTER = col_character(),
## `COUNTRY NAME` = col_character(),
## `PASSENGERS TO INDIA` = col_double(),
## `PASSENGERS FROM INDIA` = col_double(),
## `FREIGHT TO INDIA` = col_double(),
## `FREIGHT FROM INDIA` = col_double()
## )
head(country,10)
World<-world_pop %>%gather(year,Population,5:61) #data was untidy hence gathering process done.
head(World,10)
class(country$YEAR)
## [1] "numeric"
country$YEAR<-as.factor(country$YEAR)
World$Country<-toupper(World$Country) #changing country names to uppercase for join function.
flight<-left_join(country,World,by=c("COUNTRY NAME"="Country","YEAR"="year"))
flight<-flight %>% rename(`FLIGHT FROM INDIA`=`FREIGHT FROM INDIA`,`FLIGHT TO INDIA`=`FREIGHT TO INDIA`) # spelling correctiong using dplyr.
head(flight,10)
dim(flight) #dimension of data frame
## [1] 499 11
summary(flight)
## YEAR QUARTER COUNTRY NAME PASSENGERS TO INDIA
## Length:499 Length:499 Length:499 Min. : 0
## Class :character Class :character Class :character 1st Qu.: 5921
## Mode :character Mode :character Mode :character Median : 21144
## Mean : 114450
## 3rd Qu.: 108567
## Max. :2384541
##
## PASSENGERS FROM INDIA FLIGHT TO INDIA FLIGHT FROM INDIA Country Code
## Min. : 0 Min. : 0.0 Min. : 0.00 Length:499
## 1st Qu.: 6913 1st Qu.: 4.0 1st Qu.: 51.15 Class :character
## Median : 21846 Median : 255.7 Median : 588.40 Mode :character
## Mean : 119050 Mean : 2576.1 Mean : 4041.46
## 3rd Qu.: 112085 3rd Qu.: 2150.1 3rd Qu.: 3076.30
## Max. :2480816 Max. :35914.1 Max. :108131.40
##
## Indicator Name Indicator Code Population
## Length:499 Length:499 Min. :9.342e+04
## Class :character Class :character 1st Qu.:5.607e+06
## Mode :character Mode :character Median :2.898e+07
## Mean :7.166e+07
## 3rd Qu.:6.886e+07
## Max. :1.379e+09
## NA's :138
typeof(flight$QUARTER) #checking type of data.
## [1] "character"
typeof(flight$`Country Code`)
## [1] "character"
typeof(flight$`Indicator Code`)
## [1] "character"
typeof(flight$YEAR)
## [1] "character"
typeof(flight$Population)
## [1] "double"
typeof(flight$`PASSENGERS FROM INDIA`)
## [1] "double"
typeof(flight$`FLIGHT FROM INDIA`)
## [1] "double"
typeof(flight$`Indicator Name`)
## [1] "character"
flight$YEAR<-as.factor(flight$YEAR)
flight$QUARTER<-as.factor(flight$QUARTER) # changing data types where required.
flight$`Country Code`<-as.factor(flight$`Country Code`)
flight$`Indicator Name`<-as.factor(flight$`Indicator Name`)
flight$`Indicator Code`<-as.factor(flight$`Indicator Code`)
levels(flight$QUARTER) # labelling or ordering one of the factor variable.
## [1] "Q1" "Q2" "Q3" "Q4"
flight$QUARTER<- factor(flight$QUARTER,levels = c("Q1", "Q2", "Q3", "Q4"),labels=c("Q-1","Q-2","Q-3","Q-4"))
flight<-mutate(flight,Difference =(flight$`FLIGHT FROM INDIA`-flight$`FLIGHT TO INDIA`))
head(flight,10)
sum(is.na(flight)) # finding actual sum of missing values in dataframe.
## [1] 552
flight$`Country Code`<- impute(flight$`Country Code`, fun= mode) # removing null
sum(is.na(flight$`Country Code`))
## [1] 0
flight$`Indicator Name`<- impute(flight$`Indicator Name`, fun= mode)
sum(is.na(flight$`Country Code`))
## [1] 0
flight$`Indicator Code`<-impute(flight$`Indicator Code`,fun=mode)
sum(is.na(flight$`Indicator Code`))
## [1] 0
flight$Population<-impute(flight$Population,fun=mean)
sum(is.na(flight$Population))
## [1] 0
sum(is.na(flight$Difference))
## [1] 0
sum(is.na(flight))
## [1] 0
flight$QUARTER<-impute(flight$QUARTER,fun=mode)
sum(is.na(flight$QUARTER))
## [1] 0
sum(is.na(flight))
## [1] 0
sum(is.infinite(flight$Difference)) # Finding special INF values
## [1] 0
sum(is.infinite(flight$YEAR))
## [1] 0
sum(is.infinite(flight$Population))
## [1] 0
sum(is.infinite(flight$`PASSENGERS TO INDIA`))
## [1] 0
sum(is.infinite(flight$`COUNTRY NAME`))
## [1] 0
sum(is.infinite(flight$`Indicator Code`))
## [1] 0
sum(is.infinite(flight$`FLIGHT TO INDIA`)) #No special values ( INF ...)
## [1] 0
sum(is.nan(flight$Difference)) # Finding special NAN values
## [1] 0
sum(is.nan(flight$YEAR))
## [1] 0
sum(is.nan(flight$Population))
## [1] 0
sum(is.nan(flight$`PASSENGERS TO INDIA`))
## [1] 0
sum(is.nan(flight$`COUNTRY NAME`))
## [1] 0
sum(is.nan(flight$`Indicator Code`))
## [1] 0
sum(is.nan(flight$`FLIGHT TO INDIA`)) # No special NAN values
## [1] 0
class(flight$Population)
## [1] "impute"
flight$Population<-as.numeric(flight$Population)
pindiato<-flight$`PASSENGERS TO INDIA` %>% boxplot(main="box pot",ylab="PASSENGERS TO INDIA",col="yellow",horizontal = TRUE)
fromindia<-flight$`FLIGHT FROM INDIA` %>% boxplot(main="box pot",ylab="FLIGHTS FROM INDIA",col="red",horizontal = TRUE)
pfromindia<-flight$`PASSENGERS FROM INDIA` %>% boxplot(main="box pot",ylab="PASSENGERS FROM INDIA",col="blue",horizontal = TRUE)
indiato<-flight$`FLIGHT TO INDIA`%>% boxplot(main="box pot",ylab="FLIGHTS TO INDIA",col="green",horizontal = TRUE)
popoutlier<-flight$Population %>% boxplot(main="box pot",ylab=" Population",col="pink",horizontal = TRUE)
# detecting outliers in population
sum(popoutlier$out)
## [1] 12529875956
sum(pindiato$out)
## [1] 40112418
sum(fromindia$out)
## [1] 1503252
sum(indiato$out)
## [1] 981997.1
#excluding outliers by capping method.
cap <- function(x){
quantiles <- quantile( x, c(.05, 0.25, 0.75, .95 ) )
x[ x < quantiles[2] - 1.5*IQR(x) ] <- quantiles[1]
x[ x > quantiles[3] + 1.5*IQR(x) ] <- quantiles[4]
x
}
Diamonds_sub <- flight %>% dplyr::select(Population,`FLIGHT TO INDIA`,`FLIGHT FROM INDIA`,`PASSENGERS TO INDIA`,`PASSENGERS FROM INDIA`)
cappopulation<- Diamonds_sub$Population %>% cap()
cappopulation %>% boxplot(main=" population (cap)",col="pink",horizontal = TRUE)
capflight1<- Diamonds_sub$`FLIGHT TO INDIA` %>% cap()
capflight2<- Diamonds_sub$`FLIGHT FROM INDIA` %>% cap()
cappass1<- Diamonds_sub$`PASSENGERS TO INDIA` %>% cap()
cappass1 %>% boxplot(main="Passengers to India(CAP)",col="yellow",horizontal = TRUE)
cappass2 <- Diamonds_sub$`PASSENGERS FROM INDIA` %>% cap()
cappass2 %>% boxplot(horizontal=TRUE,main="Passengers from India(cap)",col="blue")
datasummary<-sapply(Diamonds_sub,FUN = cap) # we can also do above step using sapply method in combined way.
summary(Diamonds_sub)
## Population FLIGHT TO INDIA FLIGHT FROM INDIA PASSENGERS TO INDIA
## Min. :9.342e+04 Min. : 0.0 Min. : 0.00 Min. : 0
## 1st Qu.:8.747e+06 1st Qu.: 4.0 1st Qu.: 51.15 1st Qu.: 5921
## Median :6.060e+07 Median : 255.7 Median : 588.40 Median : 21144
## Mean :7.166e+07 Mean : 2576.1 Mean : 4041.46 Mean : 114450
## 3rd Qu.:7.166e+07 3rd Qu.: 2150.1 3rd Qu.: 3076.30 3rd Qu.: 108567
## Max. :1.379e+09 Max. :35914.1 Max. :108131.40 Max. :2384541
## PASSENGERS FROM INDIA
## Min. : 0
## 1st Qu.: 6913
## Median : 21846
## Mean : 119050
## 3rd Qu.: 112085
## Max. :2480816
summary(datasummary) # compare with above summary.
## Population FLIGHT TO INDIA FLIGHT FROM INDIA PASSENGERS TO INDIA
## Min. : 93419 Min. : 0.0 Min. : 0.00 Min. : 0
## 1st Qu.: 8747358 1st Qu.: 4.0 1st Qu.: 51.15 1st Qu.: 5921
## Median : 60600590 Median : 255.7 Median : 588.40 Median : 21144
## Mean : 51178561 Mean : 2398.5 Mean : 3161.12 Mean : 86828
## 3rd Qu.: 71660319 3rd Qu.: 2150.1 3rd Qu.: 3076.30 3rd Qu.:108567
## Max. :162951560 Max. :13744.2 Max. :16624.41 Max. :381580
## PASSENGERS FROM INDIA
## Min. : 0
## 1st Qu.: 6913
## Median : 21846
## Mean : 90527
## 3rd Qu.:112085
## Max. :401412
#box-cox for reducing skewness of population
class(flight$Population)
## [1] "numeric"
flight$Population %>% hist(main="Population")
boxycoxy<-BoxCox(flight$Population,lambda = "auto")
boxycoxy %>% hist(main="BOX-COX")
#for reducing right skewness
logpop<-log(flight$Population)
hist(logpop)
#for left skewness
square<- function(x) {x*x}
popsquare<-square(flight$Population)
hist(popsquare)
#normalizing
#without centering
popnocenter<-scale(flight$Population,center = FALSE,scale = sd(flight$Population))
hist(popnocenter)
#with centering
popcenter<-scale(flight$Population,center = TRUE,scale = FALSE)
hist(popcenter)