library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(tidyr)
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
library(outliers)
library(ggplot2)
library(infotheo)
library(forecast)
library(knitr)
require("knitr")
opts_knit$set(root.dir = "~/Downloads")
All the 3 datasets were related and had common variables. Therefore worldcupMatches and Worldcups tables shared common variable, using Inner Join both tables were joined using common variable ‘Year’ and saved under a new table ‘world_cup’. Further, World_cup and worldcupPlayers also had common variables ‘roundID’ and ‘matchID’ which were used to inner join the tables. The tables formed was then saved under ‘world_cup1’. Head() function was used to check if all the variables are present in the new table ‘world_cup1’.
WorldCupMatches <- read.csv("~/Downloads/fifa-world-cup/WorldCupMatches.csv")
head(WorldCupMatches, n=2)
WorldCupPlayers <- read.csv("~/Downloads/fifa-world-cup/WorldCupPlayers.csv")
head(WorldCupPlayers, n=2)
WorldCups <- read.csv("~/Downloads/fifa-world-cup/WorldCups.csv")
head(WorldCups, n=2)
world_cup <- inner_join(WorldCupMatches, WorldCups, by = "Year")
world_cup1 <- inner_join(world_cup, WorldCupPlayers, by = c("RoundID","MatchID"))
head(world_cup1, n=2)
#understand
str(world_cup1)
## 'data.frame': 39256 obs. of 36 variables:
## $ Year : int 1930 1930 1930 1930 1930 1930 1930 1930 1930 1930 ...
## $ Datetime : Factor w/ 603 levels "","01 Jul 1950 - 15:00 ",..: 206 206 206 206 206 206 206 206 206 206 ...
## $ Stage : Factor w/ 24 levels "","Final","First round",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ Stadium : Factor w/ 182 levels "","Arena Amazonia",..: 126 126 126 126 126 126 126 126 126 126 ...
## $ City : Factor w/ 152 levels "","Alicante ",..: 85 85 85 85 85 85 85 85 85 85 ...
## $ Home.Team.Name : Factor w/ 79 levels "","Algeria","Angola",..: 25 25 25 25 25 25 25 25 25 25 ...
## $ Home.Team.Goals : int 4 4 4 4 4 4 4 4 4 4 ...
## $ Away.Team.Goals : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Away.Team.Name : Factor w/ 84 levels "","Algeria","Angola",..: 47 47 47 47 47 47 47 47 47 47 ...
## $ Win.conditions : Factor w/ 44 levels ""," "," win on penalties (2 - 3) ",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ Attendance.x : int 4444 4444 4444 4444 4444 4444 4444 4444 4444 4444 ...
## $ Half.time.Home.Goals: int 3 3 3 3 3 3 3 3 3 3 ...
## $ Half.time.Away.Goals: int 0 0 0 0 0 0 0 0 0 0 ...
## $ Referee : Factor w/ 367 levels "","ABD EL FATAH Essam (EGY)",..: 193 193 193 193 193 193 193 193 193 193 ...
## $ Assistant.1 : Factor w/ 388 levels "","ABDUL HAMID Halim (MAS)",..: 88 88 88 88 88 88 88 88 88 88 ...
## $ Assistant.2 : Factor w/ 409 levels "","ABDUL HAMID Halim (MAS)",..: 293 293 293 293 293 293 293 293 293 293 ...
## $ RoundID : int 201 201 201 201 201 201 201 201 201 201 ...
## $ MatchID : int 1096 1096 1096 1096 1096 1096 1096 1096 1096 1096 ...
## $ Home.Team.Initials : Factor w/ 78 levels "","ALG","ANG",..: 26 26 26 26 26 26 26 26 26 26 ...
## $ Away.Team.Initials : Factor w/ 83 levels "","ALG","ANG",..: 48 48 48 48 48 48 48 48 48 48 ...
## $ Country : Factor w/ 15 levels "Argentina","Brazil",..: 14 14 14 14 14 14 14 14 14 14 ...
## $ Winner : Factor w/ 9 levels "Argentina","Brazil",..: 9 9 9 9 9 9 9 9 9 9 ...
## $ Runners.Up : Factor w/ 10 levels "Argentina","Brazil",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Third : Factor w/ 14 levels "Austria","Brazil",..: 14 14 14 14 14 14 14 14 14 14 ...
## $ Fourth : Factor w/ 16 levels "Austria","Belgium",..: 16 16 16 16 16 16 16 16 16 16 ...
## $ GoalsScored : int 70 70 70 70 70 70 70 70 70 70 ...
## $ QualifiedTeams : int 13 13 13 13 13 13 13 13 13 13 ...
## $ MatchesPlayed : int 18 18 18 18 18 18 18 18 18 18 ...
## $ Attendance.y : Factor w/ 20 levels "1.045.246","1.545.791",..: 17 17 17 17 17 17 17 17 17 17 ...
## $ Team.Initials : Factor w/ 82 levels "ALG","ANG","ARG",..: 26 47 26 47 26 47 26 47 26 47 ...
## $ Coach.Name : Factor w/ 335 levels "ACOSTA Nelson (URU)",..: 50 169 50 169 50 169 50 169 50 169 ...
## $ Line.up : Factor w/ 2 levels "N","S": 2 2 2 2 2 2 2 2 2 2 ...
## $ Shirt.Number : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Player.Name : Factor w/ 7663 levels "?URI?I?","A BAUTISTA",..: 317 5512 4706 3856 1988 5981 482 2988 2035 1666 ...
## $ Position : Factor w/ 4 levels "","C","GK","GKC": 3 3 1 1 1 2 1 1 1 1 ...
## $ Event : Factor w/ 1894 levels "","G1'","G1' G42'",..: 1 1 308 527 1 1 325 1 1 1 ...
attributes(world_cup1)$class
## [1] "data.frame"
#Change Columns Name
names(world_cup1)[names(world_cup1) == "Attendance.x"] <- "Match_Attendance"
names(world_cup1)[names(world_cup1) == "Attendance.y"] <- "Total_Attendance"
#Data Type Coversion
world_cup1$Datetime <- dmy_hm(world_cup1$Datetime)
world_cup1$Match_Attendance <- as.integer(world_cup1$Match_Attendance)
world_cup1$Total_Attendance <- as.integer(world_cup1$Total_Attendance)
world_cup1$Year <- as.factor(world_cup1$Year)
#Labelled Factor Variable
world_cup1$Line.up <- world_cup1$Line.up %>% factor(levels = c("S","N"), labels = c("Line Up","Substitute"))
Dataset world_cup1 was joined with 3 datasets as mentioned above. All the 3 datasets worldcupMatches, wordcupPlayers, Worldcups were in tidy format.
#Creating new columns
world_cup1 <- mutate(world_cup1, Total_Goals = Home.Team.Goals + Away.Team.Goals)
world_cup1 <- mutate(world_cup1, Total_HalfTime_Goals = Half.time.Home.Goals + Half.time.Away.Goals)
world_cup1 <- mutate(world_cup1, Average_Goal = `GoalsScored`/ `MatchesPlayed`)
#Selecting the the required columns
world_cup1 <- world_cup1 %>% select(-(Win.conditions), -(Referee:Assistant.2),-(Home.Team.Initials: Away.Team.Initials), -(Event) )
head(world_cup1, n=2)
str(world_cup1)
## 'data.frame': 39256 obs. of 32 variables:
## $ Year : Factor w/ 20 levels "1930","1934",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Datetime : POSIXct, format: "1930-07-13 15:00:00" "1930-07-13 15:00:00" ...
## $ Stage : Factor w/ 24 levels "","Final","First round",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ Stadium : Factor w/ 182 levels "","Arena Amazonia",..: 126 126 126 126 126 126 126 126 126 126 ...
## $ City : Factor w/ 152 levels "","Alicante ",..: 85 85 85 85 85 85 85 85 85 85 ...
## $ Home.Team.Name : Factor w/ 79 levels "","Algeria","Angola",..: 25 25 25 25 25 25 25 25 25 25 ...
## $ Home.Team.Goals : int 4 4 4 4 4 4 4 4 4 4 ...
## $ Away.Team.Goals : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Away.Team.Name : Factor w/ 84 levels "","Algeria","Angola",..: 47 47 47 47 47 47 47 47 47 47 ...
## $ Match_Attendance : int 4444 4444 4444 4444 4444 4444 4444 4444 4444 4444 ...
## $ Half.time.Home.Goals: int 3 3 3 3 3 3 3 3 3 3 ...
## $ Half.time.Away.Goals: int 0 0 0 0 0 0 0 0 0 0 ...
## $ RoundID : int 201 201 201 201 201 201 201 201 201 201 ...
## $ MatchID : int 1096 1096 1096 1096 1096 1096 1096 1096 1096 1096 ...
## $ Country : Factor w/ 15 levels "Argentina","Brazil",..: 14 14 14 14 14 14 14 14 14 14 ...
## $ Winner : Factor w/ 9 levels "Argentina","Brazil",..: 9 9 9 9 9 9 9 9 9 9 ...
## $ Runners.Up : Factor w/ 10 levels "Argentina","Brazil",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Third : Factor w/ 14 levels "Austria","Brazil",..: 14 14 14 14 14 14 14 14 14 14 ...
## $ Fourth : Factor w/ 16 levels "Austria","Belgium",..: 16 16 16 16 16 16 16 16 16 16 ...
## $ GoalsScored : int 70 70 70 70 70 70 70 70 70 70 ...
## $ QualifiedTeams : int 13 13 13 13 13 13 13 13 13 13 ...
## $ MatchesPlayed : int 18 18 18 18 18 18 18 18 18 18 ...
## $ Total_Attendance : int 17 17 17 17 17 17 17 17 17 17 ...
## $ Team.Initials : Factor w/ 82 levels "ALG","ANG","ARG",..: 26 47 26 47 26 47 26 47 26 47 ...
## $ Coach.Name : Factor w/ 335 levels "ACOSTA Nelson (URU)",..: 50 169 50 169 50 169 50 169 50 169 ...
## $ Line.up : Factor w/ 2 levels "Line Up","Substitute": 1 1 1 1 1 1 1 1 1 1 ...
## $ Shirt.Number : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Player.Name : Factor w/ 7663 levels "?URI?I?","A BAUTISTA",..: 317 5512 4706 3856 1988 5981 482 2988 2035 1666 ...
## $ Position : Factor w/ 4 levels "","C","GK","GKC": 3 3 1 1 1 2 1 1 1 1 ...
## $ Total_Goals : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Total_HalfTime_Goals: int 3 3 3 3 3 3 3 3 3 3 ...
## $ Average_Goal : num 3.89 3.89 3.89 3.89 3.89 ...
#Scanning for missing Values
colSums(is.na(world_cup1))
## Year Datetime Stage
## 0 0 0
## Stadium City Home.Team.Name
## 0 0 0
## Home.Team.Goals Away.Team.Goals Away.Team.Name
## 0 0 0
## Match_Attendance Half.time.Home.Goals Half.time.Away.Goals
## 184 0 0
## RoundID MatchID Country
## 0 0 0
## Winner Runners.Up Third
## 0 0 0
## Fourth GoalsScored QualifiedTeams
## 0 0 0
## MatchesPlayed Total_Attendance Team.Initials
## 0 0 0
## Coach.Name Line.up Shirt.Number
## 0 0 0
## Player.Name Position Total_Goals
## 0 0 0
## Total_HalfTime_Goals Average_Goal
## 0 0
#Imputing values
world_cup1$Match_Attendance <- as.integer(impute(world_cup1$Match_Attendance, fun = mean))
is.specialInfinite <- function(x){
sum(is.infinite(x))}
sapply(world_cup1,is.specialInfinite)
## Year Datetime Stage
## 0 0 0
## Stadium City Home.Team.Name
## 0 0 0
## Home.Team.Goals Away.Team.Goals Away.Team.Name
## 0 0 0
## Match_Attendance Half.time.Home.Goals Half.time.Away.Goals
## 0 0 0
## RoundID MatchID Country
## 0 0 0
## Winner Runners.Up Third
## 0 0 0
## Fourth GoalsScored QualifiedTeams
## 0 0 0
## MatchesPlayed Total_Attendance Team.Initials
## 0 0 0
## Coach.Name Line.up Shirt.Number
## 0 0 0
## Player.Name Position Total_Goals
## 0 0 0
## Total_HalfTime_Goals Average_Goal
## 0 0
is.specialNan <- function(x){
sum(is.nan(x))}
sapply(world_cup1,is.specialNan)
## Year Datetime Stage
## 0 0 0
## Stadium City Home.Team.Name
## 0 0 0
## Home.Team.Goals Away.Team.Goals Away.Team.Name
## 0 0 0
## Match_Attendance Half.time.Home.Goals Half.time.Away.Goals
## 0 0 0
## RoundID MatchID Country
## 0 0 0
## Winner Runners.Up Third
## 0 0 0
## Fourth GoalsScored QualifiedTeams
## 0 0 0
## MatchesPlayed Total_Attendance Team.Initials
## 0 0 0
## Coach.Name Line.up Shirt.Number
## 0 0 0
## Player.Name Position Total_Goals
## 0 0 0
## Total_HalfTime_Goals Average_Goal
## 0 0
par(mfrow = c(2,2))
#scanning for outliers
world_cup1$GoalsScored %>% boxplot(main="Box Plot of Goals Scored", ylab="Goals", col = "grey")
world_cup1$Total_Attendance %>% boxplot(main="Box Plot of Total Attendance", ylab="Attendance", col = "grey")
world_cup1$Match_Attendance %>% boxplot(main="Box Plot of Match Attendance", ylab="Attendance", col = "grey")
world_cup1$Total_HalfTime_Goals %>% boxplot(main="Box Plot of Total Half Time Goals", ylab="Goals", col = "grey")
world_cup1$Total_Goals %>% boxplot(main="Box Plot of Total Goals", ylab="Goals", col = "grey")
world_cup1$Average_Goal %>% boxplot(main="Box Plot of Average Goals per Match", ylab="Goals", col = "grey")
#capping Outliers
cap <- function(x){
quantiles <- quantile( x, c(.05, 0.25, 0.75, .95 ) )
x[ x < quantiles[2] - 1.5*IQR(x) ] <- quantiles[1]
x[ x > quantiles[3] + 1.5*IQR(x) ] <- quantiles[4]
x}
world_cup1$Match_Attendance <- world_cup1$Match_Attendance %>% cap()
#checking again for outliers
world_cup1$Match_Attendance %>% boxplot(main="Box Plot of Match Attendance", ylab="Attendance", col = "grey")
#transformation
hist(world_cup1$Match_Attendance)
#The Log transformation
log_Match_Attendance <- log10(world_cup1$Match_Attendance)
hist(log_Match_Attendance)
ln_Match_Attendance <- log(world_cup1$Match_Attendance)
hist(ln_Match_Attendance)
#square transformation
sqrt_Match_Attendance <- sqrt(world_cup1$Match_Attendance)
hist(sqrt_Match_Attendance)
#Box-Cox transformation
boxcox_Match_Attendance<- BoxCox(world_cup1$Match_Attendance,lambda = "auto")
hist(boxcox_Match_Attendance)
#reciprocal transformation
reci_Match_Attendance <- 1/world_cup1$Match_Attendance
hist(reci_Match_Attendance)