Necessary packages are installed and library() is used to load the packages.
library(readr)
library(readxl)
library(foreign)
library(gdata)
## gdata: Unable to locate valid perl interpreter
## gdata:
## gdata: read.xls() will be unable to read Excel XLS and XLSX files
## gdata: unless the 'perl=' argument is used to specify the location
## gdata: of a valid perl intrpreter.
## gdata:
## gdata: (To avoid display of this message in the future, please
## gdata: ensure perl is installed and available on the executable
## gdata: search path.)
## gdata: Unable to load perl libaries needed by read.xls()
## gdata: to support 'XLX' (Excel 97-2004) files.
##
## gdata: Unable to load perl libaries needed by read.xls()
## gdata: to support 'XLSX' (Excel 2007+) files.
##
## gdata: Run the function 'installXLSXsupport()'
## gdata: to automatically download and install the perl
## gdata: libaries needed to support Excel XLS and XLSX formats.
##
## Attaching package: 'gdata'
## The following object is masked from 'package:stats':
##
## nobs
## The following object is masked from 'package:utils':
##
## object.size
## The following object is masked from 'package:base':
##
## startsWith
library(rvest)
## Loading required package: xml2
##
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
##
## guess_encoding
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:gdata':
##
## combine, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(knitr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(stringr)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(forecast)
## Registered S3 method overwritten by 'xts':
## method from
## as.zoo.xts zoo
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Registered S3 methods overwritten by 'forecast':
## method from
## fitted.fracdiff fracdiff
## residuals.fracdiff fracdiff
Following are the steps used for data processing :- * Fetching the data. * Understanding the data. * Tidying and Manipulating the data. * Scanning and Transforming the data.
Here we have merged two datasets namely World Cup Matches and World cup, where matches data contains all the results from the matches contested as part of the cups and World Cups Dataset displays all the information of World Cups.
Firstly, we have fetched the datasets and by looking upon both the datasets it comes into the picture that the data is not in tidy format.
TIDY AND MANIPULATE
Since World_Cup_merged holds too many entries and columns we have filtered the data for Year>1983 and have removed some unnecessary columns.
we have tidied the data further by mutating new columns - ‘Total_Goals_Scored’, ‘Total_Goals_FirstHalf’ and ‘Total_Goals_SecondHalf’ to World_Cup_merged
SCANNING
*Then we scanned for Outliers with z.scores>3 and imputed them with the mean value.
getwd()
## [1] "C:/Users/KOMAL MEHTA/Desktop/1st Semester/Data Preprocessing/Assignment3"
World_Cup_Matches <- read_csv("fifa-world-cup/WorldCupMatches.csv")
## Parsed with column specification:
## cols(
## .default = col_character(),
## Year = col_double(),
## `Home Team Goals` = col_double(),
## `Away Team Goals` = col_double(),
## Attendance = col_double(),
## `Half-time Home Goals` = col_double(),
## `Half-time Away Goals` = col_double(),
## RoundID = col_double(),
## MatchID = col_double()
## )
## See spec(...) for full column specifications.
View(World_Cup_Matches)
World_Cups <- read_csv("fifa-world-cup/WorldCups.csv")
## Parsed with column specification:
## cols(
## Year = col_double(),
## Country = col_character(),
## Winner = col_character(),
## `Runners-Up` = col_character(),
## Third = col_character(),
## Fourth = col_character(),
## GoalsScored = col_double(),
## QualifiedTeams = col_double(),
## MatchesPlayed = col_double(),
## Attendance = col_character()
## )
View(World_Cups)
str(World_Cup_Matches)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 4572 obs. of 20 variables:
## $ Year : num 1930 1930 1930 1930 1930 1930 1930 1930 1930 1930 ...
## $ Datetime : chr "13 Jul 1930 - 15:00" "13 Jul 1930 - 15:00" "14 Jul 1930 - 12:45" "14 Jul 1930 - 14:50" ...
## $ Stage : chr "Group 1" "Group 4" "Group 2" "Group 3" ...
## $ Stadium : chr "Pocitos" "Parque Central" "Parque Central" "Pocitos" ...
## $ City : chr "Montevideo" "Montevideo" "Montevideo" "Montevideo" ...
## $ Home Team Name : chr "France" "USA" "Yugoslavia" "Romania" ...
## $ Home Team Goals : num 4 3 2 3 1 3 4 3 1 1 ...
## $ Away Team Goals : num 1 0 1 1 0 0 0 0 0 0 ...
## $ Away Team Name : chr "Mexico" "Belgium" "Brazil" "Peru" ...
## $ Win conditions : chr NA NA NA NA ...
## $ Attendance : num 4444 18346 24059 2549 23409 ...
## $ Half-time Home Goals: num 3 2 2 1 0 1 0 2 0 0 ...
## $ Half-time Away Goals: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Referee : chr "LOMBARDI Domingo (URU)" "MACIAS Jose (ARG)" "TEJADA Anibal (URU)" "WARNKEN Alberto (CHI)" ...
## $ Assistant 1 : chr "CRISTOPHE Henry (BEL)" "MATEUCCI Francisco (URU)" "VALLARINO Ricardo (URU)" "LANGENUS Jean (BEL)" ...
## $ Assistant 2 : chr "REGO Gilberto (BRA)" "WARNKEN Alberto (CHI)" "BALWAY Thomas (FRA)" "MATEUCCI Francisco (URU)" ...
## $ RoundID : num 201 201 201 201 201 201 201 201 201 201 ...
## $ MatchID : num 1096 1090 1093 1098 1085 ...
## $ Home Team Initials : chr "FRA" "USA" "YUG" "ROU" ...
## $ Away Team Initials : chr "MEX" "BEL" "BRA" "PER" ...
## - attr(*, "spec")=
## .. cols(
## .. Year = col_double(),
## .. Datetime = col_character(),
## .. Stage = col_character(),
## .. Stadium = col_character(),
## .. City = col_character(),
## .. `Home Team Name` = col_character(),
## .. `Home Team Goals` = col_double(),
## .. `Away Team Goals` = col_double(),
## .. `Away Team Name` = col_character(),
## .. `Win conditions` = col_character(),
## .. Attendance = col_double(),
## .. `Half-time Home Goals` = col_double(),
## .. `Half-time Away Goals` = col_double(),
## .. Referee = col_character(),
## .. `Assistant 1` = col_character(),
## .. `Assistant 2` = col_character(),
## .. RoundID = col_double(),
## .. MatchID = col_double(),
## .. `Home Team Initials` = col_character(),
## .. `Away Team Initials` = col_character()
## .. )
str(World_Cups)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 20 obs. of 10 variables:
## $ Year : num 1930 1934 1938 1950 1954 ...
## $ Country : chr "Uruguay" "Italy" "France" "Brazil" ...
## $ Winner : chr "Uruguay" "Italy" "Italy" "Uruguay" ...
## $ Runners-Up : chr "Argentina" "Czechoslovakia" "Hungary" "Brazil" ...
## $ Third : chr "USA" "Germany" "Brazil" "Sweden" ...
## $ Fourth : chr "Yugoslavia" "Austria" "Sweden" "Spain" ...
## $ GoalsScored : num 70 70 84 88 140 126 89 89 95 97 ...
## $ QualifiedTeams: num 13 16 15 13 16 16 16 16 16 16 ...
## $ MatchesPlayed : num 18 17 18 22 26 35 32 32 32 38 ...
## $ Attendance : chr "590.549" "363.000" "375.700" "1.045.246" ...
## - attr(*, "spec")=
## .. cols(
## .. Year = col_double(),
## .. Country = col_character(),
## .. Winner = col_character(),
## .. `Runners-Up` = col_character(),
## .. Third = col_character(),
## .. Fourth = col_character(),
## .. GoalsScored = col_double(),
## .. QualifiedTeams = col_double(),
## .. MatchesPlayed = col_double(),
## .. Attendance = col_character()
## .. )
dim(World_Cup_Matches)
## [1] 4572 20
dim(World_Cups)
## [1] 20 10
World_Cup_Matches$Datetime <-dmy_hm(World_Cup_Matches$Datetime)
World_Cup_Matches
class(World_Cup_Matches$Stage)
## [1] "character"
World_Cup_Matches$Stage=factor(World_Cup_Matches$Stage,levels=c("Final","First round","Group 1",
"Group 2","Group 3",
"Group 4","Group 5",
"Group 6","Group A","Group B",
"Group C","Group D",
"Group E","Group F",
"Group G","Group H",
"Match for third place","Play-off for third place",
"Preliminary round","Quarter-finals",
"Round of 16","Semi-finals","Third place"),order=TRUE)
levels(World_Cup_Matches$Stage)
## [1] "Final" "First round"
## [3] "Group 1" "Group 2"
## [5] "Group 3" "Group 4"
## [7] "Group 5" "Group 6"
## [9] "Group A" "Group B"
## [11] "Group C" "Group D"
## [13] "Group E" "Group F"
## [15] "Group G" "Group H"
## [17] "Match for third place" "Play-off for third place"
## [19] "Preliminary round" "Quarter-finals"
## [21] "Round of 16" "Semi-finals"
## [23] "Third place"
str(World_Cup_Matches$Stage)
## Ord.factor w/ 23 levels "Final"<"First round"<..: 3 6 4 5 3 3 4 6 5 3 ...
class(World_Cup_Matches$Stage)
## [1] "ordered" "factor"
class(World_Cup_Matches$Datetime)
## [1] "POSIXct" "POSIXt"
class(World_Cups$Attendance)
## [1] "character"
World_Cups$Attendance <- str_replace_all(World_Cups$Attendance,"[.]","")
World_Cups$Attendance <- as.integer(World_Cups$Attendance)
class(World_Cups$Attendance)
## [1] "integer"
World_Cups
names(World_Cup_Matches)[names(World_Cup_Matches)=="Attendance"] <- "Attendance_per_Match"
World_Cup_Matches
names(World_Cups)[names(World_Cups)=="Attendance"] <- "Attendance_of_world_Cup_Tournament"
World_Cups
dim(World_Cup_Matches)
## [1] 4572 20
dim(World_Cups)
## [1] 20 10
World_Cups_merged<-World_Cup_Matches%>%left_join(World_Cups,by="Year")
World_Cups_merged
dim(World_Cups_merged)
## [1] 4572 29
World_Cups_merged<-unique(World_Cups_merged)
dim(World_Cups_merged)
## [1] 837 29
str(World_Cups_merged)
## Classes 'tbl_df', 'tbl' and 'data.frame': 837 obs. of 29 variables:
## $ Year : num 1930 1930 1930 1930 1930 1930 1930 1930 1930 1930 ...
## $ Datetime : POSIXct, format: "1930-07-13 15:00:00" "1930-07-13 15:00:00" ...
## $ Stage : Ord.factor w/ 23 levels "Final"<"First round"<..: 3 6 4 5 3 3 4 6 5 3 ...
## $ Stadium : chr "Pocitos" "Parque Central" "Parque Central" "Pocitos" ...
## $ City : chr "Montevideo" "Montevideo" "Montevideo" "Montevideo" ...
## $ Home Team Name : chr "France" "USA" "Yugoslavia" "Romania" ...
## $ Home Team Goals : num 4 3 2 3 1 3 4 3 1 1 ...
## $ Away Team Goals : num 1 0 1 1 0 0 0 0 0 0 ...
## $ Away Team Name : chr "Mexico" "Belgium" "Brazil" "Peru" ...
## $ Win conditions : chr NA NA NA NA ...
## $ Attendance_per_Match : num 4444 18346 24059 2549 23409 ...
## $ Half-time Home Goals : num 3 2 2 1 0 1 0 2 0 0 ...
## $ Half-time Away Goals : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Referee : chr "LOMBARDI Domingo (URU)" "MACIAS Jose (ARG)" "TEJADA Anibal (URU)" "WARNKEN Alberto (CHI)" ...
## $ Assistant 1 : chr "CRISTOPHE Henry (BEL)" "MATEUCCI Francisco (URU)" "VALLARINO Ricardo (URU)" "LANGENUS Jean (BEL)" ...
## $ Assistant 2 : chr "REGO Gilberto (BRA)" "WARNKEN Alberto (CHI)" "BALWAY Thomas (FRA)" "MATEUCCI Francisco (URU)" ...
## $ RoundID : num 201 201 201 201 201 201 201 201 201 201 ...
## $ MatchID : num 1096 1090 1093 1098 1085 ...
## $ Home Team Initials : chr "FRA" "USA" "YUG" "ROU" ...
## $ Away Team Initials : chr "MEX" "BEL" "BRA" "PER" ...
## $ Country : chr "Uruguay" "Uruguay" "Uruguay" "Uruguay" ...
## $ Winner : chr "Uruguay" "Uruguay" "Uruguay" "Uruguay" ...
## $ Runners-Up : chr "Argentina" "Argentina" "Argentina" "Argentina" ...
## $ Third : chr "USA" "USA" "USA" "USA" ...
## $ Fourth : chr "Yugoslavia" "Yugoslavia" "Yugoslavia" "Yugoslavia" ...
## $ GoalsScored : num 70 70 70 70 70 70 70 70 70 70 ...
## $ QualifiedTeams : num 13 13 13 13 13 13 13 13 13 13 ...
## $ MatchesPlayed : num 18 18 18 18 18 18 18 18 18 18 ...
## $ Attendance_of_world_Cup_Tournament: int 590549 590549 590549 590549 590549 590549 590549 590549 590549 590549 ...
#filtering the data on the basis of year and select the required columns out of that and finally getting trimmed dataset
World_Cups_merged <- World_Cups_merged%>%filter(Year>= 1983)%>% select(-('Stadium'),-('Win conditions'),-(Referee:RoundID),-('Home Team Initials':'Away Team Initials'))
World_Cups_merged
dim(World_Cups_merged)
## [1] 476 21
sum(is.na(World_Cups_merged))
## [1] 1
which(is.na(World_Cups_merged))
## [1] 4272
colSums(is.na(World_Cups_merged))
## Year Datetime
## 0 0
## Stage City
## 0 0
## Home Team Name Home Team Goals
## 0 0
## Away Team Goals Away Team Name
## 0 0
## Attendance_per_Match Half-time Home Goals
## 1 0
## Half-time Away Goals MatchID
## 0 0
## Country Winner
## 0 0
## Runners-Up Third
## 0 0
## Fourth GoalsScored
## 0 0
## QualifiedTeams MatchesPlayed
## 0 0
## Attendance_of_world_Cup_Tournament
## 0
World_Cups_merged$Attendance_per_Match <- round(na.aggregate(World_Cups_merged$Attendance_per_Match, by=World_Cups_merged$Year, FUN=mean))
World_Cups_merged
colSums(is.na(World_Cups_merged))
## Year Datetime
## 0 0
## Stage City
## 0 0
## Home Team Name Home Team Goals
## 0 0
## Away Team Goals Away Team Name
## 0 0
## Attendance_per_Match Half-time Home Goals
## 0 0
## Half-time Away Goals MatchID
## 0 0
## Country Winner
## 0 0
## Runners-Up Third
## 0 0
## Fourth GoalsScored
## 0 0
## QualifiedTeams MatchesPlayed
## 0 0
## Attendance_of_world_Cup_Tournament
## 0
sum(is.na(World_Cups_merged))
## [1] 0
sum(is.infinite(World_Cups_merged$Attendance_per_Match))
## [1] 0
sum(is.nan(World_Cups_merged$Attendance_per_Match))
## [1] 0
World_Cups_merged_imputed <- subset(World_Cups_merged, select = -c(Attendance_per_Match))
World_Cups_merged<-World_Cups_merged %>% mutate(Total_Combined_Goals = World_Cups_merged$`Home Team Goals` + World_Cups_merged$`Away Team Goals`)
World_Cups_merged<- World_Cups_merged %>% mutate(Total_Goals_FirstHalf = `Half-time Home Goals` + `Half-time Away Goals`)
World_Cups_merged<- World_Cups_merged %>% mutate(Total_Goals_SecondHalf = Total_Combined_Goals - Total_Goals_FirstHalf)
World_Cups_merged
World_Cups_merged <- World_Cups_merged %>%
gather('Home Team Name','Away Team Name', key= "Participating_Teams(Home/Away)", value = "Participating_Team_Names")
World_Cups_merged <- World_Cups_merged %>%
gather('Home Team Goals','Away Team Goals', key= "Goals_By_Teams", value = "Goals_Values")
World_Cups_merged
World_Cup_combined_Goals<- World_Cups_merged %>% group_by(Year,Participating_Team_Names)%>%summarise(Total_Goals_Scored = sum(Goals_Values))
head(World_Cup_combined_Goals)
str(World_Cups_merged_imputed)#to check no of numeric variables for which outliers has to be detected
## Classes 'tbl_df', 'tbl' and 'data.frame': 476 obs. of 20 variables:
## $ Year : num 1986 1986 1986 1986 1986 ...
## $ Datetime : POSIXct, format: "1986-05-31 12:00:00" "1986-06-01 16:00:00" ...
## $ Stage : Ord.factor w/ 23 levels "Final"<"First round"<..: 9 11 12 11 9 14 12 10 14 10 ...
## $ City : chr "Mexico City" "Leon" "Guadalajara" "Irapuato" ...
## $ Home Team Name : chr "Bulgaria" "Canada" "Spain" "Soviet Union" ...
## $ Home Team Goals : num 1 0 0 6 3 0 1 1 1 1 ...
## $ Away Team Goals : num 1 1 1 0 1 0 1 2 0 0 ...
## $ Away Team Name : chr "Italy" "France" "Brazil" "Hungary" ...
## $ Half-time Home Goals : num 0 0 0 3 2 0 0 1 0 1 ...
## $ Half-time Away Goals : num 1 0 0 0 0 0 1 2 0 0 ...
## $ MatchID : num 459 468 439 610 395 674 379 428 538 628 ...
## $ Country : chr "Mexico" "Mexico" "Mexico" "Mexico" ...
## $ Winner : chr "Argentina" "Argentina" "Argentina" "Argentina" ...
## $ Runners-Up : chr "Germany FR" "Germany FR" "Germany FR" "Germany FR" ...
## $ Third : chr "France" "France" "France" "France" ...
## $ Fourth : chr "Belgium" "Belgium" "Belgium" "Belgium" ...
## $ GoalsScored : num 132 132 132 132 132 132 132 132 132 132 ...
## $ QualifiedTeams : num 24 24 24 24 24 24 24 24 24 24 ...
## $ MatchesPlayed : num 52 52 52 52 52 52 52 52 52 52 ...
## $ Attendance_of_world_Cup_Tournament: int 2394031 2394031 2394031 2394031 2394031 2394031 2394031 2394031 2394031 2394031 ...
sum(World_Cups_merged_imputed$`Home Team Goals`)
## [1] 654
hist(World_Cups_merged_imputed$`Home Team Goals`,breaks=10)
World_Cups_merged_imputed$`Home Team Goals`%>%
boxplot(main="Box Plot of Home TEam Goals",ylab="Home TEam Goals",col="green")
summary(World_Cups_merged_imputed$`Home Team Goals`)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.000 1.374 2.000 8.000
boxplot.stats(World_Cups_merged_imputed$`Home Team Goals`)$out
## [1] 6 6 6 6 8 6 7
cap <- function(x){
quantiles <- quantile( x, c(.05, 0.25, 0.75, .95 ) )
x[ x < quantiles[2] - 1.5*IQR(x) ] <- quantiles[1]
x[ x > quantiles[3] + 1.5*IQR(x) ] <- quantiles[4]
x
}
World_Cups_merged_imputed$`Home Team Goals` <- World_Cups_merged_imputed$`Home Team Goals` %>% cap()
World_Cups_merged_imputed$`Home Team Goals`%>%
boxplot(main="Box Plot of Home TEam Goals without outliers",ylab="Home TEam Goals",col="green")
summary(World_Cups_merged_imputed$`Home Team Goals`)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.000 1.338 2.000 5.000
boxplot.stats(World_Cups_merged_imputed$`Home Team Goals`)$out
## numeric(0)
#####Capping and binning
hist(World_Cups_merged_imputed$`Away Team Goals`,breaks=10)
World_Cups_merged_imputed$`Away Team Goals`%>%
boxplot(main="Box Plot of Away TEam Goals",ylab="Away TEam Goals",col="green")
World_Cups_merged_imputed$`Away Team Goals` <- World_Cups_merged_imputed$`Away Team Goals` %>% cap()
World_Cups_merged_imputed$`Away Team Goals`%>%
boxplot(main="Box Plot of Away TEam Goals without outliers",ylab="Away TEam Goals",col="green")
boxplot.stats(World_Cups_merged_imputed$`Away Team Goals`)$out
## numeric(0)
###Numeric Variable Half-time Home Goals
hist(World_Cups_merged_imputed$`Half-time Home Goals`,breaks=10)
World_Cups_merged_imputed$`Half-time Home Goals`%>%
boxplot(main="Box Plot of Half-time Home Goals",ylab="Half-time Home Goals",col="yellow")
World_Cups_merged_imputed$`Half-time Home Goals` <- World_Cups_merged_imputed$`Half-time Home Goals` %>% cap()
World_Cups_merged_imputed$`Half-time Home Goals`%>%
boxplot(main="Box Plot of Half-time Home Goals without outliers",ylab="Half-time Home Goals",col="yellow")
summary(World_Cups_merged_imputed$`Half-time Home Goals`)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.4979 1.0000 2.0000
boxplot.stats(World_Cups_merged_imputed$`Half-time Home Goals`)$out
## numeric(0)
###Numeric Variable Half-time Home Goals
hist(World_Cups_merged_imputed$`Half-time Home Goals`,breaks=10)
World_Cups_merged_imputed$`Half-time Home Goals`%>%
boxplot(main="Box Plot of Half-time Home Goals",ylab="Half-time Home Goals",col="yellow")
World_Cups_merged_imputed$`Half-time Home Goals` <- World_Cups_merged_imputed$`Half-time Home Goals` %>% cap()
World_Cups_merged_imputed$`Half-time Home Goals`%>%
boxplot(main="Box Plot of Half-time Home Goals without outliers",ylab="Half-time Home Goals",col="yellow")
summary(World_Cups_merged_imputed$`Half-time Home Goals`)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.4979 1.0000 2.0000
boxplot.stats(World_Cups_merged_imputed$`Half-time Home Goals`)$out
## numeric(0)
hist(World_Cups_merged_imputed$`Half-time Home Goals`,breaks=20)
#checking the difference in the distribution after applying log transformation
World_Cups_merged_imputed$GoalsScored<- log10(World_Cups_merged_imputed$GoalsScored)
hist(World_Cups_merged_imputed$GoalsScored,breaks=20)
# no significant difference in distribution is seen so we try with sqrt transformations
World_Cups_merged_imputed$GoalsScored<- sqrt(World_Cups_merged_imputed$GoalsScored)
hist(World_Cups_merged_imputed$GoalsScored,breaks=20)
#By this sqrt transformation we get somewhat normal distribution, so here is the transformed data.
head(World_Cups_merged_imputed)