Required packages

Necessary packages are installed and library() is used to load the packages.

library(readr) 
library(readxl) 
library(foreign) 
library(gdata) 
## gdata: Unable to locate valid perl interpreter
## gdata: 
## gdata: read.xls() will be unable to read Excel XLS and XLSX files
## gdata: unless the 'perl=' argument is used to specify the location
## gdata: of a valid perl intrpreter.
## gdata: 
## gdata: (To avoid display of this message in the future, please
## gdata: ensure perl is installed and available on the executable
## gdata: search path.)
## gdata: Unable to load perl libaries needed by read.xls()
## gdata: to support 'XLX' (Excel 97-2004) files.
## 
## gdata: Unable to load perl libaries needed by read.xls()
## gdata: to support 'XLSX' (Excel 2007+) files.
## 
## gdata: Run the function 'installXLSXsupport()'
## gdata: to automatically download and install the perl
## gdata: libaries needed to support Excel XLS and XLSX formats.
## 
## Attaching package: 'gdata'
## The following object is masked from 'package:stats':
## 
##     nobs
## The following object is masked from 'package:utils':
## 
##     object.size
## The following object is masked from 'package:base':
## 
##     startsWith
library(rvest) 
## Loading required package: xml2
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
## 
##     guess_encoding
library(dplyr) 
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:gdata':
## 
##     combine, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(knitr)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(stringr)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(forecast)
## Registered S3 method overwritten by 'xts':
##   method     from
##   as.zoo.xts zoo
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## Registered S3 methods overwritten by 'forecast':
##   method             from    
##   fitted.fracdiff    fracdiff
##   residuals.fracdiff fracdiff

Executive Summary

Following are the steps used for data processing :- * Fetching the data. * Understanding the data. * Tidying and Manipulating the data. * Scanning and Transforming the data.

TIDY AND MANIPULATE

SCANNING

*Then we scanned for Outliers with z.scores>3 and imputed them with the mean value.

Data

getwd()
## [1] "C:/Users/KOMAL MEHTA/Desktop/1st Semester/Data Preprocessing/Assignment3"
World_Cup_Matches <- read_csv("fifa-world-cup/WorldCupMatches.csv")
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   Year = col_double(),
##   `Home Team Goals` = col_double(),
##   `Away Team Goals` = col_double(),
##   Attendance = col_double(),
##   `Half-time Home Goals` = col_double(),
##   `Half-time Away Goals` = col_double(),
##   RoundID = col_double(),
##   MatchID = col_double()
## )
## See spec(...) for full column specifications.
View(World_Cup_Matches)

World_Cups <- read_csv("fifa-world-cup/WorldCups.csv")
## Parsed with column specification:
## cols(
##   Year = col_double(),
##   Country = col_character(),
##   Winner = col_character(),
##   `Runners-Up` = col_character(),
##   Third = col_character(),
##   Fourth = col_character(),
##   GoalsScored = col_double(),
##   QualifiedTeams = col_double(),
##   MatchesPlayed = col_double(),
##   Attendance = col_character()
## )
View(World_Cups)
str(World_Cup_Matches)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 4572 obs. of  20 variables:
##  $ Year                : num  1930 1930 1930 1930 1930 1930 1930 1930 1930 1930 ...
##  $ Datetime            : chr  "13 Jul 1930 - 15:00" "13 Jul 1930 - 15:00" "14 Jul 1930 - 12:45" "14 Jul 1930 - 14:50" ...
##  $ Stage               : chr  "Group 1" "Group 4" "Group 2" "Group 3" ...
##  $ Stadium             : chr  "Pocitos" "Parque Central" "Parque Central" "Pocitos" ...
##  $ City                : chr  "Montevideo" "Montevideo" "Montevideo" "Montevideo" ...
##  $ Home Team Name      : chr  "France" "USA" "Yugoslavia" "Romania" ...
##  $ Home Team Goals     : num  4 3 2 3 1 3 4 3 1 1 ...
##  $ Away Team Goals     : num  1 0 1 1 0 0 0 0 0 0 ...
##  $ Away Team Name      : chr  "Mexico" "Belgium" "Brazil" "Peru" ...
##  $ Win conditions      : chr  NA NA NA NA ...
##  $ Attendance          : num  4444 18346 24059 2549 23409 ...
##  $ Half-time Home Goals: num  3 2 2 1 0 1 0 2 0 0 ...
##  $ Half-time Away Goals: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Referee             : chr  "LOMBARDI Domingo (URU)" "MACIAS Jose (ARG)" "TEJADA Anibal (URU)" "WARNKEN Alberto (CHI)" ...
##  $ Assistant 1         : chr  "CRISTOPHE Henry (BEL)" "MATEUCCI Francisco (URU)" "VALLARINO Ricardo (URU)" "LANGENUS Jean (BEL)" ...
##  $ Assistant 2         : chr  "REGO Gilberto (BRA)" "WARNKEN Alberto (CHI)" "BALWAY Thomas (FRA)" "MATEUCCI Francisco (URU)" ...
##  $ RoundID             : num  201 201 201 201 201 201 201 201 201 201 ...
##  $ MatchID             : num  1096 1090 1093 1098 1085 ...
##  $ Home Team Initials  : chr  "FRA" "USA" "YUG" "ROU" ...
##  $ Away Team Initials  : chr  "MEX" "BEL" "BRA" "PER" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Year = col_double(),
##   ..   Datetime = col_character(),
##   ..   Stage = col_character(),
##   ..   Stadium = col_character(),
##   ..   City = col_character(),
##   ..   `Home Team Name` = col_character(),
##   ..   `Home Team Goals` = col_double(),
##   ..   `Away Team Goals` = col_double(),
##   ..   `Away Team Name` = col_character(),
##   ..   `Win conditions` = col_character(),
##   ..   Attendance = col_double(),
##   ..   `Half-time Home Goals` = col_double(),
##   ..   `Half-time Away Goals` = col_double(),
##   ..   Referee = col_character(),
##   ..   `Assistant 1` = col_character(),
##   ..   `Assistant 2` = col_character(),
##   ..   RoundID = col_double(),
##   ..   MatchID = col_double(),
##   ..   `Home Team Initials` = col_character(),
##   ..   `Away Team Initials` = col_character()
##   .. )
str(World_Cups)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 20 obs. of  10 variables:
##  $ Year          : num  1930 1934 1938 1950 1954 ...
##  $ Country       : chr  "Uruguay" "Italy" "France" "Brazil" ...
##  $ Winner        : chr  "Uruguay" "Italy" "Italy" "Uruguay" ...
##  $ Runners-Up    : chr  "Argentina" "Czechoslovakia" "Hungary" "Brazil" ...
##  $ Third         : chr  "USA" "Germany" "Brazil" "Sweden" ...
##  $ Fourth        : chr  "Yugoslavia" "Austria" "Sweden" "Spain" ...
##  $ GoalsScored   : num  70 70 84 88 140 126 89 89 95 97 ...
##  $ QualifiedTeams: num  13 16 15 13 16 16 16 16 16 16 ...
##  $ MatchesPlayed : num  18 17 18 22 26 35 32 32 32 38 ...
##  $ Attendance    : chr  "590.549" "363.000" "375.700" "1.045.246" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Year = col_double(),
##   ..   Country = col_character(),
##   ..   Winner = col_character(),
##   ..   `Runners-Up` = col_character(),
##   ..   Third = col_character(),
##   ..   Fourth = col_character(),
##   ..   GoalsScored = col_double(),
##   ..   QualifiedTeams = col_double(),
##   ..   MatchesPlayed = col_double(),
##   ..   Attendance = col_character()
##   .. )
dim(World_Cup_Matches)
## [1] 4572   20
dim(World_Cups)
## [1] 20 10

Understanding and Interpretation

World_Cup_Matches$Datetime <-dmy_hm(World_Cup_Matches$Datetime)

World_Cup_Matches
class(World_Cup_Matches$Stage)
## [1] "character"
World_Cup_Matches$Stage=factor(World_Cup_Matches$Stage,levels=c("Final","First round","Group 1",
"Group 2","Group 3",
"Group 4","Group 5",
"Group 6","Group A","Group B",
"Group C","Group D",
"Group E","Group F",
"Group G","Group H",
"Match for third place","Play-off for third place",
"Preliminary round","Quarter-finals",
"Round of 16","Semi-finals","Third place"),order=TRUE)
levels(World_Cup_Matches$Stage)
##  [1] "Final"                    "First round"             
##  [3] "Group 1"                  "Group 2"                 
##  [5] "Group 3"                  "Group 4"                 
##  [7] "Group 5"                  "Group 6"                 
##  [9] "Group A"                  "Group B"                 
## [11] "Group C"                  "Group D"                 
## [13] "Group E"                  "Group F"                 
## [15] "Group G"                  "Group H"                 
## [17] "Match for third place"    "Play-off for third place"
## [19] "Preliminary round"        "Quarter-finals"          
## [21] "Round of 16"              "Semi-finals"             
## [23] "Third place"
str(World_Cup_Matches$Stage)
##  Ord.factor w/ 23 levels "Final"<"First round"<..: 3 6 4 5 3 3 4 6 5 3 ...
class(World_Cup_Matches$Stage)
## [1] "ordered" "factor"
class(World_Cup_Matches$Datetime)
## [1] "POSIXct" "POSIXt"
class(World_Cups$Attendance)
## [1] "character"
World_Cups$Attendance <- str_replace_all(World_Cups$Attendance,"[.]","")
World_Cups$Attendance <- as.integer(World_Cups$Attendance)
class(World_Cups$Attendance)
## [1] "integer"
World_Cups

Renaming the column names

names(World_Cup_Matches)[names(World_Cup_Matches)=="Attendance"] <- "Attendance_per_Match"
World_Cup_Matches
names(World_Cups)[names(World_Cups)=="Attendance"] <- "Attendance_of_world_Cup_Tournament"
World_Cups
dim(World_Cup_Matches)
## [1] 4572   20
dim(World_Cups)
## [1] 20 10

Merging the datasets

World_Cups_merged<-World_Cup_Matches%>%left_join(World_Cups,by="Year")
World_Cups_merged
dim(World_Cups_merged)
## [1] 4572   29
World_Cups_merged<-unique(World_Cups_merged)
dim(World_Cups_merged)
## [1] 837  29
str(World_Cups_merged)
## Classes 'tbl_df', 'tbl' and 'data.frame':    837 obs. of  29 variables:
##  $ Year                              : num  1930 1930 1930 1930 1930 1930 1930 1930 1930 1930 ...
##  $ Datetime                          : POSIXct, format: "1930-07-13 15:00:00" "1930-07-13 15:00:00" ...
##  $ Stage                             : Ord.factor w/ 23 levels "Final"<"First round"<..: 3 6 4 5 3 3 4 6 5 3 ...
##  $ Stadium                           : chr  "Pocitos" "Parque Central" "Parque Central" "Pocitos" ...
##  $ City                              : chr  "Montevideo" "Montevideo" "Montevideo" "Montevideo" ...
##  $ Home Team Name                    : chr  "France" "USA" "Yugoslavia" "Romania" ...
##  $ Home Team Goals                   : num  4 3 2 3 1 3 4 3 1 1 ...
##  $ Away Team Goals                   : num  1 0 1 1 0 0 0 0 0 0 ...
##  $ Away Team Name                    : chr  "Mexico" "Belgium" "Brazil" "Peru" ...
##  $ Win conditions                    : chr  NA NA NA NA ...
##  $ Attendance_per_Match              : num  4444 18346 24059 2549 23409 ...
##  $ Half-time Home Goals              : num  3 2 2 1 0 1 0 2 0 0 ...
##  $ Half-time Away Goals              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Referee                           : chr  "LOMBARDI Domingo (URU)" "MACIAS Jose (ARG)" "TEJADA Anibal (URU)" "WARNKEN Alberto (CHI)" ...
##  $ Assistant 1                       : chr  "CRISTOPHE Henry (BEL)" "MATEUCCI Francisco (URU)" "VALLARINO Ricardo (URU)" "LANGENUS Jean (BEL)" ...
##  $ Assistant 2                       : chr  "REGO Gilberto (BRA)" "WARNKEN Alberto (CHI)" "BALWAY Thomas (FRA)" "MATEUCCI Francisco (URU)" ...
##  $ RoundID                           : num  201 201 201 201 201 201 201 201 201 201 ...
##  $ MatchID                           : num  1096 1090 1093 1098 1085 ...
##  $ Home Team Initials                : chr  "FRA" "USA" "YUG" "ROU" ...
##  $ Away Team Initials                : chr  "MEX" "BEL" "BRA" "PER" ...
##  $ Country                           : chr  "Uruguay" "Uruguay" "Uruguay" "Uruguay" ...
##  $ Winner                            : chr  "Uruguay" "Uruguay" "Uruguay" "Uruguay" ...
##  $ Runners-Up                        : chr  "Argentina" "Argentina" "Argentina" "Argentina" ...
##  $ Third                             : chr  "USA" "USA" "USA" "USA" ...
##  $ Fourth                            : chr  "Yugoslavia" "Yugoslavia" "Yugoslavia" "Yugoslavia" ...
##  $ GoalsScored                       : num  70 70 70 70 70 70 70 70 70 70 ...
##  $ QualifiedTeams                    : num  13 13 13 13 13 13 13 13 13 13 ...
##  $ MatchesPlayed                     : num  18 18 18 18 18 18 18 18 18 18 ...
##  $ Attendance_of_world_Cup_Tournament: int  590549 590549 590549 590549 590549 590549 590549 590549 590549 590549 ...

Tidy and Manipualting Data I

#filtering the data on the basis of year and select the required columns out of that and finally getting trimmed dataset 

World_Cups_merged <- World_Cups_merged%>%filter(Year>= 1983)%>% select(-('Stadium'),-('Win conditions'),-(Referee:RoundID),-('Home Team Initials':'Away Team Initials'))
World_Cups_merged
dim(World_Cups_merged)
## [1] 476  21

Scan I

sum(is.na(World_Cups_merged))
## [1] 1
which(is.na(World_Cups_merged))
## [1] 4272
colSums(is.na(World_Cups_merged))
##                               Year                           Datetime 
##                                  0                                  0 
##                              Stage                               City 
##                                  0                                  0 
##                     Home Team Name                    Home Team Goals 
##                                  0                                  0 
##                    Away Team Goals                     Away Team Name 
##                                  0                                  0 
##               Attendance_per_Match               Half-time Home Goals 
##                                  1                                  0 
##               Half-time Away Goals                            MatchID 
##                                  0                                  0 
##                            Country                             Winner 
##                                  0                                  0 
##                         Runners-Up                              Third 
##                                  0                                  0 
##                             Fourth                        GoalsScored 
##                                  0                                  0 
##                     QualifiedTeams                      MatchesPlayed 
##                                  0                                  0 
## Attendance_of_world_Cup_Tournament 
##                                  0
World_Cups_merged$Attendance_per_Match <- round(na.aggregate(World_Cups_merged$Attendance_per_Match, by=World_Cups_merged$Year, FUN=mean))
World_Cups_merged
colSums(is.na(World_Cups_merged))
##                               Year                           Datetime 
##                                  0                                  0 
##                              Stage                               City 
##                                  0                                  0 
##                     Home Team Name                    Home Team Goals 
##                                  0                                  0 
##                    Away Team Goals                     Away Team Name 
##                                  0                                  0 
##               Attendance_per_Match               Half-time Home Goals 
##                                  0                                  0 
##               Half-time Away Goals                            MatchID 
##                                  0                                  0 
##                            Country                             Winner 
##                                  0                                  0 
##                         Runners-Up                              Third 
##                                  0                                  0 
##                             Fourth                        GoalsScored 
##                                  0                                  0 
##                     QualifiedTeams                      MatchesPlayed 
##                                  0                                  0 
## Attendance_of_world_Cup_Tournament 
##                                  0
sum(is.na(World_Cups_merged))
## [1] 0
sum(is.infinite(World_Cups_merged$Attendance_per_Match))
## [1] 0
sum(is.nan(World_Cups_merged$Attendance_per_Match))
## [1] 0
World_Cups_merged_imputed <- subset(World_Cups_merged, select = -c(Attendance_per_Match))

Tidy & Manipulate Data II

World_Cups_merged<-World_Cups_merged %>% mutate(Total_Combined_Goals = World_Cups_merged$`Home Team Goals` + World_Cups_merged$`Away Team Goals`)
World_Cups_merged<- World_Cups_merged %>% mutate(Total_Goals_FirstHalf = `Half-time Home Goals` + `Half-time Away Goals`)
World_Cups_merged<- World_Cups_merged %>% mutate(Total_Goals_SecondHalf = Total_Combined_Goals - Total_Goals_FirstHalf)
World_Cups_merged

Gather the data

World_Cups_merged <- World_Cups_merged %>% 
  gather('Home Team Name','Away Team Name', key= "Participating_Teams(Home/Away)", value = "Participating_Team_Names")

World_Cups_merged <- World_Cups_merged %>% 
  gather('Home Team Goals','Away Team Goals', key= "Goals_By_Teams", value = "Goals_Values")
World_Cups_merged

Data Grouping

  • For getting total goals scored by a team for a particular year in a world cup tournament we are using group-by().
  • The reason we are using this approach is to get total number of goals scored by each team names of the participating country in entire tournament in any given year.
  • Therefore we have used summarise() and group_by().
World_Cup_combined_Goals<- World_Cups_merged %>% group_by(Year,Participating_Team_Names)%>%summarise(Total_Goals_Scored = sum(Goals_Values))
head(World_Cup_combined_Goals)

Scan II

  • We are checking the outliers for the numeric variables and through Hmisc().
  • By plotting the histogram we are trying to find out whether the values show up a normal distribution or not.
  • What if it does not show normal distribution, so for this situation we have checked the outliers using boxplot.
  • Also, if the outliers found in a boxplot method are less than 5%, then we are dealing woth the outliers through capping().
  • Capping() imputes the lower outlier values with upper outlier values.
str(World_Cups_merged_imputed)#to check no of numeric variables for which outliers has to be detected
## Classes 'tbl_df', 'tbl' and 'data.frame':    476 obs. of  20 variables:
##  $ Year                              : num  1986 1986 1986 1986 1986 ...
##  $ Datetime                          : POSIXct, format: "1986-05-31 12:00:00" "1986-06-01 16:00:00" ...
##  $ Stage                             : Ord.factor w/ 23 levels "Final"<"First round"<..: 9 11 12 11 9 14 12 10 14 10 ...
##  $ City                              : chr  "Mexico City" "Leon" "Guadalajara" "Irapuato" ...
##  $ Home Team Name                    : chr  "Bulgaria" "Canada" "Spain" "Soviet Union" ...
##  $ Home Team Goals                   : num  1 0 0 6 3 0 1 1 1 1 ...
##  $ Away Team Goals                   : num  1 1 1 0 1 0 1 2 0 0 ...
##  $ Away Team Name                    : chr  "Italy" "France" "Brazil" "Hungary" ...
##  $ Half-time Home Goals              : num  0 0 0 3 2 0 0 1 0 1 ...
##  $ Half-time Away Goals              : num  1 0 0 0 0 0 1 2 0 0 ...
##  $ MatchID                           : num  459 468 439 610 395 674 379 428 538 628 ...
##  $ Country                           : chr  "Mexico" "Mexico" "Mexico" "Mexico" ...
##  $ Winner                            : chr  "Argentina" "Argentina" "Argentina" "Argentina" ...
##  $ Runners-Up                        : chr  "Germany FR" "Germany FR" "Germany FR" "Germany FR" ...
##  $ Third                             : chr  "France" "France" "France" "France" ...
##  $ Fourth                            : chr  "Belgium" "Belgium" "Belgium" "Belgium" ...
##  $ GoalsScored                       : num  132 132 132 132 132 132 132 132 132 132 ...
##  $ QualifiedTeams                    : num  24 24 24 24 24 24 24 24 24 24 ...
##  $ MatchesPlayed                     : num  52 52 52 52 52 52 52 52 52 52 ...
##  $ Attendance_of_world_Cup_Tournament: int  2394031 2394031 2394031 2394031 2394031 2394031 2394031 2394031 2394031 2394031 ...
sum(World_Cups_merged_imputed$`Home Team Goals`)
## [1] 654
hist(World_Cups_merged_imputed$`Home Team Goals`,breaks=10)

World_Cups_merged_imputed$`Home Team Goals`%>%
boxplot(main="Box Plot of Home TEam Goals",ylab="Home TEam Goals",col="green")

summary(World_Cups_merged_imputed$`Home Team Goals`)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   1.374   2.000   8.000
boxplot.stats(World_Cups_merged_imputed$`Home Team Goals`)$out
## [1] 6 6 6 6 8 6 7
cap <- function(x){
    quantiles <- quantile( x, c(.05, 0.25, 0.75, .95 ) )
    x[ x < quantiles[2] - 1.5*IQR(x) ] <- quantiles[1]
    x[ x > quantiles[3] + 1.5*IQR(x) ] <- quantiles[4]
    x
}

World_Cups_merged_imputed$`Home Team Goals` <- World_Cups_merged_imputed$`Home Team Goals` %>% cap()
World_Cups_merged_imputed$`Home Team Goals`%>%
  boxplot(main="Box Plot of Home TEam Goals without outliers",ylab="Home TEam Goals",col="green")

summary(World_Cups_merged_imputed$`Home Team Goals`)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   1.338   2.000   5.000
boxplot.stats(World_Cups_merged_imputed$`Home Team Goals`)$out
## numeric(0)
#####Capping and binning

hist(World_Cups_merged_imputed$`Away Team Goals`,breaks=10)

World_Cups_merged_imputed$`Away Team Goals`%>%
boxplot(main="Box Plot of Away TEam Goals",ylab="Away TEam Goals",col="green")

World_Cups_merged_imputed$`Away Team Goals` <- World_Cups_merged_imputed$`Away Team Goals` %>% cap()
World_Cups_merged_imputed$`Away Team Goals`%>%
  boxplot(main="Box Plot of Away TEam Goals without outliers",ylab="Away TEam Goals",col="green")

boxplot.stats(World_Cups_merged_imputed$`Away Team Goals`)$out
## numeric(0)
###Numeric Variable Half-time Home Goals
hist(World_Cups_merged_imputed$`Half-time Home Goals`,breaks=10)

World_Cups_merged_imputed$`Half-time Home Goals`%>%
boxplot(main="Box Plot of Half-time Home Goals",ylab="Half-time Home Goals",col="yellow")

World_Cups_merged_imputed$`Half-time Home Goals` <- World_Cups_merged_imputed$`Half-time Home Goals` %>% cap()
World_Cups_merged_imputed$`Half-time Home Goals`%>%
  boxplot(main="Box Plot of Half-time Home Goals without outliers",ylab="Half-time Home Goals",col="yellow")

summary(World_Cups_merged_imputed$`Half-time Home Goals`)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.4979  1.0000  2.0000
boxplot.stats(World_Cups_merged_imputed$`Half-time Home Goals`)$out
## numeric(0)
###Numeric Variable Half-time Home Goals
hist(World_Cups_merged_imputed$`Half-time Home Goals`,breaks=10)

World_Cups_merged_imputed$`Half-time Home Goals`%>%
boxplot(main="Box Plot of Half-time Home Goals",ylab="Half-time Home Goals",col="yellow")

World_Cups_merged_imputed$`Half-time Home Goals` <- World_Cups_merged_imputed$`Half-time Home Goals` %>% cap()
World_Cups_merged_imputed$`Half-time Home Goals`%>%
  boxplot(main="Box Plot of Half-time Home Goals without outliers",ylab="Half-time Home Goals",col="yellow")

summary(World_Cups_merged_imputed$`Half-time Home Goals`)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.4979  1.0000  2.0000
boxplot.stats(World_Cups_merged_imputed$`Half-time Home Goals`)$out
## numeric(0)

Transform

  • If a numeric variable does not show normal distribution, then we are tranforming numeric values, so that normal distribution can be achieved.
  • If there is a skewness in the graph then we are converting it to normal distribution using sqrt transfomination.
  • The factors considered for dataset going into trnsformations are:i) to change the scale for better understanding of the variable, ii) to convert a non-linear relation into linear one, or iii) to decrease the skewness and convert the distribution into a normal distribution.
  • Some numeric variables here shows skewness, so we have to follows accurate transformation.
hist(World_Cups_merged_imputed$`Half-time Home Goals`,breaks=20)

#checking the difference in the distribution after applying log transformation
World_Cups_merged_imputed$GoalsScored<- log10(World_Cups_merged_imputed$GoalsScored)
hist(World_Cups_merged_imputed$GoalsScored,breaks=20)

# no significant difference in distribution is seen so we try with sqrt transformations
World_Cups_merged_imputed$GoalsScored<- sqrt(World_Cups_merged_imputed$GoalsScored)
hist(World_Cups_merged_imputed$GoalsScored,breaks=20)

#By this sqrt transformation we get somewhat normal distribution, so here is the transformed data.
head(World_Cups_merged_imputed)