Introduction

This project is the base data preparation for further projects. Data consists of detailed attributes for every player registered in the latest edition of the FIFA 19 database. It is acquired from the Kaggle website (K.Gadiya 2019).

data <- read.csv2("data.csv", sep = ",", encoding = "UTF-8")
str(data)
## 'data.frame':    18207 obs. of  88 variables:
##  $ X.U.FEFF.ID             : int  158023 20801 190871 193080 192985 183277 177003 176580 155862 200389 ...
##  $ Name                    : chr  "L. Messi" "Cristiano Ronaldo" "Neymar Jr" "De Gea" ...
##  $ Age                     : int  31 33 26 27 27 27 32 31 32 25 ...
##  $ Photo                   : chr  "https://cdn.sofifa.org/players/4/19/158023.png" "https://cdn.sofifa.org/players/4/19/20801.png" "https://cdn.sofifa.org/players/4/19/190871.png" "https://cdn.sofifa.org/players/4/19/193080.png" ...
##  $ Nationality             : chr  "Argentina" "Portugal" "Brazil" "Spain" ...
##  $ Flag                    : chr  "https://cdn.sofifa.org/flags/52.png" "https://cdn.sofifa.org/flags/38.png" "https://cdn.sofifa.org/flags/54.png" "https://cdn.sofifa.org/flags/45.png" ...
##  $ Overall                 : int  94 94 92 91 91 91 91 91 91 90 ...
##  $ Potential               : int  94 94 93 93 92 91 91 91 91 93 ...
##  $ Club                    : chr  "FC Barcelona" "Juventus" "Paris Saint-Germain" "Manchester United" ...
##  $ Club.Logo               : chr  "https://cdn.sofifa.org/teams/2/light/241.png" "https://cdn.sofifa.org/teams/2/light/45.png" "https://cdn.sofifa.org/teams/2/light/73.png" "https://cdn.sofifa.org/teams/2/light/11.png" ...
##  $ Value                   : chr  "\200110.5M" "\20077M" "\200118.5M" "\20072M" ...
##  $ Wage                    : chr  "\200565K" "\200405K" "\200290K" "\200260K" ...
##  $ Special                 : int  2202 2228 2143 1471 2281 2142 2280 2346 2201 1331 ...
##  $ Preferred.Foot          : chr  "Left" "Right" "Right" "Right" ...
##  $ International.Reputation: int  5 5 5 4 4 4 4 5 4 3 ...
##  $ Weak.Foot               : int  4 4 5 3 5 4 4 4 3 3 ...
##  $ Skill.Moves             : int  4 5 5 1 4 4 4 3 3 1 ...
##  $ Work.Rate               : chr  "Medium/ Medium" "High/ Low" "High/ Medium" "Medium/ Medium" ...
##  $ Body.Type               : chr  "Messi" "C. Ronaldo" "Neymar" "Lean" ...
##  $ Real.Face               : chr  "Yes" "Yes" "Yes" "Yes" ...
##  $ Position                : chr  "RF" "ST" "LW" "GK" ...
##  $ Jersey.Number           : int  10 7 10 1 7 10 10 9 15 1 ...
##  $ Joined                  : chr  "1-Jul-04" "10-Jul-18" "3-Aug-17" "1-Jul-11" ...
##  $ Loaned.From             : chr  "" "" "" "" ...
##  $ Contract.Valid.Until    : chr  "2021" "2022" "2022" "2020" ...
##  $ Height                  : chr  "5'7" "6'2" "5'9" "6'4" ...
##  $ Weight                  : chr  "159lbs" "183lbs" "150lbs" "168lbs" ...
##  $ LS                      : chr  "88+2" "91+3" "84+3" "" ...
##  $ ST                      : chr  "88+2" "91+3" "84+3" "" ...
##  $ RS                      : chr  "88+2" "91+3" "84+3" "" ...
##  $ LW                      : chr  "92+2" "89+3" "89+3" "" ...
##  $ LF                      : chr  "93+2" "90+3" "89+3" "" ...
##  $ CF                      : chr  "93+2" "90+3" "89+3" "" ...
##  $ RF                      : chr  "93+2" "90+3" "89+3" "" ...
##  $ RW                      : chr  "92+2" "89+3" "89+3" "" ...
##  $ LAM                     : chr  "93+2" "88+3" "89+3" "" ...
##  $ CAM                     : chr  "93+2" "88+3" "89+3" "" ...
##  $ RAM                     : chr  "93+2" "88+3" "89+3" "" ...
##  $ LM                      : chr  "91+2" "88+3" "88+3" "" ...
##  $ LCM                     : chr  "84+2" "81+3" "81+3" "" ...
##  $ CM                      : chr  "84+2" "81+3" "81+3" "" ...
##  $ RCM                     : chr  "84+2" "81+3" "81+3" "" ...
##  $ RM                      : chr  "91+2" "88+3" "88+3" "" ...
##  $ LWB                     : chr  "64+2" "65+3" "65+3" "" ...
##  $ LDM                     : chr  "61+2" "61+3" "60+3" "" ...
##  $ CDM                     : chr  "61+2" "61+3" "60+3" "" ...
##  $ RDM                     : chr  "61+2" "61+3" "60+3" "" ...
##  $ RWB                     : chr  "64+2" "65+3" "65+3" "" ...
##  $ LB                      : chr  "59+2" "61+3" "60+3" "" ...
##  $ LCB                     : chr  "47+2" "53+3" "47+3" "" ...
##  $ CB                      : chr  "47+2" "53+3" "47+3" "" ...
##  $ RCB                     : chr  "47+2" "53+3" "47+3" "" ...
##  $ RB                      : chr  "59+2" "61+3" "60+3" "" ...
##  $ Crossing                : int  84 84 79 17 93 81 86 77 66 13 ...
##  $ Finishing               : int  95 94 87 13 82 84 72 93 60 11 ...
##  $ HeadingAccuracy         : int  70 89 62 21 55 61 55 77 91 15 ...
##  $ ShortPassing            : int  90 81 84 50 92 89 93 82 78 29 ...
##  $ Volleys                 : int  86 87 84 13 82 80 76 88 66 13 ...
##  $ Dribbling               : int  97 88 96 18 86 95 90 87 63 12 ...
##  $ Curve                   : int  93 81 88 21 85 83 85 86 74 13 ...
##  $ FKAccuracy              : int  94 76 87 19 83 79 78 84 72 14 ...
##  $ LongPassing             : int  87 77 78 51 91 83 88 64 77 26 ...
##  $ BallControl             : int  96 94 95 42 91 94 93 90 84 16 ...
##  $ Acceleration            : int  91 89 94 57 78 94 80 86 76 43 ...
##  $ SprintSpeed             : int  86 91 90 58 76 88 72 75 75 60 ...
##  $ Agility                 : int  91 87 96 60 79 95 93 82 78 67 ...
##  $ Reactions               : int  95 96 94 90 91 90 90 92 85 86 ...
##  $ Balance                 : int  95 70 84 43 77 94 94 83 66 49 ...
##  $ ShotPower               : int  85 95 80 31 91 82 79 86 79 22 ...
##  $ Jumping                 : int  68 95 61 67 63 56 68 69 93 76 ...
##  $ Stamina                 : int  72 88 81 43 90 83 89 90 84 41 ...
##  $ Strength                : int  59 79 49 64 75 66 58 83 83 78 ...
##  $ LongShots               : int  94 93 82 12 91 80 82 85 59 12 ...
##  $ Aggression              : int  48 63 56 38 76 54 62 87 88 34 ...
##  $ Interceptions           : int  22 29 36 30 61 41 83 41 90 19 ...
##  $ Positioning             : int  94 95 89 12 87 87 79 92 60 11 ...
##  $ Vision                  : int  94 82 87 68 94 89 92 84 63 70 ...
##  $ Penalties               : int  75 85 81 40 79 86 82 85 75 11 ...
##  $ Composure               : int  96 95 94 68 88 91 84 85 82 70 ...
##  $ Marking                 : int  33 28 27 15 68 34 60 62 87 27 ...
##  $ StandingTackle          : int  28 31 24 21 58 27 76 45 92 12 ...
##  $ SlidingTackle           : int  26 23 33 13 51 22 73 38 91 18 ...
##  $ GKDiving                : int  6 7 9 90 15 11 13 27 11 86 ...
##  $ GKHandling              : int  11 11 9 85 13 12 9 25 8 92 ...
##  $ GKKicking               : int  15 15 15 87 5 6 7 31 9 78 ...
##  $ GKPositioning           : int  14 14 15 88 10 8 14 33 7 88 ...
##  $ GKReflexes              : int  8 11 11 94 13 8 9 37 11 89 ...
##  $ Release.Clause          : chr  "\200226.5M" "\200127.1M" "\200228.1M" "\200138.6M" ...

There are 18,207 observations and 88 columns in the data. However, there are some fields that will not be used on both projects and some data cleaning is required for some columns in order for them to be useful for further analysis.

Libraries

Data Preparation

Let’s first remove unneeded columns from dataset. I am removing Photo, Flag, Club.Logo columns because they are links for the image which will not be needed in my analysis and Body.Type column because there is not an exact distinction among body types in data. Moreover, I will remove detailed position analysis of players as we mostly focus on their roles (striker, defender and etc.) rather than specific positions (LW, DM and etc.).

data <- subset(data, select = -c(Photo, Flag, Club.Logo, Body.Type))
data <- data[,-c(24:49)]

Next, we need to convert Value, Wage, and Release.Clause to numeric version. For this, we need to remove fields from characters (currency symbol and abbreviations of numbers). Moreover, I need to change the name of the first column to “ID” for simplicity.

data$Value <- as.double(substr(data$Value, 2, nchar(data$Value)-1))
colnames(data)[8] <- "Value.in.million.euros"

data$Wage <- as.double(substr(data$Wage, 2, nchar(data$Wage)-1))
colnames(data)[9] <- "Wage.in.thousand.euros"

data$Release.Clause <- as.double(substr(data$Release.Clause, 2, nchar(data$Release.Clause)-1))
colnames(data)[58] <- "Release.clause.in.million.euros"

colnames(data)[1] <- "ID"

In the next step, we need to convert International.Reputation, Weak.Foot and Skill.Moves to factors as they are 5 ordinal groups not integers.

data$International.Reputation <- as.factor(data$International.Reputation)
data$Weak.Foot <- as.factor(data$Weak.Foot)
data$Skill.Moves <- as.factor(data$Skill.Moves)

For simplicity of analysis, I will convert height into centimeters and weight into kilograms.

height <- data %>% 
  separate(Height, into = c("H1", "H2"), convert = TRUE) %>%
  transmute(Height =  H1 * 30.48 + H2 * 2.54)

data <- data[,-22]
data <- cbind(data, height)

data$Weight <- as.double(substr(data$Weight, 1, nchar(data$Weight)-3))

After this we can finally export data and use them in further analysis.

str(data)
## 'data.frame':    18207 obs. of  58 variables:
##  $ ID                             : int  158023 20801 190871 193080 192985 183277 177003 176580 155862 200389 ...
##  $ Name                           : chr  "L. Messi" "Cristiano Ronaldo" "Neymar Jr" "De Gea" ...
##  $ Age                            : int  31 33 26 27 27 27 32 31 32 25 ...
##  $ Nationality                    : chr  "Argentina" "Portugal" "Brazil" "Spain" ...
##  $ Overall                        : int  94 94 92 91 91 91 91 91 91 90 ...
##  $ Potential                      : int  94 94 93 93 92 91 91 91 91 93 ...
##  $ Club                           : chr  "FC Barcelona" "Juventus" "Paris Saint-Germain" "Manchester United" ...
##  $ Value.in.million.euros         : num  110 77 118 72 102 ...
##  $ Wage.in.thousand.euros         : num  565 405 290 260 355 340 420 455 380 94 ...
##  $ Special                        : int  2202 2228 2143 1471 2281 2142 2280 2346 2201 1331 ...
##  $ Preferred.Foot                 : chr  "Left" "Right" "Right" "Right" ...
##  $ International.Reputation       : Factor w/ 5 levels "1","2","3","4",..: 5 5 5 4 4 4 4 5 4 3 ...
##  $ Weak.Foot                      : Factor w/ 5 levels "1","2","3","4",..: 4 4 5 3 5 4 4 4 3 3 ...
##  $ Skill.Moves                    : Factor w/ 5 levels "1","2","3","4",..: 4 5 5 1 4 4 4 3 3 1 ...
##  $ Work.Rate                      : chr  "Medium/ Medium" "High/ Low" "High/ Medium" "Medium/ Medium" ...
##  $ Real.Face                      : chr  "Yes" "Yes" "Yes" "Yes" ...
##  $ Position                       : chr  "RF" "ST" "LW" "GK" ...
##  $ Jersey.Number                  : int  10 7 10 1 7 10 10 9 15 1 ...
##  $ Joined                         : chr  "1-Jul-04" "10-Jul-18" "3-Aug-17" "1-Jul-11" ...
##  $ Loaned.From                    : chr  "" "" "" "" ...
##  $ Contract.Valid.Until           : chr  "2021" "2022" "2022" "2020" ...
##  $ Weight                         : num  159 183 150 168 154 163 146 190 181 192 ...
##  $ Crossing                       : int  84 84 79 17 93 81 86 77 66 13 ...
##  $ Finishing                      : int  95 94 87 13 82 84 72 93 60 11 ...
##  $ HeadingAccuracy                : int  70 89 62 21 55 61 55 77 91 15 ...
##  $ ShortPassing                   : int  90 81 84 50 92 89 93 82 78 29 ...
##  $ Volleys                        : int  86 87 84 13 82 80 76 88 66 13 ...
##  $ Dribbling                      : int  97 88 96 18 86 95 90 87 63 12 ...
##  $ Curve                          : int  93 81 88 21 85 83 85 86 74 13 ...
##  $ FKAccuracy                     : int  94 76 87 19 83 79 78 84 72 14 ...
##  $ LongPassing                    : int  87 77 78 51 91 83 88 64 77 26 ...
##  $ BallControl                    : int  96 94 95 42 91 94 93 90 84 16 ...
##  $ Acceleration                   : int  91 89 94 57 78 94 80 86 76 43 ...
##  $ SprintSpeed                    : int  86 91 90 58 76 88 72 75 75 60 ...
##  $ Agility                        : int  91 87 96 60 79 95 93 82 78 67 ...
##  $ Reactions                      : int  95 96 94 90 91 90 90 92 85 86 ...
##  $ Balance                        : int  95 70 84 43 77 94 94 83 66 49 ...
##  $ ShotPower                      : int  85 95 80 31 91 82 79 86 79 22 ...
##  $ Jumping                        : int  68 95 61 67 63 56 68 69 93 76 ...
##  $ Stamina                        : int  72 88 81 43 90 83 89 90 84 41 ...
##  $ Strength                       : int  59 79 49 64 75 66 58 83 83 78 ...
##  $ LongShots                      : int  94 93 82 12 91 80 82 85 59 12 ...
##  $ Aggression                     : int  48 63 56 38 76 54 62 87 88 34 ...
##  $ Interceptions                  : int  22 29 36 30 61 41 83 41 90 19 ...
##  $ Positioning                    : int  94 95 89 12 87 87 79 92 60 11 ...
##  $ Vision                         : int  94 82 87 68 94 89 92 84 63 70 ...
##  $ Penalties                      : int  75 85 81 40 79 86 82 85 75 11 ...
##  $ Composure                      : int  96 95 94 68 88 91 84 85 82 70 ...
##  $ Marking                        : int  33 28 27 15 68 34 60 62 87 27 ...
##  $ StandingTackle                 : int  28 31 24 21 58 27 76 45 92 12 ...
##  $ SlidingTackle                  : int  26 23 33 13 51 22 73 38 91 18 ...
##  $ GKDiving                       : int  6 7 9 90 15 11 13 27 11 86 ...
##  $ GKHandling                     : int  11 11 9 85 13 12 9 25 8 92 ...
##  $ GKKicking                      : int  15 15 15 87 5 6 7 31 9 78 ...
##  $ GKPositioning                  : int  14 14 15 88 10 8 14 33 7 88 ...
##  $ GKReflexes                     : int  8 11 11 94 13 8 9 37 11 89 ...
##  $ Release.clause.in.million.euros: num  226 127 228 139 196 ...
##  $ Height                         : num  170 188 175 193 180 ...