1 Exploring “Fifa 2021 Players Dataset” available in kaggle

(For Practicing data cleaning , standard data organisation, Exploratory data Analysis and arrangement )

1.1 Loading librarys

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stringr)

1.2 Importing data

f21DF <- read_csv("fifa21_raw_data.csv")
## Rows: 18979 Columns: 77
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (22): photoUrl, LongName, playerUrl, Nationality, Positions, Name, Team ...
## dbl (55): Age, ↓OVA, POT, ID, BOV, Growth, Attacking, Crossing, Finishing, H...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

1.3 View of the dataset

Have a view and knowlwdge of the imported dataset:-

There are many functions in R e.g. str(), glimpse() , summary() but I needed to create a compact summary where the column names, count of columns, data type, Number of null values per columns, number of unique values per columns, Min, Max and Mean of each columns (whose data type is numeric) can be viewed in a single table. Therefore taking inspiration from the info() function of “pandas” library I have crafted an info() function here in R and it results the following data.

info <- function(data) {
    cat("Data Frame Information:\n")
    cat("Number of Rows: ", nrow(data), "\n")
    cat("Number of Columns: ", ncol(data), "\n")
    cat("\nColumn Information:\n")
    col_info <- data.frame(Column = names(data), DType = sapply(data, class), Null_Values = sapply(data,
        function(x) sum(is.na(x))), Unique_Values = sapply(data, function(x) length(unique(x))),
        Mean = round(sapply(data, function(x) if (is.numeric(x)) mean(x, na.rm = TRUE) else NA),
            4), Median = round(sapply(data, function(x) if (is.numeric(x)) median(x,
            na.rm = TRUE) else NA), 4), Min = sapply(data, function(x) if (is.numeric(x))
            min(x, na.rm = TRUE) else NA), Max = sapply(data, function(x) if (is.numeric(x))
            max(x, na.rm = TRUE) else NA), row.names = NULL  # Exclude the default row names
)
    # Create a mapping of data type abbreviations
    data_type_abbreviations <- c(integer = "int", character = "chr", numeric = "dbl")

    # Format the Data_Type column using the abbreviations
    col_info$DType <- data_type_abbreviations[col_info$DType]

    col_info
}
info(f21DF)
## Data Frame Information:
## Number of Rows:  18979 
## Number of Columns:  77 
## 
## Column Information:

1.4 Glimpse

Have a view and knowlwdge of the imported dataset by using the glimpse() function of R:-

Here we can have a glimpse of the actual data stored in the columns.

glimpse(f21DF)
## Rows: 18,979
## Columns: 77
## $ photoUrl           <chr> "https://cdn.sofifa.com/players/158/023/21_60.png",…
## $ LongName           <chr> "Lionel Messi", "C. Ronaldo dos Santos Aveiro", "Ja…
## $ playerUrl          <chr> "http://sofifa.com/player/158023/lionel-messi/21000…
## $ Nationality        <chr> "Argentina", "Portugal", "Slovenia", "Belgium", "Br…
## $ Positions          <chr> "RW ST CF", "ST LW", "GK", "CAM CM", "LW CAM", "ST"…
## $ Name               <chr> "L. Messi", "Cristiano Ronaldo", "J. Oblak", "K. De…
## $ Age                <dbl> 33, 35, 27, 29, 28, 31, 21, 27, 28, 28, 28, 28, 28,…
## $ `↓OVA`             <dbl> 93, 92, 91, 91, 91, 91, 90, 90, 90, 90, 90, 90, 89,…
## $ POT                <dbl> 93, 92, 93, 91, 91, 91, 95, 91, 90, 90, 91, 93, 89,…
## $ `Team & Contract`  <chr> "\n\n\n\nFC Barcelona\n2004 ~ 2021\n\n", "\n\n\n\nJ…
## $ ID                 <dbl> 158023, 20801, 200389, 192985, 190871, 188545, 2317…
## $ Height             <chr> "5'7\"", "6'2\"", "6'2\"", "5'11\"", "5'9\"", "6'0\…
## $ Weight             <chr> "159lbs", "183lbs", "192lbs", "154lbs", "150lbs", "…
## $ foot               <chr> "Left", "Right", "Right", "Right", "Right", "Right"…
## $ BOV                <dbl> 93, 92, 91, 91, 91, 91, 91, 90, 90, 90, 90, 90, 89,…
## $ BP                 <chr> "RW", "ST", "GK", "CAM", "LW", "ST", "ST", "GK", "R…
## $ Growth             <dbl> 0, 0, 2, 0, 0, 0, 5, 1, 0, 0, 1, 3, 0, 1, 0, 0, 0, …
## $ Joined             <chr> "Jul 1, 2004", "Jul 10, 2018", "Jul 16, 2014", "Aug…
## $ `Loan Date End`    <chr> "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N…
## $ Value              <chr> "€67.5M", "€46M", "€75M", "€87M", "€90M", "€80M", "…
## $ Wage               <chr> "€560K", "€220K", "€125K", "€370K", "€270K", "€240K…
## $ `Release Clause`   <chr> "€138.4M", "€75.9M", "€159.4M", "€161M", "€166.5M",…
## $ Attacking          <dbl> 429, 437, 95, 407, 408, 423, 408, 114, 392, 410, 31…
## $ Crossing           <dbl> 85, 84, 13, 94, 85, 71, 78, 17, 79, 76, 53, 18, 58,…
## $ Finishing          <dbl> 95, 95, 11, 82, 87, 94, 91, 13, 91, 90, 52, 14, 64,…
## $ `Heading Accuracy` <dbl> 70, 90, 15, 55, 62, 85, 73, 19, 59, 84, 87, 11, 80,…
## $ `Short Passing`    <dbl> 91, 82, 43, 94, 87, 84, 83, 45, 84, 85, 79, 61, 84,…
## $ Volleys            <dbl> 88, 86, 13, 82, 87, 89, 83, 20, 79, 75, 45, 14, 63,…
## $ Skill              <dbl> 470, 414, 109, 441, 448, 407, 394, 138, 406, 391, 3…
## $ Dribbling          <dbl> 96, 88, 12, 88, 95, 85, 92, 27, 90, 91, 70, 21, 69,…
## $ Curve              <dbl> 93, 81, 13, 85, 88, 79, 79, 19, 83, 76, 60, 18, 63,…
## $ `FK Accuracy`      <dbl> 94, 76, 14, 83, 89, 85, 63, 18, 69, 64, 70, 12, 74,…
## $ `Long Passing`     <dbl> 91, 77, 40, 93, 81, 70, 70, 44, 75, 71, 86, 63, 84,…
## $ `Ball Control`     <dbl> 96, 92, 30, 92, 95, 88, 90, 30, 89, 89, 77, 30, 79,…
## $ Movement           <dbl> 451, 431, 307, 398, 453, 407, 458, 268, 460, 460, 3…
## $ Acceleration       <dbl> 91, 87, 43, 77, 94, 77, 96, 56, 94, 95, 72, 38, 60,…
## $ `Sprint Speed`     <dbl> 80, 91, 60, 76, 89, 78, 96, 47, 92, 93, 79, 50, 69,…
## $ Agility            <dbl> 91, 87, 67, 78, 96, 77, 92, 40, 91, 93, 61, 37, 61,…
## $ Reactions          <dbl> 94, 95, 88, 91, 91, 93, 92, 88, 92, 93, 89, 86, 87,…
## $ Balance            <dbl> 95, 71, 49, 76, 83, 82, 82, 37, 91, 86, 53, 43, 66,…
## $ Power              <dbl> 389, 444, 268, 408, 357, 420, 404, 240, 393, 406, 4…
## $ `Shot Power`       <dbl> 86, 94, 59, 91, 80, 89, 86, 64, 80, 84, 81, 66, 88,…
## $ Jumping            <dbl> 68, 95, 78, 63, 62, 84, 77, 52, 69, 86, 90, 79, 87,…
## $ Stamina            <dbl> 72, 84, 41, 89, 81, 76, 86, 32, 85, 88, 75, 35, 90,…
## $ Strength           <dbl> 69, 78, 78, 74, 50, 86, 76, 78, 75, 70, 92, 78, 91,…
## $ `Long Shots`       <dbl> 94, 93, 12, 91, 84, 85, 79, 14, 84, 78, 64, 10, 81,…
## $ Mentality          <dbl> 347, 353, 140, 408, 356, 391, 341, 140, 376, 358, 3…
## $ Aggression         <dbl> 44, 63, 34, 76, 51, 81, 62, 27, 63, 75, 83, 43, 91,…
## $ Interceptions      <dbl> 40, 29, 19, 66, 36, 49, 38, 11, 55, 35, 90, 22, 87,…
## $ Positioning        <dbl> 93, 95, 11, 88, 87, 94, 91, 13, 91, 92, 47, 11, 72,…
## $ Vision             <dbl> 95, 82, 65, 94, 90, 79, 80, 66, 84, 85, 65, 70, 80,…
## $ Penalties          <dbl> 75, 84, 11, 84, 92, 88, 70, 23, 83, 71, 62, 25, 66,…
## $ Composure          <dbl> 96, 95, 68, 91, 93, 88, 84, 65, 90, 84, 90, 70, 84,…
## $ Defending          <dbl> 91, 84, 57, 186, 94, 96, 100, 50, 122, 122, 272, 48…
## $ Marking            <dbl> 32, 28, 27, 68, 35, 35, 34, 15, 38, 42, 93, 25, 84,…
## $ `Standing Tackle`  <dbl> 35, 32, 12, 65, 30, 42, 34, 19, 43, 42, 93, 13, 88,…
## $ `Sliding Tackle`   <dbl> 24, 24, 18, 53, 29, 19, 32, 16, 41, 38, 86, 10, 87,…
## $ Goalkeeping        <dbl> 54, 58, 437, 56, 59, 51, 42, 439, 62, 56, 58, 439, …
## $ `GK Diving`        <dbl> 6, 7, 87, 15, 9, 15, 13, 86, 14, 10, 13, 88, 13, 84…
## $ `GK Handling`      <dbl> 11, 11, 92, 13, 9, 6, 5, 88, 14, 10, 10, 85, 14, 89…
## $ `GK Kicking`       <dbl> 15, 15, 78, 5, 15, 12, 7, 85, 9, 15, 13, 88, 16, 74…
## $ `GK Positioning`   <dbl> 14, 14, 90, 10, 15, 8, 11, 91, 11, 7, 11, 88, 12, 8…
## $ `GK Reflexes`      <dbl> 8, 11, 90, 13, 11, 10, 6, 89, 14, 14, 11, 90, 12, 8…
## $ `Total Stats`      <dbl> 2231, 2221, 1413, 2304, 2175, 2195, 2147, 1389, 221…
## $ `Base Stats`       <dbl> 466, 464, 489, 485, 451, 457, 466, 490, 470, 469, 4…
## $ `W/F`              <chr> "4 ★", "4 ★", "3 ★", "5 ★", "5 ★", "4 ★", "4 ★", "3…
## $ SM                 <chr> "4★", "5★", "1★", "4★", "5★", "4★", "5★", "1★", "4★…
## $ `A/W`              <chr> "Medium", "High", "Medium", "High", "High", "High",…
## $ `D/W`              <chr> "Low", "Low", "Medium", "High", "Medium", "Medium",…
## $ IR                 <chr> "5 ★", "5 ★", "3 ★", "4 ★", "5 ★", "4 ★", "3 ★", "3…
## $ PAC                <dbl> 85, 89, 87, 76, 91, 78, 96, 86, 93, 94, 76, 88, 65,…
## $ SHO                <dbl> 92, 93, 92, 86, 85, 91, 86, 88, 86, 85, 60, 85, 73,…
## $ PAS                <dbl> 91, 81, 78, 93, 86, 78, 78, 85, 81, 80, 71, 88, 76,…
## $ DRI                <dbl> 95, 89, 90, 88, 94, 85, 91, 89, 90, 90, 71, 90, 72,…
## $ DEF                <dbl> 38, 35, 52, 64, 36, 43, 39, 51, 45, 44, 91, 45, 86,…
## $ PHY                <dbl> 65, 77, 90, 78, 59, 82, 76, 91, 75, 76, 86, 88, 91,…
## $ Hits               <chr> "\n372", "\n344", "\n86", "\n163", "\n273", "\n182"…

1.5 Some obvious questions to answer

1.5.0.1 What datatype are the height column? can it be changed to desired datatype? if possible please transform it.

Ans :- The datatype of height column are character (5’7”), which should be changed to numeric , otherwise no numerical analysis can be done involving those columns.By using the separate() function from “tidyr” package in the following code chunk the Height column is separated to “feet” and “inches” column with numeric data type. For the ease of analysis a separate col “height_in_cm” is also calculated from “feet and”inches” column.

temp <- separate(f21DF, Height, into = c("feet", "inches"), sep = "'|\"", convert = TRUE)
temp$Height <- f21DF$Height
f21DF <- temp  # Modified the original Data Frame
temp <- data.frame()  #Flush the temporary dataframe
# View(f21DF) head(f21DF|>select(LongName,Height,feet,inches))
f21DF$Height_in_CM <- f21DF$feet * 30.48 + f21DF$inches * 2.54
head(f21DF |>
    select(LongName, Height, feet, inches, Height_in_CM))

1.5.0.2 What datatype are the weight column? can it be changed to desired datatype? if possible please transform it.

Ans: The datatype of weight column are character, which should be changed to numeric , otherwise no numerical analysis can be done involving those columns.By using the separate() function from “tidyr” package in the following code chunk the numeric value is extracted from Weight column to a new column “weight_in_pound”. For the ease of analysis a separate col “weight_in_kg” is also calculated.

f21DF$weight_in_pound <- as.numeric(str_extract(f21DF$Weight, "\\d+"))
f21DF$weight_in_kg <- f21DF$weight_in_pound * 0.45359237
head(f21DF |>
    select(LongName, Height, feet, inches, Height_in_CM, Weight, weight_in_pound,
        weight_in_kg))

1.5.0.3 Can the “Joined” column be separated in “year”, “month” and “day”?

Ans : In the previous 1.4 section (i.e. the glimpse() section) we can see that the “Joined” column has data like “Jul 1, 2004”. By using the separate() function from “tidyr” package in the following code chunk I have spliced the “Joined” column into three different columns.

temp <- separate(f21DF, Joined, into = c("month", "day", "year"), convert = TRUE)
temp$Joined <- f21DF$Joined
f21DF <- temp  # Modified the original Data Frame
temp <- data.frame()  #Flush the temporary dataframe
head(f21DF |>
    select(LongName, Joined, month, day, year))

1.5.0.4 Can you clean and transform the value, wage and “release clause” columns into columns of numeric value?

Ans : The data in Value , Wage and “release clause” column is given in the format “€138.4M”. In the following code the the numeric value of the stated columns are separated in “value_num” , “wage_num” and “relcls_num” for further analysis.

f21DF$value_num <- as.numeric(gsub("[^0-9.]", "", f21DF$Value))
f21DF$wage_num <- as.numeric(gsub("[^0-9.]", "", f21DF$Wage))
f21DF$relcls_num <- as.numeric(gsub("[^0-9.]", "", f21DF$`Release Clause`))
head(f21DF |>
    select(LongName, Value, value_num, Wage, wage_num, `Release Clause`, relcls_num))

1.5.0.5 How can you remove the New Line characters from the Hits colums?

Ans : The data in Hits column is given in the format “”. In the following code the the numeric value of Hits column is separated in “hits_num” for further analysis.

f21DF$hits_num<- as.numeric(gsub("[^0-9.]", "", f21DF$Hits))
head(f21DF|>select(LongName,Hits,hits_num))

1.5.0.6 Separate “Team & contract” column into separate “Team” and “Contract” column ?

Ans :The data in “Team & Contract” column is given in the format “Barcelona ~ 2021”. In the following code the data is separated into “team” and “contracts” column.

# Function to clean and split a string
clean_and_split <- function(text) {
  cleaned_text <- trimws(text, whitespace = "\n")
  parts <- unlist(strsplit(cleaned_text, "\n"))
  return(parts)
}

# Apply the function to each element in the vector
result <- sapply(f21DF$`Team & Contract`, clean_and_split)
#View(result)

# Extract the team names and date ranges
team <- result[1, ]
contracts <- result[2, ]
f21DF$team<-team
f21DF$contracts<-contracts
head(f21DF|>select(LongName,`Team & Contract`,team,contracts))