(For Practicing data cleaning , standard data organisation, Exploratory data Analysis and arrangement )
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stringr)
f21DF <- read_csv("fifa21_raw_data.csv")
## Rows: 18979 Columns: 77
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (22): photoUrl, LongName, playerUrl, Nationality, Positions, Name, Team ...
## dbl (55): Age, ↓OVA, POT, ID, BOV, Growth, Attacking, Crossing, Finishing, H...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Have a view and knowlwdge of the imported dataset:-
There are many functions in R e.g. str(), glimpse() , summary() but I needed to create a compact summary where the column names, count of columns, data type, Number of null values per columns, number of unique values per columns, Min, Max and Mean of each columns (whose data type is numeric) can be viewed in a single table. Therefore taking inspiration from the info() function of “pandas” library I have crafted an info() function here in R and it results the following data.
info <- function(data) {
cat("Data Frame Information:\n")
cat("Number of Rows: ", nrow(data), "\n")
cat("Number of Columns: ", ncol(data), "\n")
cat("\nColumn Information:\n")
col_info <- data.frame(Column = names(data), DType = sapply(data, class), Null_Values = sapply(data,
function(x) sum(is.na(x))), Unique_Values = sapply(data, function(x) length(unique(x))),
Mean = round(sapply(data, function(x) if (is.numeric(x)) mean(x, na.rm = TRUE) else NA),
4), Median = round(sapply(data, function(x) if (is.numeric(x)) median(x,
na.rm = TRUE) else NA), 4), Min = sapply(data, function(x) if (is.numeric(x))
min(x, na.rm = TRUE) else NA), Max = sapply(data, function(x) if (is.numeric(x))
max(x, na.rm = TRUE) else NA), row.names = NULL # Exclude the default row names
)
# Create a mapping of data type abbreviations
data_type_abbreviations <- c(integer = "int", character = "chr", numeric = "dbl")
# Format the Data_Type column using the abbreviations
col_info$DType <- data_type_abbreviations[col_info$DType]
col_info
}
info(f21DF)
## Data Frame Information:
## Number of Rows: 18979
## Number of Columns: 77
##
## Column Information:
Have a view and knowlwdge of the imported dataset by using the glimpse() function of R:-
Here we can have a glimpse of the actual data stored in the columns.
glimpse(f21DF)
## Rows: 18,979
## Columns: 77
## $ photoUrl <chr> "https://cdn.sofifa.com/players/158/023/21_60.png",…
## $ LongName <chr> "Lionel Messi", "C. Ronaldo dos Santos Aveiro", "Ja…
## $ playerUrl <chr> "http://sofifa.com/player/158023/lionel-messi/21000…
## $ Nationality <chr> "Argentina", "Portugal", "Slovenia", "Belgium", "Br…
## $ Positions <chr> "RW ST CF", "ST LW", "GK", "CAM CM", "LW CAM", "ST"…
## $ Name <chr> "L. Messi", "Cristiano Ronaldo", "J. Oblak", "K. De…
## $ Age <dbl> 33, 35, 27, 29, 28, 31, 21, 27, 28, 28, 28, 28, 28,…
## $ `↓OVA` <dbl> 93, 92, 91, 91, 91, 91, 90, 90, 90, 90, 90, 90, 89,…
## $ POT <dbl> 93, 92, 93, 91, 91, 91, 95, 91, 90, 90, 91, 93, 89,…
## $ `Team & Contract` <chr> "\n\n\n\nFC Barcelona\n2004 ~ 2021\n\n", "\n\n\n\nJ…
## $ ID <dbl> 158023, 20801, 200389, 192985, 190871, 188545, 2317…
## $ Height <chr> "5'7\"", "6'2\"", "6'2\"", "5'11\"", "5'9\"", "6'0\…
## $ Weight <chr> "159lbs", "183lbs", "192lbs", "154lbs", "150lbs", "…
## $ foot <chr> "Left", "Right", "Right", "Right", "Right", "Right"…
## $ BOV <dbl> 93, 92, 91, 91, 91, 91, 91, 90, 90, 90, 90, 90, 89,…
## $ BP <chr> "RW", "ST", "GK", "CAM", "LW", "ST", "ST", "GK", "R…
## $ Growth <dbl> 0, 0, 2, 0, 0, 0, 5, 1, 0, 0, 1, 3, 0, 1, 0, 0, 0, …
## $ Joined <chr> "Jul 1, 2004", "Jul 10, 2018", "Jul 16, 2014", "Aug…
## $ `Loan Date End` <chr> "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N…
## $ Value <chr> "€67.5M", "€46M", "€75M", "€87M", "€90M", "€80M", "…
## $ Wage <chr> "€560K", "€220K", "€125K", "€370K", "€270K", "€240K…
## $ `Release Clause` <chr> "€138.4M", "€75.9M", "€159.4M", "€161M", "€166.5M",…
## $ Attacking <dbl> 429, 437, 95, 407, 408, 423, 408, 114, 392, 410, 31…
## $ Crossing <dbl> 85, 84, 13, 94, 85, 71, 78, 17, 79, 76, 53, 18, 58,…
## $ Finishing <dbl> 95, 95, 11, 82, 87, 94, 91, 13, 91, 90, 52, 14, 64,…
## $ `Heading Accuracy` <dbl> 70, 90, 15, 55, 62, 85, 73, 19, 59, 84, 87, 11, 80,…
## $ `Short Passing` <dbl> 91, 82, 43, 94, 87, 84, 83, 45, 84, 85, 79, 61, 84,…
## $ Volleys <dbl> 88, 86, 13, 82, 87, 89, 83, 20, 79, 75, 45, 14, 63,…
## $ Skill <dbl> 470, 414, 109, 441, 448, 407, 394, 138, 406, 391, 3…
## $ Dribbling <dbl> 96, 88, 12, 88, 95, 85, 92, 27, 90, 91, 70, 21, 69,…
## $ Curve <dbl> 93, 81, 13, 85, 88, 79, 79, 19, 83, 76, 60, 18, 63,…
## $ `FK Accuracy` <dbl> 94, 76, 14, 83, 89, 85, 63, 18, 69, 64, 70, 12, 74,…
## $ `Long Passing` <dbl> 91, 77, 40, 93, 81, 70, 70, 44, 75, 71, 86, 63, 84,…
## $ `Ball Control` <dbl> 96, 92, 30, 92, 95, 88, 90, 30, 89, 89, 77, 30, 79,…
## $ Movement <dbl> 451, 431, 307, 398, 453, 407, 458, 268, 460, 460, 3…
## $ Acceleration <dbl> 91, 87, 43, 77, 94, 77, 96, 56, 94, 95, 72, 38, 60,…
## $ `Sprint Speed` <dbl> 80, 91, 60, 76, 89, 78, 96, 47, 92, 93, 79, 50, 69,…
## $ Agility <dbl> 91, 87, 67, 78, 96, 77, 92, 40, 91, 93, 61, 37, 61,…
## $ Reactions <dbl> 94, 95, 88, 91, 91, 93, 92, 88, 92, 93, 89, 86, 87,…
## $ Balance <dbl> 95, 71, 49, 76, 83, 82, 82, 37, 91, 86, 53, 43, 66,…
## $ Power <dbl> 389, 444, 268, 408, 357, 420, 404, 240, 393, 406, 4…
## $ `Shot Power` <dbl> 86, 94, 59, 91, 80, 89, 86, 64, 80, 84, 81, 66, 88,…
## $ Jumping <dbl> 68, 95, 78, 63, 62, 84, 77, 52, 69, 86, 90, 79, 87,…
## $ Stamina <dbl> 72, 84, 41, 89, 81, 76, 86, 32, 85, 88, 75, 35, 90,…
## $ Strength <dbl> 69, 78, 78, 74, 50, 86, 76, 78, 75, 70, 92, 78, 91,…
## $ `Long Shots` <dbl> 94, 93, 12, 91, 84, 85, 79, 14, 84, 78, 64, 10, 81,…
## $ Mentality <dbl> 347, 353, 140, 408, 356, 391, 341, 140, 376, 358, 3…
## $ Aggression <dbl> 44, 63, 34, 76, 51, 81, 62, 27, 63, 75, 83, 43, 91,…
## $ Interceptions <dbl> 40, 29, 19, 66, 36, 49, 38, 11, 55, 35, 90, 22, 87,…
## $ Positioning <dbl> 93, 95, 11, 88, 87, 94, 91, 13, 91, 92, 47, 11, 72,…
## $ Vision <dbl> 95, 82, 65, 94, 90, 79, 80, 66, 84, 85, 65, 70, 80,…
## $ Penalties <dbl> 75, 84, 11, 84, 92, 88, 70, 23, 83, 71, 62, 25, 66,…
## $ Composure <dbl> 96, 95, 68, 91, 93, 88, 84, 65, 90, 84, 90, 70, 84,…
## $ Defending <dbl> 91, 84, 57, 186, 94, 96, 100, 50, 122, 122, 272, 48…
## $ Marking <dbl> 32, 28, 27, 68, 35, 35, 34, 15, 38, 42, 93, 25, 84,…
## $ `Standing Tackle` <dbl> 35, 32, 12, 65, 30, 42, 34, 19, 43, 42, 93, 13, 88,…
## $ `Sliding Tackle` <dbl> 24, 24, 18, 53, 29, 19, 32, 16, 41, 38, 86, 10, 87,…
## $ Goalkeeping <dbl> 54, 58, 437, 56, 59, 51, 42, 439, 62, 56, 58, 439, …
## $ `GK Diving` <dbl> 6, 7, 87, 15, 9, 15, 13, 86, 14, 10, 13, 88, 13, 84…
## $ `GK Handling` <dbl> 11, 11, 92, 13, 9, 6, 5, 88, 14, 10, 10, 85, 14, 89…
## $ `GK Kicking` <dbl> 15, 15, 78, 5, 15, 12, 7, 85, 9, 15, 13, 88, 16, 74…
## $ `GK Positioning` <dbl> 14, 14, 90, 10, 15, 8, 11, 91, 11, 7, 11, 88, 12, 8…
## $ `GK Reflexes` <dbl> 8, 11, 90, 13, 11, 10, 6, 89, 14, 14, 11, 90, 12, 8…
## $ `Total Stats` <dbl> 2231, 2221, 1413, 2304, 2175, 2195, 2147, 1389, 221…
## $ `Base Stats` <dbl> 466, 464, 489, 485, 451, 457, 466, 490, 470, 469, 4…
## $ `W/F` <chr> "4 ★", "4 ★", "3 ★", "5 ★", "5 ★", "4 ★", "4 ★", "3…
## $ SM <chr> "4★", "5★", "1★", "4★", "5★", "4★", "5★", "1★", "4★…
## $ `A/W` <chr> "Medium", "High", "Medium", "High", "High", "High",…
## $ `D/W` <chr> "Low", "Low", "Medium", "High", "Medium", "Medium",…
## $ IR <chr> "5 ★", "5 ★", "3 ★", "4 ★", "5 ★", "4 ★", "3 ★", "3…
## $ PAC <dbl> 85, 89, 87, 76, 91, 78, 96, 86, 93, 94, 76, 88, 65,…
## $ SHO <dbl> 92, 93, 92, 86, 85, 91, 86, 88, 86, 85, 60, 85, 73,…
## $ PAS <dbl> 91, 81, 78, 93, 86, 78, 78, 85, 81, 80, 71, 88, 76,…
## $ DRI <dbl> 95, 89, 90, 88, 94, 85, 91, 89, 90, 90, 71, 90, 72,…
## $ DEF <dbl> 38, 35, 52, 64, 36, 43, 39, 51, 45, 44, 91, 45, 86,…
## $ PHY <dbl> 65, 77, 90, 78, 59, 82, 76, 91, 75, 76, 86, 88, 91,…
## $ Hits <chr> "\n372", "\n344", "\n86", "\n163", "\n273", "\n182"…
Ans :- The datatype of height column are character (5’7”), which should be changed to numeric , otherwise no numerical analysis can be done involving those columns.By using the separate() function from “tidyr” package in the following code chunk the Height column is separated to “feet” and “inches” column with numeric data type. For the ease of analysis a separate col “height_in_cm” is also calculated from “feet and”inches” column.
temp <- separate(f21DF, Height, into = c("feet", "inches"), sep = "'|\"", convert = TRUE)
temp$Height <- f21DF$Height
f21DF <- temp # Modified the original Data Frame
temp <- data.frame() #Flush the temporary dataframe
# View(f21DF) head(f21DF|>select(LongName,Height,feet,inches))
f21DF$Height_in_CM <- f21DF$feet * 30.48 + f21DF$inches * 2.54
head(f21DF |>
select(LongName, Height, feet, inches, Height_in_CM))
Ans: The datatype of weight column are character, which should be changed to numeric , otherwise no numerical analysis can be done involving those columns.By using the separate() function from “tidyr” package in the following code chunk the numeric value is extracted from Weight column to a new column “weight_in_pound”. For the ease of analysis a separate col “weight_in_kg” is also calculated.
f21DF$weight_in_pound <- as.numeric(str_extract(f21DF$Weight, "\\d+"))
f21DF$weight_in_kg <- f21DF$weight_in_pound * 0.45359237
head(f21DF |>
select(LongName, Height, feet, inches, Height_in_CM, Weight, weight_in_pound,
weight_in_kg))
Ans : In the previous 1.4 section (i.e. the glimpse() section) we can see that the “Joined” column has data like “Jul 1, 2004”. By using the separate() function from “tidyr” package in the following code chunk I have spliced the “Joined” column into three different columns.
temp <- separate(f21DF, Joined, into = c("month", "day", "year"), convert = TRUE)
temp$Joined <- f21DF$Joined
f21DF <- temp # Modified the original Data Frame
temp <- data.frame() #Flush the temporary dataframe
head(f21DF |>
select(LongName, Joined, month, day, year))
Ans : The data in Value , Wage and “release clause” column is given in the format “€138.4M”. In the following code the the numeric value of the stated columns are separated in “value_num” , “wage_num” and “relcls_num” for further analysis.
f21DF$value_num <- as.numeric(gsub("[^0-9.]", "", f21DF$Value))
f21DF$wage_num <- as.numeric(gsub("[^0-9.]", "", f21DF$Wage))
f21DF$relcls_num <- as.numeric(gsub("[^0-9.]", "", f21DF$`Release Clause`))
head(f21DF |>
select(LongName, Value, value_num, Wage, wage_num, `Release Clause`, relcls_num))
Ans : The data in Hits column is given in the format “”. In the following code the the numeric value of Hits column is separated in “hits_num” for further analysis.
f21DF$hits_num<- as.numeric(gsub("[^0-9.]", "", f21DF$Hits))
head(f21DF|>select(LongName,Hits,hits_num))
Ans :The data in “Team & Contract” column is given in the format “Barcelona ~ 2021”. In the following code the data is separated into “team” and “contracts” column.
# Function to clean and split a string
clean_and_split <- function(text) {
cleaned_text <- trimws(text, whitespace = "\n")
parts <- unlist(strsplit(cleaned_text, "\n"))
return(parts)
}
# Apply the function to each element in the vector
result <- sapply(f21DF$`Team & Contract`, clean_and_split)
#View(result)
# Extract the team names and date ranges
team <- result[1, ]
contracts <- result[2, ]
f21DF$team<-team
f21DF$contracts<-contracts
head(f21DF|>select(LongName,`Team & Contract`,team,contracts))