# Load the readr package
library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ purrr 1.0.2
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2) # Load ggplot2 for visualization
theUrl <- "https://raw.githubusercontent.com/Kingtilon1/Assignmnet8-data607/main/1_lebron_james_shot_chart_1_2023.csv"
lebron <- read_csv(theUrl)
## Rows: 1533 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): date, qtr, opponent, team, color
## dbl (7): top, left, shot_type, distance_ft, lebron_team_score, opponent_tea...
## lgl (2): result, lead
## time (1): time_remaining
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
The readr package can handle missing values by specifying it during it being read like the following
theLebron <- read_csv(theUrl, na = c("NA", "N/A", "missing"))
## Rows: 1533 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): date, qtr, opponent, team, color
## dbl (7): top, left, shot_type, distance_ft, lebron_team_score, opponent_tea...
## lgl (2): result, lead
## time (1): time_remaining
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
The readr package can also guess the kind of data within a column
guess_parser(lebron$'left')
## [1] "double"
you can speficy the column types during reading
lebron2 <- read_csv(theUrl, col_types = cols(
top = col_double(),
left = col_double(),
date = col_character(),
qtr = col_character(),
time_remaining = col_character(),
result = col_logical(),
shot_type = col_integer(),
distance_ft = col_double(),
lead = col_logical(),
lebron_team_score = col_integer(),
opponent_team_score = col_integer(),
opponent = col_character(),
team = col_character(),
season = col_integer(),
color = col_character()
))
I also want to mention how the relocate function is used in the dplyr package incase you may want to prioritze certain columns being seen than other, or it might make more sense for you to shift around some columns withput removing them completely
lebron_switch <- lebron %>%
relocate(top, left, .after = "qtr")
Here we moved the top and left column to appear after the qtr column
the readr function is great for reading rectangular data, not just csv files, but you can also read tab-separated value files as well. The readr package is great for parsing data as we’ve seen in how we easily imported a csv file from github to our local r-markdown file for analysis
# Task 2: Filter Rows, dplyr package
lebron_filtered <- filter(lebron, lebron$result == "TRUE")
# Task 3: Create New Variables
lebron$ScoreDifference <- lebron$lebron_team_score - lebron$opponent_team_score
## grouping functions , dplyr package
lebron_summary <- lebron %>%
group_by(qtr) %>%
summarise(average_score = mean(ScoreDifference, na.rm = TRUE))
# Task 5: Visualize Data
ggplot(data = lebron_filtered, aes(x = left, y = top, color = qtr)) +
geom_point() +
labs(title = "Lebron James Shot Distribution by Quarter")