?read.csv
?read.table
data_types <- c(
player = "character", team = "factor",position = "factor",height = "integer", weight = "integer", age = "integer", experience = "integer",college = "character", salary = "double", games = "integer", minutes = "integer", points = "integer",points3 = "integer", points2 = "integer", points1 = "integer"
)
nba <- read.csv("~/Downloads/nba2018-players.csv", colClasses = data_types, header = TRUE, stringsAsFactors = FALSE)
nba2 <- read.table("~/Downloads/nba2018-players.csv", colClasses = data_types, header = TRUE, sep = ",", stringsAsFactors = FALSE)
str(nba, ven.len =1 )
## 'data.frame': 477 obs. of 15 variables:
## $ player : chr "Al Horford" "Amir Johnson" "Avery Bradley" "Demetrius Jackson" ...
## $ team : Factor w/ 30 levels "ATL","BOS","BRK",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ position : Factor w/ 5 levels "C","PF","PG",..: 1 2 5 3 4 3 4 5 4 2 ...
## $ height : int 82 81 74 73 79 69 78 78 79 82 ...
## $ weight : int 245 240 180 201 205 185 235 215 225 231 ...
## $ age : int 30 29 26 22 31 27 26 21 20 29 ...
## $ experience: int 9 11 6 0 9 5 4 2 0 6 ...
## $ college : chr "University of Florida" "" "University of Texas at Austin" "University of Notre Dame" ...
## $ salary : num 26540100 12000000 8269663 1450000 1410598 ...
## $ games : int 68 80 55 5 47 76 72 29 78 78 ...
## $ minutes : int 2193 1608 1835 17 538 2569 2335 220 1341 1232 ...
## $ points : int 952 520 894 10 262 2199 999 68 515 299 ...
## $ points3 : int 86 27 108 1 39 245 157 12 46 45 ...
## $ points2 : int 293 186 251 2 56 437 176 13 146 69 ...
## $ points1 : int 108 67 68 3 33 590 176 6 85 26 ...
identical(nba, nba2)
## [1] TRUE
hist(nba$salary )
# Create the new column 'salary2' and ensure it's numeric
nba$salary2 <- as.numeric(nba$salary) / 1e6
# Plot histogram for 'salary2' in millions
hist(nba$salary2, main = "Histogram of Salary (in millions)", xlab = "Salary in Millions USD")
nba$salary2_log <- log(nba$salary2 + 1)
hist(nba$salary2_log, main = "Histogram of Log-Transformed Salary (in millions)",
xlab = "Log of Salary in Millions USD")
#this graph is slightly skewed left but seems closely symmetrical when you preform the log function
low <- nba$salary2_log < -3
sum(low)
## [1] 0
low_salary_players <- nba[low, c("player", "weight", "height", "team", "position")]
low_salary_players
## [1] player weight height team position
## <0 rows> (or 0-length row.names)
summary(nba$points)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 124.0 403.0 510.3 756.0 2558.0
hist(nba$points, main = "Histogram of Points Scored", xlab = "Points")
nba$points[nba$points == 0] <- 0.1
nba$log_points <- log(nba$points)
hist(nba$log_points, main = "Histogram of Log-Transformed Points", xlab = "Log of Points")
#this graph is skewed right wtch siggest that it is more frequent that more plaers score at a higher rate than the opposite
boxplot(salary2 ~ age, data = nba, xlab = "Age", ylab = "Salary (in millions)", main = "Boxplot of Points by Salary")
boxplot(points ~ age, data = nba, xlab = "Age", ylab = "Points", main = "Boxplot of Points by Age")
age2 <- nba$age
age2[age2 < 19] <- 20
age2 <- cut(age2,
breaks = c(19, 24, 29, Inf),
labels = c("20-24", "25-29", "30+"),
right = TRUE)
summary(age2)
## 20-24 25-29 30+ NA's
## 177 176 115 9
party ## Part 6.
palette1 <- c("#D4D62A", "#4F9D66", "#9575AB")
age_colors <- palette1[as.numeric(age2)]
plot(salary ~ points, data = nba, log = "xy", main = "", xlab = "Points (log scale)"
, ylab ="Salary (log $)", col = age_colors, pch = 19, cex = 0.6)
legend("bottomright", fill = palette1, legend = levels(age2), title = "Age Groups", cex = 0.8)
# Part 7.
boxplot(salary2 ~ position, data = nba,
xlab = "Position", ylab = "Salary (in millions)",
main = "Salary by Position")
##for the positions. there aren't as many outliers in centers and their median price is higher than the rest of the positions. A salary per positions is visible when it comes to different positions like PG and C but there are outliers that even the payment out a little more.
palette_pos <- c("C" = "#66c2a5", "PF" = "#fc8d62","PG" = "#8da0cb", "SG" = "#e78ac3", "SF" = "#a6d854")
pos_colors <- palette_pos[match(nba$position, names(palette_pos))]
# Make the boxplot
boxplot(salary2 ~ position, data = nba, main = "Salary by Position", xlab = "Position", ylab = "Salary (in millions)", col = palette_pos[nba$position], pch = 19, cex = 0.6)
legend("topright",
fill = c("#66c2a5", "#fc8d62", "#8da0cb", "#e78ac3", "#a6d854"), legend = c("Center", "Power Forward", "Point Guard", "Shooting Guard", "Small Forward"),title = "Position", cex = 0.8)