rm(list = ls())
EPS 700 Lab 1
Task 2: Conduct some General R Practice (1 point)
- Perform the following calculations in R
15-4
[1] 11
27+13
[1] 40
5^11
[1] 48828125
6+5)/(2^(1/2)) (
[1] 7.778175
1/2)*((200^(-2))^(1/2)) (
[1] 0.0025
- Create and perform operations with variables
<- 6
Var1 <- 22
Var2 /Var2 Var1
[1] 0.2727273
*Var2 Var1
[1] 132
^Var2 Var1
[1] 1.316217e+17
<- Var1^Var2
Var3 >Var1*Var2 Var3
[1] TRUE
sqrt(Var2)>sqrt(Var3-Var1)
[1] FALSE
Task 3: Importing data into R
- Install and load the following packages: ggplot2
require(ggplot2) || install.packages("ggplot2")
Loading required package: ggplot2
[1] TRUE
library(ggplot2)
- Import the Dataset
<- read.csv(file.choose(), header=TRUE, sep=",") df
- Inspect the headers
head(df)
Name Country HeightInches Age Goals Assists Points Minutes GamesPlayed
1 Saka Canada 65 26 17 45 62 1333 45
2 Lapinski Canada 70 23 8 33 41 1406 49
3 Angard Canada 69 19 6 23 29 1347 55
4 Fox Canada 69 33 11 12 23 1445 65
5 Bure Canada 74 19 13 29 42 1849 72
6 Park Canada 68 32 7 17 24 2110 79
FreeAgent PlusMin
1 No -14
2 Yes 9
3 Yes 33
4 Yes 12
5 No -2
6 Yes 44
b, I actually did not see asterisks next to values, probably because this is in a quarto file
- Find the variable names
names(df)
[1] "Name" "Country" "HeightInches" "Age" "Goals"
[6] "Assists" "Points" "Minutes" "GamesPlayed" "FreeAgent"
[11] "PlusMin"
- Summarize your data
summary(df)
Name Country HeightInches Age
Length:31 Length:31 Min. :63.00 Min. :19.00
Class :character Class :character 1st Qu.:67.00 1st Qu.:23.00
Mode :character Mode :character Median :70.00 Median :25.00
Mean :69.84 Mean :25.81
3rd Qu.:72.50 3rd Qu.:29.50
Max. :77.00 Max. :33.00
Goals Assists Points Minutes GamesPlayed
Min. : 3.00 Min. : 4.00 Min. : 7.00 Min. :1239 Min. :45.00
1st Qu.: 6.50 1st Qu.:16.50 1st Qu.:25.00 1st Qu.:1494 1st Qu.:62.50
Median :11.00 Median :22.00 Median :33.00 Median :1801 Median :72.00
Mean :13.26 Mean :26.06 Mean :39.32 Mean :1729 Mean :68.94
3rd Qu.:17.00 3rd Qu.:33.00 3rd Qu.:48.50 3rd Qu.:1934 3rd Qu.:78.00
Max. :42.00 Max. :56.00 Max. :78.00 Max. :2234 Max. :82.00
FreeAgent PlusMin
Length:31 Min. :-24.00
Class :character 1st Qu.: 0.00
Mode :character Median : 9.00
Mean : 11.97
3rd Qu.: 24.50
Max. : 48.00
- Use the command describe()
require(psych) || install.packages("psych")
Loading required package: psych
Attaching package: 'psych'
The following objects are masked from 'package:ggplot2':
%+%, alpha
[1] TRUE
library(psych)
describe(df)
vars n mean sd median trimmed mad min max range skew
Name* 1 31 16.00 9.09 16 16.00 11.86 1 31 30 0.00
Country* 2 31 2.32 1.17 2 2.28 1.48 1 4 3 0.23
HeightInches 3 31 69.84 3.62 70 69.92 4.45 63 77 14 -0.18
Age 4 31 25.81 4.25 25 25.76 4.45 19 33 14 0.15
Goals 5 31 13.26 9.61 11 11.76 8.90 3 42 39 1.21
Assists 6 31 26.06 12.85 22 25.00 10.38 4 56 52 0.65
Points 7 31 39.32 19.51 33 37.92 13.34 7 78 71 0.63
Minutes 8 31 1728.90 291.78 1801 1728.24 366.20 1239 2234 995 -0.03
GamesPlayed 9 31 68.94 10.98 72 70.00 10.38 45 82 37 -0.68
FreeAgent* 10 31 1.61 0.50 2 1.64 0.00 1 2 1 -0.44
PlusMin 11 31 11.97 18.48 9 11.64 17.79 -24 48 72 0.18
kurtosis se
Name* -1.32 1.63
Country* -1.47 0.21
HeightInches -0.96 0.65
Age -1.25 0.76
Goals 1.06 1.73
Assists -0.44 2.31
Points -0.79 3.50
Minutes -1.23 52.41
GamesPlayed -0.87 1.97
FreeAgent* -1.86 0.09
PlusMin -0.74 3.32
if skew is larger than zero, right skewed: Age, Goals, Assists
if skew is smaller than zero, left skewed: HeightInches, Minutes, GamesPlayed
- Use the descibeBy function, grouped by Country
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
# a. Country with the most players
<- df %>%
most_players_country group_by(Country) %>%
summarise(NumberOfPlayers = n()) %>%
arrange(desc(NumberOfPlayers)) %>%
slice(1)
# b. Country with the highest average points
<- df %>%
highest_avg_points group_by(Country) %>%
summarize(AveragePoints = mean(Points, na.rm = TRUE)) %>%
arrange(desc(AveragePoints)) %>%
top_n(1, AveragePoints)
# c. Country with the lowest mean age
<- df %>%
lowest_mean_age group_by(Country) %>%
summarize(MeanAge = mean(Age, na.rm = TRUE)) %>%
arrange(MeanAge) %>%
top_n(1, -MeanAge)
# d. Country with the highest variation in minutes played
<- df %>%
highest_variation_minutes group_by(Country) %>%
summarize(VariationMinutes = sd(Minutes, na.rm = TRUE)) %>%
arrange(desc(VariationMinutes)) %>%
top_n(1, VariationMinutes)
print(most_players_country)
# A tibble: 1 × 2
Country NumberOfPlayers
<chr> <int>
1 Canada 10
print(highest_avg_points)
# A tibble: 1 × 2
Country AveragePoints
<chr> <dbl>
1 Finland 41.8
print(lowest_mean_age)
# A tibble: 1 × 2
Country MeanAge
<chr> <dbl>
1 USA 24.4
print(highest_variation_minutes)
# A tibble: 1 × 2
Country VariationMinutes
<chr> <dbl>
1 Canada 328.
Task 4: Descriptive Statistics
1 .Fill out the table below for each variable in the data set.
<- data.frame(
data Name = c("Name", "Country", "HeightInch", "Age", "Goals", "Assists",
"Points", "Minutes", "GamesPlayed", "FreeAgent", "PlusMin"),
Definition = c("Player Last name", "Player’s home country", "Height in inches", "Age",
"Goals scored in the season", "Assists registered in the season",
"Points (goals + assists)", "Minutes play over the course of the season",
"Games appeared in", "Is the player a free agent next year?",
"Plus-Mins rating for the player"),
Range = c(NA, NA, 14, 14, 39, 52, 71, 995, 37, NA, 72),
Median = c(NA, NA, 70, 25, 11, 22, 33, 1801, 72, NA, 9),
Measurement_Type = c("Nominal", "Nominal", "Numeric", "Numeric", "Numeric",
"Numeric", "Numeric", "Numeric", "Numeric", "Nominal",
"Numeric")
)
# Replace 'N/A' with NA for proper missing values representation in R
is.na(data)] <- NA
data[library(knitr)
kable(data)
Name | Definition | Range | Median | Measurement_Type |
---|---|---|---|---|
Name | Player Last name | NA | NA | Nominal |
Country | Player’s home country | NA | NA | Nominal |
HeightInch | Height in inches | 14 | 70 | Numeric |
Age | Age | 14 | 25 | Numeric |
Goals | Goals scored in the season | 39 | 11 | Numeric |
Assists | Assists registered in the season | 52 | 22 | Numeric |
Points | Points (goals + assists) | 71 | 33 | Numeric |
Minutes | Minutes play over the course of the season | 995 | 1801 | Numeric |
GamesPlayed | Games appeared in | 37 | 72 | Numeric |
FreeAgent | Is the player a free agent next year? | NA | NA | Nominal |
PlusMin | Plus-Mins rating for the player | 72 | 9 | Numeric |
What is the overall mean minutes per game of all the players
$Minutes <- as.numeric(df$Minutes) df$GamesPlayed <- as.numeric(df$GamesPlayed) df # Create the MinutesPerGame column <- transform(df, MinutesPerGame = Minutes / GamesPlayed) df # Sum the total minutes and total games played across all players <- sum(df$Minutes, na.rm = TRUE) total_minutes <- sum(df$GamesPlayed, na.rm = TRUE) total_games_played # Calculate the overall mean minutes per game <- total_minutes / total_games_played overall_mean_minutes_per_game overall_mean_minutes_per_game
[1] 25.08002
How many countries are represented in the data? Which country has the most players? Which countries has the highest average height?
# Number of countries represented <- length(unique(df$Country)) number_of_countries # Country with the most players <- df %>% country_most_players group_by(Country) %>% summarize(NumberOfPlayers = n()) %>% arrange(desc(NumberOfPlayers)) %>% slice(1) # Country with the highest average height <- df %>% country_highest_avg_height group_by(Country) %>% summarize(AverageHeight = mean(HeightInches, na.rm = TRUE)) %>% arrange(desc(AverageHeight)) %>% slice(1) print(number_of_countries)
[1] 4
print(country_most_players)
# A tibble: 1 × 2 Country NumberOfPlayers <chr> <int> 1 Canada 10
print(country_highest_avg_height)
# A tibble: 1 × 2 Country AverageHeight <chr> <dbl> 1 USA 71.1
For each of the following variables, what is the mean, median, and mode?
# Function to calculate mode <- function(v) { get_mode <- unique(v) uniqv which.max(tabulate(match(v, uniqv)))] uniqv[ } # Mean, median, and mode for each variable <- c(mean = mean(df$Age, na.rm = TRUE), stats_age median = median(df$Age, na.rm = TRUE), mode = get_mode(df$Age)) <- c(mean = mean(df$Points, na.rm = TRUE), stats_points median = median(df$Points, na.rm = TRUE), mode = get_mode(df$Points)) <- c(mean = mean(df$MinutesPerGame, na.rm = TRUE), stats_minutes_per_game median = median(df$MinutesPerGame, na.rm = TRUE), mode = get_mode(df$MinutesPerGame)) <- c(mean = mean(df$GamesPlayed, na.rm = TRUE), stats_games_played median = median(df$GamesPlayed, na.rm = TRUE), mode = get_mode(df$GamesPlayed)) print(stats_age)
mean median mode 25.80645 25.00000 23.00000
print(stats_points)
mean median mode 39.32258 33.00000 25.00000
print(stats_minutes_per_game)
mean median mode 25.20837 25.21212 29.62222
print(stats_games_played)
mean median mode 68.93548 72.00000 72.00000
- For each of the following categories, what is the mean, median, and mode?
<- c(mean = mean(df$HeightInch, na.rm = TRUE), stats_heightinc median = median(df$HeightInch, na.rm = TRUE), mode = get_mode(df$HeightInch)) <- c(mean = mean(df$Minutes, na.rm = TRUE), stats_minutes median = median(df$Minutes, na.rm = TRUE), mode = get_mode(df$Minutes)) <- c(mean = mean(df$PlusMin, na.rm = TRUE), stats_plusmin median = median(df$PlusMin, na.rm = TRUE), mode = get_mode(df$PlusMin)) print(stats_heightinc)
mean median mode 69.83871 70.00000 71.00000
print(stats_minutes)
mean median mode 1728.903 1801.000 1333.000
print(stats_plusmin)
mean median mode 11.96774 9.00000 13.00000
Task 5: Creating Graphs/Charts
Create a histogram showing the distribution of Goals for all players.
library(ggplot2) # Assuming 'Goals' is a column in your data frame 'df' ggplot(df, aes(x=Goals)) + geom_histogram(binwidth=1, fill="blue", color="black") + labs(title="Histogram of Goals", x="Goals", y="Count") + theme_minimal()
The distribution is mainly within the range of 3 and 22, with 33 and 42 as outliers. Due to the small sample size, it is hard to characterize the distribution. It is NOT normal or uniform distribution. I would say the shape is left-skewed.
Create a scatterplot of minutes per game
ggplot(df, aes(x=MinutesPerGame, y=Points)) + geom_point(alpha=0.5, color="blue") + labs(title="Scatterplot of Minutes per Game vs. Points", x="Minutes per Game", y="Points") + theme_minimal()
It is hard to say anything concrete about the correlation. It seems positive but not very strong.
- Create a boxplot showing assists by country
ggplot(df, aes(x=Country, y=Assists)) +
geom_boxplot(fill="cyan") +
labs(title="Boxplot of Assists by Country", x="Country", y="Assists") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Three criteria:
The median line (inside the box) close to the center of the box.
The smallest interquartile range (IQR), which is the distance between the first and third quartiles.
Few or no outliers, which are typically indicated by dots beyond the whiskers.
Finland would be the one whose average assists are least likely to be influenced by an outlier because its data is more clustered around the median with less spread.
Feedback
1. How long did this lab take you to complete?
4h+
2. What parts, if any, were large departures from the course material?
Maybe the graphing part? Actually, all are relevant in the material
3. Which questions, if any, were unduly frustrating or challenging?
None, but it is hard to ‘present’ or ‘showcase’ the results we can easily have in the console.
4. Which questions, if any, were especially useful/interesting?
Task 5 3
The codes are also publicly available at: https://rpubs.com/AlanHuang/EPS700_Lab1