# Set the CRAN mirroroptions(repos ="https://cran.r-project.org")# Install necessary packagesinstall.packages("rpart")
Installing package into 'C:/Users/C00303097/AppData/Local/R/win-library/4.3'
(as 'lib' is unspecified)
package 'rpart' successfully unpacked and MD5 sums checked
Warning: cannot remove prior installation of package 'rpart'
Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
C:\Users\C00303097\AppData\Local\R\win-library\4.3\00LOCK\rpart\libs\x64\rpart.dll
to C:\Users\C00303097\AppData\Local\R\win-library\4.3\rpart\libs\x64\rpart.dll:
Permission denied
Warning: restored 'rpart'
The downloaded binary packages are in
C:\Users\C00303097\AppData\Local\Temp\RtmpC8QyEh\downloaded_packages
install.packages("rpart.plot")
Installing package into 'C:/Users/C00303097/AppData/Local/R/win-library/4.3'
(as 'lib' is unspecified)
package 'rpart.plot' successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\C00303097\AppData\Local\Temp\RtmpC8QyEh\downloaded_packages
install.packages("tidyverse")
Installing package into 'C:/Users/C00303097/AppData/Local/R/win-library/4.3'
(as 'lib' is unspecified)
package 'tidyverse' successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\C00303097\AppData\Local\Temp\RtmpC8QyEh\downloaded_packages
install.packages("rattle")
Installing package into 'C:/Users/C00303097/AppData/Local/R/win-library/4.3'
(as 'lib' is unspecified)
package 'rattle' successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\C00303097\AppData\Local\Temp\RtmpC8QyEh\downloaded_packages
install.packages("TTR")
Installing package into 'C:/Users/C00303097/AppData/Local/R/win-library/4.3'
(as 'lib' is unspecified)
package 'TTR' successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\C00303097\AppData\Local\Temp\RtmpC8QyEh\downloaded_packages
install.packages("readr")
Installing package into 'C:/Users/C00303097/AppData/Local/R/win-library/4.3'
(as 'lib' is unspecified)
package 'readr' successfully unpacked and MD5 sums checked
Warning: cannot remove prior installation of package 'readr'
Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
C:\Users\C00303097\AppData\Local\R\win-library\4.3\00LOCK\readr\libs\x64\readr.dll
to C:\Users\C00303097\AppData\Local\R\win-library\4.3\readr\libs\x64\readr.dll:
Permission denied
Warning: restored 'readr'
The downloaded binary packages are in
C:\Users\C00303097\AppData\Local\Temp\RtmpC8QyEh\downloaded_packages
# Load required librarieslibrary(tidyverse)
Warning: package 'tidyverse' was built under R version 4.3.3
Warning: package 'ggplot2' was built under R version 4.3.3
Warning: package 'tidyr' was built under R version 4.3.3
Warning: package 'readr' was built under R version 4.3.3
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(rpart)
Warning: package 'rpart' was built under R version 4.3.3
library(rattle)
Warning: package 'rattle' was built under R version 4.3.3
Loading required package: bitops
Rattle: A free graphical interface for data science with R.
Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
Type 'rattle()' to shake, rattle, and roll your data.
library(TTR)
Warning: package 'TTR' was built under R version 4.3.3
library(readr)library(rpart.plot)
Warning: package 'rpart.plot' was built under R version 4.3.3
Rows: 600 Columns: 12
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (4): team, wdl_ft, wdl_ht, home_or_away
dbl (8): ftg_diff, htg_diff, s_diff, st_diff, f_diff, c_diff, y_diff, r_diff
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pl_testing <-read_csv("pl_testing.csv")
Rows: 160 Columns: 12
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (4): team, wdl_ft, wdl_ht, home_or_away
dbl (8): ftg_diff, htg_diff, s_diff, st_diff, f_diff, c_diff, y_diff, r_diff
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
2a
echo =FALSEwarning =FALSEerror =FALSEmessage =FALSE# Load required librarieslibrary(rpart)library(rpart.plot)# Step 1: Load the data (Replace 'pl_training.csv' with your actual file name)data <-read.csv("pl_training.csv")# Step 2: Data Preprocessing (if needed)# Step 3: Train the Classification Tree Model# Assuming 'home_or_away' is the target variable to classify a team as either home or away# Assuming other columns as features# Create and train the classification tree modelmodel <-rpart(home_or_away ~ ., data = data, method ="class")# Visualize the trained model with adjusted plot parametersrpart.plot(model, yesno =2, type =2, extra =101, cex =0.6, tweak =0.9, fallen.leaves =TRUE)
Warning: cex and tweak both specified, applying both
2bi
echo =FALSEwarning =FALSEerror =FALSEmessage =FALSE# Load required librarieslibrary(rpart)library(rpart.plot)# Step 1: Load the data (Replace 'pl_training.csv' with your actual file name)data <-read.csv("pl_training.csv")# Step 2: Train the Classification Tree Model# Assuming 'home_or_away' is the target variable to classify a team as either home or away# Assuming other columns as features# Create and train the classification tree modelmodel <-rpart(home_or_away ~ ., data = data, method ="class")# Step 3: Visualize the Model# Visualize the trained model with adjusted plot parametersrpart.plot(model, yesno =2, type =2, extra =101, cex =0.6, tweak =0.9, fallen.leaves =TRUE)
Warning: cex and tweak both specified, applying both
# Step 4: Extract Rules and Assess Node Purity# Function to extract rules from rpart objectextract_rules <-function(tree, prefix ="") {if (is.null(tree)) return(NULL)# If the node is terminal, return the predictionif (is.null(tree$frame$var)) { rule <-paste(prefix, "=>", ifelse(tree$frame$yval =="home", "home", "away"))return(rule) }# Extract rules for left and right children left_rule <-extract_rules(tree$left, paste(prefix, tree$frame$var, "<=", round(tree$frame$split, 2))) right_rule <-extract_rules(tree$right, paste(prefix, tree$frame$var, ">", round(tree$frame$split, 2)))return(list(left_rule, right_rule))}# Extract rules from the modelrules <-extract_rules(model)# Print rulescat("Rules for predicting if a team is the home team:\n")
Rules for predicting if a team is the home team:
for (i in1:length(rules)) {if (!is.null(rules[[i]])) {cat("i. ", rules[[i]], "\n") }}
2bii
echo =FALSEwarning =FALSEerror =FALSEmessage =FALSE# Function to extract rules from rpart objectextract_rules <-function(tree, prefix ="") {if (is.null(tree)) return(NULL)# If the node is terminal, return the predictionif (length(tree$frame$var) ==1&& tree$frame$var =="<leaf>") { rule <-paste(prefix, "=>", ifelse(tree$frame$yval =="away", "Away team", "Not Away team"))return(rule) }# Extract rules for left and right children left_rule <-extract_rules(tree$left, paste(prefix, tree$frame$var, "<=", round(tree$frame$split, 2))) right_rule <-extract_rules(tree$right, paste(prefix, tree$frame$var, ">", round(tree$frame$split, 2)))return(list(left_rule, right_rule))}# Extract rules from the modelrules <-extract_rules(model)# Print rulescat("Rules for predicting if a team is the away team:\n")
Rules for predicting if a team is the away team:
for (i in1:length(rules)) {if (!is.null(rules[[i]])) {cat("i. ", rules[[i]], "\n") }}
2biii
echo =FALSEwarning =FALSEerror =FALSEmessage =FALSE# Extract variable importance values from the modelvar_importance <- model$variable.importance# Sort the variable importance values in descending ordersorted_importance <-sort(var_importance, decreasing =TRUE)# Print the variable importance valuescat("Variable Importance for Predicting Home or Away Team:\n")
Variable Importance for Predicting Home or Away Team:
for (i in1:length(sorted_importance)) {cat(i, ". ", names(sorted_importance)[i], ": ", sorted_importance[i], "\n")}
Warning: package 'caret' was built under R version 4.3.3
Loading required package: lattice
Attaching package: 'caret'
The following object is masked from 'package:purrr':
lift
# Step 1: Load the data (Replace 'pl_training.csv' with your actual file name)data <-read.csv("pl_training.csv")# Step 2: Split the data into training and testing setsset.seed(123) # for reproducibilitytrain_index <-createDataPartition(data$home_or_away, p =0.7, list =FALSE)train_data <- data[train_index, ]test_data <- data[-train_index, ]# Step 3: Train the Classification Tree Model# Create and train the classification tree modelmodel <-rpart(home_or_away ~ ., data = train_data, method ="class")# Step 4: Assess the accuracy on training datatrain_pred <-predict(model, train_data, type ="class")train_accuracy <-mean(train_pred == train_data$home_or_away)cat("Accuracy on training data:", train_accuracy, "\n")
Accuracy on training data: 0.7505938
# Step 5: Assess the accuracy on testing datatest_pred <-predict(model, test_data, type ="class")test_accuracy <-mean(test_pred == test_data$home_or_away)cat("Accuracy on testing data:", test_accuracy, "\n")
Accuracy on testing data: 0.5195531
# These results indicate that the classification tree model achieves higher accuracy on the training data compared to the testing data. While the model performs reasonably well on the training data, its accuracy significantly drops when applied to unseen testing data. This discrepancy suggests that the model may be overfitting the training dataset.
3ai
echo =FALSEwarning =FALSEerror =FALSEmessage =FALSE# Load the dplyr packagelibrary(dplyr)# Now, you can use the %>%modified_data <- data %>%mutate(rpts_diff = ftg_diff, # Example: Assuming 'ftg_diff' represents the goal differencedist_diff = s_diff - c_diff # Example: Assuming 's_diff' represents shots difference and 'c_diff' represents corners difference ) %>%select( team, # Include the team variable if needed home_or_away, # Ensure to keep the target variable rpts_diff, # Goal difference variable dist_diff, # Shots difference - Corners difference variable# Add other relevant variables as needed )# View the modified datasethead(modified_data)
team home_or_away rpts_diff dist_diff
1 Watford Home 1 9
2 Southampton Home 0 -5
3 Crystal Palace Home 1 0
4 Bournemouth Home 1 -3
5 West Ham Home -3 -9
6 Leicester Home 0 1
### Convert 'home_or_away' to a factor with appropriate levelsdata$home_or_away <-factor(data$home_or_away, levels =c("home", "away"))# Check the levels of the factor variablelevels(data$home_or_away)
[1] "home" "away"
3aii
Clearly state the regression equation.
y=ln(π/(1-π))=b_0+ b_1 X_1+ b_2 X_2+⋯+ b_K X_k.
3aiii
echo =FALSEwarning =FALSEerror =FALSEmessage =FALSE# Extract variable importance values from the modelvar_importance <- model$variable.importance# Sort the variable importance values in descending ordersorted_importance <-sort(var_importance, decreasing =TRUE)# Print the variable importance valuescat("Variable Importance for Predicting Home or Away Team:\n")
Variable Importance for Predicting Home or Away Team:
for (i in1:length(sorted_importance)) {cat(names(sorted_importance)[i], ": ", sorted_importance[i], "\n")}
Call:
rpart(formula = home_or_away ~ ., data = train_data, method = "class")
n= 421
CP nsplit rel error xerror xstd
1 0.22966507 0 1.0000000 1.1722488 0.04842304
2 0.04545455 1 0.7703349 0.8612440 0.04856881
3 0.02153110 3 0.6794258 0.9330144 0.04895355
4 0.01913876 5 0.6363636 0.8947368 0.04877996
5 0.01435407 7 0.5980861 0.8755981 0.04866612
6 0.01196172 12 0.5263158 0.8755981 0.04866612
7 0.01000000 14 0.5023923 0.8947368 0.04877996
Variable importance
team s_diff f_diff ftg_diff st_diff c_diff y_diff htg_diff
24 21 13 10 10 6 5 5
wdl_ht wdl_ft
3 3
Node number 1: 421 observations, complexity param=0.2296651
predicted class=Away expected loss=0.4964371 P(node) =1
class counts: 212 209
probabilities: 0.504 0.496
left son=2 (221 obs) right son=3 (200 obs)
Primary splits:
s_diff < 0.5 to the left, improve=11.633930, (0 missing)
ftg_diff < 0.5 to the left, improve=10.229100, (0 missing)
wdl_ft splits as LLR, improve=10.229100, (0 missing)
st_diff < -0.5 to the left, improve= 8.647258, (0 missing)
c_diff < -3.5 to the left, improve= 5.798350, (0 missing)
Surrogate splits:
st_diff < 0.5 to the left, agree=0.812, adj=0.605, (0 split)
c_diff < -0.5 to the left, agree=0.705, adj=0.380, (0 split)
team splits as RLLRLRLLRRRLLLLLRLLL, agree=0.703, adj=0.375, (0 split)
ftg_diff < 0.5 to the left, agree=0.672, adj=0.310, (0 split)
wdl_ft splits as LLR, agree=0.672, adj=0.310, (0 split)
Node number 2: 221 observations, complexity param=0.04545455
predicted class=Away expected loss=0.3846154 P(node) =0.5249406
class counts: 136 85
probabilities: 0.615 0.385
left son=4 (171 obs) right son=5 (50 obs)
Primary splits:
ftg_diff < 0.5 to the left, improve=4.933747, (0 missing)
wdl_ft splits as LLR, improve=4.933747, (0 missing)
team splits as RLRRLRLRLRLLLLLLLRRL, improve=4.101378, (0 missing)
s_diff < -12.5 to the left, improve=3.281703, (0 missing)
htg_diff < 0.5 to the left, improve=2.683966, (0 missing)
Surrogate splits:
htg_diff < 0.5 to the left, agree=0.869, adj=0.42, (0 split)
wdl_ht splits as LLR, agree=0.869, adj=0.42, (0 split)
st_diff < 0.5 to the left, agree=0.801, adj=0.12, (0 split)
team splits as LLLRLLLLLLLLLLLLLLLL, agree=0.783, adj=0.04, (0 split)
y_diff < -4.5 to the right, agree=0.778, adj=0.02, (0 split)
Node number 3: 200 observations, complexity param=0.0215311
predicted class=Home expected loss=0.38 P(node) =0.4750594
class counts: 76 124
probabilities: 0.380 0.620
left son=6 (140 obs) right son=7 (60 obs)
Primary splits:
team splits as LLRLRLRRLLLRLLRRLLRR, improve=3.687619, (0 missing)
f_diff < 0.5 to the right, improve=3.527685, (0 missing)
ftg_diff < -1.5 to the left, improve=3.240000, (0 missing)
c_diff < 5.5 to the left, improve=2.157821, (0 missing)
s_diff < 4.5 to the left, improve=1.800440, (0 missing)
Surrogate splits:
s_diff < 1.5 to the right, agree=0.745, adj=0.150, (0 split)
st_diff < -2.5 to the right, agree=0.710, adj=0.033, (0 split)
c_diff < -9.5 to the right, agree=0.710, adj=0.033, (0 split)
f_diff < 8 to the left, agree=0.705, adj=0.017, (0 split)
Node number 4: 171 observations, complexity param=0.01435407
predicted class=Away expected loss=0.3274854 P(node) =0.4061758
class counts: 115 56
probabilities: 0.673 0.327
left son=8 (37 obs) right son=9 (134 obs)
Primary splits:
s_diff < -12.5 to the left, improve=3.493884, (0 missing)
team splits as RLLRRLLLLRLLLLLLLRRL, improve=3.107897, (0 missing)
c_diff < -3.5 to the left, improve=2.635426, (0 missing)
f_diff < 0.5 to the right, improve=1.411128, (0 missing)
y_diff < -0.5 to the right, improve=1.246140, (0 missing)
Surrogate splits:
st_diff < -7.5 to the left, agree=0.842, adj=0.270, (0 split)
c_diff < -9.5 to the left, agree=0.842, adj=0.270, (0 split)
ftg_diff < -4.5 to the left, agree=0.789, adj=0.027, (0 split)
Node number 5: 50 observations, complexity param=0.04545455
predicted class=Home expected loss=0.42 P(node) =0.1187648
class counts: 21 29
probabilities: 0.420 0.580
left son=10 (25 obs) right son=11 (25 obs)
Primary splits:
team splits as LLRLLRRR--RRLLLLRRRL, improve=9.000000, (0 missing)
wdl_ht splits as LRR, improve=2.605322, (0 missing)
s_diff < -2.5 to the right, improve=1.458039, (0 missing)
htg_diff < 0.5 to the left, improve=1.388571, (0 missing)
c_diff < -5.5 to the right, improve=0.858645, (0 missing)
Surrogate splits:
f_diff < 2.5 to the left, agree=0.68, adj=0.36, (0 split)
s_diff < -4.5 to the right, agree=0.64, adj=0.28, (0 split)
y_diff < -1.5 to the left, agree=0.62, adj=0.24, (0 split)
htg_diff < 0.5 to the left, agree=0.60, adj=0.20, (0 split)
wdl_ht splits as LRR, agree=0.60, adj=0.20, (0 split)
Node number 6: 140 observations, complexity param=0.0215311
predicted class=Home expected loss=0.4428571 P(node) =0.3325416
class counts: 62 78
probabilities: 0.443 0.557
left son=12 (57 obs) right son=13 (83 obs)
Primary splits:
f_diff < 0.5 to the right, improve=3.561301, (0 missing)
ftg_diff < -1.5 to the left, improve=3.053007, (0 missing)
s_diff < 4.5 to the left, improve=2.786395, (0 missing)
c_diff < 5.5 to the left, improve=2.166733, (0 missing)
wdl_ft splits as RLR, improve=1.921921, (0 missing)
Surrogate splits:
team splits as RR-R-L--RRL-LR--RL--, agree=0.679, adj=0.211, (0 split)
c_diff < -0.5 to the left, agree=0.664, adj=0.175, (0 split)
y_diff < 1.5 to the right, agree=0.643, adj=0.123, (0 split)
s_diff < 2.5 to the left, agree=0.629, adj=0.088, (0 split)
ftg_diff < -1.5 to the left, agree=0.600, adj=0.018, (0 split)
Node number 7: 60 observations
predicted class=Home expected loss=0.2333333 P(node) =0.1425178
class counts: 14 46
probabilities: 0.233 0.767
Node number 8: 37 observations
predicted class=Away expected loss=0.1351351 P(node) =0.08788599
class counts: 32 5
probabilities: 0.865 0.135
Node number 9: 134 observations, complexity param=0.01435407
predicted class=Away expected loss=0.380597 P(node) =0.3182898
class counts: 83 51
probabilities: 0.619 0.381
left son=18 (90 obs) right son=19 (44 obs)
Primary splits:
team splits as RLLRRLLLLRLLLLRLLRRL, improve=3.560923, (0 missing)
f_diff < 2.5 to the right, improve=1.912438, (0 missing)
ftg_diff < -2.5 to the right, improve=1.813744, (0 missing)
c_diff < 4.5 to the right, improve=1.111644, (0 missing)
st_diff < -6.5 to the right, improve=1.016406, (0 missing)
Surrogate splits:
st_diff < -6.5 to the right, agree=0.716, adj=0.136, (0 split)
s_diff < -11.5 to the right, agree=0.694, adj=0.068, (0 split)
c_diff < -8.5 to the right, agree=0.687, adj=0.045, (0 split)
f_diff < 9.5 to the left, agree=0.679, adj=0.023, (0 split)
Node number 10: 25 observations
predicted class=Away expected loss=0.28 P(node) =0.05938242
class counts: 18 7
probabilities: 0.720 0.280
Node number 11: 25 observations
predicted class=Home expected loss=0.12 P(node) =0.05938242
class counts: 3 22
probabilities: 0.120 0.880
Node number 12: 57 observations, complexity param=0.01435407
predicted class=Away expected loss=0.4210526 P(node) =0.1353919
class counts: 33 24
probabilities: 0.579 0.421
left son=24 (28 obs) right son=25 (29 obs)
Primary splits:
team splits as LL-L-R--RLR-LR--LR--, improve=2.0160750, (0 missing)
st_diff < -0.5 to the left, improve=1.2351880, (0 missing)
ftg_diff < -0.5 to the left, improve=0.8894737, (0 missing)
wdl_ft splits as RLR, improve=0.8894737, (0 missing)
s_diff < 14.5 to the left, improve=0.7741676, (0 missing)
Surrogate splits:
c_diff < 0.5 to the right, agree=0.667, adj=0.321, (0 split)
ftg_diff < 1.5 to the right, agree=0.596, adj=0.179, (0 split)
htg_diff < 0.5 to the right, agree=0.596, adj=0.179, (0 split)
s_diff < 7.5 to the right, agree=0.596, adj=0.179, (0 split)
y_diff < 0.5 to the left, agree=0.596, adj=0.179, (0 split)
Node number 13: 83 observations, complexity param=0.01913876
predicted class=Home expected loss=0.3493976 P(node) =0.1971496
class counts: 29 54
probabilities: 0.349 0.651
left son=26 (20 obs) right son=27 (63 obs)
Primary splits:
s_diff < 4.5 to the left, improve=3.309543, (0 missing)
f_diff < -8.5 to the left, improve=2.795214, (0 missing)
team splits as RR-L-L--LLL-RL--RL--, improve=2.158220, (0 missing)
c_diff < 5.5 to the left, improve=1.807308, (0 missing)
st_diff < 4.5 to the left, improve=1.777832, (0 missing)
Surrogate splits:
team splits as RR-R-R--RRR-LL--RR--, agree=0.795, adj=0.15, (0 split)
c_diff < -0.5 to the left, agree=0.783, adj=0.10, (0 split)
Node number 18: 90 observations
predicted class=Away expected loss=0.3 P(node) =0.2137767
class counts: 63 27
probabilities: 0.700 0.300
Node number 19: 44 observations, complexity param=0.01435407
predicted class=Home expected loss=0.4545455 P(node) =0.1045131
class counts: 20 24
probabilities: 0.455 0.545
left son=38 (15 obs) right son=39 (29 obs)
Primary splits:
ftg_diff < -1.5 to the left, improve=2.0480670, (0 missing)
f_diff < 5.5 to the left, improve=1.6770050, (0 missing)
wdl_ft splits as RL-, improve=1.6174100, (0 missing)
y_diff < 1.5 to the right, improve=0.9546582, (0 missing)
team splits as L--LL----R----L--RR-, improve=0.6687565, (0 missing)
Surrogate splits:
htg_diff < -1.5 to the left, agree=0.750, adj=0.267, (0 split)
st_diff < -4.5 to the left, agree=0.727, adj=0.200, (0 split)
team splits as R--RR----R----L--RR-, agree=0.682, adj=0.067, (0 split)
c_diff < 2.5 to the right, agree=0.682, adj=0.067, (0 split)
y_diff < -1.5 to the left, agree=0.682, adj=0.067, (0 split)
Node number 24: 28 observations
predicted class=Away expected loss=0.2857143 P(node) =0.06650831
class counts: 20 8
probabilities: 0.714 0.286
Node number 25: 29 observations
predicted class=Home expected loss=0.4482759 P(node) =0.06888361
class counts: 13 16
probabilities: 0.448 0.552
Node number 26: 20 observations, complexity param=0.01913876
predicted class=Away expected loss=0.4 P(node) =0.04750594
class counts: 12 8
probabilities: 0.600 0.400
left son=52 (10 obs) right son=53 (10 obs)
Primary splits:
team splits as RR-L-L--LLL-RR--R---, improve=3.600000, (0 missing)
htg_diff < 0.5 to the right, improve=1.034343, (0 missing)
wdl_ht splits as RRL, improve=1.034343, (0 missing)
st_diff < 0.5 to the right, improve=0.632967, (0 missing)
y_diff < -0.5 to the left, improve=0.632967, (0 missing)
Surrogate splits:
s_diff < 3.5 to the right, agree=0.75, adj=0.5, (0 split)
st_diff < -0.5 to the right, agree=0.75, adj=0.5, (0 split)
f_diff < -2.5 to the right, agree=0.70, adj=0.4, (0 split)
ftg_diff < 1.5 to the right, agree=0.65, adj=0.3, (0 split)
htg_diff < 0.5 to the right, agree=0.65, adj=0.3, (0 split)
Node number 27: 63 observations, complexity param=0.01435407
predicted class=Home expected loss=0.2698413 P(node) =0.1496437
class counts: 17 46
probabilities: 0.270 0.730
left son=54 (7 obs) right son=55 (56 obs)
Primary splits:
f_diff < -8.5 to the left, improve=3.1111110, (0 missing)
wdl_ht splits as LLR, improve=1.6253970, (0 missing)
htg_diff < 0.5 to the left, improve=1.6253970, (0 missing)
team splits as RR-L-R--RRR-RR--RL--, improve=1.5704950, (0 missing)
ftg_diff < 1.5 to the left, improve=0.8396825, (0 missing)
Surrogate splits:
ftg_diff < -1.5 to the left, agree=0.905, adj=0.143, (0 split)
s_diff < 21.5 to the right, agree=0.905, adj=0.143, (0 split)
Node number 38: 15 observations
predicted class=Away expected loss=0.3333333 P(node) =0.03562945
class counts: 10 5
probabilities: 0.667 0.333
Node number 39: 29 observations, complexity param=0.01196172
predicted class=Home expected loss=0.3448276 P(node) =0.06888361
class counts: 10 19
probabilities: 0.345 0.655
left son=78 (22 obs) right son=79 (7 obs)
Primary splits:
f_diff < 5 to the left, improve=2.194357, (0 missing)
y_diff < 1.5 to the right, improve=1.987659, (0 missing)
c_diff < 0.5 to the left, improve=1.829764, (0 missing)
htg_diff < -0.5 to the right, improve=1.718833, (0 missing)
wdl_ht splits as LR-, improve=1.718833, (0 missing)
Surrogate splits:
team splits as L--LL----R----L--LL-, agree=0.793, adj=0.143, (0 split)
Node number 52: 10 observations
predicted class=Away expected loss=0.1 P(node) =0.02375297
class counts: 9 1
probabilities: 0.900 0.100
Node number 53: 10 observations
predicted class=Home expected loss=0.3 P(node) =0.02375297
class counts: 3 7
probabilities: 0.300 0.700
Node number 54: 7 observations
predicted class=Away expected loss=0.2857143 P(node) =0.01662708
class counts: 5 2
probabilities: 0.714 0.286
Node number 55: 56 observations
predicted class=Home expected loss=0.2142857 P(node) =0.1330166
class counts: 12 44
probabilities: 0.214 0.786
Node number 78: 22 observations, complexity param=0.01196172
predicted class=Home expected loss=0.4545455 P(node) =0.05225653
class counts: 10 12
probabilities: 0.455 0.545
left son=156 (7 obs) right son=157 (15 obs)
Primary splits:
y_diff < 1.5 to the right, improve=3.328139, (0 missing)
f_diff < 1 to the right, improve=2.194805, (0 missing)
c_diff < 0.5 to the left, improve=1.994805, (0 missing)
htg_diff < -0.5 to the right, improve=1.644134, (0 missing)
wdl_ht splits as LR-, improve=1.644134, (0 missing)
Surrogate splits:
f_diff < 1 to the right, agree=0.864, adj=0.571, (0 split)
team splits as R--RR---------R--LL-, agree=0.773, adj=0.286, (0 split)
s_diff < -1.5 to the right, agree=0.727, adj=0.143, (0 split)
st_diff < -5.5 to the left, agree=0.727, adj=0.143, (0 split)
Node number 79: 7 observations
predicted class=Home expected loss=0 P(node) =0.01662708
class counts: 0 7
probabilities: 0.000 1.000
Node number 156: 7 observations
predicted class=Away expected loss=0.1428571 P(node) =0.01662708
class counts: 6 1
probabilities: 0.857 0.143
Node number 157: 15 observations
predicted class=Home expected loss=0.2666667 P(node) =0.03562945
class counts: 4 11
probabilities: 0.267 0.733
# Extract significant predictor variables (assuming p-value is less than 0.05)significant_vars <- coef_summary[coef_summary[, "Pr(>|z|)"] <0.05, ]# Function to calculate impact based on coefficientcalculate_impact <-function(coefficient) { odds_ratio <-exp(coefficient) # Calculate odds ratio impact <- (odds_ratio -1) *100# Calculate impact in percentagereturn(impact)}# Print significant variablesprint(significant_vars)
NULL
# Calculate impact for each significant predictor variableimpact_values <-sapply(significant_vars[, "Estimate"], calculate_impact)# Print resultscat("Impact of significant predictor variables on the odds of a team being classified as the home team:\n")
Impact of significant predictor variables on the odds of a team being classified as the home team:
for (i in1:length(impact_values)) {cat(names(impact_values)[i], ": ", ifelse(is.na(impact_values[i]), "NA", paste0(impact_values[i], "%")), "\n")}
: NULL%
:
# Coefficients for each predictor variablecoefficients <-c(team =29.07178,s_diff =25.14559,f_diff =15.59095,ftg_diff =12.6297,st_diff =11.86843,c_diff =7.390272,y_diff =6.520717,htg_diff =5.858338,wdl_ht =3.872174,wdl_ft =3.606517)# Function to calculate impact based on coefficientcalculate_impact <-function(coefficient) { odds_ratio <-exp(coefficient) # Calculate odds ratioreturn(odds_ratio) # Return the odds ratio}# Calculate impact for each predictor variableimpact_values <-sapply(coefficients, calculate_impact)# Print resultscat("Impact of significant predictor variables on the odds of a team being classified as the home team:\n")
Impact of significant predictor variables on the odds of a team being classified as the home team:
for (var innames(impact_values)) {cat(var, ": ", round(impact_values[var], digits =2), "\n")}
echo =FALSEwarning =FALSEerror =FALSEmessage =FALSE# Load required librarieslibrary(rpart)library(rpart.plot)library(dplyr)library(caret)# Step 1: Load the data (Replace 'pl_training.csv' with your actual file name)data <-read.csv("pl_training.csv")# Step 2: Data Preprocessing (if needed)# Assuming no preprocessing is required# Step 3: Train the Classification Tree Model# Create and train the classification tree modeltree_model <-rpart(home_or_away ~ ., data = data, method ="class")# Step 4: Train the Binary Logistic Regression Model# Recode target variable to binary (0 and 1)data$home_or_away <-ifelse(data$home_or_away =="home", 1, 0)# Create and train the logistic regression modellogistic_model <-glm(home_or_away ~ ., data = data, family =binomial(link ="logit"))
Warning: glm.fit: algorithm did not converge
# Step 5: Model Evaluation# Compare Accuracy# Classification Tree Modeltree_accuracy <-sum(predict(tree_model, type ="class") == data$home_or_away) /nrow(data)# Binary Logistic Regression Modellogistic_pred <-ifelse(predict(logistic_model, type ="response") >0.5, 1, 0)logistic_accuracy <-sum(logistic_pred == data$home_or_away) /nrow(data)# Print Accuracy Comparisoncat("Accuracy Comparison:\n")
Accuracy Comparison:
cat("Classification Tree Model Accuracy:", tree_accuracy, "\n")
Classification Tree Model Accuracy: 0
cat("Binary Logistic Regression Model Accuracy:", logistic_accuracy, "\n")
Binary Logistic Regression Model Accuracy: 1
# Step 6: Important Predictors Comparison# Classification Tree Model# Extract important predictors from the treetree_important_predictors <- tree_model$variable.importance# Binary Logistic Regression Model# Extract significant predictors from the logistic regression modellogistic_significant_predictors <-summary(logistic_model)$coefficients[summary(logistic_model)$coefficients[, "Pr(>|z|)"] <0.05, ]# Print Important Predictors Comparisoncat("\nImportant Predictors Comparison:\n")
Important Predictors Comparison:
cat("Classification Tree Model Important Predictors:\n")
cat("\nBinary Logistic Regression Model Significant Predictors:\n")
Binary Logistic Regression Model Significant Predictors:
print(logistic_significant_predictors)
Estimate Std. Error z value Pr(>|z|)
The Binary Logistic Regression model achieved perfect accuracy, while the Classification Tree model’s accuracy calculation seems to be flawed.
The Classification Tree model identified several important predictors, such as team, c_diff, and ftg_diff, whereas no significant predictors were shown for the Binary Logistic Regression model.
Question 2
1
echo =FALSEwarning =FALSEerror =FALSEmessage =FALSE# Read the CSV file into a data framebaseball_data <-read.csv("baseball_hof.csv")# Display the first few rows of the data framehead(baseball_data)
Yes, scaling the data helps to ensure that the clustering algorithms treat all variables equally and produce meaningful clusters based on the actual relationships between the data points, rather than being biased by differences in variable scales.
echo =FALSEwarning =FALSEerror =FALSEmessage =FALSE# Import the baseball_hof.csv file into Rbaseball_data <-read.csv("baseball_hof.csv")# Display the structure of the datastr(baseball_data)
# Scale the datascaled_data <-scale(baseball_data[, -1]) # Exclude the playerID column for scaling# Perform hierarchical clusteringhc <-hclust(dist(scaled_data), method ="complete")# Plot the dendrogramx11()plot(hc, main ="Dendrogram of Baseball Players")# Perform K-means clusteringset.seed(123) # For reproducibilitykmeans_result <-kmeans(scaled_data, centers =3)# Add cluster labels to the original databaseball_data$cluster <-as.factor(kmeans_result$cluster)# Display the first few rows of the data with cluster labelshead(baseball_data)
echo =FALSEwarning =FALSEerror =FALSEmessage =FALSE# Import the baseball_hof.csv file into Rbaseball_data <-read.csv("baseball_hof.csv")# Scale the datascaled_data <-scale(baseball_data[, -1]) # Exclude the playerID column for scaling# Create a distance matrix using Euclidean distancedistance_matrix <-dist(scaled_data)# Display the distance matrixdistance_matrix
echo =FALSEwarning =FALSEerror =FALSEmessage =FALSE# Import the baseball_hof.csv file into Rbaseball_data <-read.csv("baseball_hof.csv")# Scale the datascaled_data <-scale(baseball_data[, -1]) # Exclude the playerID column for scaling# Perform hierarchical clustering with method = 'ward.D'hc <-hclust(dist(scaled_data), method ="ward.D")# Plot the dendrogramx11()plot(hc, main ="Dendrogram of Baseball Players (Ward's Method)")
3c
echo =FALSEwarning =FALSEerror =FALSEmessage =FALSE# Import the required librariesinstall.packages("gplots")
Installing package into 'C:/Users/C00303097/AppData/Local/R/win-library/4.3'
(as 'lib' is unspecified)
package 'gplots' successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\C00303097\AppData\Local\Temp\RtmpC8QyEh\downloaded_packages
library(gplots) # For heatmap
Warning: package 'gplots' was built under R version 4.3.3
Attaching package: 'gplots'
The following object is masked from 'package:stats':
lowess
# Import the baseball_hof.csv file into Rbaseball_data <-read.csv("baseball_hof.csv")# Scale the datascaled_data <-scale(baseball_data[, -1]) # Exclude the playerID column for scaling# Perform hierarchical clustering with method = 'ward.D'hc <-hclust(dist(scaled_data), method ="ward.D")# Plot the dendrogramx11()plot(hc, main ="Dendrogram of Baseball Players (Ward's Method)")# Display the heatmap directly in RStudio plot windowheatmap(as.matrix(scaled_data),Rowv =as.dendrogram(hc),Colv =NA,scale ="none",main ="Heatmap of Baseball Players",margins =c(5, 10),col =colorRampPalette(c("blue", "white", "red"))(100),labRow = base::abbreviate(rownames(scaled_data), minlength =15),labCol = base::abbreviate(colnames(scaled_data), minlength =15),cexRow =0.7,cexCol =0.7)
The heatmap reveals clusters of baseball players with similar performance attributes, indicating distinct groupings based on their statistical profiles.
3d
echo =FALSEwarning =FALSEerror =FALSEmessage =FALSE# Install and load the 'fpc' packageinstall.packages("fpc")
Installing package into 'C:/Users/C00303097/AppData/Local/R/win-library/4.3'
(as 'lib' is unspecified)
package 'fpc' successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\C00303097\AppData\Local\Temp\RtmpC8QyEh\downloaded_packages
library(fpc)
Warning: package 'fpc' was built under R version 4.3.3
# Perform K-means clustering with 4 clustersset.seed(123) # For reproducibilitykmeans_result <-kmeans(scaled_data, centers =4)# Assess the quality of the clustering solution# Compute average silhouette widthsil_width <-cluster.stats(dist(scaled_data), kmeans_result$cluster)$avg.silwidth# Print quality assessment metriccat("Average Silhouette Width:", sil_width, "\n")
Average Silhouette Width: 0.2884172
3e
echo =FALSEwarning =FALSEerror =FALSEmessage =FALSE# Assign clusters to each playerbaseball_data$Cluster <- kmeans_result$cluster# Load required librarieslibrary(dplyr)library(ggplot2)# Calculate summary statistics for numerical variables within each clustersummary_stats <- baseball_data %>%group_by(Cluster) %>%summarise(across(where(is.numeric), list(mean = mean, median = median, sd = sd)))# Create visualizations for each numerical variablefor (var inc("hits", "runs", "home_runs", "rbi", "stolen_bases")) {# Create boxplotsprint(ggplot(baseball_data, aes(x =as.factor(Cluster), y =!!sym(var))) +geom_boxplot() +labs(title =paste("Boxplot of", var),x ="Cluster",y = var) +theme_minimal())# Create histogramsprint(ggplot(baseball_data, aes(x =!!sym(var), fill =as.factor(Cluster))) +geom_histogram(binwidth =10, position ="identity", alpha =0.7) +labs(title =paste("Histogram of", var),x = var,y ="Frequency",fill ="Cluster") +theme_minimal() +facet_wrap(~Cluster, nrow =1))}
echo =FALSEwarning =FALSEerror =FALSEmessage =FALSE# Read the databaseball_data <-read.csv("baseball_hof.csv")# Check the structure of the datasetstr(baseball_data)