library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(dplyr)
Lets build two pairs of variables from the dataset.
data <- read.csv("~/Downloads/archive/matches.csv")
# Pair 1: target_runs (Explanatory) and result_margin (Response)
data <- data %>%
mutate(win_type = ifelse(result == "runs", "Won by Runs", "Won by Wickets"))
# Pair 2: target_runs (Explanatory) and target_overs (Response)
data <- data %>%
mutate(average_run_rate = target_runs / target_overs)
# print
head(data %>% select(target_runs, result_margin, win_type, target_overs, average_run_rate))
Lets see some visualizations to understand these variables better.
data <- data %>%
mutate(win_type = ifelse(result == "runs", "Runs", "Wickets"))
ggplot(data, aes(x = target_runs, y = result_margin)) +
geom_point(aes(color = win_type), alpha = 0.6) +
labs(title = "Target Runs vs Result Margin", x = "Target Runs", y = "Result Margin") +
theme_minimal()
## Warning: Removed 19 rows containing missing values or values outside the scale range
## (`geom_point()`).
data <- data %>%
mutate(average_run_rate = target_runs / target_overs)
ggplot(data, aes(x = target_runs, y = average_run_rate)) +
geom_point(alpha = 0.6, color = "blue") +
labs(title = "Target Runs vs Average Run Rate", x = "Target Runs", y = "Average Run Rate") +
theme_minimal()
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).
Lets find the correlation coefficients for these combinations. As both variables are continuous, and the relationship between them is linear we can use pearson coefficient.
# Correlation for Pair 1: target_runs vs result_margin
correlation_1 <- cor(data$target_runs, data$result_margin, use = "complete.obs", method = "pearson")
cat("Correlation between Target Runs and Result Margin:", correlation_1, "\n")
## Correlation between Target Runs and Result Margin: 0.3951201
# Correlation for Pair 2: target_runs vs average_run_rate
correlation_2 <- cor(data$target_runs, data$average_run_rate, use = "complete.obs", method = "pearson")
cat("Correlation between Target Runs and Average Run Rate:", correlation_2, "\n")
## Correlation between Target Runs and Average Run Rate: 0.8825475
# Confidence Interval for response variable: result_margin (Pair 1)
ci_result_margin <- t.test(data$result_margin, conf.level = 0.95)
cat("95% Confidence Interval for Result Margin:\n", ci_result_margin$conf.int, "\n")
## 95% Confidence Interval for Result Margin:
## 15.95601 18.56257
# Confidence Interval for response variable: average_run_rate (Pair 2)
ci_avg_run_rate <- t.test(data$average_run_rate, conf.level = 0.95)
cat("95% Confidence Interval for Average Run Rate:\n", ci_avg_run_rate$conf.int, "\n")
## 95% Confidence Interval for Average Run Rate:
## 8.304006 8.495184