# データの読み込み
d<- read.csv('https://stats.dip.jp/01_ds/data/ad_sales.csv')
library(DT)
datatable(d, caption = 'ある製品の広告媒体別費用と売上(単位:万円)')
# 列名の変更
colnames(d) <- c("TV", "Radio", "Newspaper", "Sales")
library(tidyr)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# データの整形
d_long <- d%>%
gather(key = "Media", value = "Cost", -Sales)
# 箱ひげ図の作成
ggplot(d_long, aes(x = Media, y = Cost, fill = Media)) +
geom_boxplot() +
labs(title = "Boxplot of Advertising Costs by Media", x = "Media", y = "Cost (10,000 yen)") +
theme_minimal() +
theme(legend.position = "none")

# Calculate the correlation matrix
cor_matrix <- cor(d[, -4])
cor_matrix
## TV Radio Newspaper
## TV 1.00000000 0.05480866 0.05664787
## Radio 0.05480866 1.00000000 0.35410375
## Newspaper 0.05664787 0.35410375 1.00000000
# Create a pair plot for correlation analysis
pairs(d[, -4], main = "Correlation Analysis of Advertising Costs and Sales")

# Perform correlation tests
cor.test(d$TV, d$Sales)
##
## Pearson's product-moment correlation
##
## data: d$TV and d$Sales
## t = 17.668, df = 198, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7218201 0.8308014
## sample estimates:
## cor
## 0.7822244
cor.test(d$Radio, d$Sales)
##
## Pearson's product-moment correlation
##
## data: d$Radio and d$Sales
## t = 9.9208, df = 198, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4754954 0.6620366
## sample estimates:
## cor
## 0.5762226
cor.test(d$Newspaper, d$Sales)
##
## Pearson's product-moment correlation
##
## data: d$Newspaper and d$Sales
## t = 3.2996, df = 198, p-value = 0.001148
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.0924875 0.3557712
## sample estimates:
## cor
## 0.228299
# Change column names
colnames(d) <- c("TV", "Radio", "Newspaper", "Sales")
# Set the colors and points for the plot
COL <- c("blue", "green", "red")
PCH <- 16:18
# Extract data for each media type
tv_data <- data.frame(Cost = d$TV, Sales = d$Sales)
radio_data <- data.frame(Cost = d$Radio, Sales = d$Sales)
newspaper_data <- data.frame(Cost = d$Newspaper, Sales = d$Sales)
# Create the base scatter plot
plot(0, 0, type = 'n',
xlim = c(0, max(c(d$TV, d$Radio, d$Newspaper))),
ylim = c(0, max(d$Sales)),
main = 'Correlation Analysis of Advertising Costs and Sales',
xlab = 'Advertising Costs (10,000 yen)',
ylab = 'Sales (10,000 yen)')
# Add grid lines
abline(h = seq(0, max(d$Sales), by = 50000), v = seq(0, max(c(d$TV, d$Radio, d$Newspaper)), by = 500),
lty = 2, col = gray(0.5, 0.25))
# Plot the data points for each media type
points(tv_data$Cost, tv_data$Sales, pch = PCH[1], col = COL[1])
points(radio_data$Cost, radio_data$Sales, pch = PCH[2], col = COL[2])
points(newspaper_data$Cost, newspaper_data$Sales, pch = PCH[3], col = COL[3])
# Add a legend
legend('topright', pch = PCH, col = COL, legend = c("TV", "Radio", "Newspaper"))

# Change column names
colnames(d) <- c("TV", "Radio", "Newspaper", "Sales")
# Correlation test and significance for TV advertising costs and sales
cor_test_tv <- cor.test(d$TV, d$Sales)
significance_tv <- ifelse(cor_test_tv$p.value < 0.05, 'significant', 'not significant')
cat("TV advertising costs and sales: \nCorrelation coefficient =", cor_test_tv$estimate, "\np-value =", cor_test_tv$p.value, "-", significance_tv, "\n")
## TV advertising costs and sales:
## Correlation coefficient = 0.7822244
## p-value = 1.46739e-42 - significant
# Correlation test and significance for Radio advertising costs and sales
cor_test_radio <- cor.test(d$Radio, d$Sales)
significance_radio <- ifelse(cor_test_radio$p.value < 0.05, 'significant', 'not significant')
cat("Radio advertising costs and sales: \nCorrelation coefficient =", cor_test_radio$estimate, "\np-value =", cor_test_radio$p.value, "-", significance_radio, "\n")
## Radio advertising costs and sales:
## Correlation coefficient = 0.5762226
## p-value = 4.354966e-19 - significant
# Correlation test and significance for Newspaper advertising costs and sales
cor_test_newspaper <- cor.test(d$Newspaper, d$Sales)
significance_newspaper <- ifelse(cor_test_newspaper$p.value < 0.05, 'significant', 'not significant')
cat("Newspaper advertising costs and sales: \nCorrelation coefficient =", cor_test_newspaper$estimate, "\np-value =", cor_test_newspaper$p.value, "-", significance_newspaper, "\n")
## Newspaper advertising costs and sales:
## Correlation coefficient = 0.228299
## p-value = 0.001148196 - significant
# 列名の変更
colnames(d) <- c("TV", "Radio", "Newspaper", "Sales")
# 回帰分析モデルの作成
fit <- lm(Sales ~ TV + Radio + Newspaper, data = d)
summary(fit)
##
## Call:
## lm(formula = Sales ~ TV + Radio + Newspaper, data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -88277 -8908 2418 11893 28292
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29388.894 3119.082 9.422 <2e-16 ***
## TV 45.765 1.395 32.809 <2e-16 ***
## Radio 188.530 8.611 21.893 <2e-16 ***
## Newspaper -1.037 5.871 -0.177 0.86
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16860 on 196 degrees of freedom
## Multiple R-squared: 0.8972, Adjusted R-squared: 0.8956
## F-statistic: 570.3 on 3 and 196 DF, p-value: < 2.2e-16
# テレビ広告費用の範囲に基づいて予測のための新しいデータセットを作成
tp.p <- seq(min(d$TV), max(d$TV), length.out = 100)
new_data <- data.frame(TV = tp.p, Radio = mean(d$Radio), Newspaper = mean(d$Newspaper))
# 予測と信頼区間の計算
pred <- predict(fit, newdata = new_data, interval = "prediction")
conf <- predict(fit, newdata = new_data, interval = "confidence")
# カスタムカラーの設定
COL <- c("blue", "green", "red", "gray80", "gray50")
# ベースの散布図を作成
plot(d$TV, d$Sales,
type = 'p', pch = 16, col = COL[1], ylim = c(min(d$Sales) - 5000, max(d$Sales) + 5000),
main = 'Relationship between TV Advertising Costs and Sales',
xlab = 'TV Advertising Costs (10,000 yen)', ylab = 'Sales (10,000 yen)')
# 信頼区間と予測区間のための関数
gray_area <- function(x, lwr, upr, col, alpha) {
polygon(c(x, rev(x)), c(lwr, rev(upr)), col = adjustcolor(col, alpha.f = alpha), border = NA)
}
# 信頼区間と予測区間のプロット
gray_area(tp.p, conf[, 'lwr'], conf[, 'upr'], col = COL[4], alpha = 0.2)
gray_area(tp.p, pred[, 'lwr'], pred[, 'upr'], col = COL[5], alpha = 0.2)
# フィットしたモデルのラインをプロット
lines(tp.p, conf[, 'fit'], col = COL[2], lwd = 3)
# 凡例の追加
legend('topright',
pch = c(16, NA, NA, NA), col = COL[c(1, 2, 4, 5)],
lty = c(NA, 1, NA, NA), lwd = c(NA, 3, NA, NA),
fill = c(NA, NA, adjustcolor(COL[4], alpha.f = 0.2), adjustcolor(COL[5], alpha.f = 0.2)), border = FALSE,
legend = c('TV Advertising Costs', 'Sales', '95% Confidence Interval', '95% Prediction Interval'))

set.seed(123) # 再現性のためにシードを設定
n <- nrow(d)
ii.tr <- sample(1:n, size = floor(0.8 * n))
d.tr <- d[ii.tr, ]
d.te <- d[-ii.tr, ]
# Custom color settings
COL <- c("blue", "green", "red", "gray80", "gray50")
# Create new dataset for prediction
tp.p <- seq(min(d$TV), max(d$TV), length.out = 100)
new_data <- data.frame(TV = tp.p, Radio = mean(d$Radio), Newspaper = mean(d$Newspaper))
# Calculate prediction and confidence intervals
pred <- predict(fit, newdata = new_data, interval = "prediction")
conf <- predict(fit, newdata = new_data, interval = "confidence")
# Create the base scatter plot
plot(d$TV, d$Sales,
type = 'p', pch = 16, col = COL[1], ylim = c(min(d$Sales) - 5000, max(d$Sales) + 5000),
main = 'Relationship between TV Advertising Costs and Sales',
xlab = 'TV Advertising Costs (10,000 yen)', ylab = 'Sales (10,000 yen)')
# Function for plotting confidence and prediction intervals
gray_area <- function(x, lwr, upr, col, alpha) {
polygon(c(x, rev(x)), c(lwr, rev(upr)), col = adjustcolor(col, alpha.f = alpha), border = NA)
}
# Plot confidence and prediction intervals
gray_area(tp.p, conf[, 'lwr'], conf[, 'upr'], col = COL[4], alpha = 0.2)
gray_area(tp.p, pred[, 'lwr'], pred[, 'upr'], col = COL[5], alpha = 0.2)
# Plot the fitted model line
lines(tp.p, conf[, 'fit'], col = COL[2], lwd = 3)
# Add legend
legend('topright',
pch = c(16, NA, NA, NA), col = COL[c(1, 2, 4, 5)],
lty = c(NA, 1, NA, NA), lwd = c(NA, 3, NA, NA),
fill = c(NA, NA, adjustcolor(COL[4], alpha.f = 0.2), adjustcolor(COL[5], alpha.f = 0.2)), border = FALSE,
legend = c('TV Advertising Costs', 'Sales', '95% Confidence Interval', '95% Prediction Interval'))
