# データの読み込み
d<- read.csv('https://stats.dip.jp/01_ds/data/ad_sales.csv')

library(DT)
datatable(d, caption = 'ある製品の広告媒体別費用と売上(単位:万円)')
# 列名の変更
colnames(d) <- c("TV", "Radio", "Newspaper", "Sales")
library(tidyr)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# データの整形
d_long <- d%>%
 gather(key = "Media", value = "Cost", -Sales)

# 箱ひげ図の作成
ggplot(d_long, aes(x = Media, y = Cost, fill = Media)) +
  geom_boxplot() +
  labs(title = "Boxplot of Advertising Costs by Media", x = "Media", y = "Cost (10,000 yen)") +
  theme_minimal() +
  theme(legend.position = "none")

# Calculate the correlation matrix
cor_matrix <- cor(d[, -4])
cor_matrix
##                   TV      Radio  Newspaper
## TV        1.00000000 0.05480866 0.05664787
## Radio     0.05480866 1.00000000 0.35410375
## Newspaper 0.05664787 0.35410375 1.00000000
# Create a pair plot for correlation analysis
pairs(d[, -4], main = "Correlation Analysis of Advertising Costs and Sales")

# Perform correlation tests
cor.test(d$TV, d$Sales)
## 
##  Pearson's product-moment correlation
## 
## data:  d$TV and d$Sales
## t = 17.668, df = 198, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7218201 0.8308014
## sample estimates:
##       cor 
## 0.7822244
cor.test(d$Radio, d$Sales)
## 
##  Pearson's product-moment correlation
## 
## data:  d$Radio and d$Sales
## t = 9.9208, df = 198, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4754954 0.6620366
## sample estimates:
##       cor 
## 0.5762226
cor.test(d$Newspaper, d$Sales)
## 
##  Pearson's product-moment correlation
## 
## data:  d$Newspaper and d$Sales
## t = 3.2996, df = 198, p-value = 0.001148
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.0924875 0.3557712
## sample estimates:
##      cor 
## 0.228299
# Change column names
colnames(d) <- c("TV", "Radio", "Newspaper", "Sales")

# Set the colors and points for the plot
COL <- c("blue", "green", "red")
PCH <- 16:18

# Extract data for each media type
tv_data <- data.frame(Cost = d$TV, Sales = d$Sales)
radio_data <- data.frame(Cost = d$Radio, Sales = d$Sales)
newspaper_data <- data.frame(Cost = d$Newspaper, Sales = d$Sales)

# Create the base scatter plot
plot(0, 0, type = 'n', 
     xlim = c(0, max(c(d$TV, d$Radio, d$Newspaper))), 
     ylim = c(0, max(d$Sales)),
     main = 'Correlation Analysis of Advertising Costs and Sales',
     xlab = 'Advertising Costs (10,000 yen)',
     ylab = 'Sales (10,000 yen)')

# Add grid lines
abline(h = seq(0, max(d$Sales), by = 50000), v = seq(0, max(c(d$TV, d$Radio, d$Newspaper)), by = 500), 
       lty = 2, col = gray(0.5, 0.25))

# Plot the data points for each media type
points(tv_data$Cost, tv_data$Sales, pch = PCH[1], col = COL[1])
points(radio_data$Cost, radio_data$Sales, pch = PCH[2], col = COL[2])
points(newspaper_data$Cost, newspaper_data$Sales, pch = PCH[3], col = COL[3])

# Add a legend
legend('topright', pch = PCH, col = COL, legend = c("TV", "Radio", "Newspaper"))

# Change column names
colnames(d) <- c("TV", "Radio", "Newspaper", "Sales")

# Correlation test and significance for TV advertising costs and sales
cor_test_tv <- cor.test(d$TV, d$Sales)
significance_tv <- ifelse(cor_test_tv$p.value < 0.05, 'significant', 'not significant')
cat("TV advertising costs and sales: \nCorrelation coefficient =", cor_test_tv$estimate, "\np-value =", cor_test_tv$p.value, "-", significance_tv, "\n")
## TV advertising costs and sales: 
## Correlation coefficient = 0.7822244 
## p-value = 1.46739e-42 - significant
# Correlation test and significance for Radio advertising costs and sales
cor_test_radio <- cor.test(d$Radio, d$Sales)
significance_radio <- ifelse(cor_test_radio$p.value < 0.05, 'significant', 'not significant')
cat("Radio advertising costs and sales: \nCorrelation coefficient =", cor_test_radio$estimate, "\np-value =", cor_test_radio$p.value, "-", significance_radio, "\n")
## Radio advertising costs and sales: 
## Correlation coefficient = 0.5762226 
## p-value = 4.354966e-19 - significant
# Correlation test and significance for Newspaper advertising costs and sales
cor_test_newspaper <- cor.test(d$Newspaper, d$Sales)
significance_newspaper <- ifelse(cor_test_newspaper$p.value < 0.05, 'significant', 'not significant')
cat("Newspaper advertising costs and sales: \nCorrelation coefficient =", cor_test_newspaper$estimate, "\np-value =", cor_test_newspaper$p.value, "-", significance_newspaper, "\n")
## Newspaper advertising costs and sales: 
## Correlation coefficient = 0.228299 
## p-value = 0.001148196 - significant
# 列名の変更
colnames(d) <- c("TV", "Radio", "Newspaper", "Sales")

# 回帰分析モデルの作成
fit <- lm(Sales ~ TV + Radio + Newspaper, data = d)
summary(fit)
## 
## Call:
## lm(formula = Sales ~ TV + Radio + Newspaper, data = d)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -88277  -8908   2418  11893  28292 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 29388.894   3119.082   9.422   <2e-16 ***
## TV             45.765      1.395  32.809   <2e-16 ***
## Radio         188.530      8.611  21.893   <2e-16 ***
## Newspaper      -1.037      5.871  -0.177     0.86    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16860 on 196 degrees of freedom
## Multiple R-squared:  0.8972, Adjusted R-squared:  0.8956 
## F-statistic: 570.3 on 3 and 196 DF,  p-value: < 2.2e-16
# テレビ広告費用の範囲に基づいて予測のための新しいデータセットを作成
tp.p <- seq(min(d$TV), max(d$TV), length.out = 100)
new_data <- data.frame(TV = tp.p, Radio = mean(d$Radio), Newspaper = mean(d$Newspaper))

# 予測と信頼区間の計算
pred <- predict(fit, newdata = new_data, interval = "prediction")
conf <- predict(fit, newdata = new_data, interval = "confidence")
# カスタムカラーの設定
COL <- c("blue", "green", "red", "gray80", "gray50")

# ベースの散布図を作成
plot(d$TV, d$Sales, 
     type = 'p', pch = 16, col = COL[1], ylim = c(min(d$Sales) - 5000, max(d$Sales) + 5000),
     main = 'Relationship between TV Advertising Costs and Sales', 
     xlab = 'TV Advertising Costs (10,000 yen)', ylab = 'Sales (10,000 yen)')

# 信頼区間と予測区間のための関数
gray_area <- function(x, lwr, upr, col, alpha) {
  polygon(c(x, rev(x)), c(lwr, rev(upr)), col = adjustcolor(col, alpha.f = alpha), border = NA)
}

# 信頼区間と予測区間のプロット
gray_area(tp.p, conf[, 'lwr'], conf[, 'upr'], col = COL[4], alpha = 0.2)
gray_area(tp.p, pred[, 'lwr'], pred[, 'upr'], col = COL[5], alpha = 0.2)

# フィットしたモデルのラインをプロット
lines(tp.p, conf[, 'fit'], col = COL[2], lwd = 3)

# 凡例の追加
legend('topright',
       pch = c(16, NA, NA, NA), col = COL[c(1, 2, 4, 5)],
       lty = c(NA, 1, NA, NA), lwd = c(NA, 3, NA, NA), 
       fill = c(NA, NA, adjustcolor(COL[4], alpha.f = 0.2), adjustcolor(COL[5], alpha.f = 0.2)), border = FALSE,
       legend = c('TV Advertising Costs', 'Sales', '95% Confidence Interval', '95% Prediction Interval'))

set.seed(123)  # 再現性のためにシードを設定
n <- nrow(d)
ii.tr <- sample(1:n, size = floor(0.8 * n))

d.tr <- d[ii.tr, ]
d.te <- d[-ii.tr, ]
# Custom color settings
COL <- c("blue", "green", "red", "gray80", "gray50")
# Create new dataset for prediction
tp.p <- seq(min(d$TV), max(d$TV), length.out = 100)
new_data <- data.frame(TV = tp.p, Radio = mean(d$Radio), Newspaper = mean(d$Newspaper))

# Calculate prediction and confidence intervals
pred <- predict(fit, newdata = new_data, interval = "prediction")
conf <- predict(fit, newdata = new_data, interval = "confidence")
# Create the base scatter plot
plot(d$TV, d$Sales, 
     type = 'p', pch = 16, col = COL[1], ylim = c(min(d$Sales) - 5000, max(d$Sales) + 5000),
     main = 'Relationship between TV Advertising Costs and Sales', 
     xlab = 'TV Advertising Costs (10,000 yen)', ylab = 'Sales (10,000 yen)')

# Function for plotting confidence and prediction intervals
gray_area <- function(x, lwr, upr, col, alpha) {
  polygon(c(x, rev(x)), c(lwr, rev(upr)), col = adjustcolor(col, alpha.f = alpha), border = NA)
}

# Plot confidence and prediction intervals
gray_area(tp.p, conf[, 'lwr'], conf[, 'upr'], col = COL[4], alpha = 0.2)
gray_area(tp.p, pred[, 'lwr'], pred[, 'upr'], col = COL[5], alpha = 0.2)

# Plot the fitted model line
lines(tp.p, conf[, 'fit'], col = COL[2], lwd = 3)

# Add legend
legend('topright',
       pch = c(16, NA, NA, NA), col = COL[c(1, 2, 4, 5)],
       lty = c(NA, 1, NA, NA), lwd = c(NA, 3, NA, NA), 
       fill = c(NA, NA, adjustcolor(COL[4], alpha.f = 0.2), adjustcolor(COL[5], alpha.f = 0.2)), border = FALSE,
       legend = c('TV Advertising Costs', 'Sales', '95% Confidence Interval', '95% Prediction Interval'))