par(family= "HiraKakuProN-W3")

ビジネス アナリティクス

学籍番号:23110039,氏名:松本 瑠樹亜

1.データ

d <- read.csv('https://stats.dip.jp/01_ds/data/ad_sales.csv',stringsAsFactors = FALSE)

library(DT)
datatable(d, caption = 'ある製品の広告媒体別費用と売上(単位:万円)')

2.箱ひげ図

summary(d)
##      テレビ           ラジオ            新聞             売上       
##  Min.   :   7.0   Min.   :  0.00   Min.   :   3.0   Min.   : 16000  
##  1st Qu.: 743.8   1st Qu.: 99.75   1st Qu.: 127.5   1st Qu.:103750  
##  Median :1497.5   Median :229.00   Median : 257.5   Median :129000  
##  Mean   :1470.4   Mean   :232.64   Mean   : 305.5   Mean   :140225  
##  3rd Qu.:2188.2   3rd Qu.:365.25   3rd Qu.: 451.0   3rd Qu.:174000  
##  Max.   :2964.0   Max.   :496.00   Max.   :1140.0   Max.   :270000
boxplot(d$テレビ, d$ラジオ, d$新聞,
        col = c("orange","red","yellow"),
        names = c('TV','Radio','Newspaper'),
        main = 'Boxplot of costs by advertising medium',
        ylab = 'Sales',
        xlab = 'media')

abline(h = seq(0, 3000, 500), lty = 3, col = gray(0.5, 0.25))

3.相間分析

3.1 相関分析図

library(psych)
pairs.panels(d)

cor.plot(d)
## Warning in abbreviate(dimnames(ans)[[2L]], minlength = abbr.colnames):
## abbreviate used with non-ASCII chars

r <- cor(d)
r
##            テレビ     ラジオ       新聞      売上
## テレビ 1.00000000 0.05480866 0.05664787 0.7822244
## ラジオ 0.05480866 1.00000000 0.35410375 0.5762226
## 新聞   0.05664787 0.35410375 1.00000000 0.2282990
## 売上   0.78222442 0.57622257 0.22829903 1.0000000
library(corrplot)
## corrplot 0.92 loaded
corrplot.mixed(r, lower = 'ellips', upper = 'number')

library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
kyokasho <- list(size = 11, color = 'black', family = 'UD Digi Kyokasho NK-R')


plot_ly(x = colnames(r),
        y = rownames(r),
        z = as.matrix(r),
        text = paste(r),
        type = 'heatmap') %>%
  layout(font = kyokasho,
         title = 'Heart Data',
         xaxis = list(title = 'Data'),
         yaxis = list(title = 'Data'))
# Load necessary libraries
library(ggplot2)
library(scales)
## 
## Attaching package: 'scales'
## The following objects are masked from 'package:psych':
## 
##     alpha, rescale
# Reshape the data for plotting
d_melt <- reshape2::melt(d, id.vars = '売上', variable.name = '広告媒体', value.name = '広告費')

# Create the combined scatter plot
ggplot(d_melt, aes(x = 広告費, y = 売上, color = 広告媒体)) +
  geom_point() +
  labs(title = 'Correlation diagram between advertising expenses and sales',
       x = 'Advertising cost (tens of thousands)',
       y = 'Sales (yen)') +
  scale_color_manual(values = c("テレビ" = "green", "ラジオ" = "pink", "新聞" = "orange")) +
  scale_y_continuous(labels = comma) +  # Format y-axis labels
  theme_minimal() +
  theme(legend.title = element_blank())

3.2 無相関検定

tv_test_result <- cor.test(d$テレビ, d$売上)

significance <- ifelse(tv_test_result$p.value < 0.05, '有意', '有意でない')

cat("テレビ and 売上: \n相関係数 =", tv_test_result$estimate, "\np-value =", tv_test_result$p.value, "-", significance, "\n")
## テレビ and 売上: 
## 相関係数 = 0.7822244 
## p-value = 1.46739e-42 - 有意
radio_test_result <- cor.test(d$ラジオ, d$売上)

significance <- ifelse(radio_test_result$p.value < 0.05, '有意', '有意でない')

cat("\nラジオ and 売上: \n相関係数 =", radio_test_result$estimate, "\np-value =", radio_test_result$p.value, "-", significance, "\n")
## 
## ラジオ and 売上: 
## 相関係数 = 0.5762226 
## p-value = 4.354966e-19 - 有意
np_test_result <- cor.test(d$新聞, d$売上)

significance <- ifelse(np_test_result$p.value < 0.05, '有意', '有意でない')

cat("\n新聞 and 売上: \n相関係数 =", np_test_result$estimate, "\np-value =", np_test_result$p.value, "-", significance, "\n")
## 
## 新聞 and 売上: 
## 相関係数 = 0.228299 
## p-value = 0.001148196 - 有意

5. 回帰分析

library(sjPlot)
## Learn more about sjPlot with 'browseVignettes("sjPlot")'.
fit <- lm(売上 ~ テレビ + ラジオ + 新聞, data = d)


tab_model(fit, show.stat = T, show.aic = T)
  売上
Predictors Estimates CI Statistic p
(Intercept) 29388.89 23237.62 – 35540.16 9.42 <0.001
テレビ 45.76 43.01 – 48.52 32.81 <0.001
ラジオ 188.53 171.55 – 205.51 21.89 <0.001
新聞 -1.04 -12.62 – 10.54 -0.18 0.860
Observations 200
R2 / R2 adjusted 0.897 / 0.896
AIC 4466.498
消した後
library(sjPlot)
fit1 <- lm(売上 ~ テレビ + ラジオ, data = d)


tab_model(fit1, show.stat = T, show.aic = T)
  売上
Predictors Estimates CI Statistic p
(Intercept) 29211.00 23403.43 – 35018.57 9.92 <0.001
テレビ 45.75 43.01 – 48.50 32.91 <0.001
ラジオ 187.99 172.14 – 203.85 23.38 <0.001
Observations 200
R2 / R2 adjusted 0.897 / 0.896
AIC 4464.530
plot_model(fit1, show.values = T, show.intercept = T, width = 0.1)

set.seed(123)  # Set seed for reproducibility
n <- nrow(d)
ii.tr <- sample(1:n, size = floor(0.8 * n))
d.tr <- d[ii.tr, ]
d.te <- d[-ii.tr, ]

# Fit a linear model with multiple predictors
fit <- lm(売上 ~ テレビ + ラジオ + 新聞, data = d.tr)

# Predict using the test set
tp_hat <- predict(fit, newdata = d.te)

# Calculate the RMSE
RMSE <- sqrt(mean((d.te$売上 - tp_hat)^2))
RMSE
## [1] 14380.8
# Define the get.accuracy function
get.accuracy <- function(yhat, y, digits = 2) {
  d <- data.frame(MBE  = mean(yhat - y),
                  MAE  = mean(abs(yhat - y)),
                  MAPE = mean(abs((yhat - y) / y)) * 100,
                  RMSE = sqrt(mean((yhat - y)^2)))
  return(round(d, digits))
}

# Calculate accuracy metrics
a <- get.accuracy(tp_hat, d.te$売上)
print(a)
##        MBE      MAE  MAPE    RMSE
## 1 -1805.83 12129.77 10.12 14380.8
COL <- c(rgb(255,   0,   0,  105, max = 255), # 赤
         rgb(  0,   0, 255,  105, max = 255), # 青
         rgb(  0, 155,   0,  105, max = 255), # 緑
         rgb(140, 140, 140,  105, max = 255), # 暗灰
         rgb(180, 180, 180,  105, max = 255)) # 灰
fit <- lm(売上 ~ poly(テレビ, 2), data = d)

# Create a sequence of new values for predictions
tp.p <- seq(min(d$テレビ), max(d$テレビ), length.out = 100)

# Generate predictions with confidence intervals
pred <- predict(fit, newdata = data.frame(テレビ = tp.p), interval = "predict")
conf <- predict(fit, newdata = data.frame(テレビ = tp.p), interval = "confidence")

# Plotting
matplot(x = d$テレビ, y = d$売上,
        type = 'p', pch = 16, col = COL[1], ylim = c(min(d$売上)-5000, max(d$売上)+5000),
        main = 'Sales forecasting using quadratic polynomial regression model', 
        xlab = 'Advertising costs (TV)', ylab = 'Sales (10,000 yen)')

gray.area <- function(x, lwr, upr, col) {
  polygon(c(x, rev(x)), c(lwr, rev(upr)), col = col, border = NA)
}

gray.area(tp.p, conf[, 'lwr'], conf[, 'upr'], col = COL[4])
gray.area(tp.p, pred[, 'lwr'], pred[, 'upr'], col = COL[5])

matlines(x = tp.p, y = conf[, 'fit'], col = COL[2], lwd = 3)

legend('topright',
       pch = c(16, NA, NA, NA), col = COL[-3],
       lty = c(NA, 1, NA, NA), lwd = c(NA, 3, NA, NA), 
       fill = c(NA, NA, COL[4], COL[5]), border = F,
       legend = c('テレビ', '売上', '95%信頼区間', '95%予測区間'))