par(family= "HiraKakuProN-W3")
d <- read.csv('https://stats.dip.jp/01_ds/data/ad_sales.csv',stringsAsFactors = FALSE)
library(DT)
datatable(d, caption = 'ある製品の広告媒体別費用と売上(単位:万円)')
summary(d)
## テレビ ラジオ 新聞 売上
## Min. : 7.0 Min. : 0.00 Min. : 3.0 Min. : 16000
## 1st Qu.: 743.8 1st Qu.: 99.75 1st Qu.: 127.5 1st Qu.:103750
## Median :1497.5 Median :229.00 Median : 257.5 Median :129000
## Mean :1470.4 Mean :232.64 Mean : 305.5 Mean :140225
## 3rd Qu.:2188.2 3rd Qu.:365.25 3rd Qu.: 451.0 3rd Qu.:174000
## Max. :2964.0 Max. :496.00 Max. :1140.0 Max. :270000
boxplot(d$テレビ, d$ラジオ, d$新聞,
col = c("orange","red","yellow"),
names = c('TV','Radio','Newspaper'),
main = 'Boxplot of costs by advertising medium',
ylab = 'Sales',
xlab = 'media')
abline(h = seq(0, 3000, 500), lty = 3, col = gray(0.5, 0.25))
library(psych)
pairs.panels(d)
cor.plot(d)
## Warning in abbreviate(dimnames(ans)[[2L]], minlength = abbr.colnames):
## abbreviate used with non-ASCII chars
r <- cor(d)
r
## テレビ ラジオ 新聞 売上
## テレビ 1.00000000 0.05480866 0.05664787 0.7822244
## ラジオ 0.05480866 1.00000000 0.35410375 0.5762226
## 新聞 0.05664787 0.35410375 1.00000000 0.2282990
## 売上 0.78222442 0.57622257 0.22829903 1.0000000
library(corrplot)
## corrplot 0.92 loaded
corrplot.mixed(r, lower = 'ellips', upper = 'number')
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
kyokasho <- list(size = 11, color = 'black', family = 'UD Digi Kyokasho NK-R')
plot_ly(x = colnames(r),
y = rownames(r),
z = as.matrix(r),
text = paste(r),
type = 'heatmap') %>%
layout(font = kyokasho,
title = 'Heart Data',
xaxis = list(title = 'Data'),
yaxis = list(title = 'Data'))
# Load necessary libraries
library(ggplot2)
library(scales)
##
## Attaching package: 'scales'
## The following objects are masked from 'package:psych':
##
## alpha, rescale
# Reshape the data for plotting
d_melt <- reshape2::melt(d, id.vars = '売上', variable.name = '広告媒体', value.name = '広告費')
# Create the combined scatter plot
ggplot(d_melt, aes(x = 広告費, y = 売上, color = 広告媒体)) +
geom_point() +
labs(title = 'Correlation diagram between advertising expenses and sales',
x = 'Advertising cost (tens of thousands)',
y = 'Sales (yen)') +
scale_color_manual(values = c("テレビ" = "green", "ラジオ" = "pink", "新聞" = "orange")) +
scale_y_continuous(labels = comma) + # Format y-axis labels
theme_minimal() +
theme(legend.title = element_blank())
tv_test_result <- cor.test(d$テレビ, d$売上)
significance <- ifelse(tv_test_result$p.value < 0.05, '有意', '有意でない')
cat("テレビ and 売上: \n相関係数 =", tv_test_result$estimate, "\np-value =", tv_test_result$p.value, "-", significance, "\n")
## テレビ and 売上:
## 相関係数 = 0.7822244
## p-value = 1.46739e-42 - 有意
radio_test_result <- cor.test(d$ラジオ, d$売上)
significance <- ifelse(radio_test_result$p.value < 0.05, '有意', '有意でない')
cat("\nラジオ and 売上: \n相関係数 =", radio_test_result$estimate, "\np-value =", radio_test_result$p.value, "-", significance, "\n")
##
## ラジオ and 売上:
## 相関係数 = 0.5762226
## p-value = 4.354966e-19 - 有意
np_test_result <- cor.test(d$新聞, d$売上)
significance <- ifelse(np_test_result$p.value < 0.05, '有意', '有意でない')
cat("\n新聞 and 売上: \n相関係数 =", np_test_result$estimate, "\np-value =", np_test_result$p.value, "-", significance, "\n")
##
## 新聞 and 売上:
## 相関係数 = 0.228299
## p-value = 0.001148196 - 有意
library(sjPlot)
## Learn more about sjPlot with 'browseVignettes("sjPlot")'.
fit <- lm(売上 ~ テレビ + ラジオ + 新聞, data = d)
tab_model(fit, show.stat = T, show.aic = T)
| 売上 | ||||
|---|---|---|---|---|
| Predictors | Estimates | CI | Statistic | p |
| (Intercept) | 29388.89 | 23237.62 – 35540.16 | 9.42 | <0.001 |
| テレビ | 45.76 | 43.01 – 48.52 | 32.81 | <0.001 |
| ラジオ | 188.53 | 171.55 – 205.51 | 21.89 | <0.001 |
| 新聞 | -1.04 | -12.62 – 10.54 | -0.18 | 0.860 |
| Observations | 200 | |||
| R2 / R2 adjusted | 0.897 / 0.896 | |||
| AIC | 4466.498 | |||
library(sjPlot)
fit1 <- lm(売上 ~ テレビ + ラジオ, data = d)
tab_model(fit1, show.stat = T, show.aic = T)
| 売上 | ||||
|---|---|---|---|---|
| Predictors | Estimates | CI | Statistic | p |
| (Intercept) | 29211.00 | 23403.43 – 35018.57 | 9.92 | <0.001 |
| テレビ | 45.75 | 43.01 – 48.50 | 32.91 | <0.001 |
| ラジオ | 187.99 | 172.14 – 203.85 | 23.38 | <0.001 |
| Observations | 200 | |||
| R2 / R2 adjusted | 0.897 / 0.896 | |||
| AIC | 4464.530 | |||
plot_model(fit1, show.values = T, show.intercept = T, width = 0.1)
set.seed(123) # Set seed for reproducibility
n <- nrow(d)
ii.tr <- sample(1:n, size = floor(0.8 * n))
d.tr <- d[ii.tr, ]
d.te <- d[-ii.tr, ]
# Fit a linear model with multiple predictors
fit <- lm(売上 ~ テレビ + ラジオ + 新聞, data = d.tr)
# Predict using the test set
tp_hat <- predict(fit, newdata = d.te)
# Calculate the RMSE
RMSE <- sqrt(mean((d.te$売上 - tp_hat)^2))
RMSE
## [1] 14380.8
# Define the get.accuracy function
get.accuracy <- function(yhat, y, digits = 2) {
d <- data.frame(MBE = mean(yhat - y),
MAE = mean(abs(yhat - y)),
MAPE = mean(abs((yhat - y) / y)) * 100,
RMSE = sqrt(mean((yhat - y)^2)))
return(round(d, digits))
}
# Calculate accuracy metrics
a <- get.accuracy(tp_hat, d.te$売上)
print(a)
## MBE MAE MAPE RMSE
## 1 -1805.83 12129.77 10.12 14380.8
COL <- c(rgb(255, 0, 0, 105, max = 255), # 赤
rgb( 0, 0, 255, 105, max = 255), # 青
rgb( 0, 155, 0, 105, max = 255), # 緑
rgb(140, 140, 140, 105, max = 255), # 暗灰
rgb(180, 180, 180, 105, max = 255)) # 灰
fit <- lm(売上 ~ poly(テレビ, 2), data = d)
# Create a sequence of new values for predictions
tp.p <- seq(min(d$テレビ), max(d$テレビ), length.out = 100)
# Generate predictions with confidence intervals
pred <- predict(fit, newdata = data.frame(テレビ = tp.p), interval = "predict")
conf <- predict(fit, newdata = data.frame(テレビ = tp.p), interval = "confidence")
# Plotting
matplot(x = d$テレビ, y = d$売上,
type = 'p', pch = 16, col = COL[1], ylim = c(min(d$売上)-5000, max(d$売上)+5000),
main = 'Sales forecasting using quadratic polynomial regression model',
xlab = 'Advertising costs (TV)', ylab = 'Sales (10,000 yen)')
gray.area <- function(x, lwr, upr, col) {
polygon(c(x, rev(x)), c(lwr, rev(upr)), col = col, border = NA)
}
gray.area(tp.p, conf[, 'lwr'], conf[, 'upr'], col = COL[4])
gray.area(tp.p, pred[, 'lwr'], pred[, 'upr'], col = COL[5])
matlines(x = tp.p, y = conf[, 'fit'], col = COL[2], lwd = 3)
legend('topright',
pch = c(16, NA, NA, NA), col = COL[-3],
lty = c(NA, 1, NA, NA), lwd = c(NA, 3, NA, NA),
fill = c(NA, NA, COL[4], COL[5]), border = F,
legend = c('テレビ', '売上', '95%信頼区間', '95%予測区間'))