Dataset ini menjelaskan penjualan baik di Amerika Utara, Eropa, Jepang maupun penjualan secara global berdasarkan Genre, Platform, Publisher, serta Name
vg <- read.csv("data_input/vgsales.csv",stringsAsFactors = F)
glimpse(vg)
## Rows: 16,598
## Columns: 11
## $ Rank <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17…
## $ Name <chr> "Wii Sports", "Super Mario Bros.", "Mario Kart Wii", "Wii…
## $ Platform <chr> "Wii", "NES", "Wii", "Wii", "GB", "GB", "DS", "Wii", "Wii…
## $ Year <chr> "2006", "1985", "2008", "2009", "1996", "1989", "2006", "…
## $ Genre <chr> "Sports", "Platform", "Racing", "Sports", "Role-Playing",…
## $ Publisher <chr> "Nintendo", "Nintendo", "Nintendo", "Nintendo", "Nintendo…
## $ NA_Sales <dbl> 41.49, 29.08, 15.85, 15.75, 11.27, 23.20, 11.38, 14.03, 1…
## $ EU_Sales <dbl> 29.02, 3.58, 12.88, 11.01, 8.89, 2.26, 9.23, 9.20, 7.06, …
## $ JP_Sales <dbl> 3.77, 6.81, 3.79, 3.28, 10.22, 4.22, 6.50, 2.93, 4.70, 0.…
## $ Other_Sales <dbl> 8.46, 0.77, 3.31, 2.96, 1.00, 0.58, 2.90, 2.85, 2.26, 0.4…
## $ Global_Sales <dbl> 82.74, 40.24, 35.82, 33.00, 31.37, 30.26, 30.01, 29.02, 2…
Untuk melakukan regresi linear kita hapus kolom2 yang bersifat kategorik
vg_clean <- vg %>%
select(-Publisher,-Name,-Platform,-Genre) %>%
mutate(Year = as.numeric(Year)) %>%
drop_na()
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Year = as.numeric(Year)`.
## Caused by warning:
## ! NAs introduced by coercion
str(vg_clean)
## 'data.frame': 16327 obs. of 7 variables:
## $ Rank : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Year : num 2006 1985 2008 2009 1996 ...
## $ NA_Sales : num 41.5 29.1 15.8 15.8 11.3 ...
## $ EU_Sales : num 29.02 3.58 12.88 11.01 8.89 ...
## $ JP_Sales : num 3.77 6.81 3.79 3.28 10.22 ...
## $ Other_Sales : num 8.46 0.77 3.31 2.96 1 0.58 2.9 2.85 2.26 0.47 ...
## $ Global_Sales: num 82.7 40.2 35.8 33 31.4 ...
kita lihat 6 data teratas ketika sudah dilakukan data wrangling
head(vg_clean)
## Rank Year NA_Sales EU_Sales JP_Sales Other_Sales Global_Sales
## 1 1 2006 41.49 29.02 3.77 8.46 82.74
## 2 2 1985 29.08 3.58 6.81 0.77 40.24
## 3 3 2008 15.85 12.88 3.79 3.31 35.82
## 4 4 2009 15.75 11.01 3.28 2.96 33.00
## 5 5 1996 11.27 8.89 10.22 1.00 31.37
## 6 6 1989 23.20 2.26 4.22 0.58 30.26
Menggunakan ggcorr kilah lihat korelasi dari setiap prediktor, ternyata global sales memiliki korelasi yang sangat tinggi terhadap NA_Sales dan EU_Sales
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggcorr(vg_clean, label = T)
kita lakukan linear model pada Global Sales
model_vg_clean <- lm(formula = Global_Sales ~ 1, data = vg_clean)
summary(model_vg_clean)
##
## Call:
## lm(formula = Global_Sales ~ 1, data = vg_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.53 -0.48 -0.37 -0.06 82.20
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.54023 0.01225 44.09 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.566 on 16326 degrees of freedom
str(vg_clean)
## 'data.frame': 16327 obs. of 7 variables:
## $ Rank : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Year : num 2006 1985 2008 2009 1996 ...
## $ NA_Sales : num 41.5 29.1 15.8 15.8 11.3 ...
## $ EU_Sales : num 29.02 3.58 12.88 11.01 8.89 ...
## $ JP_Sales : num 3.77 6.81 3.79 3.28 10.22 ...
## $ Other_Sales : num 8.46 0.77 3.31 2.96 1 0.58 2.9 2.85 2.26 0.47 ...
## $ Global_Sales: num 82.7 40.2 35.8 33 31.4 ...
boxplot(vg_clean$Global_Sales, vg_clean$NA_Sales, vg_clean$EU_Sales, vg_clean$JP_Sales, vg_clean$Other_Sales, horizontal = T)
Insight : Dari semua jenis sales, mayoritas sales berada dibawah 20
million dollar
cor(vg_clean$Global_Sales, vg_clean$NA_Sales)
## [1] 0.9412677
cor(vg_clean$Global_Sales, vg_clean$EU_Sales)
## [1] 0.903271
cor(vg_clean$Global_Sales, vg_clean$JP_Sales)
## [1] 0.6127938
cor(vg_clean$Global_Sales, vg_clean$Other_Sales)
## [1] 0.7479742
cor(vg_clean$Global_Sales, vg_clean$Rank)
## [1] -0.4268798
Berdasarkan nilai korelasi antara Global_Sales dengan Sales lainnya, dapat disimpulkan bahwa NA_Sales memiliki nilai korelasi tertinggi dibanding Sales lainnya. Sehingga kita menggunakan NA_Sales untuk visualisasi plotnya
plot(vg_clean$Global_Sales,vg_clean$NA_Sales)
model_vg_clean <- lm(formula = Global_Sales ~ NA_Sales,
data = vg_clean)
abline(model_vg_clean, col="red")
summary(model_vg_clean)
##
## Call:
## lm(formula = Global_Sales ~ NA_Sales, data = vg_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.0613 -0.1097 -0.0538 0.0177 11.6353
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.064129 0.004348 14.75 <2e-16 ***
## NA_Sales 1.793804 0.005036 356.17 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5287 on 16325 degrees of freedom
## Multiple R-squared: 0.886, Adjusted R-squared: 0.886
## F-statistic: 1.269e+05 on 1 and 16325 DF, p-value: < 2.2e-16
vg_clean_no_outlier <- vg_clean[vg_clean$Global_Sales < 50,]
plot(vg_clean$Global_Sales,vg_clean$NA_Sales)
model_vg_no_outlier <- lm(formula = Global_Sales~ NA_Sales, data = vg_clean_no_outlier)
abline(model_vg_clean, col="red")
abline(model_vg_no_outlier, col="blue")
kita bandingkan antara nilai pada Global_Sales dan NA_Sales dengan
mempertimbangkan outlier dan tidak mempertimbangkan outlier. ternyata
hasil dari garis AB linenya tidak terlalu jauh.
model_vg_clean_none <- lm(formula = Global_Sales~1 ,vg_clean)
model_vg_clean_none
##
## Call:
## lm(formula = Global_Sales ~ 1, data = vg_clean)
##
## Coefficients:
## (Intercept)
## 0.5402
plot(vg_clean)
model_vg_clean_all <- lm(formula = Global_Sales ~ .,
data = vg_clean)
abline(model_vg_clean_all, col="red")
## Warning in abline(model_vg_clean_all, col = "red"): only using the first two of
## 7 regression coefficients
plot(model_vg_clean_all)
model_forward <- step(object = model_vg_clean_none,
direction ="forward",
scope = list(lower = model_vg_clean_none, upper = model_vg_clean_all))
## Start: AIC=14641.53
## Global_Sales ~ 1
##
## Df Sum of Sq RSS AIC
## + NA_Sales 1 35460 4563 -20809.3
## + EU_Sales 1 32655 7368 -12986.1
## + Other_Sales 1 22392 17632 1259.2
## + JP_Sales 1 15029 24994 6956.3
## + Rank 1 7293 32730 11359.0
## + Year 1 224 39800 14552.1
## <none> 40023 14641.5
##
## Step: AIC=-20809.3
## Global_Sales ~ NA_Sales
##
## Df Sum of Sq RSS AIC
## + EU_Sales 1 3154.87 1408.4 -40001
## + JP_Sales 1 1776.61 2786.7 -28860
## + Other_Sales 1 1522.16 3041.1 -27433
## + Rank 1 119.46 4443.8 -21240
## + Year 1 5.15 4558.1 -20826
## <none> 4563.3 -20809
##
## Step: AIC=-40000.93
## Global_Sales ~ NA_Sales + EU_Sales
##
## Df Sum of Sq RSS AIC
## + JP_Sales 1 1140.09 268.33 -67069
## + Other_Sales 1 185.18 1223.24 -42300
## + Year 1 20.19 1388.22 -40235
## + Rank 1 17.09 1391.32 -40198
## <none> 1408.42 -40001
##
## Step: AIC=-67069.07
## Global_Sales ~ NA_Sales + EU_Sales + JP_Sales
##
## Df Sum of Sq RSS AIC
## + Other_Sales 1 267.886 0.445 -171588
## + Year 1 1.280 267.051 -67145
## + Rank 1 1.141 267.190 -67137
## <none> 268.331 -67069
##
## Step: AIC=-171588.2
## Global_Sales ~ NA_Sales + EU_Sales + JP_Sales + Other_Sales
##
## Df Sum of Sq RSS AIC
## + Rank 1 0.00178884 0.44329 -171652
## <none> 0.44507 -171588
## + Year 1 0.00001593 0.44506 -171587
##
## Step: AIC=-171652
## Global_Sales ~ NA_Sales + EU_Sales + JP_Sales + Other_Sales +
## Rank
##
## Df Sum of Sq RSS AIC
## + Year 1 0.00012492 0.44316 -171655
## <none> 0.44329 -171652
##
## Step: AIC=-171654.6
## Global_Sales ~ NA_Sales + EU_Sales + JP_Sales + Other_Sales +
## Rank + Year
colSums(is.na(vg_clean))
## Rank Year NA_Sales EU_Sales JP_Sales Other_Sales
## 0 0 0 0 0 0
## Global_Sales
## 0
summary(model_vg_no_outlier)
##
## Call:
## lm(formula = Global_Sales ~ NA_Sales, data = vg_clean_no_outlier)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.0875 -0.1138 -0.0578 0.0167 11.7996
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.073218 0.004342 16.86 <2e-16 ***
## NA_Sales 1.757308 0.005428 323.75 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.524 on 16324 degrees of freedom
## Multiple R-squared: 0.8652, Adjusted R-squared: 0.8652
## F-statistic: 1.048e+05 on 1 and 16324 DF, p-value: < 2.2e-16
summary(model_vg_clean)$r.squared
## [1] 0.8859848
summary(model_vg_no_outlier)$r.squared
## [1] 0.8652452
range(vg_clean$Global_Sales)
## [1] 0.01 82.74
res <- data.frame(
aktual = vg_clean$Global_Sales,
prediksi = model_vg_clean$fitted.values) %>%
mutate(error = prediksi - aktual)
head(res)
## aktual prediksi error
## 1 82.74 74.48905 -8.250951
## 2 40.24 52.22794 11.987944
## 3 35.82 28.49592 -7.324081
## 4 33.00 28.31654 -4.683461
## 5 31.37 20.28030 -11.089702
## 6 30.26 41.68038 11.420377
library(MLmetrics)
##
## Attaching package: 'MLmetrics'
## The following object is masked from 'package:base':
##
## Recall
library(base)
MAE(y_pred = res$prediksi,
y_true = res$aktual)
## [1] 0.200364
MAPE(y_pred = res$prediksi,
y_true = res$aktual) * 100
## [1] 96.41522
MSE(y_pred = res$prediksi, y_true = res$aktual)
## [1] 0.279493
RMSE(y_pred = res$prediksi, y_true = res$aktual)
## [1] 0.528671
model_backward <- step(object = model_vg_clean_all,direction = "backward")
## Start: AIC=-171654.6
## Global_Sales ~ Rank + Year + NA_Sales + EU_Sales + JP_Sales +
## Other_Sales
##
## Df Sum of Sq RSS AIC
## <none> 0.4 -171655
## - Year 1 0.0 0.4 -171652
## - Rank 1 0.0 0.4 -171587
## - Other_Sales 1 265.0 265.4 -67242
## - JP_Sales 1 1185.6 1186.1 -42800
## - EU_Sales 1 1265.7 1266.2 -41733
## - NA_Sales 1 4037.7 4038.2 -22797
summary(model_backward)
##
## Call:
## lm(formula = Global_Sales ~ Rank + Year + NA_Sales + EU_Sales +
## JP_Sales + Other_Sales, data = vg_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0199560 -0.0007091 -0.0002955 0.0001462 0.0203439
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.100e-02 1.463e-02 2.120 0.0341 *
## Rank 7.983e-08 9.549e-09 8.360 <2e-16 ***
## Year -1.565e-05 7.296e-06 -2.145 0.0320 *
## NA_Sales 1.000e+00 8.201e-05 12194.033 <2e-16 ***
## EU_Sales 1.000e+00 1.465e-04 6827.291 <2e-16 ***
## JP_Sales 9.999e-01 1.513e-04 6607.787 <2e-16 ***
## Other_Sales 9.998e-01 3.201e-04 3123.958 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.005211 on 16320 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 2.457e+08 on 6 and 16320 DF, p-value: < 2.2e-16
dengan menggunakan model backward maka hasil dari Adjusted R-square memiliki nilai yang sangat baik
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
vif(model_backward)
## Rank Year NA_Sales EU_Sales JP_Sales Other_Sales
## 1.259202 1.087283 2.729510 3.339252 1.336466 2.220590
tidak ada multicolinearity karena semua vif < 10