library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(jtools)
cpu.uci <- read.csv(url("https://archive.ics.uci.edu/ml/machine-learning-databases/cpu-performance/machine.data"), header=FALSE)
colnames(cpu.uci) <- c("vendor.name","model.name","MYCT","MMIN","MMAX","CACH","CHMIN","CHMAX","PRP","ERP")
cpu.uci <- select(cpu.uci, -c(ERP))
Attribute Information: 1. vendor name: 30 (adviser, amdahl,apollo, basf, bti, burroughs, c.r.d, cambex, cdc, dec, dg, formation, four-phase, gould, honeywell, hp, ibm, ipl, magnuson, microdata, nas, ncr, nixdorf, perkin-elmer, prime, siemens, sperry, sratus, wang) 2. Model Name: many unique symbols 3. MYCT: machine cycle time in nanoseconds (integer) 4. MMIN: minimum main memory in kilobytes (integer) 5. MMAX: maximum main memory in kilobytes (integer) 6. CACH: cache memory in kilobytes (integer) 7. CHMIN: minimum channels in units (integer) 8. CHMAX: maximum channels in units (integer) 9. PRP: published relative performance (integer) 10. ERP: estimated relative performance from the original article (integer) –dropped
str(cpu.uci)
## 'data.frame': 209 obs. of 9 variables:
## $ vendor.name: Factor w/ 30 levels "adviser","amdahl",..: 1 2 2 2 2 2 2 2 2 2 ...
## $ model.name : Factor w/ 209 levels "100","1100/61-h1",..: 30 63 64 65 66 67 75 76 77 78 ...
## $ MYCT : int 125 29 29 29 29 26 23 23 23 23 ...
## $ MMIN : int 256 8000 8000 8000 8000 8000 16000 16000 16000 32000 ...
## $ MMAX : int 6000 32000 32000 32000 16000 32000 32000 32000 64000 64000 ...
## $ CACH : int 256 32 32 32 32 64 64 64 64 128 ...
## $ CHMIN : int 16 8 8 8 8 8 16 16 16 32 ...
## $ CHMAX : int 128 32 32 32 16 32 32 32 32 64 ...
## $ PRP : int 198 269 220 172 132 318 367 489 636 1144 ...
summary(cpu.uci)
## vendor.name model.name MYCT MMIN
## ibm : 32 100 : 1 Min. : 17.0 Min. : 64
## nas : 19 1100/61-h1: 1 1st Qu.: 50.0 1st Qu.: 768
## honeywell: 13 1100/81 : 1 Median : 110.0 Median : 2000
## ncr : 13 1100/82 : 1 Mean : 203.8 Mean : 2868
## sperry : 13 1100/83 : 1 3rd Qu.: 225.0 3rd Qu.: 4000
## siemens : 12 1100/84 : 1 Max. :1500.0 Max. :32000
## (Other) :107 (Other) :203
## MMAX CACH CHMIN CHMAX
## Min. : 64 Min. : 0.00 Min. : 0.000 Min. : 0.00
## 1st Qu.: 4000 1st Qu.: 0.00 1st Qu.: 1.000 1st Qu.: 5.00
## Median : 8000 Median : 8.00 Median : 2.000 Median : 8.00
## Mean :11796 Mean : 25.21 Mean : 4.699 Mean : 18.27
## 3rd Qu.:16000 3rd Qu.: 32.00 3rd Qu.: 6.000 3rd Qu.: 24.00
## Max. :64000 Max. :256.00 Max. :52.000 Max. :176.00
##
## PRP
## Min. : 6.0
## 1st Qu.: 27.0
## Median : 50.0
## Mean : 105.6
## 3rd Qu.: 113.0
## Max. :1150.0
##
head(cpu.uci)
## vendor.name model.name MYCT MMIN MMAX CACH CHMIN CHMAX PRP
## 1 adviser 32/60 125 256 6000 256 16 128 198
## 2 amdahl 470v/7 29 8000 32000 32 8 32 269
## 3 amdahl 470v/7a 29 8000 32000 32 8 32 220
## 4 amdahl 470v/7b 29 8000 32000 32 8 32 172
## 5 amdahl 470v/7c 29 8000 16000 32 8 16 132
## 6 amdahl 470v/b 26 8000 32000 64 8 32 318
Membagi training dan test Data. 70:30
set.seed(2)
trainingRowIndex <- sample(3:nrow(cpu.uci), 0.7*nrow(cpu.uci))
trainingData <- cpu.uci[trainingRowIndex, ]
testData <- cpu.uci[-trainingRowIndex,]
Membuat model linear regression multivariable.
lm.cpu <- lm(PRP ~ CHMAX+CHMIN+CACH+MMAX+MMIN+MYCT, data = trainingData)
perf.pred <- predict(lm.cpu, testData)
ref untuk melihat nilai:(http://r-statistics.co/Linear-Regression.html)
summary(lm.cpu)
##
## Call:
## lm(formula = PRP ~ CHMAX + CHMIN + CACH + MMAX + MMIN + MYCT,
## data = trainingData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -254.350 -24.567 6.922 25.909 309.616
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.699e+01 9.966e+00 -4.715 5.81e-06 ***
## CHMAX 2.054e+00 2.842e-01 7.227 2.99e-11 ***
## CHMIN -1.495e+00 9.207e-01 -1.624 0.106733
## CACH 1.131e+00 1.902e-01 5.948 2.09e-08 ***
## MMAX 5.019e-03 8.875e-04 5.655 8.54e-08 ***
## MMIN 9.568e-03 2.578e-03 3.711 0.000298 ***
## MYCT 4.468e-02 2.315e-02 1.930 0.055675 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 60.21 on 139 degrees of freedom
## Multiple R-squared: 0.8525, Adjusted R-squared: 0.8461
## F-statistic: 133.9 on 6 and 139 DF, p-value: < 2.2e-16
Dapat dilihat bahwasanya, model memiliki nilai Adjusted r-squared yang baik: 0.84 (Semakin tinggi, semakin baik) dan didukung juga oleh R-Squared yang juga baik: 0.8525 (diatas >0.7 baik)
Membuat dataframe dari PRP (Actual) dan perf.pred (Prediksi)
actuals.preds <- data.frame(cbind(actuals= testData$PRP, predicteds= perf.pred))
cor(actuals.preds)
## actuals predicteds
## actuals 1.0000000 0.8978655
## predicteds 0.8978655 1.0000000
Berikut gambaran hasil yang didaptkan. Terlihat sangat jauh pada hasil aktual dengan prediksinya.
head(actuals.preds)
## actuals predicteds
## 1 198 519.7111
## 2 269 281.3998
## 3 220 281.3998
## 7 367 381.9146
## 9 636 542.5063
## 10 1144 809.7890
min_max_accuracy <- mean(apply(actuals.preds, 1, min) / apply(actuals.preds, 1, max))
min_max_accuracy
## [1] 0.5428306
Didapatkan nilai 54% pada hasil akurasi modelnya. Pada Min Max Accuracy, semakin besar semakin baik.
mape <- mean(abs((actuals.preds$predicteds - actuals.preds$actuals))/actuals.preds$actuals)
mape
## [1] 0.7024613
Didapatkan nilai 0.70% pada penggunaan MAPE sebagai error rates. Pada MAPE, semakin kecil, semakin baik.