This is a multiple linear regression model for the dataset machine.csv from the link https://archive.ics.uci.edu/ml/datasets/Computer+Hardware
#Import the dataset
hurl <- url("https://archive.ics.uci.edu/ml/machine-learning-databases/cpu-performance/machine.data")
dataset = read.table(hurl,header = F,sep=",",col.names = c("vendor name","Model Name","MYCT","MMIN","MMAX","CACH","CHMIN","CHMAX","PRP","ERP"))
head(dataset)
## vendor.name Model.Name MYCT MMIN MMAX CACH CHMIN CHMAX PRP ERP
## 1 adviser 32/60 125 256 6000 256 16 128 198 199
## 2 amdahl 470v/7 29 8000 32000 32 8 32 269 253
## 3 amdahl 470v/7a 29 8000 32000 32 8 32 220 253
## 4 amdahl 470v/7b 29 8000 32000 32 8 32 172 253
## 5 amdahl 470v/7c 29 8000 16000 32 8 16 132 132
## 6 amdahl 470v/b 26 8000 32000 64 8 32 318 290
#feature selection
dataset = dataset[3:9]
head(dataset)
## MYCT MMIN MMAX CACH CHMIN CHMAX PRP
## 1 125 256 6000 256 16 128 198
## 2 29 8000 32000 32 8 32 269
## 3 29 8000 32000 32 8 32 220
## 4 29 8000 32000 32 8 32 172
## 5 29 8000 16000 32 8 16 132
## 6 26 8000 32000 64 8 32 318
str(dataset)
## 'data.frame': 209 obs. of 7 variables:
## $ MYCT : int 125 29 29 29 29 26 23 23 23 23 ...
## $ MMIN : int 256 8000 8000 8000 8000 8000 16000 16000 16000 32000 ...
## $ MMAX : int 6000 32000 32000 32000 16000 32000 32000 32000 64000 64000 ...
## $ CACH : int 256 32 32 32 32 64 64 64 64 128 ...
## $ CHMIN: int 16 8 8 8 8 8 16 16 16 32 ...
## $ CHMAX: int 128 32 32 32 16 32 32 32 32 64 ...
## $ PRP : int 198 269 220 172 132 318 367 489 636 1144 ...
summary(dataset)
## MYCT MMIN MMAX CACH
## Min. : 17.0 Min. : 64 Min. : 64 Min. : 0.00
## 1st Qu.: 50.0 1st Qu.: 768 1st Qu.: 4000 1st Qu.: 0.00
## Median : 110.0 Median : 2000 Median : 8000 Median : 8.00
## Mean : 203.8 Mean : 2868 Mean :11796 Mean : 25.21
## 3rd Qu.: 225.0 3rd Qu.: 4000 3rd Qu.:16000 3rd Qu.: 32.00
## Max. :1500.0 Max. :32000 Max. :64000 Max. :256.00
## CHMIN CHMAX PRP
## Min. : 0.000 Min. : 0.00 Min. : 6.0
## 1st Qu.: 1.000 1st Qu.: 5.00 1st Qu.: 27.0
## Median : 2.000 Median : 8.00 Median : 50.0
## Mean : 4.699 Mean : 18.27 Mean : 105.6
## 3rd Qu.: 6.000 3rd Qu.: 24.00 3rd Qu.: 113.0
## Max. :52.000 Max. :176.00 Max. :1150.0
plot(dataset , main ="dataset")
set.seed(23)
library(caTools)
split = sample.split(dataset$PRP , SplitRatio = 0.75)
training_set = subset(dataset, split== TRUE)
test_set = subset(dataset,split== FALSE)
dim(training_set)
## [1] 156 7
dim(test_set)
## [1] 53 7
regressor = lm(formula = PRP ~ . , data = training_set)
summary(regressor)
##
## Call:
## lm(formula = PRP ~ ., data = training_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -188.01 -25.93 4.07 23.06 320.85
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.917e+01 9.530e+00 -5.160 7.77e-07 ***
## MYCT 3.739e-02 1.883e-02 1.986 0.048888 *
## MMIN 1.221e-02 2.620e-03 4.661 6.92e-06 ***
## MMAX 5.448e-03 7.912e-04 6.885 1.50e-10 ***
## CACH 5.561e-01 1.504e-01 3.699 0.000304 ***
## CHMIN -1.271e+00 1.119e+00 -1.136 0.257707
## CHMAX 2.130e+00 2.838e-01 7.506 5.14e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 58.64 on 149 degrees of freedom
## Multiple R-squared: 0.8379, Adjusted R-squared: 0.8314
## F-statistic: 128.4 on 6 and 149 DF, p-value: < 2.2e-16
plot(regressor , main = 'regressor')
### Prediction
y_pred = predict(regressor , newdata = test_set[-7])
y_pred
## 5 7 8 9 10 11
## 178.4883587 404.8396860 404.8396860 579.1679925 857.9910583 -2.6693029
## 15 19 24 26 28 36
## -27.7042610 7.0109081 155.2826353 -12.7135587 -10.3922488 184.0099726
## 38 39 52 54 55 58
## 0.5305596 34.5342920 -8.1155684 17.0908685 0.8326399 34.7009058
## 60 68 70 73 77 80
## 34.7009058 -13.7350769 42.0802868 18.7052585 45.7941036 100.4133545
## 81 83 84 85 88 101
## -2.2976225 301.9122786 -10.7345818 6.1393367 48.0616529 -9.0929061
## 109 117 119 122 129 138
## -0.8280507 89.5567727 53.2603008 123.2104656 89.1447613 246.9773060
## 147 153 154 160 165 166
## 135.3651514 300.7148631 440.9933493 -1.5639762 104.7076232 110.3056415
## 168 181 182 183 184 193
## 184.6644813 69.0364331 -18.9282210 -9.2200591 18.6480741 332.6734583
## 195 196 198 203 204
## 212.5569245 255.8653892 393.6027350 -12.3307995 -9.2776260
rmse <- sqrt(mean((test_set$PRP - y_pred)^2))
rmse
## [1] 73.86591
R2 <- summary(regressor)$r.squared
R2
## [1] 0.8379149