Multiple Linear Regression

This is a multiple linear regression model for the dataset machine.csv from the link https://archive.ics.uci.edu/ml/datasets/Computer+Hardware

Code

Import dataset

#Import the dataset
hurl <- url("https://archive.ics.uci.edu/ml/machine-learning-databases/cpu-performance/machine.data")
dataset = read.table(hurl,header = F,sep=",",col.names = c("vendor name","Model Name","MYCT","MMIN","MMAX","CACH","CHMIN","CHMAX","PRP","ERP"))
head(dataset)
##   vendor.name Model.Name MYCT MMIN  MMAX CACH CHMIN CHMAX PRP ERP
## 1     adviser      32/60  125  256  6000  256    16   128 198 199
## 2      amdahl     470v/7   29 8000 32000   32     8    32 269 253
## 3      amdahl    470v/7a   29 8000 32000   32     8    32 220 253
## 4      amdahl    470v/7b   29 8000 32000   32     8    32 172 253
## 5      amdahl    470v/7c   29 8000 16000   32     8    16 132 132
## 6      amdahl     470v/b   26 8000 32000   64     8    32 318 290

Feature selection

#feature selection 
dataset = dataset[3:9]
head(dataset)
##   MYCT MMIN  MMAX CACH CHMIN CHMAX PRP
## 1  125  256  6000  256    16   128 198
## 2   29 8000 32000   32     8    32 269
## 3   29 8000 32000   32     8    32 220
## 4   29 8000 32000   32     8    32 172
## 5   29 8000 16000   32     8    16 132
## 6   26 8000 32000   64     8    32 318
str(dataset)
## 'data.frame':    209 obs. of  7 variables:
##  $ MYCT : int  125 29 29 29 29 26 23 23 23 23 ...
##  $ MMIN : int  256 8000 8000 8000 8000 8000 16000 16000 16000 32000 ...
##  $ MMAX : int  6000 32000 32000 32000 16000 32000 32000 32000 64000 64000 ...
##  $ CACH : int  256 32 32 32 32 64 64 64 64 128 ...
##  $ CHMIN: int  16 8 8 8 8 8 16 16 16 32 ...
##  $ CHMAX: int  128 32 32 32 16 32 32 32 32 64 ...
##  $ PRP  : int  198 269 220 172 132 318 367 489 636 1144 ...
summary(dataset)
##       MYCT             MMIN            MMAX            CACH       
##  Min.   :  17.0   Min.   :   64   Min.   :   64   Min.   :  0.00  
##  1st Qu.:  50.0   1st Qu.:  768   1st Qu.: 4000   1st Qu.:  0.00  
##  Median : 110.0   Median : 2000   Median : 8000   Median :  8.00  
##  Mean   : 203.8   Mean   : 2868   Mean   :11796   Mean   : 25.21  
##  3rd Qu.: 225.0   3rd Qu.: 4000   3rd Qu.:16000   3rd Qu.: 32.00  
##  Max.   :1500.0   Max.   :32000   Max.   :64000   Max.   :256.00  
##      CHMIN            CHMAX             PRP        
##  Min.   : 0.000   Min.   :  0.00   Min.   :   6.0  
##  1st Qu.: 1.000   1st Qu.:  5.00   1st Qu.:  27.0  
##  Median : 2.000   Median :  8.00   Median :  50.0  
##  Mean   : 4.699   Mean   : 18.27   Mean   : 105.6  
##  3rd Qu.: 6.000   3rd Qu.: 24.00   3rd Qu.: 113.0  
##  Max.   :52.000   Max.   :176.00   Max.   :1150.0
plot(dataset , main ="dataset")

Splitting into training and test set

set.seed(23)
library(caTools)
split = sample.split(dataset$PRP , SplitRatio = 0.75)
training_set = subset(dataset, split== TRUE)
test_set = subset(dataset,split== FALSE)
dim(training_set)
## [1] 156   7
dim(test_set)
## [1] 53  7

Model

regressor = lm(formula = PRP ~ . , data = training_set)
summary(regressor)
## 
## Call:
## lm(formula = PRP ~ ., data = training_set)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -188.01  -25.93    4.07   23.06  320.85 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.917e+01  9.530e+00  -5.160 7.77e-07 ***
## MYCT         3.739e-02  1.883e-02   1.986 0.048888 *  
## MMIN         1.221e-02  2.620e-03   4.661 6.92e-06 ***
## MMAX         5.448e-03  7.912e-04   6.885 1.50e-10 ***
## CACH         5.561e-01  1.504e-01   3.699 0.000304 ***
## CHMIN       -1.271e+00  1.119e+00  -1.136 0.257707    
## CHMAX        2.130e+00  2.838e-01   7.506 5.14e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 58.64 on 149 degrees of freedom
## Multiple R-squared:  0.8379, Adjusted R-squared:  0.8314 
## F-statistic: 128.4 on 6 and 149 DF,  p-value: < 2.2e-16
plot(regressor , main = 'regressor')

### Prediction

y_pred = predict(regressor , newdata = test_set[-7])
y_pred
##           5           7           8           9          10          11 
## 178.4883587 404.8396860 404.8396860 579.1679925 857.9910583  -2.6693029 
##          15          19          24          26          28          36 
## -27.7042610   7.0109081 155.2826353 -12.7135587 -10.3922488 184.0099726 
##          38          39          52          54          55          58 
##   0.5305596  34.5342920  -8.1155684  17.0908685   0.8326399  34.7009058 
##          60          68          70          73          77          80 
##  34.7009058 -13.7350769  42.0802868  18.7052585  45.7941036 100.4133545 
##          81          83          84          85          88         101 
##  -2.2976225 301.9122786 -10.7345818   6.1393367  48.0616529  -9.0929061 
##         109         117         119         122         129         138 
##  -0.8280507  89.5567727  53.2603008 123.2104656  89.1447613 246.9773060 
##         147         153         154         160         165         166 
## 135.3651514 300.7148631 440.9933493  -1.5639762 104.7076232 110.3056415 
##         168         181         182         183         184         193 
## 184.6644813  69.0364331 -18.9282210  -9.2200591  18.6480741 332.6734583 
##         195         196         198         203         204 
## 212.5569245 255.8653892 393.6027350 -12.3307995  -9.2776260

Analysis

rmse <- sqrt(mean((test_set$PRP - y_pred)^2))
rmse
## [1] 73.86591
R2 <- summary(regressor)$r.squared
R2
## [1] 0.8379149