Simple Linear Regression

This is a simple linear regression model for the dataset machine.csv from the link https://archive.ics.uci.edu/ml/datasets/Computer+Hardware

Code

Import dataset

#Import the dataset
hurl <- url("https://archive.ics.uci.edu/ml/machine-learning-databases/cpu-performance/machine.data")
dataset = read.table(hurl,header = F,sep=",",col.names = c("vendor name","Model Name","MYCT","MMIN","MMAX","CACH","CHMIN","CHMAX","PRP","ERP"))
head(dataset)
##   vendor.name Model.Name MYCT MMIN  MMAX CACH CHMIN CHMAX PRP ERP
## 1     adviser      32/60  125  256  6000  256    16   128 198 199
## 2      amdahl     470v/7   29 8000 32000   32     8    32 269 253
## 3      amdahl    470v/7a   29 8000 32000   32     8    32 220 253
## 4      amdahl    470v/7b   29 8000 32000   32     8    32 172 253
## 5      amdahl    470v/7c   29 8000 16000   32     8    16 132 132
## 6      amdahl     470v/b   26 8000 32000   64     8    32 318 290

Feature Selection

dataset = dataset[3:9]
head(dataset)
##   MYCT MMIN  MMAX CACH CHMIN CHMAX PRP
## 1  125  256  6000  256    16   128 198
## 2   29 8000 32000   32     8    32 269
## 3   29 8000 32000   32     8    32 220
## 4   29 8000 32000   32     8    32 172
## 5   29 8000 16000   32     8    16 132
## 6   26 8000 32000   64     8    32 318
str(dataset)
## 'data.frame':    209 obs. of  7 variables:
##  $ MYCT : int  125 29 29 29 29 26 23 23 23 23 ...
##  $ MMIN : int  256 8000 8000 8000 8000 8000 16000 16000 16000 32000 ...
##  $ MMAX : int  6000 32000 32000 32000 16000 32000 32000 32000 64000 64000 ...
##  $ CACH : int  256 32 32 32 32 64 64 64 64 128 ...
##  $ CHMIN: int  16 8 8 8 8 8 16 16 16 32 ...
##  $ CHMAX: int  128 32 32 32 16 32 32 32 32 64 ...
##  $ PRP  : int  198 269 220 172 132 318 367 489 636 1144 ...
summary(dataset)
##       MYCT             MMIN            MMAX            CACH       
##  Min.   :  17.0   Min.   :   64   Min.   :   64   Min.   :  0.00  
##  1st Qu.:  50.0   1st Qu.:  768   1st Qu.: 4000   1st Qu.:  0.00  
##  Median : 110.0   Median : 2000   Median : 8000   Median :  8.00  
##  Mean   : 203.8   Mean   : 2868   Mean   :11796   Mean   : 25.21  
##  3rd Qu.: 225.0   3rd Qu.: 4000   3rd Qu.:16000   3rd Qu.: 32.00  
##  Max.   :1500.0   Max.   :32000   Max.   :64000   Max.   :256.00  
##      CHMIN            CHMAX             PRP        
##  Min.   : 0.000   Min.   :  0.00   Min.   :   6.0  
##  1st Qu.: 1.000   1st Qu.:  5.00   1st Qu.:  27.0  
##  Median : 2.000   Median :  8.00   Median :  50.0  
##  Mean   : 4.699   Mean   : 18.27   Mean   : 105.6  
##  3rd Qu.: 6.000   3rd Qu.: 24.00   3rd Qu.: 113.0  
##  Max.   :52.000   Max.   :176.00   Max.   :1150.0
plot(dataset ,main = "dataset")

Splitting the dataset

library(caTools)
set.seed(23)
split = sample.split(dataset$PRP , SplitRatio = 0.75)
training_set = subset(dataset, split==TRUE)
test_set = subset(dataset,split==FALSE)
View(training_set)
View(test_set)
dim(training_set)
## [1] 156   7
dim(test_set)
## [1] 53  7

Model

regressor = lm(formula = PRP ~ MMAX , data = training_set)
summary(regressor)
## 
## Call:
## lm(formula = PRP ~ MMAX, data = training_set)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -195.10  -34.94   -2.05   24.34  462.69 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.912e+01  9.345e+00  -3.116  0.00219 ** 
## MMAX         1.119e-02  5.998e-04  18.662  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 79.33 on 154 degrees of freedom
## Multiple R-squared:  0.6934, Adjusted R-squared:  0.6914 
## F-statistic: 348.3 on 1 and 154 DF,  p-value: < 2.2e-16
plot(regressor , main = "regressor")

Predictions

y_pred = predict(regressor , newdata = test_set[-7])
test_set[-7]
##     MYCT  MMIN  MMAX CACH CHMIN CHMAX
## 5     29  8000 16000   32     8    16
## 7     23 16000 32000   64    16    32
## 8     23 16000 32000   64    16    32
## 9     23 16000 64000   64    16    32
## 10    23 32000 64000  128    32    64
## 11   400  1000  3000    0     1     2
## 15   350    64    64    0     1     4
## 19   143  1000  2000    0     5    16
## 24   110  3100  6200    0     6    64
## 26   320   512  2000    4     1     3
## 28   320   256  3000    4     1     3
## 36    64  5240 20970   30    12    24
## 38    50  1000  4000    8     1     5
## 39    50  2000  8000    8     1     5
## 52   700   256  2000    0     1     1
## 54   200  1000  8000    0     1     2
## 55   110  1000  4000   16     1     2
## 58   800   256  8000    0     1     4
## 60   800   256  8000    0     1     4
## 68   105   256  2000    0     3    10
## 70   105  2000  4000    8     3    19
## 73   175   256  2000    0     3    24
## 77   300   768  4500    0     1    24
## 80   180   768 12000    6     1    31
## 81   330  1000  3000    0     2     4
## 83   300  1000 16000    8     2   112
## 84   330  1000  2000    0     1     2
## 85   330  1000  4000    0     3     6
## 88   140  2000  4000    8     1    20
## 101  203  1000  2000    0     1     5
## 109  900   512  1000    0     1     2
## 117  180  2000 16000   16     1     6
## 119   25  2000 12000    8     1     4
## 122   17  4000 16000   32     6    12
## 129   50  2000 16000   24     1     6
## 138  150   512  4000    0     8   128
## 147   60  4000 16000   64     5     8
## 153   35  8000 32000   64     8    24
## 154   38 16000 32000  128    16    32
## 160   56  1000  4000    0     1     6
## 165   56  4000 16000    0     1     8
## 166   38  4000  8000   32    16    32
## 168   38  8000 16000   64     4     8
## 181  160  2000  8000   32     1    13
## 182  240   512  1000    8     1     3
## 183  240   512  2000    8     1     5
## 184  105  2000  4000    8     3     8
## 193   26  8000 32000  128    24    32
## 195   50  2000 32000   24     6    26
## 196   50  2000 32000   48    26    52
## 198   50  4000 32000  112    52   104
## 203  180   262  4000    0     1     3
## 204  180   512  4000    0     1     3
y_pred
##          5          7          8          9         10         11         15 
## 149.988741 329.095836 329.095836 687.310026 687.310026   4.464226 -28.401926 
##         19         24         26         28         36         38         39 
##  -6.729967  40.285645  -6.729967   4.464226 205.623882  15.658419  60.435193 
##         52         54         55         58         60         68         70 
##  -6.729967  60.435193  15.658419  60.435193  60.435193  -6.729967  15.658419 
##         73         77         80         81         83         84         85 
##  -6.729967  21.255516 105.211967   4.464226 149.988741  -6.729967  15.658419 
##         88        101        109        117        119        122        129 
##  15.658419  -6.729967 -17.924161 149.988741 105.211967 149.988741 149.988741 
##        138        147        153        154        160        165        166 
##  15.658419 149.988741 329.095836 329.095836  15.658419 149.988741  60.435193 
##        168        181        182        183        184        193        195 
## 149.988741  60.435193 -17.924161  -6.729967  15.658419 329.095836 329.095836 
##        196        198        203        204 
## 329.095836 329.095836  15.658419  15.658419

Visualisation

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.2
ggplot() + 
  geom_point(aes(x=training_set$MMAX , y = training_set$PRP) , color = 'red') + 
  geom_line(aes(x = training_set$MMAX , y = predict(regressor , newdata = training_set[-7])),
            color = 'blue') +
  ggtitle('PRP vs MMAX (Training set)') +
  xlab('MMAX') + ylab('PRP')

#Visualizing test set results
ggplot() + 
  geom_point(aes(x=test_set$MMAX , y = test_set$PRP) , color = 'red') + 
  geom_line(aes(x = training_set$MMAX , y = predict(regressor , newdata = training_set[-7])),
            color = 'blue') +
  ggtitle('PRP vs MMAX (Test set)') +
  xlab('MMAX') + ylab('PRP')

Analysis

rmse <- sqrt(mean((test_set$PRP - y_pred)^2))
rmse
## [1] 88.73718
R2 <- summary(regressor)$r.squared
R2
## [1] 0.6933942