This is a simple linear regression model for the dataset machine.csv from the link https://archive.ics.uci.edu/ml/datasets/Computer+Hardware
#Import the dataset
hurl <- url("https://archive.ics.uci.edu/ml/machine-learning-databases/cpu-performance/machine.data")
dataset = read.table(hurl,header = F,sep=",",col.names = c("vendor name","Model Name","MYCT","MMIN","MMAX","CACH","CHMIN","CHMAX","PRP","ERP"))
head(dataset)
## vendor.name Model.Name MYCT MMIN MMAX CACH CHMIN CHMAX PRP ERP
## 1 adviser 32/60 125 256 6000 256 16 128 198 199
## 2 amdahl 470v/7 29 8000 32000 32 8 32 269 253
## 3 amdahl 470v/7a 29 8000 32000 32 8 32 220 253
## 4 amdahl 470v/7b 29 8000 32000 32 8 32 172 253
## 5 amdahl 470v/7c 29 8000 16000 32 8 16 132 132
## 6 amdahl 470v/b 26 8000 32000 64 8 32 318 290
dataset = dataset[3:9]
head(dataset)
## MYCT MMIN MMAX CACH CHMIN CHMAX PRP
## 1 125 256 6000 256 16 128 198
## 2 29 8000 32000 32 8 32 269
## 3 29 8000 32000 32 8 32 220
## 4 29 8000 32000 32 8 32 172
## 5 29 8000 16000 32 8 16 132
## 6 26 8000 32000 64 8 32 318
str(dataset)
## 'data.frame': 209 obs. of 7 variables:
## $ MYCT : int 125 29 29 29 29 26 23 23 23 23 ...
## $ MMIN : int 256 8000 8000 8000 8000 8000 16000 16000 16000 32000 ...
## $ MMAX : int 6000 32000 32000 32000 16000 32000 32000 32000 64000 64000 ...
## $ CACH : int 256 32 32 32 32 64 64 64 64 128 ...
## $ CHMIN: int 16 8 8 8 8 8 16 16 16 32 ...
## $ CHMAX: int 128 32 32 32 16 32 32 32 32 64 ...
## $ PRP : int 198 269 220 172 132 318 367 489 636 1144 ...
summary(dataset)
## MYCT MMIN MMAX CACH
## Min. : 17.0 Min. : 64 Min. : 64 Min. : 0.00
## 1st Qu.: 50.0 1st Qu.: 768 1st Qu.: 4000 1st Qu.: 0.00
## Median : 110.0 Median : 2000 Median : 8000 Median : 8.00
## Mean : 203.8 Mean : 2868 Mean :11796 Mean : 25.21
## 3rd Qu.: 225.0 3rd Qu.: 4000 3rd Qu.:16000 3rd Qu.: 32.00
## Max. :1500.0 Max. :32000 Max. :64000 Max. :256.00
## CHMIN CHMAX PRP
## Min. : 0.000 Min. : 0.00 Min. : 6.0
## 1st Qu.: 1.000 1st Qu.: 5.00 1st Qu.: 27.0
## Median : 2.000 Median : 8.00 Median : 50.0
## Mean : 4.699 Mean : 18.27 Mean : 105.6
## 3rd Qu.: 6.000 3rd Qu.: 24.00 3rd Qu.: 113.0
## Max. :52.000 Max. :176.00 Max. :1150.0
plot(dataset ,main = "dataset")
library(caTools)
set.seed(23)
split = sample.split(dataset$PRP , SplitRatio = 0.75)
training_set = subset(dataset, split==TRUE)
test_set = subset(dataset,split==FALSE)
View(training_set)
View(test_set)
dim(training_set)
## [1] 156 7
dim(test_set)
## [1] 53 7
regressor = lm(formula = PRP ~ MMAX , data = training_set)
summary(regressor)
##
## Call:
## lm(formula = PRP ~ MMAX, data = training_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -195.10 -34.94 -2.05 24.34 462.69
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.912e+01 9.345e+00 -3.116 0.00219 **
## MMAX 1.119e-02 5.998e-04 18.662 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 79.33 on 154 degrees of freedom
## Multiple R-squared: 0.6934, Adjusted R-squared: 0.6914
## F-statistic: 348.3 on 1 and 154 DF, p-value: < 2.2e-16
plot(regressor , main = "regressor")
y_pred = predict(regressor , newdata = test_set[-7])
test_set[-7]
## MYCT MMIN MMAX CACH CHMIN CHMAX
## 5 29 8000 16000 32 8 16
## 7 23 16000 32000 64 16 32
## 8 23 16000 32000 64 16 32
## 9 23 16000 64000 64 16 32
## 10 23 32000 64000 128 32 64
## 11 400 1000 3000 0 1 2
## 15 350 64 64 0 1 4
## 19 143 1000 2000 0 5 16
## 24 110 3100 6200 0 6 64
## 26 320 512 2000 4 1 3
## 28 320 256 3000 4 1 3
## 36 64 5240 20970 30 12 24
## 38 50 1000 4000 8 1 5
## 39 50 2000 8000 8 1 5
## 52 700 256 2000 0 1 1
## 54 200 1000 8000 0 1 2
## 55 110 1000 4000 16 1 2
## 58 800 256 8000 0 1 4
## 60 800 256 8000 0 1 4
## 68 105 256 2000 0 3 10
## 70 105 2000 4000 8 3 19
## 73 175 256 2000 0 3 24
## 77 300 768 4500 0 1 24
## 80 180 768 12000 6 1 31
## 81 330 1000 3000 0 2 4
## 83 300 1000 16000 8 2 112
## 84 330 1000 2000 0 1 2
## 85 330 1000 4000 0 3 6
## 88 140 2000 4000 8 1 20
## 101 203 1000 2000 0 1 5
## 109 900 512 1000 0 1 2
## 117 180 2000 16000 16 1 6
## 119 25 2000 12000 8 1 4
## 122 17 4000 16000 32 6 12
## 129 50 2000 16000 24 1 6
## 138 150 512 4000 0 8 128
## 147 60 4000 16000 64 5 8
## 153 35 8000 32000 64 8 24
## 154 38 16000 32000 128 16 32
## 160 56 1000 4000 0 1 6
## 165 56 4000 16000 0 1 8
## 166 38 4000 8000 32 16 32
## 168 38 8000 16000 64 4 8
## 181 160 2000 8000 32 1 13
## 182 240 512 1000 8 1 3
## 183 240 512 2000 8 1 5
## 184 105 2000 4000 8 3 8
## 193 26 8000 32000 128 24 32
## 195 50 2000 32000 24 6 26
## 196 50 2000 32000 48 26 52
## 198 50 4000 32000 112 52 104
## 203 180 262 4000 0 1 3
## 204 180 512 4000 0 1 3
y_pred
## 5 7 8 9 10 11 15
## 149.988741 329.095836 329.095836 687.310026 687.310026 4.464226 -28.401926
## 19 24 26 28 36 38 39
## -6.729967 40.285645 -6.729967 4.464226 205.623882 15.658419 60.435193
## 52 54 55 58 60 68 70
## -6.729967 60.435193 15.658419 60.435193 60.435193 -6.729967 15.658419
## 73 77 80 81 83 84 85
## -6.729967 21.255516 105.211967 4.464226 149.988741 -6.729967 15.658419
## 88 101 109 117 119 122 129
## 15.658419 -6.729967 -17.924161 149.988741 105.211967 149.988741 149.988741
## 138 147 153 154 160 165 166
## 15.658419 149.988741 329.095836 329.095836 15.658419 149.988741 60.435193
## 168 181 182 183 184 193 195
## 149.988741 60.435193 -17.924161 -6.729967 15.658419 329.095836 329.095836
## 196 198 203 204
## 329.095836 329.095836 15.658419 15.658419
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.2
ggplot() +
geom_point(aes(x=training_set$MMAX , y = training_set$PRP) , color = 'red') +
geom_line(aes(x = training_set$MMAX , y = predict(regressor , newdata = training_set[-7])),
color = 'blue') +
ggtitle('PRP vs MMAX (Training set)') +
xlab('MMAX') + ylab('PRP')
#Visualizing test set results
ggplot() +
geom_point(aes(x=test_set$MMAX , y = test_set$PRP) , color = 'red') +
geom_line(aes(x = training_set$MMAX , y = predict(regressor , newdata = training_set[-7])),
color = 'blue') +
ggtitle('PRP vs MMAX (Test set)') +
xlab('MMAX') + ylab('PRP')
rmse <- sqrt(mean((test_set$PRP - y_pred)^2))
rmse
## [1] 88.73718
R2 <- summary(regressor)$r.squared
R2
## [1] 0.6933942