Multiple Linear Regression

Computer Dataset

Assignment 13

mydata <- read.csv("C:\\Users\\RISHI RAHUL\\Desktop\\DS\\3 MLR\\Assignment\\Computer_Data.csv")

attach(mydata)
colnames(mydata)
##  [1] "price"   "speed"   "hd"      "ram"     "screen"  "cd"      "multi"  
##  [8] "premium" "ads"     "trend"
summary(mydata)
##      price          speed              hd              ram        
##  Min.   : 949   Min.   : 25.00   Min.   :  80.0   Min.   : 2.000  
##  1st Qu.:1794   1st Qu.: 33.00   1st Qu.: 214.0   1st Qu.: 4.000  
##  Median :2144   Median : 50.00   Median : 340.0   Median : 8.000  
##  Mean   :2220   Mean   : 52.01   Mean   : 416.6   Mean   : 8.287  
##  3rd Qu.:2595   3rd Qu.: 66.00   3rd Qu.: 528.0   3rd Qu.: 8.000  
##  Max.   :5399   Max.   :100.00   Max.   :2100.0   Max.   :32.000  
##      screen        cd       multi      premium         ads       
##  Min.   :14.00   no :3351   no :5386   no : 612   Min.   : 39.0  
##  1st Qu.:14.00   yes:2908   yes: 873   yes:5647   1st Qu.:162.5  
##  Median :14.00                                    Median :246.0  
##  Mean   :14.61                                    Mean   :221.3  
##  3rd Qu.:15.00                                    3rd Qu.:275.0  
##  Max.   :17.00                                    Max.   :339.0  
##      trend      
##  Min.   : 1.00  
##  1st Qu.:10.00  
##  Median :16.00  
##  Mean   :15.93  
##  3rd Qu.:21.50  
##  Max.   :35.00
cor(mydata[,-c(6,7,8)])
##              price      speed         hd        ram      screen
## price   1.00000000  0.3009765  0.4302578  0.6227482  0.29604147
## speed   0.30097646  1.0000000  0.3723041  0.2347605  0.18907412
## hd      0.43025779  0.3723041  1.0000000  0.7777263  0.23280153
## ram     0.62274824  0.2347605  0.7777263  1.0000000  0.20895374
## screen  0.29604147  0.1890741  0.2328015  0.2089537  1.00000000
## ads     0.05454047 -0.2152321 -0.3232220 -0.1816697 -0.09391943
## trend  -0.19998694  0.4054383  0.5777901  0.2768438  0.18861444
##                ads      trend
## price   0.05454047 -0.1999869
## speed  -0.21523206  0.4054383
## hd     -0.32322200  0.5777901
## ram    -0.18166971  0.2768438
## screen -0.09391943  0.1886144
## ads     1.00000000 -0.3185525
## trend  -0.31855251  1.0000000
plot(mydata[,-c(6,7,8)])

library(corpcor)
cor2pcor(cor(mydata[,-c(6,7,8)]))
##            [,1]        [,2]        [,3]        [,4]        [,5]
## [1,]  1.0000000  0.47421976  0.27854293  0.45050159  0.33404481
## [2,]  0.4742198  1.00000000 -0.04075062 -0.22416243 -0.07534729
## [3,]  0.2785429 -0.04075062  1.00000000  0.54127682 -0.07038820
## [4,]  0.4505016 -0.22416243  0.54127682  1.00000000 -0.08767010
## [5,]  0.3340448 -0.07534729 -0.07038820 -0.08767010  1.00000000
## [6,]  0.2113623 -0.15770602 -0.20201263 -0.03917875 -0.07686047
## [7,] -0.6610516  0.45620752  0.55770935  0.08558243  0.26462477
##             [,6]        [,7]
## [1,]  0.21136235 -0.66105162
## [2,] -0.15770602  0.45620752
## [3,] -0.20201263  0.55770935
## [4,] -0.03917875  0.08558243
## [5,] -0.07686047  0.26462477
## [6,]  1.00000000  0.05126715
## [7,]  0.05126715  1.00000000
# Multiple Linear Regression Model
model <- lm(price~., data = mydata)
summary(model) # R Squared value : 0.7756
## 
## Call:
## lm(formula = price ~ ., data = mydata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1093.77  -174.24   -11.49   146.49  2001.05 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  307.98798   60.35341   5.103 3.44e-07 ***
## speed          9.32028    0.18506  50.364  < 2e-16 ***
## hd             0.78178    0.02761  28.311  < 2e-16 ***
## ram           48.25596    1.06608  45.265  < 2e-16 ***
## screen       123.08904    3.99950  30.776  < 2e-16 ***
## cdyes         60.91671    9.51559   6.402 1.65e-10 ***
## multiyes     104.32382   11.41268   9.141  < 2e-16 ***
## premiumyes  -509.22473   12.34225 -41.259  < 2e-16 ***
## ads            0.65729    0.05132  12.809  < 2e-16 ***
## trend        -51.84958    0.62871 -82.470  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 275.3 on 6249 degrees of freedom
## Multiple R-squared:  0.7756, Adjusted R-squared:  0.7752 
## F-statistic:  2399 on 9 and 6249 DF,  p-value: < 2.2e-16
library(car)
## Warning: package 'car' was built under R version 3.5.1
## Loading required package: carData
#influence.measures(model)
influenceIndexPlot(model)

influencePlot(model)

##         StudRes         Hat       CookD
## 1441  7.3058529 0.002228075 0.011819949
## 1701  7.1838002 0.002464463 0.012647347
## 3784 -0.8667018 0.020972880 0.001609237
## 4478 -1.3795547 0.020060286 0.003895407
vif(model)
##    speed       hd      ram   screen       cd    multi  premium      ads 
## 1.265364 4.207395 2.974628 1.081644 1.859370 1.290568 1.109388 1.217218 
##    trend 
## 2.022790
model2 <- lm(price~speed+hd+ram+screen+ads+trend+cd+multi+premium,data = mydata[-c(1441,1701,3784,4478),])
summary(model2) # R-squared: 0.7774
## 
## Call:
## lm(formula = price ~ speed + hd + ram + screen + ads + trend + 
##     cd + multi + premium, data = mydata[-c(1441, 1701, 3784, 
##     4478), ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1094.21  -173.10   -10.94   146.35  1509.23 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  336.94855   59.92166   5.623 1.96e-08 ***
## speed          9.29551    0.18355  50.642  < 2e-16 ***
## hd             0.78355    0.02795  28.030  < 2e-16 ***
## ram           48.29482    1.06764  45.235  < 2e-16 ***
## screen       121.07383    3.97118  30.488  < 2e-16 ***
## ads            0.65465    0.05095  12.849  < 2e-16 ***
## trend        -51.74355    0.62676 -82.558  < 2e-16 ***
## cdyes         60.31315    9.44029   6.389 1.79e-10 ***
## multiyes     104.85186   11.31912   9.263  < 2e-16 ***
## premiumyes  -510.03064   12.24104 -41.666  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 273.1 on 6245 degrees of freedom
## Multiple R-squared:  0.7774, Adjusted R-squared:  0.777 
## F-statistic:  2423 on 9 and 6245 DF,  p-value: < 2.2e-16
# Analysing the final model combination
library(MASS)

stepAIC(model)
## Start:  AIC=70336.65
## price ~ speed + hd + ram + screen + cd + multi + premium + ads + 
##     trend
## 
##           Df Sum of Sq       RSS   AIC
## <none>                 473783875 70337
## - cd       1   3107211 476891087 70376
## - multi    1   6335218 480119093 70418
## - ads      1  12439298 486223174 70497
## - hd       1  60768013 534551889 71090
## - screen   1  71812147 545596023 71218
## - premium  1 129062420 602846296 71843
## - ram      1 155342777 629126653 72110
## - speed    1 192316497 666100373 72467
## - trend    1 515661043 989444918 74944
## 
## Call:
## lm(formula = price ~ speed + hd + ram + screen + cd + multi + 
##     premium + ads + trend, data = mydata)
## 
## Coefficients:
## (Intercept)        speed           hd          ram       screen  
##    307.9880       9.3203       0.7818      48.2560     123.0890  
##       cdyes     multiyes   premiumyes          ads        trend  
##     60.9167     104.3238    -509.2247       0.6573     -51.8496
avPlots(model)

Final_Model <- lm(log(price)~log(speed)+log(hd)+log(ram)+screen+ads+trend+cd+multi+premium,data = mydata[-c(1441,1701,3784,4478),])
summary(Final_Model) # R-squared: 0.7918
## 
## Call:
## lm(formula = log(price) ~ log(speed) + log(hd) + log(ram) + screen + 
##     ads + trend + cd + multi + premium, data = mydata[-c(1441, 
##     1701, 3784, 4478), ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.48044 -0.07549 -0.00289  0.07381  0.42541 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.492e+00  3.362e-02 163.329  < 2e-16 ***
## log(speed)   2.079e-01  4.130e-03  50.332  < 2e-16 ***
## log(hd)      1.434e-01  5.311e-03  27.003  < 2e-16 ***
## log(ram)     1.819e-01  4.447e-03  40.905  < 2e-16 ***
## screen       4.729e-02  1.719e-03  27.507  < 2e-16 ***
## ads          1.957e-04  2.166e-05   9.034  < 2e-16 ***
## trend       -2.275e-02  2.759e-04 -82.470  < 2e-16 ***
## cdyes        4.635e-02  4.087e-03  11.341  < 2e-16 ***
## multiyes     3.398e-02  4.850e-03   7.005 2.73e-12 ***
## premiumyes  -2.315e-01  5.301e-03 -43.673  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1176 on 6245 degrees of freedom
## Multiple R-squared:  0.7918, Adjusted R-squared:  0.7915 
## F-statistic:  2638 on 9 and 6245 DF,  p-value: < 2.2e-16
# Evaluate model LINE assumptions 
plot(Final_Model)

hist(residuals(Final_Model)) # close to normal distribution

The Model2 is the final model with the accuracy of 79.18 %.