cData <- read.csv(file.choose())    #Computer_Data.csv
View(cData)
head(cData)
##   X price speed  hd ram screen cd multi premium ads trend
## 1 1  1499    25  80   4     14 no    no     yes  94     1
## 2 2  1795    33  85   2     14 no    no     yes  94     1
## 3 3  1595    25 170   4     15 no    no     yes  94     1
## 4 4  1849    25 170   8     14 no    no      no  94     1
## 5 5  3295    33 340  16     14 no    no     yes  94     1
## 6 6  3695    66 340  16     14 no    no     yes  94     1

install.packages(“fastDummies”)

library(fastDummies)
## Warning: package 'fastDummies' was built under R version 3.6.1

creating dummy variables

cDataWDummy <- dummy_cols(cData, select_columns = c("cd","multi","premium"))
head(cDataWDummy)
##   X price speed  hd ram screen cd multi premium ads trend cd_no cd_yes
## 1 1  1499    25  80   4     14 no    no     yes  94     1     1      0
## 2 2  1795    33  85   2     14 no    no     yes  94     1     1      0
## 3 3  1595    25 170   4     15 no    no     yes  94     1     1      0
## 4 4  1849    25 170   8     14 no    no      no  94     1     1      0
## 5 5  3295    33 340  16     14 no    no     yes  94     1     1      0
## 6 6  3695    66 340  16     14 no    no     yes  94     1     1      0
##   multi_no multi_yes premium_yes premium_no
## 1        1         0           1          0
## 2        1         0           1          0
## 3        1         0           1          0
## 4        1         0           0          1
## 5        1         0           1          0
## 6        1         0           1          0
attach(cDataWDummy)

taking yes 1 /no 0 from dummy data

cDataDummied <- subset(cDataWDummy, select= c("price", "speed", "hd", "ram", "screen", "cd_yes", "multi_yes", "premium_yes", "ads","trend"))
attach(cDataDummied)
## The following objects are masked from cDataWDummy:
## 
##     ads, cd_yes, hd, multi_yes, premium_yes, price, ram, screen,
##     speed, trend
head(cDataDummied)
##   price speed  hd ram screen cd_yes multi_yes premium_yes ads trend
## 1  1499    25  80   4     14      0         0           1  94     1
## 2  1795    33  85   2     14      0         0           1  94     1
## 3  1595    25 170   4     15      0         0           1  94     1
## 4  1849    25 170   8     14      0         0           0  94     1
## 5  3295    33 340  16     14      0         0           1  94     1
## 6  3695    66 340  16     14      0         0           1  94     1
summary(cDataDummied)
##      price          speed              hd              ram        
##  Min.   : 949   Min.   : 25.00   Min.   :  80.0   Min.   : 2.000  
##  1st Qu.:1794   1st Qu.: 33.00   1st Qu.: 214.0   1st Qu.: 4.000  
##  Median :2144   Median : 50.00   Median : 340.0   Median : 8.000  
##  Mean   :2220   Mean   : 52.01   Mean   : 416.6   Mean   : 8.287  
##  3rd Qu.:2595   3rd Qu.: 66.00   3rd Qu.: 528.0   3rd Qu.: 8.000  
##  Max.   :5399   Max.   :100.00   Max.   :2100.0   Max.   :32.000  
##      screen          cd_yes         multi_yes       premium_yes    
##  Min.   :14.00   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:14.00   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:1.0000  
##  Median :14.00   Median :0.0000   Median :0.0000   Median :1.0000  
##  Mean   :14.61   Mean   :0.4646   Mean   :0.1395   Mean   :0.9022  
##  3rd Qu.:15.00   3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:1.0000  
##  Max.   :17.00   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##       ads            trend      
##  Min.   : 39.0   Min.   : 1.00  
##  1st Qu.:162.5   1st Qu.:10.00  
##  Median :246.0   Median :16.00  
##  Mean   :221.3   Mean   :15.93  
##  3rd Qu.:275.0   3rd Qu.:21.50  
##  Max.   :339.0   Max.   :35.00

find the correlation between output price with rest

windows()
plot(cDataDummied)

cor(cDataDummied)
##                   price       speed          hd         ram       screen
## price        1.00000000  0.30097646  0.43025779  0.62274824  0.296041474
## speed        0.30097646  1.00000000  0.37230410  0.23476050  0.189074122
## hd           0.43025779  0.37230410  1.00000000  0.77772630  0.232801530
## ram          0.62274824  0.23476050  0.77772630  1.00000000  0.208953740
## screen       0.29604147  0.18907412  0.23280153  0.20895374  1.000000000
## cd_yes       0.19734334  0.25825980  0.50357041  0.43850441  0.129487662
## multi_yes   -0.01665139  0.08417193  0.09280483  0.04549689 -0.001740414
## premium_yes -0.08069636  0.11420791  0.19692359  0.19714459  0.018745223
## ads          0.05454047 -0.21523206 -0.32322200 -0.18166971 -0.093919429
## trend       -0.19998694  0.40543833  0.57779013  0.27684384  0.188614445
##                  cd_yes    multi_yes premium_yes         ads       trend
## price        0.19734334 -0.016651388 -0.08069636  0.05454047 -0.19998694
## speed        0.25825980  0.084171934  0.11420791 -0.21523206  0.40543833
## hd           0.50357041  0.092804830  0.19692359 -0.32322200  0.57779013
## ram          0.43850441  0.045496894  0.19714459 -0.18166971  0.27684384
## screen       0.12948766 -0.001740414  0.01874522 -0.09391943  0.18861444
## cd_yes       1.00000000  0.432179298  0.21607660 -0.06109108  0.44578018
## multi_yes    0.43217930  1.000000000  0.12477474 -0.03039426  0.21090743
## premium_yes  0.21607660  0.124774741  1.00000000 -0.15202274  0.04210738
## ads         -0.06109108 -0.030394260 -0.15202274  1.00000000 -0.31855251
## trend        0.44578018  0.210907431  0.04210738 -0.31855251  1.00000000

Multilinear modelling

m1 <- lm(price ~ screen + speed + hd + ram + screen + cd_yes, multi_yes + premium_yes + trend, data = cDataDummied )
summary(m1)
## 
## Call:
## lm(formula = price ~ screen + speed + hd + ram + screen + cd_yes, 
##     data = cDataDummied, subset = multi_yes + premium_yes + trend)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -596.21  -77.23   26.44  132.75  363.41 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -52.16421   58.70266  -0.889    0.374    
## screen        29.11531    3.94043   7.389 1.67e-13 ***
## speed         19.62572    0.19545 100.412  < 2e-16 ***
## hd             6.01063    0.03374 178.169  < 2e-16 ***
## ram            1.02802    0.91979   1.118    0.264    
## cd_yes      -137.14897   12.90394 -10.628  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 197.3 on 6253 degrees of freedom
## Multiple R-squared:  0.9291, Adjusted R-squared:  0.929 
## F-statistic: 1.638e+04 on 5 and 6253 DF,  p-value: < 2.2e-16

finding out collinearity, vif(variance inflation factor) influential measures.

install.pacakges(“car”)

library(car)
## Warning: package 'car' was built under R version 3.6.1
## Loading required package: carData
windows()
influencePlot(m1)

##         StudRes          Hat        CookD
## 31    0.1428445 0.0037443631 1.278353e-05
## 31.1  0.1428445 0.0037443631 1.278353e-05
## 37   -3.0252205 0.0007633193 1.163683e-03
## 37.1 -3.0252205 0.0007633193 1.163683e-03
vif(m1)
##   screen    speed       hd      ram   cd_yes 
## 1.086365 1.110826 1.915760 1.962322 1.101396
windows()
avPlots(m1)

## Akaike Information Criteria 70% remedy ## install.packages(“MASS”)

library(MASS)
## Warning: package 'MASS' was built under R version 3.6.1
stepAIC(m1)  # automatically removes influential records on model
## Start:  AIC=66159.05
## price ~ screen + speed + hd + ram + screen + cd_yes
## 
##          Df  Sum of Sq        RSS   AIC
## - ram     1      48618  243416616 66158
## <none>                  243367998 66159
## - screen  1    2124864  245492862 66211
## - cd_yes  1    4396587  247764585 66269
## - speed   1  392415090  635783088 72167
## - hd      1 1235486115 1478854113 77451
## 
## Step:  AIC=66158.3
## price ~ screen + speed + hd + cd_yes
## 
##          Df  Sum of Sq        RSS   AIC
## <none>                  243416616 66158
## - screen  1    2094755  245511371 66210
## - cd_yes  1    4632691  248049307 66274
## - speed   1  397763388  641180003 72218
## - hd      1 2320327061 2563743677 80893
## 
## Call:
## lm(formula = price ~ screen + speed + hd + cd_yes, data = cDataDummied, 
##     subset = multi_yes + premium_yes + trend)
## 
## Coefficients:
## (Intercept)       screen        speed           hd       cd_yes  
##     -34.616       28.050       19.598        6.036     -139.263
finalmodel <- lm(price ~ screen + speed +hd + cd_yes, data = cDataDummied)
summary(finalmodel)
## 
## Call:
## lm(formula = price ~ screen + speed + hd + cd_yes, data = cDataDummied)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1275.67  -370.03   -34.07   312.39  2369.95 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -94.15335  104.23941  -0.903 0.366433    
## screen      123.23686    7.30241  16.876  < 2e-16 ***
## speed         3.95634    0.32859  12.040  < 2e-16 ***
## hd            0.79387    0.03025  26.241  < 2e-16 ***
## cd_yes      -49.72807   14.87823  -3.342 0.000836 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 505.2 on 6254 degrees of freedom
## Multiple R-squared:  0.2439, Adjusted R-squared:  0.2434 
## F-statistic: 504.2 on 4 and 6254 DF,  p-value: < 2.2e-16

Prediction:

cPricePredicted <- predict(finalmodel,data = cDataDummied)
head(data.frame(cDataDummied,cPricePredicted))
##   price speed  hd ram screen cd_yes multi_yes premium_yes ads trend
## 1  1499    25  80   4     14      0         0           1  94     1
## 2  1795    33  85   2     14      0         0           1  94     1
## 3  1595    25 170   4     15      0         0           1  94     1
## 4  1849    25 170   8     14      0         0           0  94     1
## 5  3295    33 340  16     14      0         0           1  94     1
## 6  3695    66 340  16     14      0         0           1  94     1
##   cPricePredicted
## 1        1793.581
## 2        1829.201
## 3        1988.266
## 4        1865.029
## 5        2031.638
## 6        2162.197