cData <- read.csv(file.choose()) #Computer_Data.csv
View(cData)
head(cData)
## X price speed hd ram screen cd multi premium ads trend
## 1 1 1499 25 80 4 14 no no yes 94 1
## 2 2 1795 33 85 2 14 no no yes 94 1
## 3 3 1595 25 170 4 15 no no yes 94 1
## 4 4 1849 25 170 8 14 no no no 94 1
## 5 5 3295 33 340 16 14 no no yes 94 1
## 6 6 3695 66 340 16 14 no no yes 94 1
library(fastDummies)
## Warning: package 'fastDummies' was built under R version 3.6.1
cDataWDummy <- dummy_cols(cData, select_columns = c("cd","multi","premium"))
head(cDataWDummy)
## X price speed hd ram screen cd multi premium ads trend cd_no cd_yes
## 1 1 1499 25 80 4 14 no no yes 94 1 1 0
## 2 2 1795 33 85 2 14 no no yes 94 1 1 0
## 3 3 1595 25 170 4 15 no no yes 94 1 1 0
## 4 4 1849 25 170 8 14 no no no 94 1 1 0
## 5 5 3295 33 340 16 14 no no yes 94 1 1 0
## 6 6 3695 66 340 16 14 no no yes 94 1 1 0
## multi_no multi_yes premium_yes premium_no
## 1 1 0 1 0
## 2 1 0 1 0
## 3 1 0 1 0
## 4 1 0 0 1
## 5 1 0 1 0
## 6 1 0 1 0
attach(cDataWDummy)
cDataDummied <- subset(cDataWDummy, select= c("price", "speed", "hd", "ram", "screen", "cd_yes", "multi_yes", "premium_yes", "ads","trend"))
attach(cDataDummied)
## The following objects are masked from cDataWDummy:
##
## ads, cd_yes, hd, multi_yes, premium_yes, price, ram, screen,
## speed, trend
head(cDataDummied)
## price speed hd ram screen cd_yes multi_yes premium_yes ads trend
## 1 1499 25 80 4 14 0 0 1 94 1
## 2 1795 33 85 2 14 0 0 1 94 1
## 3 1595 25 170 4 15 0 0 1 94 1
## 4 1849 25 170 8 14 0 0 0 94 1
## 5 3295 33 340 16 14 0 0 1 94 1
## 6 3695 66 340 16 14 0 0 1 94 1
summary(cDataDummied)
## price speed hd ram
## Min. : 949 Min. : 25.00 Min. : 80.0 Min. : 2.000
## 1st Qu.:1794 1st Qu.: 33.00 1st Qu.: 214.0 1st Qu.: 4.000
## Median :2144 Median : 50.00 Median : 340.0 Median : 8.000
## Mean :2220 Mean : 52.01 Mean : 416.6 Mean : 8.287
## 3rd Qu.:2595 3rd Qu.: 66.00 3rd Qu.: 528.0 3rd Qu.: 8.000
## Max. :5399 Max. :100.00 Max. :2100.0 Max. :32.000
## screen cd_yes multi_yes premium_yes
## Min. :14.00 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:14.00 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:1.0000
## Median :14.00 Median :0.0000 Median :0.0000 Median :1.0000
## Mean :14.61 Mean :0.4646 Mean :0.1395 Mean :0.9022
## 3rd Qu.:15.00 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :17.00 Max. :1.0000 Max. :1.0000 Max. :1.0000
## ads trend
## Min. : 39.0 Min. : 1.00
## 1st Qu.:162.5 1st Qu.:10.00
## Median :246.0 Median :16.00
## Mean :221.3 Mean :15.93
## 3rd Qu.:275.0 3rd Qu.:21.50
## Max. :339.0 Max. :35.00
windows()
plot(cDataDummied)
cor(cDataDummied)
## price speed hd ram screen
## price 1.00000000 0.30097646 0.43025779 0.62274824 0.296041474
## speed 0.30097646 1.00000000 0.37230410 0.23476050 0.189074122
## hd 0.43025779 0.37230410 1.00000000 0.77772630 0.232801530
## ram 0.62274824 0.23476050 0.77772630 1.00000000 0.208953740
## screen 0.29604147 0.18907412 0.23280153 0.20895374 1.000000000
## cd_yes 0.19734334 0.25825980 0.50357041 0.43850441 0.129487662
## multi_yes -0.01665139 0.08417193 0.09280483 0.04549689 -0.001740414
## premium_yes -0.08069636 0.11420791 0.19692359 0.19714459 0.018745223
## ads 0.05454047 -0.21523206 -0.32322200 -0.18166971 -0.093919429
## trend -0.19998694 0.40543833 0.57779013 0.27684384 0.188614445
## cd_yes multi_yes premium_yes ads trend
## price 0.19734334 -0.016651388 -0.08069636 0.05454047 -0.19998694
## speed 0.25825980 0.084171934 0.11420791 -0.21523206 0.40543833
## hd 0.50357041 0.092804830 0.19692359 -0.32322200 0.57779013
## ram 0.43850441 0.045496894 0.19714459 -0.18166971 0.27684384
## screen 0.12948766 -0.001740414 0.01874522 -0.09391943 0.18861444
## cd_yes 1.00000000 0.432179298 0.21607660 -0.06109108 0.44578018
## multi_yes 0.43217930 1.000000000 0.12477474 -0.03039426 0.21090743
## premium_yes 0.21607660 0.124774741 1.00000000 -0.15202274 0.04210738
## ads -0.06109108 -0.030394260 -0.15202274 1.00000000 -0.31855251
## trend 0.44578018 0.210907431 0.04210738 -0.31855251 1.00000000
m1 <- lm(price ~ screen + speed + hd + ram + screen + cd_yes, multi_yes + premium_yes + trend, data = cDataDummied )
summary(m1)
##
## Call:
## lm(formula = price ~ screen + speed + hd + ram + screen + cd_yes,
## data = cDataDummied, subset = multi_yes + premium_yes + trend)
##
## Residuals:
## Min 1Q Median 3Q Max
## -596.21 -77.23 26.44 132.75 363.41
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -52.16421 58.70266 -0.889 0.374
## screen 29.11531 3.94043 7.389 1.67e-13 ***
## speed 19.62572 0.19545 100.412 < 2e-16 ***
## hd 6.01063 0.03374 178.169 < 2e-16 ***
## ram 1.02802 0.91979 1.118 0.264
## cd_yes -137.14897 12.90394 -10.628 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 197.3 on 6253 degrees of freedom
## Multiple R-squared: 0.9291, Adjusted R-squared: 0.929
## F-statistic: 1.638e+04 on 5 and 6253 DF, p-value: < 2.2e-16
library(car)
## Warning: package 'car' was built under R version 3.6.1
## Loading required package: carData
windows()
influencePlot(m1)
## StudRes Hat CookD
## 31 0.1428445 0.0037443631 1.278353e-05
## 31.1 0.1428445 0.0037443631 1.278353e-05
## 37 -3.0252205 0.0007633193 1.163683e-03
## 37.1 -3.0252205 0.0007633193 1.163683e-03
vif(m1)
## screen speed hd ram cd_yes
## 1.086365 1.110826 1.915760 1.962322 1.101396
windows()
avPlots(m1)
## Akaike Information Criteria 70% remedy ## install.packages(“MASS”)
library(MASS)
## Warning: package 'MASS' was built under R version 3.6.1
stepAIC(m1) # automatically removes influential records on model
## Start: AIC=66159.05
## price ~ screen + speed + hd + ram + screen + cd_yes
##
## Df Sum of Sq RSS AIC
## - ram 1 48618 243416616 66158
## <none> 243367998 66159
## - screen 1 2124864 245492862 66211
## - cd_yes 1 4396587 247764585 66269
## - speed 1 392415090 635783088 72167
## - hd 1 1235486115 1478854113 77451
##
## Step: AIC=66158.3
## price ~ screen + speed + hd + cd_yes
##
## Df Sum of Sq RSS AIC
## <none> 243416616 66158
## - screen 1 2094755 245511371 66210
## - cd_yes 1 4632691 248049307 66274
## - speed 1 397763388 641180003 72218
## - hd 1 2320327061 2563743677 80893
##
## Call:
## lm(formula = price ~ screen + speed + hd + cd_yes, data = cDataDummied,
## subset = multi_yes + premium_yes + trend)
##
## Coefficients:
## (Intercept) screen speed hd cd_yes
## -34.616 28.050 19.598 6.036 -139.263
finalmodel <- lm(price ~ screen + speed +hd + cd_yes, data = cDataDummied)
summary(finalmodel)
##
## Call:
## lm(formula = price ~ screen + speed + hd + cd_yes, data = cDataDummied)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1275.67 -370.03 -34.07 312.39 2369.95
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -94.15335 104.23941 -0.903 0.366433
## screen 123.23686 7.30241 16.876 < 2e-16 ***
## speed 3.95634 0.32859 12.040 < 2e-16 ***
## hd 0.79387 0.03025 26.241 < 2e-16 ***
## cd_yes -49.72807 14.87823 -3.342 0.000836 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 505.2 on 6254 degrees of freedom
## Multiple R-squared: 0.2439, Adjusted R-squared: 0.2434
## F-statistic: 504.2 on 4 and 6254 DF, p-value: < 2.2e-16
cPricePredicted <- predict(finalmodel,data = cDataDummied)
head(data.frame(cDataDummied,cPricePredicted))
## price speed hd ram screen cd_yes multi_yes premium_yes ads trend
## 1 1499 25 80 4 14 0 0 1 94 1
## 2 1795 33 85 2 14 0 0 1 94 1
## 3 1595 25 170 4 15 0 0 1 94 1
## 4 1849 25 170 8 14 0 0 0 94 1
## 5 3295 33 340 16 14 0 0 1 94 1
## 6 3695 66 340 16 14 0 0 1 94 1
## cPricePredicted
## 1 1793.581
## 2 1829.201
## 3 1988.266
## 4 1865.029
## 5 2031.638
## 6 2162.197