#Predict Price of the computer
comp <- read.csv("C:\\Users\\prakruthi\\Desktop\\datascience assignments\\multi linear regression\\Computer_Data.csv")
View(comp)
#apply transformation as we have categorical data
comp$cd = as.numeric(factor(comp$cd))
View(comp)
comp$multi = as.numeric(factor(comp$multi))
View(comp)
comp$premium = as.numeric(factor(comp$premium))
View(comp)
#EDA
summary(comp)
## X price speed hd
## Min. : 1 Min. : 949 Min. : 25.00 Min. : 80.0
## 1st Qu.:1566 1st Qu.:1794 1st Qu.: 33.00 1st Qu.: 214.0
## Median :3130 Median :2144 Median : 50.00 Median : 340.0
## Mean :3130 Mean :2220 Mean : 52.01 Mean : 416.6
## 3rd Qu.:4694 3rd Qu.:2595 3rd Qu.: 66.00 3rd Qu.: 528.0
## Max. :6259 Max. :5399 Max. :100.00 Max. :2100.0
## ram screen cd multi
## Min. : 2.000 Min. :14.00 Min. :1.000 Min. :1.000
## 1st Qu.: 4.000 1st Qu.:14.00 1st Qu.:1.000 1st Qu.:1.000
## Median : 8.000 Median :14.00 Median :1.000 Median :1.000
## Mean : 8.287 Mean :14.61 Mean :1.465 Mean :1.139
## 3rd Qu.: 8.000 3rd Qu.:15.00 3rd Qu.:2.000 3rd Qu.:1.000
## Max. :32.000 Max. :17.00 Max. :2.000 Max. :2.000
## premium ads trend
## Min. :1.000 Min. : 39.0 Min. : 1.00
## 1st Qu.:2.000 1st Qu.:162.5 1st Qu.:10.00
## Median :2.000 Median :246.0 Median :16.00
## Mean :1.902 Mean :221.3 Mean :15.93
## 3rd Qu.:2.000 3rd Qu.:275.0 3rd Qu.:21.50
## Max. :2.000 Max. :339.0 Max. :35.00
class(comp)
## [1] "data.frame"
str(comp)
## 'data.frame': 6259 obs. of 11 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ price : int 1499 1795 1595 1849 3295 3695 1720 1995 2225 2575 ...
## $ speed : int 25 33 25 25 33 66 25 50 50 50 ...
## $ hd : int 80 85 170 170 340 340 170 85 210 210 ...
## $ ram : int 4 2 4 8 16 16 4 2 8 4 ...
## $ screen : int 14 14 15 14 14 14 14 14 14 15 ...
## $ cd : num 1 1 1 1 1 1 2 1 1 1 ...
## $ multi : num 1 1 1 1 1 1 1 1 1 1 ...
## $ premium: num 2 2 2 1 2 2 2 2 2 2 ...
## $ ads : int 94 94 94 94 94 94 94 94 94 94 ...
## $ trend : int 1 1 1 1 1 1 1 1 1 1 ...
#removing x column as it is not required
comp1 <- subset(comp,select = c(2:11))
View(comp1)
plot(comp)
colnames(comp1)
## [1] "price" "speed" "hd" "ram" "screen" "cd" "multi"
## [8] "premium" "ads" "trend"
model<- lm( price ~ speed + hd + ram + screen + cd + multi + premium + ads + trend,data = comp1 )
summary(model)
##
## Call:
## lm(formula = price ~ speed + hd + ram + screen + cd + multi +
## premium + ads + trend, data = comp1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1093.77 -174.24 -11.49 146.49 2001.05
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 651.97219 64.49224 10.109 < 2e-16 ***
## speed 9.32028 0.18506 50.364 < 2e-16 ***
## hd 0.78178 0.02761 28.311 < 2e-16 ***
## ram 48.25596 1.06608 45.265 < 2e-16 ***
## screen 123.08904 3.99950 30.776 < 2e-16 ***
## cd 60.91671 9.51559 6.402 1.65e-10 ***
## multi 104.32382 11.41268 9.141 < 2e-16 ***
## premium -509.22473 12.34225 -41.259 < 2e-16 ***
## ads 0.65729 0.05132 12.809 < 2e-16 ***
## trend -51.84958 0.62871 -82.470 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 275.3 on 6249 degrees of freedom
## Multiple R-squared: 0.7756, Adjusted R-squared: 0.7752
## F-statistic: 2399 on 9 and 6249 DF, p-value: < 2.2e-16
library(mvinfluence)
## Loading required package: car
## Loading required package: carData
## Loading required package: heplots
influencePlot(model)
## StudRes Hat CookD
## 1441 7.3058529 0.002228075 0.011819949
## 1701 7.1838002 0.002464463 0.012647347
## 3784 -0.8667018 0.020972880 0.001609237
## 4478 -1.3795547 0.020060286 0.003895407
m1<- lm( price ~ speed + hd + ram + screen + cd + multi + premium + ads + trend,data = comp1[-1701,] )
summary(m1)
##
## Call:
## lm(formula = price ~ speed + hd + ram + screen + cd + multi +
## premium + ads + trend, data = comp1[-1701, ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1094.71 -173.87 -11.22 146.49 2004.57
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 666.84056 64.26601 10.376 < 2e-16 ***
## speed 9.31082 0.18432 50.515 < 2e-16 ***
## hd 0.77879 0.02751 28.313 < 2e-16 ***
## ram 48.40809 1.06200 45.582 < 2e-16 ***
## screen 122.10442 3.98576 30.635 < 2e-16 ***
## cd 59.58151 9.47912 6.286 3.49e-10 ***
## multi 105.13252 11.36730 9.249 < 2e-16 ***
## premium -509.37799 12.29260 -41.438 < 2e-16 ***
## ads 0.65399 0.05111 12.796 < 2e-16 ***
## trend -51.74073 0.62636 -82.605 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 274.2 on 6248 degrees of freedom
## Multiple R-squared: 0.7766, Adjusted R-squared: 0.7763
## F-statistic: 2413 on 9 and 6248 DF, p-value: < 2.2e-16
m2<- lm( price ~ speed + hd + ram + screen + cd + multi + premium + ads + trend,data = comp1[-c(1701,1441,4478,3784),] )
summary(m2)
##
## Call:
## lm(formula = price ~ speed + hd + ram + screen + cd + multi +
## premium + ads + trend, data = comp1[-c(1701, 1441, 4478,
## 3784), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1094.21 -173.10 -10.94 146.35 1509.23
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 681.81418 64.02437 10.649 < 2e-16 ***
## speed 9.29551 0.18355 50.642 < 2e-16 ***
## hd 0.78355 0.02795 28.030 < 2e-16 ***
## ram 48.29482 1.06764 45.235 < 2e-16 ***
## screen 121.07383 3.97118 30.488 < 2e-16 ***
## cd 60.31315 9.44029 6.389 1.79e-10 ***
## multi 104.85186 11.31912 9.263 < 2e-16 ***
## premium -510.03064 12.24104 -41.666 < 2e-16 ***
## ads 0.65465 0.05095 12.849 < 2e-16 ***
## trend -51.74355 0.62676 -82.558 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 273.1 on 6245 degrees of freedom
## Multiple R-squared: 0.7774, Adjusted R-squared: 0.777
## F-statistic: 2423 on 9 and 6245 DF, p-value: < 2.2e-16
vif(model)
## speed hd ram screen cd multi premium ads
## 1.265364 4.207395 2.974628 1.081644 1.859370 1.290568 1.109388 1.217218
## trend
## 2.022790
avPlots(model)
m3<- lm( price ~ speed + hd + ram + screen + cd + premium + ads + trend,data = comp1[-c(1701,1441,4478,3784),] )
summary(m3)
##
## Call:
## lm(formula = price ~ speed + hd + ram + screen + cd + premium +
## ads + trend, data = comp1[-c(1701, 1441, 4478, 3784), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1089.55 -175.21 -12.48 147.26 1516.39
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 764.16647 63.83313 11.97 <2e-16 ***
## speed 9.27114 0.18478 50.17 <2e-16 ***
## hd 0.76014 0.02803 27.12 <2e-16 ***
## ram 47.78061 1.07341 44.51 <2e-16 ***
## screen 119.78793 3.99561 29.98 <2e-16 ***
## cd 96.47473 8.65349 11.15 <2e-16 ***
## premium -502.81979 12.29892 -40.88 <2e-16 ***
## ads 0.63944 0.05127 12.47 <2e-16 ***
## trend -51.24904 0.62871 -81.52 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 274.9 on 6246 degrees of freedom
## Multiple R-squared: 0.7743, Adjusted R-squared: 0.774
## F-statistic: 2678 on 8 and 6246 DF, p-value: < 2.2e-16
m4<- lm( price ~ speed + hd + ram + screen + premium + ads + multi + trend,data = comp1[-c(1701,1441,4478,3784),] )
summary(m4)
##
## Call:
## lm(formula = price ~ speed + hd + ram + screen + premium + ads +
## multi + trend, data = comp1[-c(1701, 1441, 4478, 3784), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1078.73 -173.61 -9.91 148.07 1504.16
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 666.24733 64.18159 10.38 <2e-16 ***
## speed 9.34418 0.18398 50.79 <2e-16 ***
## hd 0.80532 0.02783 28.93 <2e-16 ***
## ram 49.41683 1.05645 46.78 <2e-16 ***
## screen 121.19452 3.98378 30.42 <2e-16 ***
## premium -499.57338 12.16972 -41.05 <2e-16 ***
## ads 0.71339 0.05027 14.19 <2e-16 ***
## multi 134.75644 10.33879 13.03 <2e-16 ***
## trend -50.84705 0.61279 -82.98 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 273.9 on 6246 degrees of freedom
## Multiple R-squared: 0.7759, Adjusted R-squared: 0.7756
## F-statistic: 2703 on 8 and 6246 DF, p-value: < 2.2e-16
final <- lm( price ~ speed + hd + ram + screen + cd + multi + premium + ads + trend,data = comp1[-c(1701,1441,4478,3784),] )
summary(final)
##
## Call:
## lm(formula = price ~ speed + hd + ram + screen + cd + multi +
## premium + ads + trend, data = comp1[-c(1701, 1441, 4478,
## 3784), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1094.21 -173.10 -10.94 146.35 1509.23
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 681.81418 64.02437 10.649 < 2e-16 ***
## speed 9.29551 0.18355 50.642 < 2e-16 ***
## hd 0.78355 0.02795 28.030 < 2e-16 ***
## ram 48.29482 1.06764 45.235 < 2e-16 ***
## screen 121.07383 3.97118 30.488 < 2e-16 ***
## cd 60.31315 9.44029 6.389 1.79e-10 ***
## multi 104.85186 11.31912 9.263 < 2e-16 ***
## premium -510.03064 12.24104 -41.666 < 2e-16 ***
## ads 0.65465 0.05095 12.849 < 2e-16 ***
## trend -51.74355 0.62676 -82.558 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 273.1 on 6245 degrees of freedom
## Multiple R-squared: 0.7774, Adjusted R-squared: 0.777
## F-statistic: 2423 on 9 and 6245 DF, p-value: < 2.2e-16
View(comp1)
prediction <- predict(final,data=comp1)
pred =data.frame( comp1[-c(1701,1441,4478,3784),],prediction)
View(pred)