#Predict Price of the computer

comp <- read.csv("C:\\Users\\prakruthi\\Desktop\\datascience assignments\\multi linear regression\\Computer_Data.csv")
View(comp)

#apply transformation as we have categorical data

comp$cd = as.numeric(factor(comp$cd))
View(comp)
comp$multi = as.numeric(factor(comp$multi))
View(comp)
comp$premium = as.numeric(factor(comp$premium))
View(comp)
#EDA
summary(comp)
##        X            price          speed              hd        
##  Min.   :   1   Min.   : 949   Min.   : 25.00   Min.   :  80.0  
##  1st Qu.:1566   1st Qu.:1794   1st Qu.: 33.00   1st Qu.: 214.0  
##  Median :3130   Median :2144   Median : 50.00   Median : 340.0  
##  Mean   :3130   Mean   :2220   Mean   : 52.01   Mean   : 416.6  
##  3rd Qu.:4694   3rd Qu.:2595   3rd Qu.: 66.00   3rd Qu.: 528.0  
##  Max.   :6259   Max.   :5399   Max.   :100.00   Max.   :2100.0  
##       ram             screen            cd            multi      
##  Min.   : 2.000   Min.   :14.00   Min.   :1.000   Min.   :1.000  
##  1st Qu.: 4.000   1st Qu.:14.00   1st Qu.:1.000   1st Qu.:1.000  
##  Median : 8.000   Median :14.00   Median :1.000   Median :1.000  
##  Mean   : 8.287   Mean   :14.61   Mean   :1.465   Mean   :1.139  
##  3rd Qu.: 8.000   3rd Qu.:15.00   3rd Qu.:2.000   3rd Qu.:1.000  
##  Max.   :32.000   Max.   :17.00   Max.   :2.000   Max.   :2.000  
##     premium           ads            trend      
##  Min.   :1.000   Min.   : 39.0   Min.   : 1.00  
##  1st Qu.:2.000   1st Qu.:162.5   1st Qu.:10.00  
##  Median :2.000   Median :246.0   Median :16.00  
##  Mean   :1.902   Mean   :221.3   Mean   :15.93  
##  3rd Qu.:2.000   3rd Qu.:275.0   3rd Qu.:21.50  
##  Max.   :2.000   Max.   :339.0   Max.   :35.00
class(comp)
## [1] "data.frame"
str(comp)
## 'data.frame':    6259 obs. of  11 variables:
##  $ X      : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ price  : int  1499 1795 1595 1849 3295 3695 1720 1995 2225 2575 ...
##  $ speed  : int  25 33 25 25 33 66 25 50 50 50 ...
##  $ hd     : int  80 85 170 170 340 340 170 85 210 210 ...
##  $ ram    : int  4 2 4 8 16 16 4 2 8 4 ...
##  $ screen : int  14 14 15 14 14 14 14 14 14 15 ...
##  $ cd     : num  1 1 1 1 1 1 2 1 1 1 ...
##  $ multi  : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ premium: num  2 2 2 1 2 2 2 2 2 2 ...
##  $ ads    : int  94 94 94 94 94 94 94 94 94 94 ...
##  $ trend  : int  1 1 1 1 1 1 1 1 1 1 ...
#removing x column as it is not required
comp1 <- subset(comp,select = c(2:11))
View(comp1)
plot(comp)

colnames(comp1)
##  [1] "price"   "speed"   "hd"      "ram"     "screen"  "cd"      "multi"  
##  [8] "premium" "ads"     "trend"
model<- lm( price ~ speed +  hd   +  ram  + screen +  cd  + multi  + premium +  ads  + trend,data = comp1 )
summary(model)
## 
## Call:
## lm(formula = price ~ speed + hd + ram + screen + cd + multi + 
##     premium + ads + trend, data = comp1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1093.77  -174.24   -11.49   146.49  2001.05 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  651.97219   64.49224  10.109  < 2e-16 ***
## speed          9.32028    0.18506  50.364  < 2e-16 ***
## hd             0.78178    0.02761  28.311  < 2e-16 ***
## ram           48.25596    1.06608  45.265  < 2e-16 ***
## screen       123.08904    3.99950  30.776  < 2e-16 ***
## cd            60.91671    9.51559   6.402 1.65e-10 ***
## multi        104.32382   11.41268   9.141  < 2e-16 ***
## premium     -509.22473   12.34225 -41.259  < 2e-16 ***
## ads            0.65729    0.05132  12.809  < 2e-16 ***
## trend        -51.84958    0.62871 -82.470  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 275.3 on 6249 degrees of freedom
## Multiple R-squared:  0.7756, Adjusted R-squared:  0.7752 
## F-statistic:  2399 on 9 and 6249 DF,  p-value: < 2.2e-16
library(mvinfluence)
## Loading required package: car
## Loading required package: carData
## Loading required package: heplots
influencePlot(model)

##         StudRes         Hat       CookD
## 1441  7.3058529 0.002228075 0.011819949
## 1701  7.1838002 0.002464463 0.012647347
## 3784 -0.8667018 0.020972880 0.001609237
## 4478 -1.3795547 0.020060286 0.003895407
m1<- lm( price ~ speed +  hd   +  ram  + screen +  cd  + multi  + premium +  ads  + trend,data = comp1[-1701,] )
summary(m1)
## 
## Call:
## lm(formula = price ~ speed + hd + ram + screen + cd + multi + 
##     premium + ads + trend, data = comp1[-1701, ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1094.71  -173.87   -11.22   146.49  2004.57 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  666.84056   64.26601  10.376  < 2e-16 ***
## speed          9.31082    0.18432  50.515  < 2e-16 ***
## hd             0.77879    0.02751  28.313  < 2e-16 ***
## ram           48.40809    1.06200  45.582  < 2e-16 ***
## screen       122.10442    3.98576  30.635  < 2e-16 ***
## cd            59.58151    9.47912   6.286 3.49e-10 ***
## multi        105.13252   11.36730   9.249  < 2e-16 ***
## premium     -509.37799   12.29260 -41.438  < 2e-16 ***
## ads            0.65399    0.05111  12.796  < 2e-16 ***
## trend        -51.74073    0.62636 -82.605  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 274.2 on 6248 degrees of freedom
## Multiple R-squared:  0.7766, Adjusted R-squared:  0.7763 
## F-statistic:  2413 on 9 and 6248 DF,  p-value: < 2.2e-16
m2<- lm( price ~ speed +  hd   +  ram  + screen +  cd  + multi  + premium +  ads  + trend,data = comp1[-c(1701,1441,4478,3784),] )
summary(m2)
## 
## Call:
## lm(formula = price ~ speed + hd + ram + screen + cd + multi + 
##     premium + ads + trend, data = comp1[-c(1701, 1441, 4478, 
##     3784), ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1094.21  -173.10   -10.94   146.35  1509.23 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  681.81418   64.02437  10.649  < 2e-16 ***
## speed          9.29551    0.18355  50.642  < 2e-16 ***
## hd             0.78355    0.02795  28.030  < 2e-16 ***
## ram           48.29482    1.06764  45.235  < 2e-16 ***
## screen       121.07383    3.97118  30.488  < 2e-16 ***
## cd            60.31315    9.44029   6.389 1.79e-10 ***
## multi        104.85186   11.31912   9.263  < 2e-16 ***
## premium     -510.03064   12.24104 -41.666  < 2e-16 ***
## ads            0.65465    0.05095  12.849  < 2e-16 ***
## trend        -51.74355    0.62676 -82.558  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 273.1 on 6245 degrees of freedom
## Multiple R-squared:  0.7774, Adjusted R-squared:  0.777 
## F-statistic:  2423 on 9 and 6245 DF,  p-value: < 2.2e-16
vif(model)
##    speed       hd      ram   screen       cd    multi  premium      ads 
## 1.265364 4.207395 2.974628 1.081644 1.859370 1.290568 1.109388 1.217218 
##    trend 
## 2.022790
avPlots(model)

m3<- lm( price ~ speed +  hd   +  ram  + screen +  cd  + premium +  ads  + trend,data = comp1[-c(1701,1441,4478,3784),] )
summary(m3)
## 
## Call:
## lm(formula = price ~ speed + hd + ram + screen + cd + premium + 
##     ads + trend, data = comp1[-c(1701, 1441, 4478, 3784), ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1089.55  -175.21   -12.48   147.26  1516.39 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  764.16647   63.83313   11.97   <2e-16 ***
## speed          9.27114    0.18478   50.17   <2e-16 ***
## hd             0.76014    0.02803   27.12   <2e-16 ***
## ram           47.78061    1.07341   44.51   <2e-16 ***
## screen       119.78793    3.99561   29.98   <2e-16 ***
## cd            96.47473    8.65349   11.15   <2e-16 ***
## premium     -502.81979   12.29892  -40.88   <2e-16 ***
## ads            0.63944    0.05127   12.47   <2e-16 ***
## trend        -51.24904    0.62871  -81.52   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 274.9 on 6246 degrees of freedom
## Multiple R-squared:  0.7743, Adjusted R-squared:  0.774 
## F-statistic:  2678 on 8 and 6246 DF,  p-value: < 2.2e-16
m4<- lm( price ~ speed +  hd   +  ram  + screen + premium +  ads  + multi + trend,data = comp1[-c(1701,1441,4478,3784),] )
summary(m4)
## 
## Call:
## lm(formula = price ~ speed + hd + ram + screen + premium + ads + 
##     multi + trend, data = comp1[-c(1701, 1441, 4478, 3784), ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1078.73  -173.61    -9.91   148.07  1504.16 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  666.24733   64.18159   10.38   <2e-16 ***
## speed          9.34418    0.18398   50.79   <2e-16 ***
## hd             0.80532    0.02783   28.93   <2e-16 ***
## ram           49.41683    1.05645   46.78   <2e-16 ***
## screen       121.19452    3.98378   30.42   <2e-16 ***
## premium     -499.57338   12.16972  -41.05   <2e-16 ***
## ads            0.71339    0.05027   14.19   <2e-16 ***
## multi        134.75644   10.33879   13.03   <2e-16 ***
## trend        -50.84705    0.61279  -82.98   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 273.9 on 6246 degrees of freedom
## Multiple R-squared:  0.7759, Adjusted R-squared:  0.7756 
## F-statistic:  2703 on 8 and 6246 DF,  p-value: < 2.2e-16
final <- lm( price ~ speed +  hd   +  ram  + screen +  cd  + multi  + premium +  ads  + trend,data = comp1[-c(1701,1441,4478,3784),] )
summary(final)
## 
## Call:
## lm(formula = price ~ speed + hd + ram + screen + cd + multi + 
##     premium + ads + trend, data = comp1[-c(1701, 1441, 4478, 
##     3784), ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1094.21  -173.10   -10.94   146.35  1509.23 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  681.81418   64.02437  10.649  < 2e-16 ***
## speed          9.29551    0.18355  50.642  < 2e-16 ***
## hd             0.78355    0.02795  28.030  < 2e-16 ***
## ram           48.29482    1.06764  45.235  < 2e-16 ***
## screen       121.07383    3.97118  30.488  < 2e-16 ***
## cd            60.31315    9.44029   6.389 1.79e-10 ***
## multi        104.85186   11.31912   9.263  < 2e-16 ***
## premium     -510.03064   12.24104 -41.666  < 2e-16 ***
## ads            0.65465    0.05095  12.849  < 2e-16 ***
## trend        -51.74355    0.62676 -82.558  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 273.1 on 6245 degrees of freedom
## Multiple R-squared:  0.7774, Adjusted R-squared:  0.777 
## F-statistic:  2423 on 9 and 6245 DF,  p-value: < 2.2e-16
View(comp1)

prediction <- predict(final,data=comp1)
pred =data.frame( comp1[-c(1701,1441,4478,3784),],prediction)
View(pred)