Multiple Linear Regression Implementation, Predicting Price of a Computer

#Reading file and understanding the demographics
existingdata = read.csv(file.choose())#choose input file with existing data
cat('Dimensions of Existing Data')

## Dimensions of Existing Data

dim(existingdata) #gives dimensions of existingdata(dataset)

## [1] 6259   11

cat('\nSummary of Existing Data')

## 
## Summary of Existing Data

summary(existingdata)#display summary of existingdata(dataset)

##      Sr.No          price          speed              hd        
##  Min.   :   1   Min.   : 949   Min.   : 25.00   Min.   :  80.0  
##  1st Qu.:1566   1st Qu.:1794   1st Qu.: 33.00   1st Qu.: 214.0  
##  Median :3130   Median :2144   Median : 50.00   Median : 340.0  
##  Mean   :3130   Mean   :2220   Mean   : 52.01   Mean   : 416.6  
##  3rd Qu.:4694   3rd Qu.:2595   3rd Qu.: 66.00   3rd Qu.: 528.0  
##  Max.   :6259   Max.   :5399   Max.   :100.00   Max.   :2100.0  
##       ram             screen        cd       multi      premium   
##  Min.   : 2.000   Min.   :14.00   no :3351   no :5386   no : 612  
##  1st Qu.: 4.000   1st Qu.:14.00   yes:2908   yes: 873   yes:5647  
##  Median : 8.000   Median :14.00                                   
##  Mean   : 8.287   Mean   :14.61                                   
##  3rd Qu.: 8.000   3rd Qu.:15.00                                   
##  Max.   :32.000   Max.   :17.00                                   
##       ads            trend      
##  Min.   : 39.0   Min.   : 1.00  
##  1st Qu.:162.5   1st Qu.:10.00  
##  Median :246.0   Median :16.00  
##  Mean   :221.3   Mean   :15.93  
##  3rd Qu.:275.0   3rd Qu.:21.50  
##  Max.   :339.0   Max.   :35.00

cat('\nData Types and other information of Existing Data')

## 
## Data Types and other information of Existing Data

str(existingdata)#display the data types and sample data of existingdata(dataset)

## 'data.frame':    6259 obs. of  11 variables:
##  $ Sr.No  : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ price  : int  1499 1795 1595 1849 3295 3695 1720 1995 2225 2575 ...
##  $ speed  : int  25 33 25 25 33 66 25 50 50 50 ...
##  $ hd     : int  80 85 170 170 340 340 170 85 210 210 ...
##  $ ram    : int  4 2 4 8 16 16 4 2 8 4 ...
##  $ screen : int  14 14 15 14 14 14 14 14 14 15 ...
##  $ cd     : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 2 1 1 1 ...
##  $ multi  : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ premium: Factor w/ 2 levels "no","yes": 2 2 2 1 2 2 2 2 2 2 ...
##  $ ads    : int  94 94 94 94 94 94 94 94 94 94 ...
##  $ trend  : int  1 1 1 1 1 1 1 1 1 1 ...

attach(existingdata)#attach existingdata(dataset) to working space

#Creating Dummy Data of catogorical Data using dummy_cols function and bind to actual data
library(fastDummies)
dummy_data=dummy_cols(existingdata)
attach(dummy_data)

## The following objects are masked from existingdata:
## 
##     ads, cd, hd, multi, premium, price, ram, screen, speed, Sr.No,
##     trend

existingdata=subset(existingdata,select = -c(cd,multi,premium))
existingdata=cbind(existingdata,cd_yes,multi_yes,premium_yes)
colnames(existingdata)

##  [1] "Sr.No"       "price"       "speed"       "hd"          "ram"        
##  [6] "screen"      "ads"         "trend"       "cd_yes"      "multi_yes"  
## [11] "premium_yes"

#Building a Multiple Linear Regression Model
library(MASS)
library(carData)
multiplelinearmodel = lm(price ~ sqrt(log(speed))+sqrt(hd)+sqrt(ram)+log(screen)+log(ads)+trend+cd_yes+multi_yes+premium_yes, data = existingdata)
summary(multiplelinearmodel)

## 
## Call:
## lm(formula = price ~ sqrt(log(speed)) + sqrt(hd) + sqrt(ram) + 
##     log(screen) + log(ads) + trend + cd_yes + multi_yes + premium_yes, 
##     data = existingdata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -975.26 -171.54  -16.96  143.49 1922.10 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -6735.2902   169.9550 -39.630  < 2e-16 ***
## sqrt(log(speed))  1874.1043    36.1656  51.820  < 2e-16 ***
## sqrt(hd)            37.4362     1.2031  31.115  < 2e-16 ***
## sqrt(ram)          295.1389     6.7117  43.974  < 2e-16 ***
## log(screen)       1660.5494    58.6868  28.295  < 2e-16 ***
## log(ads)           104.9492     8.1753  12.837  < 2e-16 ***
## trend              -51.5399     0.6143 -83.897  < 2e-16 ***
## cd_yes              38.2527     9.1570   4.177 2.99e-05 ***
## multi_yes           95.1909    10.8622   8.763  < 2e-16 ***
## premium_yes       -535.0075    11.7832 -45.404  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 262.5 on 6249 degrees of freedom
## Multiple R-squared:  0.7961, Adjusted R-squared:  0.7958 
## F-statistic:  2711 on 9 and 6249 DF,  p-value: < 2.2e-16

#Predicting Price
Datatobepredicted=read.csv(file.choose())
dummy_data=dummy_cols(Datatobepredicted)
attach(dummy_data)

## The following objects are masked from dummy_data (pos = 5):
## 
##     ads, cd, cd_no, cd_yes, hd, multi, multi_no, multi_yes,
##     premium, premium_no, premium_yes, price, ram, screen, speed,
##     Sr.No, trend

## The following objects are masked from existingdata:
## 
##     ads, cd, hd, multi, premium, price, ram, screen, speed, Sr.No,
##     trend

Datatobepredicted=subset(Datatobepredicted,select = -c(cd,multi,premium))
Datatobepredicted=cbind(Datatobepredicted,cd_yes,multi_yes,premium_yes)
predictedprice=as.data.frame(predict(multiplelinearmodel,Datatobepredicted))
write.csv(cbind(Datatobepredicted,predictedprice),file = 'D:\\Data Science\\Datasets\\M_Computer_Data_Predicted Price.csv',row.names = FALSE)