Multiple Linear Regression Implementation, Predicting Price of a Computer
#Reading file and understanding the demographics
existingdata = read.csv(file.choose())#choose input file with existing data
cat('Dimensions of Existing Data')
## Dimensions of Existing Data
dim(existingdata) #gives dimensions of existingdata(dataset)
## [1] 6259 11
cat('\nSummary of Existing Data')
##
## Summary of Existing Data
summary(existingdata)#display summary of existingdata(dataset)
## Sr.No price speed hd
## Min. : 1 Min. : 949 Min. : 25.00 Min. : 80.0
## 1st Qu.:1566 1st Qu.:1794 1st Qu.: 33.00 1st Qu.: 214.0
## Median :3130 Median :2144 Median : 50.00 Median : 340.0
## Mean :3130 Mean :2220 Mean : 52.01 Mean : 416.6
## 3rd Qu.:4694 3rd Qu.:2595 3rd Qu.: 66.00 3rd Qu.: 528.0
## Max. :6259 Max. :5399 Max. :100.00 Max. :2100.0
## ram screen cd multi premium
## Min. : 2.000 Min. :14.00 no :3351 no :5386 no : 612
## 1st Qu.: 4.000 1st Qu.:14.00 yes:2908 yes: 873 yes:5647
## Median : 8.000 Median :14.00
## Mean : 8.287 Mean :14.61
## 3rd Qu.: 8.000 3rd Qu.:15.00
## Max. :32.000 Max. :17.00
## ads trend
## Min. : 39.0 Min. : 1.00
## 1st Qu.:162.5 1st Qu.:10.00
## Median :246.0 Median :16.00
## Mean :221.3 Mean :15.93
## 3rd Qu.:275.0 3rd Qu.:21.50
## Max. :339.0 Max. :35.00
cat('\nData Types and other information of Existing Data')
##
## Data Types and other information of Existing Data
str(existingdata)#display the data types and sample data of existingdata(dataset)
## 'data.frame': 6259 obs. of 11 variables:
## $ Sr.No : int 1 2 3 4 5 6 7 8 9 10 ...
## $ price : int 1499 1795 1595 1849 3295 3695 1720 1995 2225 2575 ...
## $ speed : int 25 33 25 25 33 66 25 50 50 50 ...
## $ hd : int 80 85 170 170 340 340 170 85 210 210 ...
## $ ram : int 4 2 4 8 16 16 4 2 8 4 ...
## $ screen : int 14 14 15 14 14 14 14 14 14 15 ...
## $ cd : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 2 1 1 1 ...
## $ multi : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ premium: Factor w/ 2 levels "no","yes": 2 2 2 1 2 2 2 2 2 2 ...
## $ ads : int 94 94 94 94 94 94 94 94 94 94 ...
## $ trend : int 1 1 1 1 1 1 1 1 1 1 ...
attach(existingdata)#attach existingdata(dataset) to working space
#Creating Dummy Data of catogorical Data using dummy_cols function and bind to actual data
library(fastDummies)
dummy_data=dummy_cols(existingdata)
attach(dummy_data)
## The following objects are masked from existingdata:
##
## ads, cd, hd, multi, premium, price, ram, screen, speed, Sr.No,
## trend
existingdata=subset(existingdata,select = -c(cd,multi,premium))
existingdata=cbind(existingdata,cd_yes,multi_yes,premium_yes)
colnames(existingdata)
## [1] "Sr.No" "price" "speed" "hd" "ram"
## [6] "screen" "ads" "trend" "cd_yes" "multi_yes"
## [11] "premium_yes"
#Building a Multiple Linear Regression Model
library(MASS)
library(carData)
multiplelinearmodel = lm(price ~ sqrt(log(speed))+sqrt(hd)+sqrt(ram)+log(screen)+log(ads)+trend+cd_yes+multi_yes+premium_yes, data = existingdata)
summary(multiplelinearmodel)
##
## Call:
## lm(formula = price ~ sqrt(log(speed)) + sqrt(hd) + sqrt(ram) +
## log(screen) + log(ads) + trend + cd_yes + multi_yes + premium_yes,
## data = existingdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -975.26 -171.54 -16.96 143.49 1922.10
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6735.2902 169.9550 -39.630 < 2e-16 ***
## sqrt(log(speed)) 1874.1043 36.1656 51.820 < 2e-16 ***
## sqrt(hd) 37.4362 1.2031 31.115 < 2e-16 ***
## sqrt(ram) 295.1389 6.7117 43.974 < 2e-16 ***
## log(screen) 1660.5494 58.6868 28.295 < 2e-16 ***
## log(ads) 104.9492 8.1753 12.837 < 2e-16 ***
## trend -51.5399 0.6143 -83.897 < 2e-16 ***
## cd_yes 38.2527 9.1570 4.177 2.99e-05 ***
## multi_yes 95.1909 10.8622 8.763 < 2e-16 ***
## premium_yes -535.0075 11.7832 -45.404 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 262.5 on 6249 degrees of freedom
## Multiple R-squared: 0.7961, Adjusted R-squared: 0.7958
## F-statistic: 2711 on 9 and 6249 DF, p-value: < 2.2e-16
#Predicting Price
Datatobepredicted=read.csv(file.choose())
dummy_data=dummy_cols(Datatobepredicted)
attach(dummy_data)
## The following objects are masked from dummy_data (pos = 5):
##
## ads, cd, cd_no, cd_yes, hd, multi, multi_no, multi_yes,
## premium, premium_no, premium_yes, price, ram, screen, speed,
## Sr.No, trend
## The following objects are masked from existingdata:
##
## ads, cd, hd, multi, premium, price, ram, screen, speed, Sr.No,
## trend
Datatobepredicted=subset(Datatobepredicted,select = -c(cd,multi,premium))
Datatobepredicted=cbind(Datatobepredicted,cd_yes,multi_yes,premium_yes)
predictedprice=as.data.frame(predict(multiplelinearmodel,Datatobepredicted))
write.csv(cbind(Datatobepredicted,predictedprice),file = 'D:\\Data Science\\Datasets\\M_Computer_Data_Predicted Price.csv',row.names = FALSE)