In this mini project focused on laptop prices, a structured approach was employed, encompassing data cleaning, exploratory data analysis (EDA), feature engineering, regression model building, and the development of an interactive Shiny application. The data cleaning phase ensured the dataset’s integrity and usability, while EDA provided valuable insights into the characteristics and relationships within the dataset. Feature engineering enhanced the predictive capabilities of the model, leading to the development of a regression model to understand the determinants of laptop prices. The culmination of the project was the creation of an interactive Shiny application, allowing for dynamic exploration and visualization of the model’s predictions. This project serves as a comprehensive example of the end-to-end process of data analysis and model deployment, highlighting the multifaceted nature of predictive analytics in the realm of pricing.
The mini project on laptop prices represents a comprehensive endeavor encompassing various stages of data analysis and model development. Beginning with data cleaning, the project involved meticulous preparation of the dataset to ensure its reliability and suitability for subsequent analysis. Following this, the exploratory data analysis (EDA) phase provided valuable insights into the characteristics and relationships within the dataset, laying the foundation for further exploration.Subsequently, feature engineering was carried out to enhance the predictive capabilities of the model, focusing on creating new features and transforming existing ones to improve its performance. The regression model building phase aimed to establish a predictive relationship between the features and the target variable, which in this case is laptop prices.The culmination of the project involved the development of an interactive Shiny application, enabling dynamic exploration and visualization of the model’s predictions. This interactive tool facilitated user engagement and provided a platform for gaining insights into the factors influencing laptop prices.Overall, this mini project serves as a demonstration of the end-to-end process of data analysis and model deployment, highlighting the multifaceted nature of predictive analytics in the context of pricing.
The dataset that is going to be used for various tasks of EDA is from Kaggle. The link to the dataset is attached below:
https://www.kaggle.com/datasets/muhammetvarl/laptop-price
READING DATASET
## company type_name inches screen_resolution
## 1 Apple Ultrabook 13.3 IPS Panel Retina Display 2560x1600
## 2 Apple Ultrabook 13.3 1440x900
## 3 HP Notebook 15.6 Full HD 1920x1080
## 4 Apple Ultrabook 15.4 IPS Panel Retina Display 2880x1800
## 5 Apple Ultrabook 13.3 IPS Panel Retina Display 2560x1600
## cpu ram memory
## 1 Intel Core i5 2.3GHz 8GB 128GB SSD
## 2 Intel Core i5 1.8GHz 8GB 128GB Flash Storage
## 3 Intel Core i5 7200U 2.5GHz 8GB 256GB SSD
## 4 Intel Core i7 2.7GHz 16GB 512GB SSD
## 5 Intel Core i5 3.1GHz 8GB 256GB SSD
## gpu op_sys weight price
## 1 Intel Iris Plus Graphics 640 macOS 1.37kg 71378.68
## 2 Intel HD Graphics 6000 macOS 1.34kg 47895.52
## 3 Intel HD Graphics 620 No OS 1.86kg 30636.00
## 4 AMD Radeon Pro 455 macOS 1.83kg 135195.34
## 5 Intel Iris Plus Graphics 650 macOS 1.37kg 96095.81
## [1] "company" "type_name" "inches"
## [4] "screen_resolution" "cpu" "ram"
## [7] "memory" "gpu" "op_sys"
## [10] "weight" "price"
## 'data.frame': 1303 obs. of 11 variables:
## $ company : chr "Apple" "Apple" "HP" "Apple" ...
## $ type_name : chr "Ultrabook" "Ultrabook" "Notebook" "Ultrabook" ...
## $ inches : num 13.3 13.3 15.6 15.4 13.3 15.6 15.4 13.3 14 14 ...
## $ screen_resolution: chr "IPS Panel Retina Display 2560x1600" "1440x900" "Full HD 1920x1080" "IPS Panel Retina Display 2880x1800" ...
## $ cpu : chr "Intel Core i5 2.3GHz" "Intel Core i5 1.8GHz" "Intel Core i5 7200U 2.5GHz" "Intel Core i7 2.7GHz" ...
## $ ram : chr "8GB" "8GB" "8GB" "16GB" ...
## $ memory : chr "128GB SSD" "128GB Flash Storage" "256GB SSD" "512GB SSD" ...
## $ gpu : chr "Intel Iris Plus Graphics 640" "Intel HD Graphics 6000" "Intel HD Graphics 620" "AMD Radeon Pro 455" ...
## $ op_sys : chr "macOS" "macOS" "No OS" "macOS" ...
## $ weight : chr "1.37kg" "1.34kg" "1.86kg" "1.83kg" ...
## $ price : num 71379 47896 30636 135195 96096 ...
Converting variable names memory, weight & ram to be in numerical
#variable conversion
library(dplyr)
laptop$ram=as.numeric(sub("GB","",laptop$ram))
laptop$weight=as.numeric(sub("kg","",laptop$weight))
laptop$memory=gsub("\\D","",laptop$memory) #removing words
laptop$memory=as.numeric(laptop$memory)
laptop$memory=ifelse(laptop$memory=="11",2000,laptop$memory)
laptop$memory=ifelse(laptop$memory=="2",2000,laptop$memory)
laptop$memory=ifelse(laptop$memory=="2561",1256,laptop$memory)
laptop$memory=ifelse(laptop$memory=="1281",1128,laptop$memory)
laptop$memory=ifelse(laptop$memory=="5121",1512,laptop$memory)
laptop$memory=ifelse(laptop$memory=="10",1000,laptop$memory)
laptop$memory=ifelse(laptop$memory=="2562",2256,laptop$memory)
laptop$memory=ifelse(laptop$memory=="5122",2512,laptop$memory)
laptop$memory=ifelse(laptop$memory=="1282",2128,laptop$memory)
laptop$memory=ifelse(laptop$memory=="256256",512,laptop$memory)
laptop$memory=ifelse(laptop$memory=="256500",756,laptop$memory)
laptop$memory=ifelse(laptop$memory=="25610",1256,laptop$memory)
laptop$memory=ifelse(laptop$memory=="51210",1512,laptop$memory)
laptop$memory=ifelse(laptop$memory=="512256",768,laptop$memory)
laptop$memory=ifelse(laptop$memory=="512512",1024,laptop$memory)
laptop$memory=ifelse(laptop$memory=="641",1064,laptop$memory)
laptop$memory=ifelse(laptop$memory=="1",1000,laptop$memory)
laptop %>%
dplyr::select(ram,weight,memory) %>% str()
## 'data.frame': 1303 obs. of 3 variables:
## $ ram : num 8 8 8 16 8 4 16 8 16 8 ...
## $ weight: num 1.37 1.34 1.86 1.83 1.37 2.1 2.04 1.34 1.3 1.6 ...
## $ memory: num 128 128 256 512 256 500 256 256 512 256 ...
checking for any missing values in the dataset
## company type_name inches screen_resolution
## 0 0 0 0
## cpu ram memory gpu
## 0 0 0 0
## op_sys weight price
## 0 0 0
library(ggplot2)
laptop %>%
ggplot(aes(company,fill=type_name)) +
geom_bar(position = "dodge",width = 0.5) + theme_bw()+
theme(axis.text.x = element_text(size = 10, hjust=1,angle = 45))+
labs(title = "Distribution of Company vs Type of laptop ",
fill="Type of laptop",y="frequency") #distribution
hist1=ggplot(laptop, aes=(x=price))+
geom_density(aes(x=price), stat = "density", fill="gold2",color="black")+
theme_bw()+labs(title = "Distribution of Price") #density plot
hist2=ggplot(laptop,aes(x=price))+
geom_histogram(color="black",fill="gold2",stat = "bin")+
theme_bw()+labs(title = "Distribution of Price",y="frequency")#histogram plot
library(patchwork)
plot(hist1 + hist2)
Observation
This shows that majority of laptops are concentrated on the lower end meaning that there are a very few laptops with high prices and a larger number of laptops with lower prices.
brand_name=as.data.frame(table(laptop$company))
colnames(brand_name)=c("Brand Name","Frequency")
brand_name %>%
arrange(desc(Frequency)) #brand name impact
## Brand Name Frequency
## 1 Dell 297
## 2 Lenovo 297
## 3 HP 274
## 4 Asus 158
## 5 Acer 103
## 6 MSI 54
## 7 Toshiba 48
## 8 Apple 21
## 9 Samsung 9
## 10 Mediacom 7
## 11 Razer 7
## 12 Microsoft 6
## 13 Vero 4
## 14 Xiaomi 4
## 15 Chuwi 3
## 16 Fujitsu 3
## 17 Google 3
## 18 LG 3
## 19 Huawei 2
Observation
Major brand names in the market are dell, hp, acer, asus and lenovo
ggplot(laptop, aes(x=company, y=price, fill=company))+
geom_boxplot(stat = "boxplot",outlier.color = "blue")+
theme(axis.text.x = element_text(size = 10, hjust=1,angle = 45))+
stat_summary(fun.y = median, geom = "point", shape=20, size=3, color="red")+
labs(y="average price",x="brand name")
Observation
Razor is the most expensive as it has the highest average price
mb=as.data.frame(table(laptop$type_name))
colnames(mb)=c("type of laptop","frequency")
mb %>% arrange(desc(frequency))
## type of laptop frequency
## 1 Notebook 727
## 2 Gaming 205
## 3 Ultrabook 196
## 4 2 in 1 Convertible 121
## 5 Workstation 29
## 6 Netbook 25
ggplot(laptop, aes(x=type_name, y=price, fill=type_name))+
geom_boxplot(stat = "boxplot")+
theme(legend.position ="right")+ theme_bw()+
theme(axis.text.x = element_text(size = 10, hjust=1,angle = 45))+
labs(fill="Laptop Type",y="average price",x="Laptop type")
Observation
Workstations are more expensive.
Relationship between the inches,memory,weight, ram and prices of laptops
sp1=ggplot(laptop, aes(x=inches, y=price))+
geom_point(stat="identity",colour="orange",shape="circle")+
geom_smooth(method = "loess")+theme_linedraw()
sp2=ggplot(laptop, aes(y=price,x=ram))+
geom_point(stat="identity",colour="red2",shape="triangle")+
geom_smooth(method = "loess")+theme_linedraw()
sp3=ggplot(laptop, aes(y=price,x=weight))+
geom_point(stat="identity",colour="green",shape="square")+
geom_smooth(method = "loess")+theme_linedraw()
sp4=ggplot(laptop, aes(y=price,x=memory))+
geom_point(stat="identity",colour="blue4",shape="k")+
geom_smooth(method = "loess")+theme_linedraw()
library(gridExtra)
grid.arrange(sp1,sp2,sp3,sp4,ncol=2,nrow=2)
Observation
There is a relationship between the price of a laptop and it’s ram, memory,weight, inches.
As inches, ram, weight,memory increases, prices also increases
Adding New Columns (touchscreen,ips display,x & y dimensions,hd display)
## Var1 Freq
## 1 Full HD 1920x1080 507
## 2 1366x768 281
## 3 IPS Panel Full HD 1920x1080 230
## 4 IPS Panel Full HD / Touchscreen 1920x1080 53
## 5 Full HD / Touchscreen 1920x1080 47
## 6 1600x900 23
## 7 Touchscreen 1366x768 16
## 8 Quad HD+ / Touchscreen 3200x1800 15
## 9 IPS Panel 4K Ultra HD 3840x2160 12
## 10 IPS Panel 4K Ultra HD / Touchscreen 3840x2160 11
library(stringr)
result=as.data.frame(str_match(laptop$screen_resolution,"(\\d+)x(\\d+)"))
laptop=laptop %>%
mutate(x_dim=as.numeric(result$V2),
y_dim=as.numeric(result$V3))
laptop=laptop %>%
mutate(touchscreen=ifelse(grepl("Touchscreen",laptop$screen_resolution),1,0),
ips_display=ifelse(grepl("IPS Panel",screen_resolution),1,0),
hd_display=ifelse(grepl("Full HD",screen_resolution),1,0))
laptop %>%
dplyr::select(x_dim,y_dim,touchscreen,ips_display,hd_display) %>%
str()
## 'data.frame': 1303 obs. of 5 variables:
## $ x_dim : num 2560 1440 1920 2880 2560 ...
## $ y_dim : num 1600 900 1080 1800 1600 768 1800 900 1080 1080 ...
## $ touchscreen: num 0 0 0 0 0 0 0 0 0 0 ...
## $ ips_display: num 1 0 0 1 1 0 1 0 0 1 ...
## $ hd_display : num 0 0 1 0 0 0 0 0 1 1 ...
## touchscreen feature freq
## 1 0 1111
## 2 1 192
ggplot(laptop,aes(x=touchscreen,y=price,fill=factor(touchscreen)))+
geom_boxplot(stat = "boxplot")+theme_bw()+
labs(fill="Touchscreen Feature")
## ips display feature freq
## 1 0 938
## 2 1 365
ggplot(laptop,aes(x=ips_display,y=price,fill=factor(ips_display)))+
geom_boxplot(stat = "boxplot")+theme_dark()+theme_bw()+
labs(fill="Ips Display Feature")
## hd display feature freq
## 1 0 460
## 2 1 843
ggplot(laptop,aes(x=hd_display,y=price,fill=factor(hd_display)))+
geom_boxplot(stat = "boxplot")+theme_dark()+theme_bw()+
labs(fill="HD Display Feature")
Observation
Hd display=1, non hd display=0
A lot of laptops have hd display
Laptops with hd display are more costly
The new variables have to be in numeric format for correlation analysis
library(knitr)
co_lp= laptop %>%
dplyr::select(price,ips_display,hd_display,x_dim,y_dim,touchscreen,inches,
weight,ram,memory)
co_lp=cor(co_lp)
co_lp
## price ips_display hd_display x_dim y_dim
## price 1.00000000 0.25220762 0.19861160 0.55652933 0.55280922
## ips_display 0.25220762 1.00000000 0.18544150 0.28145673 0.28902950
## hd_display 0.19861160 0.18544150 1.00000000 0.07087520 0.04865950
## x_dim 0.55652933 0.28145673 0.07087520 1.00000000 0.99421896
## y_dim 0.55280922 0.28902950 0.04865950 0.99421896 1.00000000
## touchscreen 0.19122646 0.15051233 -0.10518846 0.35106573 0.35792997
## inches 0.06819667 -0.11480420 0.16355060 -0.07124533 -0.09540391
## weight 0.21036980 0.01696711 0.14800292 -0.03287982 -0.05384565
## ram 0.74300714 0.20662250 0.21035934 0.43312053 0.42443665
## memory 0.16081889 -0.01468659 0.09030405 0.07153091 0.05695929
## touchscreen inches weight ram memory
## price 0.1912265 0.06819667 0.21036980 0.7430071 0.16081889
## ips_display 0.1505123 -0.11480420 0.01696711 0.2066225 -0.01468659
## hd_display -0.1051885 0.16355060 0.14800292 0.2103593 0.09030405
## x_dim 0.3510657 -0.07124533 -0.03287982 0.4331205 0.07153091
## y_dim 0.3579300 -0.09540391 -0.05384565 0.4244366 0.05695929
## touchscreen 1.0000000 -0.36173453 -0.29461978 0.1169841 -0.13848059
## inches -0.3617345 1.00000000 0.82763110 0.2379928 0.53835815
## weight -0.2946198 0.82763110 1.00000000 0.3838741 0.54975391
## ram 0.1169841 0.23799280 0.38387409 1.0000000 0.35136257
## memory -0.1384806 0.53835815 0.54975391 0.3513626 1.00000000
Observation
All the new variables created have a positive relationship with price x and y dimension have a strong positive relationship
## [1] 0.4734873
## . Freq
## 1 Intel Core i5 7200U 2.5GHz 190
## 2 Intel Core i7 7700HQ 2.8GHz 146
## 3 Intel Core i7 7500U 2.7GHz 134
## 4 Intel Core i7 8550U 1.8GHz 73
## 5 Intel Core i5 8250U 1.6GHz 72
## 6 Intel Core i5 6200U 2.3GHz 68
## 7 Intel Core i3 6006U 2GHz 64
## 8 Intel Core i7 6500U 2.5GHz 49
## 9 Intel Core i7 6700HQ 2.6GHz 43
## 10 Intel Core i3 7100U 2.4GHz 37
laptop=laptop %>%
mutate(intel_core_i3=ifelse(grepl("Intel Core i3",cpu),1,0),
intel_core_i5=ifelse(grepl("Intel Core i5",cpu),1,0),
intel_core_i7=ifelse(grepl("Intel Core i7",cpu),1,0),
dual_core=ifelse(grepl("Dual Core",cpu),1,0),
amd_processor=ifelse(grepl("AMD ",cpu),1,0),
other_processor=ifelse(grepl("Intel Xeon",cpu),1,0))
laptop %>% dplyr::select(intel_core_i3,intel_core_i5,intel_core_i7,dual_core,
amd_processor,other_processor) %>% str()
## 'data.frame': 1303 obs. of 6 variables:
## $ intel_core_i3 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ intel_core_i5 : num 1 1 1 0 1 0 0 1 0 1 ...
## $ intel_core_i7 : num 0 0 0 1 0 0 1 0 1 0 ...
## $ dual_core : num 0 0 0 0 0 0 0 0 0 0 ...
## $ amd_processor : num 0 0 0 0 0 1 0 0 0 0 ...
## $ other_processor: num 0 0 0 0 0 0 0 0 0 0 ...
## . Freq
## 1 Intel HD Graphics 620 281
## 2 Intel HD Graphics 520 185
## 3 Intel UHD Graphics 620 68
## 4 Nvidia GeForce GTX 1050 66
## 5 Nvidia GeForce GTX 1060 48
## 6 Nvidia GeForce 940MX 43
## 7 AMD Radeon 530 41
## 8 Intel HD Graphics 500 39
## 9 Intel HD Graphics 400 37
## 10 Nvidia GeForce GTX 1070 30
laptop=laptop %>%
mutate(nvidia_graphics=ifelse(grepl("Nvidia",gpu),1,0),
amd_graphics=ifelse(grepl("AMD",gpu),1,0),
intel_graphics=ifelse(grepl("Intel",gpu),1,0))
laptop %>%
dplyr::select(nvidia_graphics,amd_graphics,intel_graphics) %>%
str()
## 'data.frame': 1303 obs. of 3 variables:
## $ nvidia_graphics: num 0 0 0 0 0 0 0 0 1 0 ...
## $ amd_graphics : num 0 0 0 1 0 1 0 0 0 0 ...
## $ intel_graphics : num 1 1 1 0 1 0 1 1 0 1 ...
## . Freq
## 1 Windows 10 1072
## 2 No OS 66
## 3 Linux 62
## 4 Windows 7 45
## 5 Chrome OS 27
## 6 macOS 13
## 7 Mac OS X 8
## 8 Windows 10 S 8
## 9 Android 2
laptop=laptop %>%
mutate(windows_10=ifelse(grepl("Windows 10",op_sys),1,0),
no_operating_system=ifelse(grepl("No OS",op_sys),1,0),
linux=ifelse(grepl("Linux",op_sys),1,0),
windows_7=ifelse(grepl("Windows 7",op_sys),1,0),
chrome_os=ifelse(grepl("Chrome OS ",op_sys),1,0),
mac_os=ifelse(grepl("macOS",op_sys),1,0),
mac_os_x=ifelse(grepl("Mac OS X",op_sys),1,0),
windows_10_s=ifelse(grepl("Windows 10 S",op_sys),1,0),
android=ifelse(grepl("Android",op_sys),1,0))
laptop %>%
dplyr::select(windows_10,no_operating_system,linux,windows_7,
chrome_os,mac_os,mac_os_x,windows_10,android)%>%str()
## 'data.frame': 1303 obs. of 8 variables:
## $ windows_10 : num 0 0 0 0 0 1 0 0 1 1 ...
## $ no_operating_system: num 0 0 1 0 0 0 0 0 0 0 ...
## $ linux : num 0 0 0 0 0 0 0 0 0 0 ...
## $ windows_7 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ chrome_os : num 0 0 0 0 0 0 0 0 0 0 ...
## $ mac_os : num 1 1 0 1 1 0 0 1 0 0 ...
## $ mac_os_x : num 0 0 0 0 0 0 1 0 0 0 ...
## $ android : num 0 0 0 0 0 0 0 0 0 0 ...
laptop_subset=laptop %>%
dplyr::select(6:7,10:11,14:35)
set.seed(1)
sample=sample(c(TRUE,FALSE),nrow(laptop_subset),replace=TRUE,prob = c(0.7,0.3))
train=laptop_subset[sample,]
test=laptop_subset[!sample,]
library(MASS)
full_model= lm(price ~ .,data = train)#full model including all the variables
output=capture.output(backward_regression<-
stepAIC(full_model,direction="backward",
scope=list(lower= ~1),
data=train)) #keeping significant variables
summary(backward_regression)
##
## Call:
## lm(formula = price ~ ram + memory + weight + hd_display + ppi +
## intel_core_i3 + intel_core_i5 + intel_core_i7 + amd_processor +
## other_processor + amd_graphics + no_operating_system + linux +
## windows_7 + mac_os, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -66745 -10935 -1733 8259 135827
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -27256.725 4312.931 -6.320 4.13e-10 ***
## ram 3630.263 180.835 20.075 < 2e-16 ***
## memory -5.714 1.723 -3.315 0.000953 ***
## weight 6531.871 1385.708 4.714 2.82e-06 ***
## hd_display -2834.155 1523.850 -1.860 0.063233 .
## ppi 211.398 19.225 10.996 < 2e-16 ***
## intel_core_i3 7918.765 2824.250 2.804 0.005159 **
## intel_core_i5 18141.763 2407.517 7.535 1.19e-13 ***
## intel_core_i7 29296.716 2697.793 10.860 < 2e-16 ***
## amd_processor 11233.685 4243.068 2.648 0.008251 **
## other_processor 97142.568 14179.695 6.851 1.37e-11 ***
## amd_graphics -11594.426 2336.324 -4.963 8.32e-07 ***
## no_operating_system -13535.440 3352.309 -4.038 5.86e-05 ***
## linux -9475.005 3244.385 -2.920 0.003583 **
## windows_7 27642.342 3370.634 8.201 8.25e-16 ***
## mac_os 10963.073 7525.384 1.457 0.145519
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19370 on 894 degrees of freedom
## Multiple R-squared: 0.7348, Adjusted R-squared: 0.7304
## F-statistic: 165.2 on 15 and 894 DF, p-value: < 2.2e-16
Adjusted R Squared = 0.7304 means 73 percent of variance in the dependent variable (price) is explained by the independent variables hence it is a better fit of the model to the data.
P Value of 2.2e-16 <0.05 means that the model is statistical significant in predicting the price of a laptop.
plot(backward_regression$fitted.values,backward_regression$residuals,
xlab = "Fitted Values",ylab = "Residuals")
## lag Autocorrelation D-W Statistic p-value
## 1 0.0472467 1.90477 0.14
## Alternative hypothesis: rho != 0
## ram memory weight hd_display
## 1.981195 1.548162 1.986819 1.285926
## ppi intel_core_i3 intel_core_i5 intel_core_i7
## 1.580429 1.808834 3.079975 4.251826
## amd_processor other_processor amd_graphics no_operating_system
## 2.009287 1.069451 1.621121 1.035651
## linux windows_7 mac_os
## 1.098448 1.047004 1.048466
The Price of the laptop can be predicted using the final regression model.
Initial Model:
price ~ ram + memory + weight + touchscreen + ips_display + hd_display + ppi + intel_core_i3 + intel_core_i5 + intel_core_i7 + dual_core + amd_processor + other_processor + nvidia_graphics + amd_graphics + intel_graphics + windows_10 + no_operating_system + linux + windows_7 + chrome_os + mac_os + mac_os_x + windows_10_s + android
Final Model:
price ~ ram + memory + weight + hd_display + ppi + intel_core_i3 + intel_core_i5 + intel_core_i7 + amd_processor + other_processor + amd_graphics + no_operating_system + linux + windows_7 + mac_os
Regression model
Price= -27256.725416 + 3630.263099 (ram ) + memory (-5.713723) + weight (6531.871213) + hd_display ( -2834.155057) + ppi ( 211.397877) + intel_core_i3 ( 7918.764798) + intel_core_i5 (18141.763446) + intel_core_i7 (29296.716320) + amd_processor (11233.684745) + other_processor (97142.568150) + amd_graphics (-11594.426323) + no_operating_system ( -13535.440259) + linux (-9475.004938) + windows_7 ( 27642.341792) + mac_os (10963.073052)
The application was build using using Shiny package. Here is the link to the application: https://pythias.shinyapps.io/LPDA/
knitr::opts_chunk$set(echo = T, message=F, warning = F)
laptop=read.csv(file.choose()) #reading dataset
library(janitor)
laptop=clean_names(laptop[2:12]) #Cleaning & keeping important variables
head(laptop,5) #first 5 rows
colnames(laptop) #column names
str(laptop) #dataset classes
#variable conversion
library(dplyr)
laptop$ram=as.numeric(sub("GB","",laptop$ram))
laptop$weight=as.numeric(sub("kg","",laptop$weight))
laptop$memory=gsub("\\D","",laptop$memory) #removing words
laptop$memory=as.numeric(laptop$memory)
laptop$memory=ifelse(laptop$memory=="11",2000,laptop$memory)
laptop$memory=ifelse(laptop$memory=="2",2000,laptop$memory)
laptop$memory=ifelse(laptop$memory=="2561",1256,laptop$memory)
laptop$memory=ifelse(laptop$memory=="1281",1128,laptop$memory)
laptop$memory=ifelse(laptop$memory=="5121",1512,laptop$memory)
laptop$memory=ifelse(laptop$memory=="10",1000,laptop$memory)
laptop$memory=ifelse(laptop$memory=="2562",2256,laptop$memory)
laptop$memory=ifelse(laptop$memory=="5122",2512,laptop$memory)
laptop$memory=ifelse(laptop$memory=="1282",2128,laptop$memory)
laptop$memory=ifelse(laptop$memory=="256256",512,laptop$memory)
laptop$memory=ifelse(laptop$memory=="256500",756,laptop$memory)
laptop$memory=ifelse(laptop$memory=="25610",1256,laptop$memory)
laptop$memory=ifelse(laptop$memory=="51210",1512,laptop$memory)
laptop$memory=ifelse(laptop$memory=="512256",768,laptop$memory)
laptop$memory=ifelse(laptop$memory=="512512",1024,laptop$memory)
laptop$memory=ifelse(laptop$memory=="641",1064,laptop$memory)
laptop$memory=ifelse(laptop$memory=="1",1000,laptop$memory)
laptop %>%
dplyr::select(ram,weight,memory) %>% str()
colSums(is.na.data.frame(laptop)) #missing values
anyDuplicated.default(laptop)
library(ggplot2)
laptop %>%
ggplot(aes(company,fill=type_name)) +
geom_bar(position = "dodge",width = 0.5) + theme_bw()+
theme(axis.text.x = element_text(size = 10, hjust=1,angle = 45))+
labs(title = "Distribution of Company vs Type of laptop ",
fill="Type of laptop",y="frequency") #distribution
hist1=ggplot(laptop, aes=(x=price))+
geom_density(aes(x=price), stat = "density", fill="gold2",color="black")+
theme_bw()+labs(title = "Distribution of Price") #density plot
hist2=ggplot(laptop,aes(x=price))+
geom_histogram(color="black",fill="gold2",stat = "bin")+
theme_bw()+labs(title = "Distribution of Price",y="frequency")#histogram plot
library(patchwork)
plot(hist1 + hist2)
brand_name=as.data.frame(table(laptop$company))
colnames(brand_name)=c("Brand Name","Frequency")
brand_name %>%
arrange(desc(Frequency)) #brand name impact
ggplot(laptop, aes(x=company, y=price, fill=company))+
geom_boxplot(stat = "boxplot",outlier.color = "blue")+
theme(axis.text.x = element_text(size = 10, hjust=1,angle = 45))+
stat_summary(fun.y = median, geom = "point", shape=20, size=3, color="red")+
labs(y="average price",x="brand name")
mb=as.data.frame(table(laptop$type_name))
colnames(mb)=c("type of laptop","frequency")
mb %>% arrange(desc(frequency))
ggplot(laptop, aes(x=type_name, y=price, fill=type_name))+
geom_boxplot(stat = "boxplot")+
theme(legend.position ="right")+ theme_bw()+
theme(axis.text.x = element_text(size = 10, hjust=1,angle = 45))+
labs(fill="Laptop Type",y="average price",x="Laptop type")
sp1=ggplot(laptop, aes(x=inches, y=price))+
geom_point(stat="identity",colour="orange",shape="circle")+
geom_smooth(method = "loess")+theme_linedraw()
sp2=ggplot(laptop, aes(y=price,x=ram))+
geom_point(stat="identity",colour="red2",shape="triangle")+
geom_smooth(method = "loess")+theme_linedraw()
sp3=ggplot(laptop, aes(y=price,x=weight))+
geom_point(stat="identity",colour="green",shape="square")+
geom_smooth(method = "loess")+theme_linedraw()
sp4=ggplot(laptop, aes(y=price,x=memory))+
geom_point(stat="identity",colour="blue4",shape="k")+
geom_smooth(method = "loess")+theme_linedraw()
library(gridExtra)
grid.arrange(sp1,sp2,sp3,sp4,ncol=2,nrow=2)
fe=as.data.frame(table(laptop$screen_resolution))
fe %>% arrange(desc(Freq)) %>% head(10)
library(stringr)
result=as.data.frame(str_match(laptop$screen_resolution,"(\\d+)x(\\d+)"))
laptop=laptop %>%
mutate(x_dim=as.numeric(result$V2),
y_dim=as.numeric(result$V3))
laptop=laptop %>%
mutate(touchscreen=ifelse(grepl("Touchscreen",laptop$screen_resolution),1,0),
ips_display=ifelse(grepl("IPS Panel",screen_resolution),1,0),
hd_display=ifelse(grepl("Full HD",screen_resolution),1,0))
laptop %>%
dplyr::select(x_dim,y_dim,touchscreen,ips_display,hd_display) %>%
str()
tsf=as.data.frame(table(laptop$touchscreen))
colnames(tsf)=c("touchscreen feature","freq")
tsf
ggplot(laptop,aes(x=touchscreen,y=price,fill=factor(touchscreen)))+
geom_boxplot(stat = "boxplot")+theme_bw()+
labs(fill="Touchscreen Feature")
ips=as.data.frame(table(laptop$ips_display))
colnames(ips)=c("ips display feature","freq")
ips
ggplot(laptop,aes(x=ips_display,y=price,fill=factor(ips_display)))+
geom_boxplot(stat = "boxplot")+theme_dark()+theme_bw()+
labs(fill="Ips Display Feature")
hd=as.data.frame(table(laptop$hd_display))
colnames(hd)=c("hd display feature","freq")
hd
ggplot(laptop,aes(x=hd_display,y=price,fill=factor(hd_display)))+
geom_boxplot(stat = "boxplot")+theme_dark()+theme_bw()+
labs(fill="HD Display Feature")
library(knitr)
co_lp= laptop %>%
dplyr::select(price,ips_display,hd_display,x_dim,y_dim,touchscreen,inches,
weight,ram,memory)
co_lp=cor(co_lp)
co_lp
laptop$ppi= (((laptop$y_dim**2)+(laptop$x_dim**2))**0.5/laptop$inches)
cor(laptop$ppi,laptop$price)
cp=laptop$cpu %>%
table() %>% as.data.frame %>%
arrange(desc(Freq))
cp %>% head(10)
laptop=laptop %>%
mutate(intel_core_i3=ifelse(grepl("Intel Core i3",cpu),1,0),
intel_core_i5=ifelse(grepl("Intel Core i5",cpu),1,0),
intel_core_i7=ifelse(grepl("Intel Core i7",cpu),1,0),
dual_core=ifelse(grepl("Dual Core",cpu),1,0),
amd_processor=ifelse(grepl("AMD ",cpu),1,0),
other_processor=ifelse(grepl("Intel Xeon",cpu),1,0))
laptop %>% dplyr::select(intel_core_i3,intel_core_i5,intel_core_i7,dual_core,
amd_processor,other_processor) %>% str()
gp=laptop$gpu %>%
table() %>% as.data.frame %>%arrange(desc(Freq))
gp %>% head(10)
laptop=laptop %>%
mutate(nvidia_graphics=ifelse(grepl("Nvidia",gpu),1,0),
amd_graphics=ifelse(grepl("AMD",gpu),1,0),
intel_graphics=ifelse(grepl("Intel",gpu),1,0))
laptop %>%
dplyr::select(nvidia_graphics,amd_graphics,intel_graphics) %>%
str()
op=laptop$op_sys %>%
table() %>% as.data.frame %>%
arrange(desc(Freq))
op %>% head(10)
laptop=laptop %>%
mutate(windows_10=ifelse(grepl("Windows 10",op_sys),1,0),
no_operating_system=ifelse(grepl("No OS",op_sys),1,0),
linux=ifelse(grepl("Linux",op_sys),1,0),
windows_7=ifelse(grepl("Windows 7",op_sys),1,0),
chrome_os=ifelse(grepl("Chrome OS ",op_sys),1,0),
mac_os=ifelse(grepl("macOS",op_sys),1,0),
mac_os_x=ifelse(grepl("Mac OS X",op_sys),1,0),
windows_10_s=ifelse(grepl("Windows 10 S",op_sys),1,0),
android=ifelse(grepl("Android",op_sys),1,0))
laptop %>%
dplyr::select(windows_10,no_operating_system,linux,windows_7,
chrome_os,mac_os,mac_os_x,windows_10,android)%>%str()
laptop_subset=laptop %>%
dplyr::select(6:7,10:11,14:35)
set.seed(1)
sample=sample(c(TRUE,FALSE),nrow(laptop_subset),replace=TRUE,prob = c(0.7,0.3))
train=laptop_subset[sample,]
test=laptop_subset[!sample,]
library(MASS)
full_model= lm(price ~ .,data = train)#full model including all the variables
output=capture.output(backward_regression<-
stepAIC(full_model,direction="backward",
scope=list(lower= ~1),
data=train)) #keeping significant variables
summary(backward_regression)
plot(backward_regression$fitted.values,backward_regression$residuals,
xlab = "Fitted Values",ylab = "Residuals")
library(car)
durbinWatsonTest(backward_regression)
plot(backward_regression, which = 3)
qqnorm(backward_regression$residuals)
vif(backward_regression)