Here data is being scrpaed from Google chrome Extension and then after taking out data here are the steps to clean the data and do statistical analysis and data visualisation.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#Import Raw data
df_scraped <- read.csv("/Users/hiteshyadav/Desktop/flipkart_ac.csv")
raw_df <- read.csv("/Users/hiteshyadav/Desktop/flipkart_ac.csv")
# As we can see that web scraper order id uniqe
# SO we will convert it to numeric type
raw_df$web.scraper.order <- as.numeric(raw_df$web.scraper.order)
# Then sort the data according to order id
raw_df <- raw_df %>% arrange(web.scraper.order)
# Now we will remove variables like scraper_start_url, ac_links since they are of no use
# as link to the each AC is there in ac_links.href
raw_df <- raw_df[, -c(2, 3)]
# Take out scrape order and link to another df
raw_df_link <- raw_df[, 1:2]
raw_df <- raw_df[,-2]
raw_df <- data.frame(apply(raw_df, 2, as.character), stringsAsFactors = F)
# Now we will remove the observation which have null observation in them
# as these details are not available with us
# Replacing null by NA
raw_df$price[raw_df$price == "null"] = NA
# Since here null means that discount is 0
raw_df$percentoff[as.character(raw_df$percentoff) == "null"] = 0
raw_df$brand[raw_df$brand == "null"] = NA
raw_df$type[raw_df$type == "null"] = NA
raw_df$capacity.in.tons[raw_df$capacity.in.tons == "null"] = NA
raw_df$star.rating[raw_df$star.rating == "null"] = NA
raw_df$cooling.capacity[raw_df$cooling.capacity == "null"] = NA
raw_df$condensor.coil[raw_df$condensor.coil == "null"] = NA
# Then remove observation with NA
index_complete_data <- complete.cases(raw_df)
raw_df <- raw_df[index_complete_data, ]
#Converting prices in numeric type and removing signs of rs
raw_df$price <- as.numeric(gsub(",","",substring(raw_df$price, 2)))
raw_df$percentoff <- as.numeric(gsub("% off","",raw_df$percentoff))
raw_df$brand <- as.factor((raw_df$brand))
raw_df$type <- as.factor((raw_df$type))
raw_df$capacity.in.tons <- as.numeric(gsub(" Ton","",raw_df$capacity.in.tons))
## Warning: NAs introduced by coercion
raw_df$star.rating <- as.factor(gsub(" Star BEE Rating","",raw_df$star.rating))
raw_df$cooling.capacity <- as.numeric(gsub(" W","",raw_df$cooling.capacity))
## Warning: NAs introduced by coercion
raw_df$condensor.coil <- as.factor(as.character(raw_df$condensor.coil))
index_complete_data <- complete.cases(raw_df)
raw_df <- raw_df[index_complete_data, ]
raw_df$web.scraper.order <- as.numeric(raw_df$web.scraper.order)
# Now we have data in a well cleaned way
# Now we will megre two data frames one raw_df and raw_df_linl
neat_data <- merge.data.frame(raw_df, raw_df_link, by.x = "web.scraper.order", by.y = "web.scraper.order")
##########Analysis of the data###############
#Outlier analysis
par(mfrow = c(2, 2))
boxplot(neat_data$price, ylab = "Price")
boxplot(neat_data$percentoff, ylab = "% off")
boxplot(neat_data$capacity.in.tons, ylab = "Capacity in tons")
boxplot(neat_data$cooling.capacity, ylab = "Cooling capacity")
neat_data$capacity.in.tons[which.max(neat_data$capacity.in.tons)]
## [1] 2.3
neat_data[which.max(neat_data$capacity.in.tons), -ncol(neat_data)]
## web.scraper.order price percentoff brand type capacity.in.tons star.rating
## 10 10 53500 20 Godrej Split 2.3 3
## cooling.capacity condensor.coil
## 10 7250 Copper
table(neat_data$brand)
##
## Akai Blue Star Carrier Daikin
## 1 24 13 27
## Godrej Haier Hitachi Hyundai
## 14 2 20 1
## IFB Impex LG Livpure
## 6 3 36 1
## Lloyd MarQ by Flipkart Midea Mitashi
## 23 9 6 1
## O-General Onida Panasonic Samsung
## 1 15 14 21
## Sansui Toshiba Voltas Whirlpool
## 2 6 16 15
table(neat_data$type)
##
## Split
## 277
table(neat_data$star.rating)
##
## 2 3 4 5
## 8 195 14 60
hist((neat_data$price))
boxplot(neat_data$price)
boxplot(neat_data$cooling.capacity)
# As we can see that there is one observatioin which is oulier
# So we will remove that observation from data
neat_data <- neat_data[neat_data$cooling.capacity < 10000, ]
library(ggplot2)
ggplot(neat_data, aes(x = cooling.capacity, y = price, color = star.rating)) + geom_point()
ggplot(neat_data, aes(x = cooling.capacity, y = price, color = condensor.coil)) + geom_point()
# Corrs tabulation of categorical varaibles
corsstab <- neat_data %>% group_by(condensor.coil, brand, star.rating) %>% summarise(number = n())
# Correlation between cooling capacity and price
cor.test(neat_data$price, neat_data$cooling.capacity)
##
## Pearson's product-moment correlation
##
## data: neat_data$price and neat_data$cooling.capacity
## t = 11.031, df = 274, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4670787 0.6312985
## sample estimates:
## cor
## 0.5545649
# which implies that there is positive correaltion between price of air conditioner and cooling capacity
# Fitting Linear regression model to predict log of prices of Air conditoner
m <- lm(log(price)~percentoff+brand+capacity.in.tons+star.rating+cooling.capacity+ condensor.coil, data = neat_data)
summary(m)
##
## Call:
## lm(formula = log(price) ~ percentoff + brand + capacity.in.tons +
## star.rating + cooling.capacity + condensor.coil, data = neat_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.312316 -0.038857 -0.001398 0.049691 0.253116
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.896e+00 1.022e-01 96.817 < 2e-16 ***
## percentoff -6.836e-03 6.331e-04 -10.798 < 2e-16 ***
## brandBlue Star 1.510e-01 8.882e-02 1.700 0.09037 .
## brandCarrier 1.232e-01 8.946e-02 1.377 0.16971
## brandDaikin 7.495e-02 8.872e-02 0.845 0.39903
## brandGodrej 4.282e-02 8.958e-02 0.478 0.63305
## brandHaier -3.578e-02 1.091e-01 -0.328 0.74322
## brandHitachi 1.061e-01 8.977e-02 1.182 0.23853
## brandHyundai -1.952e-01 1.194e-01 -1.634 0.10351
## brandIFB 2.525e-02 9.362e-02 0.270 0.78760
## brandImpex 2.633e-01 1.011e-01 2.605 0.00976 **
## brandLG 1.718e-01 8.787e-02 1.955 0.05167 .
## brandLivpure 2.999e-02 1.192e-01 0.252 0.80160
## brandLloyd 1.011e-01 8.850e-02 1.143 0.25428
## brandMarQ by Flipkart -9.783e-02 9.162e-02 -1.068 0.28670
## brandMidea -2.890e-02 9.272e-02 -0.312 0.75555
## brandMitashi 6.537e-02 1.200e-01 0.545 0.58652
## brandO-General 3.996e-01 1.228e-01 3.256 0.00129 **
## brandOnida 2.095e-02 9.032e-02 0.232 0.81674
## brandPanasonic 6.747e-02 9.082e-02 0.743 0.45826
## brandSamsung 1.838e-01 8.943e-02 2.056 0.04089 *
## brandSansui -9.133e-02 1.044e-01 -0.875 0.38258
## brandToshiba 2.768e-01 9.582e-02 2.889 0.00421 **
## brandVoltas 8.581e-02 8.880e-02 0.966 0.33482
## brandWhirlpool 2.722e-02 8.884e-02 0.306 0.75961
## capacity.in.tons 4.813e-01 6.383e-02 7.540 9.20e-13 ***
## star.rating3 7.334e-02 3.567e-02 2.056 0.04085 *
## star.rating4 1.768e-01 4.137e-02 4.274 2.75e-05 ***
## star.rating5 2.566e-01 3.705e-02 6.924 3.85e-11 ***
## cooling.capacity -2.876e-05 2.025e-05 -1.420 0.15683
## condensor.coilAluminium -3.012e-02 3.760e-02 -0.801 0.42380
## condensor.coilCopper 5.676e-02 2.866e-02 1.980 0.04881 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08122 on 244 degrees of freedom
## Multiple R-squared: 0.8674, Adjusted R-squared: 0.8505
## F-statistic: 51.49 on 31 and 244 DF, p-value: < 2.2e-16
plot(m)
## Warning: not plotting observations with leverage one:
## 110, 150, 182, 189, 240
## Warning: not plotting observations with leverage one:
## 110, 150, 182, 189, 240
# As we can see that we can easily predict the price of the given it has all its specification
temp <- data.frame(fitted = exp(m$fitted.values), observed = neat_data$price)
head(temp)
## fitted observed
## 1 47663.52 47188
## 2 38045.14 36990
## 3 35280.00 31990
## 4 53407.71 60090
## 5 48559.44 49790
## 6 42689.77 43999
#Export data as mentioned in the assignment
library(readxl)
library(openxlsx)
#
#write.xlsx(df_scraped, file = "Web scarping raw.xlsx")
#write.xlsx(neat_data, file = "Web scarping processed.xlsx")