Process data after scraping it from Google chrome

Here data is being scrpaed from Google chrome Extension and then after taking out data here are the steps to clean the data and do statistical analysis and data visualisation.

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

#Import Raw data 
df_scraped <- read.csv("/Users/hiteshyadav/Desktop/flipkart_ac.csv")

raw_df <- read.csv("/Users/hiteshyadav/Desktop/flipkart_ac.csv")



# As we can see that web scraper order id uniqe 


# SO we will convert it to numeric type 
raw_df$web.scraper.order <- as.numeric(raw_df$web.scraper.order)

# Then sort the data according to order id

raw_df <- raw_df %>% arrange(web.scraper.order)

# Now we will remove variables like scraper_start_url, ac_links since they are of no use 

# as link to the each AC is there in ac_links.href

raw_df <- raw_df[, -c(2, 3)]


# Take out scrape order and link to another df 


raw_df_link <- raw_df[, 1:2]

raw_df <- raw_df[,-2]


raw_df <- data.frame(apply(raw_df, 2, as.character), stringsAsFactors = F)

# Now we will remove the observation which have null observation in them 
# as these details are not available with us

# Replacing null by NA

raw_df$price[raw_df$price == "null"] = NA


# Since here null means that discount is 0
raw_df$percentoff[as.character(raw_df$percentoff) == "null"] = 0

raw_df$brand[raw_df$brand == "null"] = NA
raw_df$type[raw_df$type == "null"] = NA
raw_df$capacity.in.tons[raw_df$capacity.in.tons == "null"] = NA
raw_df$star.rating[raw_df$star.rating == "null"] = NA
raw_df$cooling.capacity[raw_df$cooling.capacity == "null"] = NA
raw_df$condensor.coil[raw_df$condensor.coil == "null"] = NA


# Then remove observation with NA

index_complete_data <- complete.cases(raw_df)


raw_df <- raw_df[index_complete_data, ]

#Converting prices in numeric type and removing signs of rs

raw_df$price <- as.numeric(gsub(",","",substring(raw_df$price, 2)))

raw_df$percentoff <- as.numeric(gsub("% off","",raw_df$percentoff))

raw_df$brand <- as.factor((raw_df$brand))

raw_df$type <- as.factor((raw_df$type))

raw_df$capacity.in.tons <- as.numeric(gsub(" Ton","",raw_df$capacity.in.tons))

## Warning: NAs introduced by coercion

raw_df$star.rating <- as.factor(gsub(" Star BEE Rating","",raw_df$star.rating))

raw_df$cooling.capacity <- as.numeric(gsub(" W","",raw_df$cooling.capacity))

## Warning: NAs introduced by coercion

raw_df$condensor.coil <- as.factor(as.character(raw_df$condensor.coil))

index_complete_data <- complete.cases(raw_df)


raw_df <- raw_df[index_complete_data, ]

raw_df$web.scraper.order <- as.numeric(raw_df$web.scraper.order)

# Now we have data in a well cleaned way

# Now we will megre two data frames one raw_df and raw_df_linl

neat_data <- merge.data.frame(raw_df, raw_df_link, by.x = "web.scraper.order", by.y = "web.scraper.order")





##########Analysis of the data###############

#Outlier analysis

par(mfrow = c(2, 2)) 

boxplot(neat_data$price, ylab = "Price")
boxplot(neat_data$percentoff, ylab = "% off")
boxplot(neat_data$capacity.in.tons, ylab = "Capacity in tons")
boxplot(neat_data$cooling.capacity, ylab = "Cooling capacity")

neat_data$capacity.in.tons[which.max(neat_data$capacity.in.tons)]

## [1] 2.3

neat_data[which.max(neat_data$capacity.in.tons), -ncol(neat_data)]

##    web.scraper.order price percentoff  brand  type capacity.in.tons star.rating
## 10                10 53500         20 Godrej Split              2.3           3
##    cooling.capacity condensor.coil
## 10             7250         Copper

table(neat_data$brand)

## 
##             Akai        Blue Star          Carrier           Daikin 
##                1               24               13               27 
##           Godrej            Haier          Hitachi          Hyundai 
##               14                2               20                1 
##              IFB            Impex               LG          Livpure 
##                6                3               36                1 
##            Lloyd MarQ by Flipkart            Midea          Mitashi 
##               23                9                6                1 
##        O-General            Onida        Panasonic          Samsung 
##                1               15               14               21 
##           Sansui          Toshiba           Voltas        Whirlpool 
##                2                6               16               15

table(neat_data$type)

## 
## Split 
##   277

table(neat_data$star.rating)

## 
##   2   3   4   5 
##   8 195  14  60

hist((neat_data$price))

boxplot(neat_data$price)

boxplot(neat_data$cooling.capacity)









# As we can see that there is one observatioin which is oulier
# So we will remove that observation from data 
neat_data <- neat_data[neat_data$cooling.capacity < 10000, ]

library(ggplot2)
ggplot(neat_data, aes(x = cooling.capacity, y = price, color = star.rating)) + geom_point()



ggplot(neat_data, aes(x = cooling.capacity, y = price, color = condensor.coil)) + geom_point()


# Corrs tabulation of categorical varaibles
corsstab <- neat_data %>% group_by(condensor.coil, brand, star.rating) %>% summarise(number = n())


# Correlation between cooling capacity and price 

cor.test(neat_data$price, neat_data$cooling.capacity)

## 
##  Pearson's product-moment correlation
## 
## data:  neat_data$price and neat_data$cooling.capacity
## t = 11.031, df = 274, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4670787 0.6312985
## sample estimates:
##       cor 
## 0.5545649

# which implies that there is positive correaltion between price of air conditioner and cooling capacity



# Fitting Linear regression model to predict log of prices of Air conditoner
m <- lm(log(price)~percentoff+brand+capacity.in.tons+star.rating+cooling.capacity+ condensor.coil, data = neat_data)
summary(m)

## 
## Call:
## lm(formula = log(price) ~ percentoff + brand + capacity.in.tons + 
##     star.rating + cooling.capacity + condensor.coil, data = neat_data)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.312316 -0.038857 -0.001398  0.049691  0.253116 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              9.896e+00  1.022e-01  96.817  < 2e-16 ***
## percentoff              -6.836e-03  6.331e-04 -10.798  < 2e-16 ***
## brandBlue Star           1.510e-01  8.882e-02   1.700  0.09037 .  
## brandCarrier             1.232e-01  8.946e-02   1.377  0.16971    
## brandDaikin              7.495e-02  8.872e-02   0.845  0.39903    
## brandGodrej              4.282e-02  8.958e-02   0.478  0.63305    
## brandHaier              -3.578e-02  1.091e-01  -0.328  0.74322    
## brandHitachi             1.061e-01  8.977e-02   1.182  0.23853    
## brandHyundai            -1.952e-01  1.194e-01  -1.634  0.10351    
## brandIFB                 2.525e-02  9.362e-02   0.270  0.78760    
## brandImpex               2.633e-01  1.011e-01   2.605  0.00976 ** 
## brandLG                  1.718e-01  8.787e-02   1.955  0.05167 .  
## brandLivpure             2.999e-02  1.192e-01   0.252  0.80160    
## brandLloyd               1.011e-01  8.850e-02   1.143  0.25428    
## brandMarQ by Flipkart   -9.783e-02  9.162e-02  -1.068  0.28670    
## brandMidea              -2.890e-02  9.272e-02  -0.312  0.75555    
## brandMitashi             6.537e-02  1.200e-01   0.545  0.58652    
## brandO-General           3.996e-01  1.228e-01   3.256  0.00129 ** 
## brandOnida               2.095e-02  9.032e-02   0.232  0.81674    
## brandPanasonic           6.747e-02  9.082e-02   0.743  0.45826    
## brandSamsung             1.838e-01  8.943e-02   2.056  0.04089 *  
## brandSansui             -9.133e-02  1.044e-01  -0.875  0.38258    
## brandToshiba             2.768e-01  9.582e-02   2.889  0.00421 ** 
## brandVoltas              8.581e-02  8.880e-02   0.966  0.33482    
## brandWhirlpool           2.722e-02  8.884e-02   0.306  0.75961    
## capacity.in.tons         4.813e-01  6.383e-02   7.540 9.20e-13 ***
## star.rating3             7.334e-02  3.567e-02   2.056  0.04085 *  
## star.rating4             1.768e-01  4.137e-02   4.274 2.75e-05 ***
## star.rating5             2.566e-01  3.705e-02   6.924 3.85e-11 ***
## cooling.capacity        -2.876e-05  2.025e-05  -1.420  0.15683    
## condensor.coilAluminium -3.012e-02  3.760e-02  -0.801  0.42380    
## condensor.coilCopper     5.676e-02  2.866e-02   1.980  0.04881 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08122 on 244 degrees of freedom
## Multiple R-squared:  0.8674, Adjusted R-squared:  0.8505 
## F-statistic: 51.49 on 31 and 244 DF,  p-value: < 2.2e-16

plot(m)

## Warning: not plotting observations with leverage one:
##   110, 150, 182, 189, 240

## Warning: not plotting observations with leverage one:
##   110, 150, 182, 189, 240

# As we can see that we can easily predict the price of the given it has all its specification

temp <- data.frame(fitted = exp(m$fitted.values), observed = neat_data$price)

head(temp)

##     fitted observed
## 1 47663.52    47188
## 2 38045.14    36990
## 3 35280.00    31990
## 4 53407.71    60090
## 5 48559.44    49790
## 6 42689.77    43999

#Export data as mentioned in the assignment  


library(readxl)
library(openxlsx)
# 
#write.xlsx(df_scraped, file = "Web scarping raw.xlsx")
#write.xlsx(neat_data, file = "Web scarping processed.xlsx")