Summary of how to scrape and manipulate data from a website in R with example (kawasaki motorcycle’s)

library(rvest)
library(data.table)
library(xml2)
library(dplyr)
library(ggplot2)
library(ggpubr)

1 - Make a function to get info from one link

get_kavasaki_detail <- function(url) {
  
  motorcycle <- read_html(url)
  
  data_list<-list()
  
  data_list[["url"]]<-url
  
  key<- motorcycle %>% 
    html_nodes(".sl-spec-section-2 .bold , .sl-spec-section-0 .bold") %>% 
    html_text()
  
  value<- motorcycle %>% 
    html_nodes(".sl-spec-section-2 .spec-value , .sl-spec-section-0 .spec-value") %>%
    html_text()
  
  for (i in 1:length(key)) {
    data_list[[key[i]]]<-trimws(value[i])
  }
  df<-data.frame(data_list)
  return(df)
}

2 - Collect the all links

###one page

first<- read_html("https://www.motorcycle.com/specs/kawasaki.html?page_num=1")

inner_pages<-first %>% html_nodes(".card-link") %>% html_attr("href")

my_links<- paste0("https://www.motorcycle.com",inner_pages)

###all pages

all_links<-c()
for (i in 1:42) {
  t<- read_html(paste0("https://www.motorcycle.com/specs/kawasaki.html?page_num=",i))
  my_links<-t %>% html_nodes(".card-link") %>% html_attr("href")
  all_links<-c(all_links,my_links)
  final_links<-paste0("https://www.motorcycle.com",all_links)
}

3 - Create a data frame

my_list<- lapply(final_links,get_kavasaki_detail)
kawasaki_df<-rbindlist(my_list,fill = T)

###Rename, subset, and convert data as you prefer

names(kawasaki_df)[11]<-"Name"
names(kawasaki_df)[3]<-"Price"
names(kawasaki_df)[2]<-"Type"

kawasaki_df$Price<-gsub("\\$","",kawasaki_df$Price)
kawasaki_df$Price<-gsub("\\,","",kawasaki_df$Price)

kawasaki_df$Price<- as.numeric(as.character(kawasaki_df$Price))
kawasaki_df$Warranty<- as.numeric(as.character(kawasaki_df$Warranty))
print(sapply(kawasaki_df[3], class))
##                                url                               Type 
##                        "character"                        "character" 
##                              Price                            Dealers 
##                          "numeric"                        "character" 
##                           Warranty                          Insurance 
##                          "numeric"                        "character" 
##                            Finance             Generic.Type..Primary. 
##                        "character"                        "character" 
##               Manufacturer.Country                     Parent.Company 
##                        "character"                        "character" 
##                               Name                               Year 
##                        "character"                        "character" 
##                               Make                  Transmission.Type 
##                        "character"                        "character" 
##                   Number.Of.Speeds         Primary.Drive..Rear.Wheel. 
##                        "character"                        "character" 
##                            Reverse                          Overdrive 
##                        "character"                        "character" 
##                  Introduction.Year           Gear.Ratio..1.2.3.4.5.6. 
##                        "character"                        "character" 
##                  Final.Drive.Ratio Manufacturer.Recommend.Minimum.Age 
##                        "character"                        "character" 
##             Gear.Ratio..1.2.3.4.5.                Adjustable.Throttle 
##                        "character"                        "character" 
##                   Heel.Toe.Shifter                     Gear.Ratio..1. 
##                        "character"                        "character" 
##               Representative.Image                 Gear.Ratio..1.2.3. 
##                        "character"                        "character" 
##                      Photo.Gallery 
##                        "character"

4 - Now you can Manipulate and Visualize your Data Frame

Here are some examples:

Top_fifty<-
  kawasaki_df %>% 
  arrange(-Price) %>% 
  head(50)


ggplot(Top_fifty, aes(Name, Price, color=Price))+
  geom_jitter()+
  theme_classic()+
  theme(axis.text.x = element_text(angle = 35,hjust = 1),
        axis.text.y = element_text(angle = 0,hjust = 1),
        axis.title.x = element_text(colour = "navy"), 
        axis.title.y = element_text(colour = "navy"))+
  scale_color_gradient(low = "pink2",high = "purple4")+
  labs(title = "Top Fifty The Most Expensive", 
       subtitle = "Theme=theme_Classic, plot=geom_point, price= in Dollor"
       , x= "Names", y= "Prices")

types<-
  kawasaki_df %>% 
  arrange(Type)

ggplot(types,aes(Type, Price))+
  geom_boxplot()+
  theme_get()+
  theme(axis.text.x = element_text(angle = 35,hjust = 1, color = "orange3"),
        axis.text.y = element_text(angle = 0,hjust = 1, color = "black"),
        axis.title.x = element_text(colour = "navy"), 
        axis.title.y = element_text(colour = "navy"))+
  labs(title = "Average Prices For Different Categories", 
       subtitle = "Theme=theme_get, plot=geom_boxplot, price= in Dollor"
       , x= "Types", y= "Price")

ggplot(Top_fifty,aes(Warranty, Name, color = Price))+
  geom_count()+
  theme_update()+
  theme(axis.text.x = element_text(angle = 0,hjust = 1, color = "black"),
        axis.text.y = element_text(angle = 35,hjust = 1, color = "pink4"),
        axis.title.x = element_text(colour = "navy"), 
        axis.title.y = element_text(colour = "navy"))+
  scale_color_gradient(low = "skyblue",high = "purple4")+
  labs(title = "Fifty Motorcycles with Highest Price by Warranty", 
       subtitle = "Theme=theme_update, plot=geom_count, 
       price= in Dollor, Warranty= in Month"
       , x= "Warranty", y= "Name")

Warranty<-
   kawasaki_df %>% 
   arrange(-Warranty) %>% 
   head(50)
  
ggplot(Warranty,aes(Price, Name, color = Price))+
  geom_count()+
  theme_update()+
  theme(axis.text.x = element_text(angle = 0,hjust = 1, color = "purple"),
        axis.text.y = element_text(angle = 35,hjust = 1, color = "cyan4"),
        axis.title.x = element_text(colour = "brown"), 
        axis.title.y = element_text(colour = "brown"))+
  scale_color_gradient(low = "orange",high = "red3")+
  labs(title = "Fifty Motorcycles with Highest Warranty(36) by Price", 
       subtitle = "Theme=theme_update, plot=geom_count, 
       price= in Dollor, Warranty= in Month"
       , x= "Price", y= "Name")

Cheers!