Summary of how to scrape and manipulate data from a website in R with example (kawasaki motorcycle’s)
library(rvest)
library(data.table)
library(xml2)
library(dplyr)
library(ggplot2)
library(ggpubr)
1 - Make a function to get info from one link
get_kavasaki_detail <- function(url) {
motorcycle <- read_html(url)
data_list<-list()
data_list[["url"]]<-url
key<- motorcycle %>%
html_nodes(".sl-spec-section-2 .bold , .sl-spec-section-0 .bold") %>%
html_text()
value<- motorcycle %>%
html_nodes(".sl-spec-section-2 .spec-value , .sl-spec-section-0 .spec-value") %>%
html_text()
for (i in 1:length(key)) {
data_list[[key[i]]]<-trimws(value[i])
}
df<-data.frame(data_list)
return(df)
}
2 - Collect the all links
###one page
first<- read_html("https://www.motorcycle.com/specs/kawasaki.html?page_num=1")
inner_pages<-first %>% html_nodes(".card-link") %>% html_attr("href")
my_links<- paste0("https://www.motorcycle.com",inner_pages)
###all pages
all_links<-c()
for (i in 1:42) {
t<- read_html(paste0("https://www.motorcycle.com/specs/kawasaki.html?page_num=",i))
my_links<-t %>% html_nodes(".card-link") %>% html_attr("href")
all_links<-c(all_links,my_links)
final_links<-paste0("https://www.motorcycle.com",all_links)
}
3 - Create a data frame
my_list<- lapply(final_links,get_kavasaki_detail)
kawasaki_df<-rbindlist(my_list,fill = T)
###Rename, subset, and convert data as you prefer
names(kawasaki_df)[11]<-"Name"
names(kawasaki_df)[3]<-"Price"
names(kawasaki_df)[2]<-"Type"
kawasaki_df$Price<-gsub("\\$","",kawasaki_df$Price)
kawasaki_df$Price<-gsub("\\,","",kawasaki_df$Price)
kawasaki_df$Price<- as.numeric(as.character(kawasaki_df$Price))
kawasaki_df$Warranty<- as.numeric(as.character(kawasaki_df$Warranty))
print(sapply(kawasaki_df[3], class))
## url Type
## "character" "character"
## Price Dealers
## "numeric" "character"
## Warranty Insurance
## "numeric" "character"
## Finance Generic.Type..Primary.
## "character" "character"
## Manufacturer.Country Parent.Company
## "character" "character"
## Name Year
## "character" "character"
## Make Transmission.Type
## "character" "character"
## Number.Of.Speeds Primary.Drive..Rear.Wheel.
## "character" "character"
## Reverse Overdrive
## "character" "character"
## Introduction.Year Gear.Ratio..1.2.3.4.5.6.
## "character" "character"
## Final.Drive.Ratio Manufacturer.Recommend.Minimum.Age
## "character" "character"
## Gear.Ratio..1.2.3.4.5. Adjustable.Throttle
## "character" "character"
## Heel.Toe.Shifter Gear.Ratio..1.
## "character" "character"
## Representative.Image Gear.Ratio..1.2.3.
## "character" "character"
## Photo.Gallery
## "character"
4 - Now you can Manipulate and Visualize your Data Frame
Here are some examples:
Top_fifty<-
kawasaki_df %>%
arrange(-Price) %>%
head(50)
ggplot(Top_fifty, aes(Name, Price, color=Price))+
geom_jitter()+
theme_classic()+
theme(axis.text.x = element_text(angle = 35,hjust = 1),
axis.text.y = element_text(angle = 0,hjust = 1),
axis.title.x = element_text(colour = "navy"),
axis.title.y = element_text(colour = "navy"))+
scale_color_gradient(low = "pink2",high = "purple4")+
labs(title = "Top Fifty The Most Expensive",
subtitle = "Theme=theme_Classic, plot=geom_point, price= in Dollor"
, x= "Names", y= "Prices")
types<-
kawasaki_df %>%
arrange(Type)
ggplot(types,aes(Type, Price))+
geom_boxplot()+
theme_get()+
theme(axis.text.x = element_text(angle = 35,hjust = 1, color = "orange3"),
axis.text.y = element_text(angle = 0,hjust = 1, color = "black"),
axis.title.x = element_text(colour = "navy"),
axis.title.y = element_text(colour = "navy"))+
labs(title = "Average Prices For Different Categories",
subtitle = "Theme=theme_get, plot=geom_boxplot, price= in Dollor"
, x= "Types", y= "Price")
ggplot(Top_fifty,aes(Warranty, Name, color = Price))+
geom_count()+
theme_update()+
theme(axis.text.x = element_text(angle = 0,hjust = 1, color = "black"),
axis.text.y = element_text(angle = 35,hjust = 1, color = "pink4"),
axis.title.x = element_text(colour = "navy"),
axis.title.y = element_text(colour = "navy"))+
scale_color_gradient(low = "skyblue",high = "purple4")+
labs(title = "Fifty Motorcycles with Highest Price by Warranty",
subtitle = "Theme=theme_update, plot=geom_count,
price= in Dollor, Warranty= in Month"
, x= "Warranty", y= "Name")
Warranty<-
kawasaki_df %>%
arrange(-Warranty) %>%
head(50)
ggplot(Warranty,aes(Price, Name, color = Price))+
geom_count()+
theme_update()+
theme(axis.text.x = element_text(angle = 0,hjust = 1, color = "purple"),
axis.text.y = element_text(angle = 35,hjust = 1, color = "cyan4"),
axis.title.x = element_text(colour = "brown"),
axis.title.y = element_text(colour = "brown"))+
scale_color_gradient(low = "orange",high = "red3")+
labs(title = "Fifty Motorcycles with Highest Warranty(36) by Price",
subtitle = "Theme=theme_update, plot=geom_count,
price= in Dollor, Warranty= in Month"
, x= "Price", y= "Name")
Cheers!