GROUP NAME: EAGLES VIEW
GROUP NAMES
Through ethical considerations required to avoid breaching the law and intrusion, we ensured that we checked the robots.txt file before scraping.
required_packages <- c("rvest", "httr", "dplyr", "readr", "stringr",
"summarytools", "tidyr", "ggplot2")
# Install missing packages
for (pkg in required_packages) {
if (!requireNamespace(pkg, quietly = TRUE)) {
install.packages(pkg)
}
}
load library
library(httr)
library(rvest)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
##
## Attaching package: 'readr'
## The following object is masked from 'package:rvest':
##
## guess_encoding
library(stringr)
library(tidyr)
library(ggplot2)
library(ggrepel)
library(summarytools)
library(robotstxt)
# Check if scraping root path is allowed
cat("Cinch: ", paths_allowed("/", "cinch.co.uk"), "\n")
## cinch.co.uk
## Cinch: TRUE
cat("TheAA: ", paths_allowed("/", "theaa.com"), "\n")
## theaa.com
## TheAA: TRUE
# Initialize empty lists to store data for all pages of both websites
all_data_eagle <- list()
all_data_view <- list()
# Loop through pages 1 to 12 for TheAA
for (page_num in 1:12) {
# Construct the URL for the current page
if (page_num == 1) {
url_eagle <- "https://www.theaa.com/used-cars/local/greater-manchester"
} else {
url_eagle <- paste0("https://www.theaa.com/used-cars/displaycars?fullpostcode=&page=", page_num, "&county=greater-manchester")
}
# Fetch the webpage content for TheAA
response_eagle <- GET(url_eagle)
content_eagle <- content(response_eagle, as = "text", encoding = "UTF-8")
# Parse the HTML content for TheAA
group_html_eagle <- read_html(content_eagle)
# Extract car details for TheAA
car_name_eagle <- group_html_eagle %>%
html_nodes(".make-model-text") %>%
html_text(trim = TRUE)
car_price_eagle <- group_html_eagle %>%
html_nodes(".total-price") %>%
html_text(trim = TRUE)
car_details_eagle <- group_html_eagle %>%
html_nodes(".vl-specs") %>%
html_text(trim = TRUE)
# Split Car Details into components
car_details_split_eagle <- str_split(car_details_eagle, "\\n\\s*•\\s*\\n", simplify = TRUE)
car_year <- car_details_split_eagle[, 1] %>% str_trim() # Year
car_mileage <- car_details_split_eagle[, 2] %>% str_trim() # Mileage
car_fuel <- car_details_split_eagle[, 3] %>% str_trim() # Fuel
car_transmission <- car_details_split_eagle[, 4] %>% str_trim() # Transmission
# Combine into a data frame for the current page
cars_data_eagle <- data.frame(
Name = car_name_eagle,
Price = car_price_eagle,
Year = car_year,
Mileage = car_mileage,
Fuel = car_fuel,
Transmission = car_transmission,
stringsAsFactors = FALSE
)
# Store the data for the current page in the list
all_data_eagle[[page_num]] <- cars_data_eagle
# Optional: Print progress
cat("Scraped TheAA page", page_num, "\n")
}
## Scraped TheAA page 1
## Scraped TheAA page 2
## Scraped TheAA page 3
## Scraped TheAA page 4
## Scraped TheAA page 5
## Scraped TheAA page 6
## Scraped TheAA page 7
## Scraped TheAA page 8
## Scraped TheAA page 9
## Scraped TheAA page 10
## Scraped TheAA page 11
## Scraped TheAA page 12
# Combine data from all pages of TheAA into one data frame
TheAA_Car_Dataset <- do.call(rbind, all_data_eagle)
write.csv(TheAA_Car_Dataset, "Theaa_Car_Dataset.csv")
all_data_view <- list()
# Loop through pages 1 to 12 for Cinch
for (page_num in 1:12) {
# Construct the URL for the current page
if (page_num == 1) {
url_view <- "https://www.cinch.co.uk/used-cars?financeType=any"
} else {
url_view <- paste0("https://www.cinch.co.uk/used-cars?financeType=any&pageNumber=", page_num)
}
# Fetch the webpage content for Cinch
response_view <- GET(url_view)
content_view <- content(response_view, as = "text", encoding = "UTF-8")
# Parse the HTML content for Cinch
group_html_view <- read_html(content_view)
# Extract car details for Cinch
car_name_view <- group_html_view %>%
html_nodes(".vehicle-card_link__AvRBT") %>%
html_text(trim = TRUE)
# Extract car price for Cinch
car_price_view <- group_html_view %>%
html_nodes(".price_cashPrice__fSwOY") %>%
html_text(trim = TRUE)
car_price_view <- str_replace(car_price_view, "Full price.", "") # Remove "Full price."
# Extract car details for Cinch
car_details_view <- group_html_view %>%
html_nodes(".specs-list_upperCase__62SjC") %>%
html_text(trim = TRUE)
# Clean car details for Cinch
car_details_view <- str_replace_all(car_details_view,
c("Vehicle Year," = "",
"Mileage," = "",
"Fuel Type," = "",
"Transmission Type," = ""))
# Split Car Details into Year, Mileage, Fuel, Transmission
details_split_view <- str_split(car_details_view, "\\s+", simplify = TRUE) # Split by spaces
car_year_view <- details_split_view[, 2] %>% str_trim() # Year
car_mileage_view <- paste(details_split_view[, 3], details_split_view[, 4]) %>% str_trim() # Mileage
car_fuel_view <- details_split_view[, 5] %>% str_trim() # Fuel
car_transmission_view <- paste(details_split_view[, 6], details_split_view[, 7], details_split_view[, 7]) %>% str_trim() # Transmission
# Combine into a data frame for the current page
cars_data_view <- data.frame(
Name = car_name_view,
Price = car_price_view,
Year = car_year_view,
Mileage = car_mileage_view,
Fuel = car_fuel_view,
Transmission = car_transmission_view,
stringsAsFactors = FALSE
)
# Store the data for the current page in the list
all_data_view[[page_num]] <- cars_data_view
# Optional: Print progress
cat("Scraped Cinch page", page_num, "\n")
}
## Scraped Cinch page 1
## Scraped Cinch page 2
## Scraped Cinch page 3
## Scraped Cinch page 4
## Scraped Cinch page 5
## Scraped Cinch page 6
## Scraped Cinch page 7
## Scraped Cinch page 8
## Scraped Cinch page 9
## Scraped Cinch page 10
## Scraped Cinch page 11
## Scraped Cinch page 12
# Combine data from all pages of Cinch into one data frame
cinch_car_Dataset <- do.call(rbind, all_data_view)
write.csv(cinch_car_Dataset, "cinch_car_Dataset.csv")
load datasets
# Read datasets (assuming they are CSV files)
cinch_df <- read.csv("C:/Users/user/Documents/R work/cinch_car_Dataset.csv", stringsAsFactors = FALSE)
theaa_df <- read.csv("C:/Users/user/Documents/R work/Theaa_Car_Dataset.csv", stringsAsFactors = FALSE)
merge the files
# Merge datasets using common key (modify if needed)
merged_df <- bind_rows(cinch_df, theaa_df) # Ensure "Car_ID" exists in both datasets
checking for dupplicates in the dataframe
``` r
duplicated_rows <- merged_df[duplicated(merged_df), ]
print(duplicated_rows)
## [1] X Name Price Year Mileage
## [6] Fuel Transmission
## <0 rows> (or 0-length row.names)
Removes duplicates if any
# Removes duplicate rows
merged_df <- unique(merged_df)
merged_df$Mileage <- as.numeric(gsub(" miles", "", gsub(",", "", trimws(merged_df$Mileage))))
# Ensure 'Price' is a character column
merged_df$Price <- as.character(merged_df$Price)
# Replace specific non-numeric values with NA
merged_df$Price[merged_df$Price %in% c("POA", "N/A", "Call for Price")] <- NA
# Remove any other non-numeric characters (e.g., currency symbols, commas)
merged_df$Price <- gsub("[^0-9.]", "", merged_df$Price)
# Convert 'Price' to numeric
merged_df$Price <- as.numeric(merged_df$Price)
# Verify conversion
summary(merged_df$Price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1495 14641 19078 21272 25482 105199
sum(is.na(merged_df$Price)) # Count NA values
## [1] 0
missing values
# Check for missing values
missing_values <- colSums(is.na(merged_df))
print("Missing values in each column: ")
## [1] "Missing values in each column: "
print(missing_values)
## X Name Price Year Mileage Fuel
## 0 0 0 0 0 0
## Transmission
## 0
handle missing values
# Check for missing values and apply median imputation only if necessary
if (any(colSums(is.na(merged_df)) > 0)) {
merged_df <- merged_df %>%
mutate(across(where(is.numeric), ~ifelse(is.na(.), median(., na.rm = TRUE), .)))
}
detect outlier
# Step 1: Calculate IQR for Price
Q1 <- quantile(merged_df$Price, 0.25, na.rm = TRUE)
Q3 <- quantile(merged_df$Price, 0.75, na.rm = TRUE)
IQR_value <- Q3 - Q1
# Step 2: Define lower and upper bounds
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
# Step 3: Filter out outliers
merged_df <- merged_df %>%
filter(Price >= lower_bound & Price <= upper_bound)
# Boxplot after
ggplot(merged_df, aes(x = Fuel, y = Price)) +
geom_boxplot(fill = "palegreen", alpha = 0.5) +
ggtitle("After Removing Outliers")
get the highest price
# Find the car with the highest price
highest_price_car <- merged_df%>% filter(Price == max(Price, na.rm = TRUE))
print("Car with the highest price:")
## [1] "Car with the highest price:"
print(highest_price_car)
## X Name Price
## 1 272 Land Rover Discovery Sport 1.5 P300e Dynamic SE 5dr Auto [5 Seat] 41579
## Year Mileage Fuel Transmission
## 1 2024 15640 Petrol plug-in hybrid hybrid
highest mileage
# Find the car with the highest mileage
highest_mileage_car <- merged_df %>% filter(Mileage == max(Mileage, na.rm = TRUE))
print("Car with the highest mileage:")
## [1] "Car with the highest mileage:"
print(highest_mileage_car)
## X Name Price Year Mileage Fuel Transmission
## 1 3 BMW 1 SERIES 11250 2019 99000 Diesel Manual
find costlier car max, min,av price
# Calculate costlier car, average, max, and min price
avg_price <- mean(merged_df$Price, na.rm = TRUE)
max_price <- max(merged_df$Price, na.rm = TRUE)
min_price <- min(merged_df$Price, na.rm = TRUE)
show the result
print(paste("Average Car Price:", avg_price))
## [1] "Average Car Price: 20103.4514003295"
print(paste("Maximum Car Price:", max_price))
## [1] "Maximum Car Price: 41579"
print(paste("Minimum Car Price:", min_price))
## [1] "Minimum Car Price: 1495"
get min and max mileage
# Calculate min & max mileage
min_mileage <- min(merged_df$Mileage, na.rm = TRUE)
max_mileage <- max(merged_df$Mileage, na.rm = TRUE)
print(paste("Minimum Mileage:", min_mileage))
## [1] "Minimum Mileage: 10"
print(paste("Maximum Mileage:", max_mileage))
## [1] "Maximum Mileage: 99000"
summary(merged_df)
car price histogram
# Histogram of Car Prices
ggplot(merged_df, aes(x = Price)) +
geom_histogram(binwidth = 1000, fill = "blue", color = "black") +
ggtitle("Distribution of Car Prices") +
xlab("Price") + ylab("Count")
# Scatterplot of Price vs Mileage
ggplot(merged_df, aes(x = Mileage, y = Price)) +
geom_point(color = "red", alpha = 0.6) +
ggtitle("Price vs Mileage") +
xlab("Mileage") + ylab("Price")
# Convert Year to a factor for better readability
ggplot(merged_df, aes(x = as.factor(Year))) +
geom_bar(fill = "purple", color = "black") +
labs(title = "Number of Cars per Year",
x = "Year",
y = "Count") +
theme_minimal()
ggplot(merged_df, aes(x = Fuel)) +
geom_bar(fill = "steelblue", color = "black") +
labs(title = "Distribution of Cars by Fuel Type",
x = "Fuel Type",
y = "Count") +
theme_minimal()
ggplot(merged_df, aes(x = Transmission)) +
geom_bar(fill = "orange", color = "black") +
labs(title = "Distribution of Cars by Transmission Type",
x = "Transmission",
y = "Count") +
theme_minimal()
# Extract top car names
top_cars <- merged_df %>%
count(Name, sort = TRUE) %>%
top_n(10)
## Selecting by n
ggplot(top_cars, aes(x = reorder(Name, n), y = n)) +
geom_col(fill = "darkgreen") +
coord_flip() +
labs(title = "Top 10 Most Listed Car Models", x = "Car Model", y = "Count")
ggplot(merged_df, aes(x = as.numeric(Year), y = Price)) +
geom_boxplot(fill = "skyblue") +
labs(title = "Car Price Trends by Year",
x = "Year",
y = "Price")
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
ggplot(merged_df, aes(x = as.numeric(Year), y = Mileage)) +
geom_boxplot(fill = "coral") +
labs(title = "Mileage Distribution by Year",
x = "Year",
y = "Mileage")
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
ggplot(merged_df, aes(x = Fuel, y = Price)) +
geom_boxplot(fill = "lightgreen") +
labs(title = "Car Prices Across Fuel Types", x = "Fuel", y = "Price")
ggplot(merged_df, aes(x = Mileage, y = Price, color = Fuel)) +
geom_point(alpha = 0.6, size = 2, position = position_jitter(width = 0.05, height = 0)) +
scale_y_continuous(labels = scales::comma) + # Formats y-axis with comma separators
labs(title = "Price vs Mileage by Fuel Type",
x = "Mileage (units)",
y = "Price (units)") +
scale_color_manual(values = c("Diesel" = "#1f77b4",
"Electric" = "#ff7f0e",
"Hybrid" = "#2ca02c",
"Petrol" = "#d62728",
"Plug-in Hybrid" = "#9467bd")) +
theme_minimal() +
theme(legend.position = "bottom",
legend.title = element_blank(),
plot.title = element_text(hjust = 0.5))