Web Scraping Project

GROUP NAME: EAGLES VIEW

GROUP NAMES

CHINONSO JOB - 2424936
CHINEDU UZIM - 2424386
IBE MOSES - 2424389
OGBU JOY ONYINYEOMA - 2424465

Through ethical considerations required to avoid breaching the law and intrusion, we ensured that we checked the robots.txt file before scraping.

required_packages <- c("rvest", "httr", "dplyr", "readr", "stringr", 
                       "summarytools", "tidyr", "ggplot2")

# Install missing packages
for (pkg in required_packages) {
  if (!requireNamespace(pkg, quietly = TRUE)) {
    install.packages(pkg)
  }
}

load library

library(httr)
library(rvest)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(readr)

## 
## Attaching package: 'readr'

## The following object is masked from 'package:rvest':
## 
##     guess_encoding

library(stringr)
library(tidyr)
library(ggplot2)
library(ggrepel)
library(summarytools)
library(robotstxt)

# Check if scraping root path is allowed
cat("Cinch: ", paths_allowed("/", "cinch.co.uk"), "\n")

##  cinch.co.uk

## Cinch:  TRUE

cat("TheAA: ", paths_allowed("/", "theaa.com"), "\n")

##  theaa.com

## TheAA:  TRUE

# Initialize empty lists to store data for all pages of both websites
all_data_eagle <- list()
all_data_view <- list()

# Loop through pages 1 to 12 for TheAA
for (page_num in 1:12) {
  # Construct the URL for the current page
  if (page_num == 1) {
    url_eagle <- "https://www.theaa.com/used-cars/local/greater-manchester"
  } else {
    url_eagle <- paste0("https://www.theaa.com/used-cars/displaycars?fullpostcode=&page=", page_num, "&county=greater-manchester")
  }
  
  # Fetch the webpage content for TheAA
  response_eagle <- GET(url_eagle)
  content_eagle <- content(response_eagle, as = "text", encoding = "UTF-8")
  
  # Parse the HTML content for TheAA
  group_html_eagle <- read_html(content_eagle)
  
  # Extract car details for TheAA
  car_name_eagle <- group_html_eagle %>%
    html_nodes(".make-model-text") %>%
    html_text(trim = TRUE)
  
  car_price_eagle <- group_html_eagle %>%
    html_nodes(".total-price") %>%
    html_text(trim = TRUE)
  
  car_details_eagle <- group_html_eagle %>%
    html_nodes(".vl-specs") %>%
    html_text(trim = TRUE)
  
  # Split Car Details into components
  car_details_split_eagle <- str_split(car_details_eagle, "\\n\\s*•\\s*\\n", simplify = TRUE)
  
  car_year <- car_details_split_eagle[, 1] %>% str_trim() # Year
  car_mileage <- car_details_split_eagle[, 2] %>% str_trim() # Mileage
  car_fuel <- car_details_split_eagle[, 3] %>% str_trim() # Fuel
  car_transmission <- car_details_split_eagle[, 4] %>% str_trim() # Transmission
  
  # Combine into a data frame for the current page
  cars_data_eagle <- data.frame(
    Name = car_name_eagle,
    Price = car_price_eagle,
    Year = car_year,
    Mileage = car_mileage,
    Fuel = car_fuel,
    Transmission = car_transmission,
    stringsAsFactors = FALSE
  )
  
  # Store the data for the current page in the list
  all_data_eagle[[page_num]] <- cars_data_eagle
  
  # Optional: Print progress
  cat("Scraped TheAA page", page_num, "\n")
}

## Scraped TheAA page 1 
## Scraped TheAA page 2 
## Scraped TheAA page 3 
## Scraped TheAA page 4 
## Scraped TheAA page 5 
## Scraped TheAA page 6 
## Scraped TheAA page 7 
## Scraped TheAA page 8 
## Scraped TheAA page 9 
## Scraped TheAA page 10 
## Scraped TheAA page 11 
## Scraped TheAA page 12

# Combine data from all pages of TheAA into one data frame
TheAA_Car_Dataset <- do.call(rbind, all_data_eagle)
write.csv(TheAA_Car_Dataset, "Theaa_Car_Dataset.csv")

all_data_view <- list()
# Loop through pages 1 to 12 for Cinch
for (page_num in 1:12) {
  # Construct the URL for the current page
  if (page_num == 1) {
    url_view <- "https://www.cinch.co.uk/used-cars?financeType=any"
  } else {
    url_view <- paste0("https://www.cinch.co.uk/used-cars?financeType=any&pageNumber=", page_num)
  }
  
  # Fetch the webpage content for Cinch
  response_view <- GET(url_view)
  content_view <- content(response_view, as = "text", encoding = "UTF-8")
  
  # Parse the HTML content for Cinch
  group_html_view <- read_html(content_view)
  
  # Extract car details for Cinch
  car_name_view <- group_html_view %>%
    html_nodes(".vehicle-card_link__AvRBT") %>%
    html_text(trim = TRUE)
  
  # Extract car price for Cinch
  car_price_view <- group_html_view %>%
    html_nodes(".price_cashPrice__fSwOY") %>%
    html_text(trim = TRUE)
  car_price_view <- str_replace(car_price_view, "Full price.", "") # Remove "Full price."
  
  # Extract car details for Cinch
  car_details_view <- group_html_view %>%
    html_nodes(".specs-list_upperCase__62SjC") %>%
    html_text(trim = TRUE)
  
  # Clean car details for Cinch
  car_details_view <- str_replace_all(car_details_view,
                                       c("Vehicle Year," = "",
                                         "Mileage," = "",
                                         "Fuel Type," = "",
                                         "Transmission Type," = ""))
  
  # Split Car Details into Year, Mileage, Fuel, Transmission
  details_split_view <- str_split(car_details_view, "\\s+", simplify = TRUE) # Split by spaces
  
  car_year_view <- details_split_view[, 2] %>% str_trim() # Year
  car_mileage_view <- paste(details_split_view[, 3], details_split_view[, 4]) %>% str_trim() # Mileage
  car_fuel_view <- details_split_view[, 5] %>% str_trim() # Fuel
  car_transmission_view <- paste(details_split_view[, 6], details_split_view[, 7], details_split_view[, 7]) %>% str_trim() # Transmission
  
  # Combine into a data frame for the current page
  cars_data_view <- data.frame(
    Name = car_name_view,
    Price = car_price_view,
    Year = car_year_view,
    Mileage = car_mileage_view,
    Fuel = car_fuel_view,
    Transmission = car_transmission_view,
    stringsAsFactors = FALSE
  )
  
  # Store the data for the current page in the list
  all_data_view[[page_num]] <- cars_data_view
  
  # Optional: Print progress
  cat("Scraped Cinch page", page_num, "\n")
}

## Scraped Cinch page 1 
## Scraped Cinch page 2 
## Scraped Cinch page 3 
## Scraped Cinch page 4 
## Scraped Cinch page 5 
## Scraped Cinch page 6 
## Scraped Cinch page 7 
## Scraped Cinch page 8 
## Scraped Cinch page 9 
## Scraped Cinch page 10 
## Scraped Cinch page 11 
## Scraped Cinch page 12

# Combine data from all pages of Cinch into one data frame
cinch_car_Dataset <- do.call(rbind, all_data_view)

write.csv(cinch_car_Dataset, "cinch_car_Dataset.csv")

load datasets

# Read datasets (assuming they are CSV files)
cinch_df <- read.csv("C:/Users/user/Documents/R work/cinch_car_Dataset.csv", stringsAsFactors = FALSE)
theaa_df <- read.csv("C:/Users/user/Documents/R work/Theaa_Car_Dataset.csv", stringsAsFactors = FALSE)

merge the files

# Merge datasets using common key (modify if needed)
merged_df <- bind_rows(cinch_df, theaa_df)  # Ensure "Car_ID" exists in both datasets


checking for dupplicates in the dataframe


``` r
duplicated_rows <- merged_df[duplicated(merged_df), ]
print(duplicated_rows)

## [1] X            Name         Price        Year         Mileage     
## [6] Fuel         Transmission
## <0 rows> (or 0-length row.names)

Removes duplicates if any

# Removes duplicate rows
merged_df <- unique(merged_df)

merged_df$Mileage <- as.numeric(gsub(" miles", "", gsub(",", "", trimws(merged_df$Mileage))))

# Ensure 'Price' is a character column
merged_df$Price <- as.character(merged_df$Price)

# Replace specific non-numeric values with NA
merged_df$Price[merged_df$Price %in% c("POA", "N/A", "Call for Price")] <- NA

# Remove any other non-numeric characters (e.g., currency symbols, commas)
merged_df$Price <- gsub("[^0-9.]", "", merged_df$Price)

# Convert 'Price' to numeric
merged_df$Price <- as.numeric(merged_df$Price)

# Verify conversion
summary(merged_df$Price)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1495   14641   19078   21272   25482  105199

sum(is.na(merged_df$Price))  # Count NA values

## [1] 0

missing values

# Check for missing values
missing_values <- colSums(is.na(merged_df))

print("Missing values in each column: ")

## [1] "Missing values in each column: "

print(missing_values)

##            X         Name        Price         Year      Mileage         Fuel 
##            0            0            0            0            0            0 
## Transmission 
##            0

handle missing values

# Check for missing values and apply median imputation only if necessary
if (any(colSums(is.na(merged_df)) > 0)) { 
  merged_df <- merged_df %>% 
    mutate(across(where(is.numeric), ~ifelse(is.na(.), median(., na.rm = TRUE), .)))
}

detect outlier

# Step 1: Calculate IQR for Price
Q1 <- quantile(merged_df$Price, 0.25, na.rm = TRUE)
Q3 <- quantile(merged_df$Price, 0.75, na.rm = TRUE)
IQR_value <- Q3 - Q1

# Step 2: Define lower and upper bounds
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value

# Step 3: Filter out outliers
merged_df <- merged_df %>%
  filter(Price >= lower_bound & Price <= upper_bound)

# Boxplot after
ggplot(merged_df, aes(x = Fuel, y = Price)) +
  geom_boxplot(fill = "palegreen", alpha = 0.5) +
  ggtitle("After Removing Outliers")

get the highest price

# Find the car with the highest price
highest_price_car <- merged_df%>% filter(Price == max(Price, na.rm = TRUE))
print("Car with the highest price:")

## [1] "Car with the highest price:"

print(highest_price_car)

##     X                                                              Name Price
## 1 272 Land Rover Discovery Sport 1.5 P300e Dynamic SE 5dr Auto [5 Seat] 41579
##   Year Mileage   Fuel          Transmission
## 1 2024   15640 Petrol plug-in hybrid hybrid

highest mileage

# Find the car with the highest mileage
highest_mileage_car <- merged_df %>% filter(Mileage == max(Mileage, na.rm = TRUE))
print("Car with the highest mileage:")

## [1] "Car with the highest mileage:"

print(highest_mileage_car)

##   X         Name Price Year Mileage   Fuel Transmission
## 1 3 BMW 1 SERIES 11250 2019   99000 Diesel       Manual

find costlier car max, min,av price

# Calculate costlier car, average, max, and min price
avg_price <- mean(merged_df$Price, na.rm = TRUE)
max_price <- max(merged_df$Price, na.rm = TRUE)
min_price <- min(merged_df$Price, na.rm = TRUE)

show the result

print(paste("Average Car Price:", avg_price))

## [1] "Average Car Price: 20103.4514003295"

print(paste("Maximum Car Price:", max_price))

## [1] "Maximum Car Price: 41579"

print(paste("Minimum Car Price:", min_price))

## [1] "Minimum Car Price: 1495"

get min and max mileage

# Calculate min & max mileage
min_mileage <- min(merged_df$Mileage, na.rm = TRUE)
max_mileage <- max(merged_df$Mileage, na.rm = TRUE)
print(paste("Minimum Mileage:", min_mileage))

## [1] "Minimum Mileage: 10"

print(paste("Maximum Mileage:", max_mileage))

## [1] "Maximum Mileage: 99000"

summary(merged_df)

car price histogram

# Histogram of Car Prices
ggplot(merged_df, aes(x = Price)) +
  geom_histogram(binwidth = 1000, fill = "blue", color = "black") +
  ggtitle("Distribution of Car Prices") +
  xlab("Price") + ylab("Count")

# Scatterplot of Price vs Mileage
ggplot(merged_df, aes(x = Mileage, y = Price)) +
  geom_point(color = "red", alpha = 0.6) +
  ggtitle("Price vs Mileage") +
  xlab("Mileage") + ylab("Price")

# Convert Year to a factor for better readability
ggplot(merged_df, aes(x = as.factor(Year))) + 
  geom_bar(fill = "purple", color = "black") + 
  labs(title = "Number of Cars per Year", 
       x = "Year", 
       y = "Count") + 
  theme_minimal()

ggplot(merged_df, aes(x = Fuel)) +
  geom_bar(fill = "steelblue", color = "black") +
  labs(title = "Distribution of Cars by Fuel Type",
       x = "Fuel Type",
       y = "Count") +
  theme_minimal()

ggplot(merged_df, aes(x = Transmission)) +
  geom_bar(fill = "orange", color = "black") +
  labs(title = "Distribution of Cars by Transmission Type",
       x = "Transmission",
       y = "Count") +
  theme_minimal()

# Extract top car names
top_cars <- merged_df %>%
  count(Name, sort = TRUE) %>%
  top_n(10)

## Selecting by n

ggplot(top_cars, aes(x = reorder(Name, n), y = n)) +
  geom_col(fill = "darkgreen") +
  coord_flip() +
  labs(title = "Top 10 Most Listed Car Models", x = "Car Model", y = "Count")

ggplot(merged_df, aes(x = as.numeric(Year), y = Price)) +
  geom_boxplot(fill = "skyblue") +
  labs(title = "Car Price Trends by Year",
       x = "Year",
       y = "Price")

## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

ggplot(merged_df, aes(x = as.numeric(Year), y = Mileage)) +
  geom_boxplot(fill = "coral") +
  labs(title = "Mileage Distribution by Year",
       x = "Year",
       y = "Mileage")

## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

ggplot(merged_df, aes(x = Fuel, y = Price)) +
  geom_boxplot(fill = "lightgreen") +
  labs(title = "Car Prices Across Fuel Types", x = "Fuel", y = "Price")

ggplot(merged_df, aes(x = Mileage, y = Price, color = Fuel)) +
  geom_point(alpha = 0.6, size = 2, position = position_jitter(width = 0.05, height = 0)) +
  scale_y_continuous(labels = scales::comma) +  # Formats y-axis with comma separators
  labs(title = "Price vs Mileage by Fuel Type",
       x = "Mileage (units)",
       y = "Price (units)") +
  scale_color_manual(values = c("Diesel" = "#1f77b4",
                               "Electric" = "#ff7f0e",
                               "Hybrid" = "#2ca02c",
                               "Petrol" = "#d62728",
                               "Plug-in Hybrid" = "#9467bd")) +
  theme_minimal() +
  theme(legend.position = "bottom",
        legend.title = element_blank(),
        plot.title = element_text(hjust = 0.5))

Web Scraping Project

EAGLES VIEW GROUP

2025-03-28