analysis on cardekho dataset

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

#Q1.From the cardekho dataset, how does the distribution of cars sold vary across different years?
cardekho <- read.csv("C:/Users/Lenovo/Downloads/file dataset/cardekho.csv")
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.3.3

ggplot(cardekho, aes(x = factor(year))) +
  geom_bar() +
  labs(title = "Number of Cars Sold by Year",
       x = "Year",
       y = "Number of Cars Sold")

#Q2..from the cardekho data set how many car  driven less than 100k
cardekho <- read.csv("C:/Users/Lenovo/Downloads/file dataset/cardekho.csv")
num_cars_less_than_100k <- sum(cardekho$km_driven < 100000)

#Q3.What is the distribution of transmission types (manual and automatic) among the cars in the dataset named 'cardekho.
cardekho <- read.csv("C:/Users/Lenovo/Downloads/file dataset/cardekho.csv")
# Counting the number of cars with manual and automatic transmission
manual_count <- sum(cardekho$transmission == "Manual")
automatic_count <- sum(cardekho$transmission == "Automatic")

#Q4. Counting the number of cars with the specified name
cardekho <- read.csv("C:/Users/Lenovo/Downloads/file dataset/cardekho.csv")
num_cars_maruti_vita <- sum(cardekho $name == "Maruti Vitara Brezza LDi")
num_cars_maruti_vitar <- sum(cardekho $name == "Maruti Vitara Brezza LDi option")
num_cars_maruti_vitara <- sum(cardekho $name == "Maruti Alto 800 CNG LXI Optional")

maruticars<-c(num_cars_maruti_vita,num_cars_maruti_vitar,num_cars_maruti_vitara)

#Q5 What is the distribution of car fuel types (e.g., petrol, diesel, CNG) in the CarDekho dataset, and which fuel type constitutes the highest proportion of cars?
cardekho <- read.csv("C:/Users/Lenovo/Downloads/file dataset/cardekho.csv")
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

# Count the number of cars for each fuel type
fuel_type_counts <- cardekho %>%
  count(fuel)
# Plot a pie chart to visualize the distribution of car fuel types
ggplot(fuel_type_counts, aes(x = "", y = n, fill = fuel)) +
  geom_bar(stat = "identity") +
  coord_polar("y") +
  labs(fill = "Fuel Type", x = NULL, y = NULL, title = "Distribution of Car Fuel Types")

#Q6 "How can I create a bar plot of car owners from the 'cardekho' dataset in R using ggplot2, with each bar filled with a different color representing a unique owner category?"
cardekho <- read.csv("C:/Users/Lenovo/Downloads/file dataset/cardekho.csv")
library(ggplot2)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ✔ readr     2.1.5     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

ggplot(cardekho, aes(x = owner, fill = owner)) +
  geom_bar() +
  labs(title = "Count of Cardekho Owners", x = "Owner", y = "Count") +
  scale_fill_manual(values = rainbow(length(unique(cardekho$owner))))

# Q7 Is there a correlation between the selling price and the year of manufacture? (Calculate correlation coefficient)
cardekho <- read.csv("C:/Users/Lenovo/Downloads/file dataset/cardekho.csv")
correlation <- cor(cardekho$selling_price, cardekho$year)

#Q8 What is the average selling price of cars based on their fuel type? (Group by fuel type and calculate the average)
cardekho <- read.csv("C:/Users/Lenovo/Downloads/file dataset/cardekho.csv")
library(dplyr)
library(ggplot2)
average_selling_price_fuel <- cardekho %>%
  group_by(fuel) %>%
  summarise(avg_selling_price = mean(selling_price, na.rm = TRUE))

#Q9. Calculate average selling price for each brand
cardekho <- read.csv("C:/Users/Lenovo/Downloads/file dataset/cardekho.csv")
average_price <- cardekho %>%
  group_by(name) %>%
  summarise(avg_price = mean(selling_price, na.rm = TRUE)) %>%
  arrange(desc(avg_price))

#Q10 Count the number of cars for each seat type
cardekho <- read.csv("C:/Users/Lenovo/Downloads/file dataset/cardekho.csv")
library(ggplot2)
seat_type_counts <- cardekho %>%
  count(seats)
library(ggplot2)
ggplot(seat_type_counts, aes(x = seats, y = n)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  labs(title = "Distribution of Car Seat Types", x = "Seat Type", y = "Number of Cars")

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_bar()`).

#Q11 How does the selling price of cars vary with the number of owners?
cardekho <- read.csv("C:/Users/Lenovo/Downloads/file dataset/cardekho.csv")
library(ggplot2)
ggplot(cardekho, aes(x = owner, y = selling_price)) +
  geom_boxplot(fill = "skyblue") +
  labs(title = "Selling Price vs. Owner", x = "Previous Owners", y = "Selling Price")

# Q12. Are there any missing values in the dataset? (Check for NA values)
any_missing_values <- any(is.na(cardekho))
ggplot(cardekho, aes(x = year, y = km_driven)) +
  geom_point() +  # Add data points
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(x = "Year of Car", y = "Kilometers Driven", title = "Kilometers Driven vs. Year of Car")

## `geom_smooth()` using formula = 'y ~ x'

#Q13 Are there any outliers in the selling price column? (Detect outliers using box plot or z-score)
# Boxplot method
ggplot(cardekho, aes(x = "", y = selling_price)) +
  geom_boxplot() +
  labs(x = "", y = "Selling Price", title = "Boxplot of Selling Prices")

#Q14. Identify the brand with the highest average selling price
highest_avg_price_brand <- average_price$name[1]
#return the avg. price of highest_avg_price_brand
highest_avg_price <- average_price$avg_price[1]

#Q15Can we predict the selling price of a car based on year using a simple linear regression model?

# Load required libraries
library(ggplot2) 
library(car)

## Warning: package 'car' was built under R version 4.3.3

## Loading required package: carData

## Warning: package 'carData' was built under R version 4.3.3

## 
## Attaching package: 'car'

## The following object is masked from 'package:purrr':
## 
##     some

## The following object is masked from 'package:dplyr':
## 
##     recode

# Explore the relationship between selling price and age with a scatter plot
ggplot(cardekho, aes(x = year, y = selling_price)) +
  geom_point() +
  labs(title = "Scatter Plot of Selling Price vs. Age",
       x = "Age (Years)",
       y = "Selling Price (in Lakh)")

# Perform simple linear regression
model <- lm(selling_price ~ year, data = cardekho)

# Summarize the regression model
summary(model)

## 
## Call:
## lm(formula = selling_price ~ year, data = cardekho)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -899766 -314346 -161900   28101 9097891 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -165606347    4053843  -40.85   <2e-16 ***
## year             82553       2013   41.01   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 733900 on 8126 degrees of freedom
## Multiple R-squared:  0.1715, Adjusted R-squared:  0.1714 
## F-statistic:  1682 on 1 and 8126 DF,  p-value: < 2.2e-16

# Plot the regression line
ggplot(cardekho, aes(x = year, y = selling_price)) +
  geom_point(color="green") +
  geom_smooth(method = "lm", se = FALSE, color = "pink") +
  labs(title = "Linear Regression: Selling Price vs. Age",
       x = "Age (Years)",
       y = "Selling Price (in Lakh)")

## `geom_smooth()` using formula = 'y ~ x'

analysis on cardekho dataset

Shivanand 12218706

2024-04-24

R Markdown