This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
#Q1.From the cardekho dataset, how does the distribution of cars sold vary across different years?
cardekho <- read.csv("C:/Users/Lenovo/Downloads/file dataset/cardekho.csv")
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
ggplot(cardekho, aes(x = factor(year))) +
geom_bar() +
labs(title = "Number of Cars Sold by Year",
x = "Year",
y = "Number of Cars Sold")
#Q2..from the cardekho data set how many car driven less than 100k
cardekho <- read.csv("C:/Users/Lenovo/Downloads/file dataset/cardekho.csv")
num_cars_less_than_100k <- sum(cardekho$km_driven < 100000)
#Q3.What is the distribution of transmission types (manual and automatic) among the cars in the dataset named 'cardekho.
cardekho <- read.csv("C:/Users/Lenovo/Downloads/file dataset/cardekho.csv")
# Counting the number of cars with manual and automatic transmission
manual_count <- sum(cardekho$transmission == "Manual")
automatic_count <- sum(cardekho$transmission == "Automatic")
#Q4. Counting the number of cars with the specified name
cardekho <- read.csv("C:/Users/Lenovo/Downloads/file dataset/cardekho.csv")
num_cars_maruti_vita <- sum(cardekho $name == "Maruti Vitara Brezza LDi")
num_cars_maruti_vitar <- sum(cardekho $name == "Maruti Vitara Brezza LDi option")
num_cars_maruti_vitara <- sum(cardekho $name == "Maruti Alto 800 CNG LXI Optional")
maruticars<-c(num_cars_maruti_vita,num_cars_maruti_vitar,num_cars_maruti_vitara)
#Q5 What is the distribution of car fuel types (e.g., petrol, diesel, CNG) in the CarDekho dataset, and which fuel type constitutes the highest proportion of cars?
cardekho <- read.csv("C:/Users/Lenovo/Downloads/file dataset/cardekho.csv")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
# Count the number of cars for each fuel type
fuel_type_counts <- cardekho %>%
count(fuel)
# Plot a pie chart to visualize the distribution of car fuel types
ggplot(fuel_type_counts, aes(x = "", y = n, fill = fuel)) +
geom_bar(stat = "identity") +
coord_polar("y") +
labs(fill = "Fuel Type", x = NULL, y = NULL, title = "Distribution of Car Fuel Types")
#Q6 "How can I create a bar plot of car owners from the 'cardekho' dataset in R using ggplot2, with each bar filled with a different color representing a unique owner category?"
cardekho <- read.csv("C:/Users/Lenovo/Downloads/file dataset/cardekho.csv")
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
ggplot(cardekho, aes(x = owner, fill = owner)) +
geom_bar() +
labs(title = "Count of Cardekho Owners", x = "Owner", y = "Count") +
scale_fill_manual(values = rainbow(length(unique(cardekho$owner))))
# Q7 Is there a correlation between the selling price and the year of manufacture? (Calculate correlation coefficient)
cardekho <- read.csv("C:/Users/Lenovo/Downloads/file dataset/cardekho.csv")
correlation <- cor(cardekho$selling_price, cardekho$year)
#Q8 What is the average selling price of cars based on their fuel type? (Group by fuel type and calculate the average)
cardekho <- read.csv("C:/Users/Lenovo/Downloads/file dataset/cardekho.csv")
library(dplyr)
library(ggplot2)
average_selling_price_fuel <- cardekho %>%
group_by(fuel) %>%
summarise(avg_selling_price = mean(selling_price, na.rm = TRUE))
#Q9. Calculate average selling price for each brand
cardekho <- read.csv("C:/Users/Lenovo/Downloads/file dataset/cardekho.csv")
average_price <- cardekho %>%
group_by(name) %>%
summarise(avg_price = mean(selling_price, na.rm = TRUE)) %>%
arrange(desc(avg_price))
#Q10 Count the number of cars for each seat type
cardekho <- read.csv("C:/Users/Lenovo/Downloads/file dataset/cardekho.csv")
library(ggplot2)
seat_type_counts <- cardekho %>%
count(seats)
library(ggplot2)
ggplot(seat_type_counts, aes(x = seats, y = n)) +
geom_bar(stat = "identity", fill = "skyblue") +
labs(title = "Distribution of Car Seat Types", x = "Seat Type", y = "Number of Cars")
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_bar()`).
#Q11 How does the selling price of cars vary with the number of owners?
cardekho <- read.csv("C:/Users/Lenovo/Downloads/file dataset/cardekho.csv")
library(ggplot2)
ggplot(cardekho, aes(x = owner, y = selling_price)) +
geom_boxplot(fill = "skyblue") +
labs(title = "Selling Price vs. Owner", x = "Previous Owners", y = "Selling Price")
# Q12. Are there any missing values in the dataset? (Check for NA values)
any_missing_values <- any(is.na(cardekho))
ggplot(cardekho, aes(x = year, y = km_driven)) +
geom_point() + # Add data points
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(x = "Year of Car", y = "Kilometers Driven", title = "Kilometers Driven vs. Year of Car")
## `geom_smooth()` using formula = 'y ~ x'
#Q13 Are there any outliers in the selling price column? (Detect outliers using box plot or z-score)
# Boxplot method
ggplot(cardekho, aes(x = "", y = selling_price)) +
geom_boxplot() +
labs(x = "", y = "Selling Price", title = "Boxplot of Selling Prices")
#Q14. Identify the brand with the highest average selling price
highest_avg_price_brand <- average_price$name[1]
#return the avg. price of highest_avg_price_brand
highest_avg_price <- average_price$avg_price[1]
#Q15Can we predict the selling price of a car based on year using a simple linear regression model?
# Load required libraries
library(ggplot2)
library(car)
## Warning: package 'car' was built under R version 4.3.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.3.3
##
## Attaching package: 'car'
## The following object is masked from 'package:purrr':
##
## some
## The following object is masked from 'package:dplyr':
##
## recode
# Explore the relationship between selling price and age with a scatter plot
ggplot(cardekho, aes(x = year, y = selling_price)) +
geom_point() +
labs(title = "Scatter Plot of Selling Price vs. Age",
x = "Age (Years)",
y = "Selling Price (in Lakh)")
# Perform simple linear regression
model <- lm(selling_price ~ year, data = cardekho)
# Summarize the regression model
summary(model)
##
## Call:
## lm(formula = selling_price ~ year, data = cardekho)
##
## Residuals:
## Min 1Q Median 3Q Max
## -899766 -314346 -161900 28101 9097891
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -165606347 4053843 -40.85 <2e-16 ***
## year 82553 2013 41.01 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 733900 on 8126 degrees of freedom
## Multiple R-squared: 0.1715, Adjusted R-squared: 0.1714
## F-statistic: 1682 on 1 and 8126 DF, p-value: < 2.2e-16
# Plot the regression line
ggplot(cardekho, aes(x = year, y = selling_price)) +
geom_point(color="green") +
geom_smooth(method = "lm", se = FALSE, color = "pink") +
labs(title = "Linear Regression: Selling Price vs. Age",
x = "Age (Years)",
y = "Selling Price (in Lakh)")
## `geom_smooth()` using formula = 'y ~ x'