library(ggplot2)
library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ stringr 1.5.1
## ✔ forcats 1.0.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
df <- read_csv("CarPrice_Assignment.csv")
## Rows: 205 Columns: 26
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): CarName, fueltype, aspiration, doornumber, carbody, drivewheel, en...
## dbl (16): car_ID, symboling, wheelbase, carlength, carwidth, carheight, curb...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_new <- subset(df, select = -c(symboling, aspiration,doornumber,drivewheel, wheelbase,curbweight,carwidth,carheight,carlength))
attach(df_new)
head(df_new)
## # A tibble: 6 × 17
## car_ID CarName fueltype carbody enginelocation enginetype cylindernumber
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 1 alfa-romero … gas conver… front dohc four
## 2 2 alfa-romero … gas conver… front dohc four
## 3 3 alfa-romero … gas hatchb… front ohcv six
## 4 4 audi 100 ls gas sedan front ohc four
## 5 5 audi 100ls gas sedan front ohc five
## 6 6 audi fox gas sedan front ohc five
## # ℹ 10 more variables: enginesize <dbl>, fuelsystem <chr>, boreratio <dbl>,
## # stroke <dbl>, compressionratio <dbl>, horsepower <dbl>, peakrpm <dbl>,
## # citympg <dbl>, highwaympg <dbl>, price <dbl>
sum(is.null(df_new))
## [1] 0
summary(df_new)
## car_ID CarName fueltype carbody
## Min. : 1 Length:205 Length:205 Length:205
## 1st Qu.: 52 Class :character Class :character Class :character
## Median :103 Mode :character Mode :character Mode :character
## Mean :103
## 3rd Qu.:154
## Max. :205
## enginelocation enginetype cylindernumber enginesize
## Length:205 Length:205 Length:205 Min. : 61.0
## Class :character Class :character Class :character 1st Qu.: 97.0
## Mode :character Mode :character Mode :character Median :120.0
## Mean :126.9
## 3rd Qu.:141.0
## Max. :326.0
## fuelsystem boreratio stroke compressionratio
## Length:205 Min. :2.54 Min. :2.070 Min. : 7.00
## Class :character 1st Qu.:3.15 1st Qu.:3.110 1st Qu.: 8.60
## Mode :character Median :3.31 Median :3.290 Median : 9.00
## Mean :3.33 Mean :3.255 Mean :10.14
## 3rd Qu.:3.58 3rd Qu.:3.410 3rd Qu.: 9.40
## Max. :3.94 Max. :4.170 Max. :23.00
## horsepower peakrpm citympg highwaympg price
## Min. : 48.0 Min. :4150 Min. :13.00 Min. :16.00 Min. : 5118
## 1st Qu.: 70.0 1st Qu.:4800 1st Qu.:19.00 1st Qu.:25.00 1st Qu.: 7788
## Median : 95.0 Median :5200 Median :24.00 Median :30.00 Median :10295
## Mean :104.1 Mean :5125 Mean :25.22 Mean :30.75 Mean :13277
## 3rd Qu.:116.0 3rd Qu.:5500 3rd Qu.:30.00 3rd Qu.:34.00 3rd Qu.:16503
## Max. :288.0 Max. :6600 Max. :49.00 Max. :54.00 Max. :45400
#EDA
#df_new$Marka<- as.factor(df_new$Marka)
#df_new <- separate(df_new, CarName, into = c("Marka", "Model"), sep = " ")
df_new <- df_new %>%
mutate(Marka = str_split(CarName, " ", simplify = TRUE)[, 1],
Model = str_split(CarName, " ", simplify = TRUE)[, 2])
df_new<- na.omit(df_new)
attach(df_new)
## The following objects are masked from df_new (pos = 3):
##
## boreratio, car_ID, carbody, CarName, citympg, compressionratio,
## cylindernumber, enginelocation, enginesize, enginetype, fuelsystem,
## fueltype, highwaympg, horsepower, peakrpm, price, stroke
ggplot(df_new, aes(x = enginesize, y = price)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
xlab("Engine Size") +
ylab("Price") +
ggtitle("Price of Car by Engine Size")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(df_new, aes(x = carbody, y = price, fill = carbody)) +
geom_boxplot(outlier.color = "red") +
theme(legend.position = "none") +
xlab("Carbody") +
ylab("Price") +
ggtitle("Price Distribution by Carbody")
top_brands <- df_new %>%
group_by(Marka) %>%
summarize(mean_price = mean(price)) %>%
top_n(10, mean_price)
# En yüksek fiyatlı ilk 10 markanın verisini filtrelemek
df_filtered <- df_new %>%
filter(Marka %in% top_brands$Marka)
# GrafiÄŸi oluÅŸturmak
ggplot(df_filtered, aes(x = reorder(Marka, price, FUN = median), y = price, fill = Marka)) +
geom_boxplot(outlier.color = "red") +
theme(legend.position = "none") +
xlab("Make") +
ylab("Price") +
ggtitle("Price Distribution by Make")
ggplot(df_new, aes(x = cylindernumber, y = price, fill = carbody)) +
geom_boxplot(outlier.color = "red") +
xlab("Number of Cylinders") +
ylab("Price") +
ggtitle("Price Distribution of Engine Cylinders")
ggplot(df_new, aes(x = price, fill = fueltype, color = carbody)) +
geom_density(alpha = 0.3) +
xlab("Price") +
ylab("Density") +
labs(subtitle = "Highlighted by Number of Doors") +
ggtitle("Price Distribution of Car Types")
## Warning: Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf