library(ggplot2)
library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ stringr   1.5.1
## ✔ forcats   1.0.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
df <- read_csv("CarPrice_Assignment.csv")
## Rows: 205 Columns: 26
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): CarName, fueltype, aspiration, doornumber, carbody, drivewheel, en...
## dbl (16): car_ID, symboling, wheelbase, carlength, carwidth, carheight, curb...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_new <- subset(df, select = -c(symboling, aspiration,doornumber,drivewheel, wheelbase,curbweight,carwidth,carheight,carlength))
attach(df_new)
head(df_new)
## # A tibble: 6 × 17
##   car_ID CarName       fueltype carbody enginelocation enginetype cylindernumber
##    <dbl> <chr>         <chr>    <chr>   <chr>          <chr>      <chr>         
## 1      1 alfa-romero … gas      conver… front          dohc       four          
## 2      2 alfa-romero … gas      conver… front          dohc       four          
## 3      3 alfa-romero … gas      hatchb… front          ohcv       six           
## 4      4 audi 100 ls   gas      sedan   front          ohc        four          
## 5      5 audi 100ls    gas      sedan   front          ohc        five          
## 6      6 audi fox      gas      sedan   front          ohc        five          
## # ℹ 10 more variables: enginesize <dbl>, fuelsystem <chr>, boreratio <dbl>,
## #   stroke <dbl>, compressionratio <dbl>, horsepower <dbl>, peakrpm <dbl>,
## #   citympg <dbl>, highwaympg <dbl>, price <dbl>
sum(is.null(df_new))
## [1] 0
summary(df_new)
##      car_ID      CarName            fueltype           carbody         
##  Min.   :  1   Length:205         Length:205         Length:205        
##  1st Qu.: 52   Class :character   Class :character   Class :character  
##  Median :103   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :103                                                           
##  3rd Qu.:154                                                           
##  Max.   :205                                                           
##  enginelocation      enginetype        cylindernumber       enginesize   
##  Length:205         Length:205         Length:205         Min.   : 61.0  
##  Class :character   Class :character   Class :character   1st Qu.: 97.0  
##  Mode  :character   Mode  :character   Mode  :character   Median :120.0  
##                                                           Mean   :126.9  
##                                                           3rd Qu.:141.0  
##                                                           Max.   :326.0  
##   fuelsystem          boreratio        stroke      compressionratio
##  Length:205         Min.   :2.54   Min.   :2.070   Min.   : 7.00   
##  Class :character   1st Qu.:3.15   1st Qu.:3.110   1st Qu.: 8.60   
##  Mode  :character   Median :3.31   Median :3.290   Median : 9.00   
##                     Mean   :3.33   Mean   :3.255   Mean   :10.14   
##                     3rd Qu.:3.58   3rd Qu.:3.410   3rd Qu.: 9.40   
##                     Max.   :3.94   Max.   :4.170   Max.   :23.00   
##    horsepower       peakrpm        citympg        highwaympg        price      
##  Min.   : 48.0   Min.   :4150   Min.   :13.00   Min.   :16.00   Min.   : 5118  
##  1st Qu.: 70.0   1st Qu.:4800   1st Qu.:19.00   1st Qu.:25.00   1st Qu.: 7788  
##  Median : 95.0   Median :5200   Median :24.00   Median :30.00   Median :10295  
##  Mean   :104.1   Mean   :5125   Mean   :25.22   Mean   :30.75   Mean   :13277  
##  3rd Qu.:116.0   3rd Qu.:5500   3rd Qu.:30.00   3rd Qu.:34.00   3rd Qu.:16503  
##  Max.   :288.0   Max.   :6600   Max.   :49.00   Max.   :54.00   Max.   :45400

#EDA

#df_new$Marka<- as.factor(df_new$Marka)
#df_new <- separate(df_new, CarName, into = c("Marka", "Model"), sep = " ")
df_new <- df_new %>%
  mutate(Marka = str_split(CarName, " ", simplify = TRUE)[, 1],
         Model = str_split(CarName, " ", simplify = TRUE)[, 2])
df_new<- na.omit(df_new)
attach(df_new)
## The following objects are masked from df_new (pos = 3):
## 
##     boreratio, car_ID, carbody, CarName, citympg, compressionratio,
##     cylindernumber, enginelocation, enginesize, enginetype, fuelsystem,
##     fueltype, highwaympg, horsepower, peakrpm, price, stroke
ggplot(df_new, aes(x = enginesize, y = price)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  xlab("Engine Size") +
  ylab("Price") +
  ggtitle("Price of Car by Engine Size")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(df_new, aes(x = carbody, y = price, fill = carbody)) + 
  geom_boxplot(outlier.color = "red") + 
  theme(legend.position = "none") +
  xlab("Carbody") +
  ylab("Price") +
  ggtitle("Price Distribution by Carbody")

top_brands <- df_new %>%
  group_by(Marka) %>%
  summarize(mean_price = mean(price)) %>%
  top_n(10, mean_price)

# En yüksek fiyatlı ilk 10 markanın verisini filtrelemek
df_filtered <- df_new %>%
  filter(Marka %in% top_brands$Marka)

# GrafiÄŸi oluÅŸturmak
ggplot(df_filtered, aes(x = reorder(Marka, price, FUN = median), y = price, fill = Marka)) +
  geom_boxplot(outlier.color = "red") +
  theme(legend.position = "none") +
  xlab("Make") +
  ylab("Price") +
  ggtitle("Price Distribution by Make")

ggplot(df_new, aes(x = cylindernumber, y = price, fill = carbody)) + 
  geom_boxplot(outlier.color = "red") +
  xlab("Number of Cylinders") +
  ylab("Price") +
  ggtitle("Price Distribution of Engine Cylinders")

ggplot(df_new, aes(x = price, fill = fueltype, color = carbody)) +
  geom_density(alpha = 0.3) +
  xlab("Price") +
  ylab("Density") + 
  labs(subtitle = "Highlighted by Number of Doors") +
  ggtitle("Price Distribution of Car Types")
## Warning: Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf