This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
df <- read_csv("https://raw.githubusercontent.com/gmoeser/AppliedStatisticalMethods/main/electric-cars.csv")
## Rows: 180 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Name, Drive
## dbl (8): BatterySize, Acceleration, TopSpeed, Range, Efficiency, ChargeSpeed...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df)
## # A tibble: 6 × 10
## Name BatterySize Acceleration TopSpeed Range Efficiency ChargeSpeed Drive
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 Tesla Ro… 200 2.1 410 970 206 920 All …
## 2 Tesla Mo… 90 2.6 262 455 198 680 All …
## 3 Porsche … 84 2.8 260 390 215 860 All …
## 4 Porsche … 84 2.9 250 380 220 790 All …
## 5 GMC Humm… 200 3 200 329 267 100 All …
## 6 Tesla Cy… 200 3 210 750 267 710 All …
## # ℹ 2 more variables: Seats <dbl>, Price <dbl>
dim(df)
## [1] 180 10
colnames(df)
## [1] "Name" "BatterySize" "Acceleration" "TopSpeed" "Range"
## [6] "Efficiency" "ChargeSpeed" "Drive" "Seats" "Price"
str(df)
## spc_tbl_ [180 × 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Name : chr [1:180] "Tesla Roadster" "Tesla Model X Plaid" "Porsche Taycan Turbo S" "Porsche Taycan Turbo S Cross Turismo" ...
## $ BatterySize : num [1:180] 200 90 84 84 200 200 110 90 84 76 ...
## $ Acceleration: num [1:180] 2.1 2.6 2.8 2.9 3 3 3.2 3.2 3.2 3.3 ...
## $ TopSpeed : num [1:180] 410 262 260 250 200 210 270 250 260 261 ...
## $ Range : num [1:180] 970 455 390 380 329 750 660 555 400 470 ...
## $ Efficiency : num [1:180] 206 198 215 220 267 267 167 162 209 162 ...
## $ ChargeSpeed : num [1:180] 920 680 860 790 100 710 1380 830 840 790 ...
## $ Drive : chr [1:180] "All Wheel Drive" "All Wheel Drive" "All Wheel Drive" "All Wheel Drive" ...
## $ Seats : num [1:180] 4 7 4 4 5 7 5 5 4 5 ...
## $ Price : num [1:180] 215000 116990 186336 187746 108700 ...
## - attr(*, "spec")=
## .. cols(
## .. Name = col_character(),
## .. BatterySize = col_double(),
## .. Acceleration = col_double(),
## .. TopSpeed = col_double(),
## .. Range = col_double(),
## .. Efficiency = col_double(),
## .. ChargeSpeed = col_double(),
## .. Drive = col_character(),
## .. Seats = col_double(),
## .. Price = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
summary(df)
## Name BatterySize Acceleration TopSpeed
## Length:180 Min. : 17.00 Min. : 2.100 Min. :123.0
## Class :character 1st Qu.: 45.00 1st Qu.: 5.575 1st Qu.:150.0
## Mode :character Median : 67.00 Median : 7.500 Median :160.0
## Mean : 65.94 Mean : 7.859 Mean :175.7
## 3rd Qu.: 77.00 3rd Qu.: 9.625 3rd Qu.:200.0
## Max. :200.00 Max. :22.400 Max. :410.0
## Range Efficiency ChargeSpeed Drive
## Min. : 95.0 Min. :104.0 Min. : 100.0 Length:180
## 1st Qu.:258.8 1st Qu.:168.0 1st Qu.: 277.5 Class :character
## Median :340.0 Median :189.0 Median : 420.0 Mode :character
## Mean :336.2 Mean :194.8 Mean : 469.2
## 3rd Qu.:400.0 3rd Qu.:216.2 3rd Qu.: 575.0
## Max. :970.0 Max. :281.0 Max. :1410.0
## Seats Price
## Min. :2.000 Min. : 13270
## 1st Qu.:5.000 1st Qu.: 37430
## Median :5.000 Median : 50000
## Mean :5.167 Mean : 57361
## 3rd Qu.:5.000 3rd Qu.: 63175
## Max. :7.000 Max. :215000
sapply(df, function(x) sum(is.na(x)))
## Name BatterySize Acceleration TopSpeed Range Efficiency
## 0 0 0 0 0 0
## ChargeSpeed Drive Seats Price
## 0 0 0 0
library(tidyverse)
library(stringr)
## Task 1: Describe the Dataset
df_clean <- df %>%
rename(
CarName = Name,
Battery = BatterySize,
Accel = Acceleration,
Top_Speed = TopSpeed,
Car_Range = Range,
Charge_Speed = ChargeSpeed
) %>%
# Convert character variables to factors
mutate(
Drive = as.factor(Drive),
Price_k = Price / 1000 # Price in thousands
)
# Remove duplicate rows if any
df_clean <- df_clean %>% distinct()
# ✅ Extract Brand from CarName
df_clean <- df_clean %>%
mutate(Brand = word(CarName, 1)) # Extracts first word (Brand)
# ✅ Ensure Brand is a factor and NOT a data frame
df_clean$Brand <- as.character(df_clean$Brand) # Convert to character first
df_clean$Brand <- as.factor(df_clean$Brand) # Now convert to a proper factor
# ✅ Check if Brand was correctly created
str(df_clean$Brand) # Should say "Factor w/ X levels"
## Factor w/ 38 levels "Aiways","Audi",..: 35 35 28 28 11 35 19 35 28 35 ...
table(df_clean$Brand) # Should list different car brands
##
## Aiways Audi BMW Byton Citroen CUPRA Dacia
## 1 17 7 3 6 4 1
## DS Fiat Ford GMC Honda Hyundai JAC
## 1 4 5 1 2 9 1
## Jaguar Kia Lexus Lightyear Lucid Mazda Mercedes
## 1 9 1 1 3 1 9
## MG Mini Nissan Opel Peugeot Polestar Porsche
## 6 1 8 8 9 3 10
## Renault SEAT Seres Skoda Smart Sono Tesla
## 6 1 1 5 3 1 14
## Toyota Volkswagen Volvo
## 4 10 2
# ✅ Drop missing values to avoid errors in regression
df_clean <- df_clean %>% filter(!is.na(Price) & !is.na(Battery) & !is.na(Car_Range) & !is.na(Accel) & !is.na(Brand))
# Check final structure
str(df_clean)
## tibble [179 × 12] (S3: tbl_df/tbl/data.frame)
## $ CarName : chr [1:179] "Tesla Roadster" "Tesla Model X Plaid" "Porsche Taycan Turbo S" "Porsche Taycan Turbo S Cross Turismo" ...
## $ Battery : num [1:179] 200 90 84 84 200 200 110 90 84 76 ...
## $ Accel : num [1:179] 2.1 2.6 2.8 2.9 3 3 3.2 3.2 3.2 3.3 ...
## $ Top_Speed : num [1:179] 410 262 260 250 200 210 270 250 260 261 ...
## $ Car_Range : num [1:179] 970 455 390 380 329 750 660 555 400 470 ...
## $ Efficiency : num [1:179] 206 198 215 220 267 267 167 162 209 162 ...
## $ Charge_Speed: num [1:179] 920 680 860 790 100 710 1380 830 840 790 ...
## $ Drive : Factor w/ 3 levels "All Wheel Drive",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Seats : num [1:179] 4 7 4 4 5 7 5 5 4 5 ...
## $ Price : num [1:179] 215000 116990 186336 187746 108700 ...
## $ Price_k : num [1:179] 215 117 186 188 109 ...
## $ Brand : Factor w/ 38 levels "Aiways","Audi",..: 35 35 28 28 11 35 19 35 28 35 ...
summary(df_clean)
## CarName Battery Accel Top_Speed
## Length:179 Min. : 17.00 Min. : 2.100 Min. :123.0
## Class :character 1st Qu.: 45.00 1st Qu.: 5.550 1st Qu.:150.0
## Mode :character Median : 67.00 Median : 7.500 Median :160.0
## Mean : 65.96 Mean : 7.859 Mean :175.8
## 3rd Qu.: 77.00 3rd Qu.: 9.650 3rd Qu.:200.0
## Max. :200.00 Max. :22.400 Max. :410.0
##
## Car_Range Efficiency Charge_Speed Drive
## Min. : 95.0 Min. :104.0 Min. : 100.0 All Wheel Drive :64
## 1st Qu.:257.5 1st Qu.:168.5 1st Qu.: 275.0 Front Wheel Drive:70
## Median :340.0 Median :189.0 Median : 420.0 Rear Wheel Drive :45
## Mean :335.9 Mean :195.0 Mean : 469.8
## 3rd Qu.:400.0 3rd Qu.:216.5 3rd Qu.: 580.0
## Max. :970.0 Max. :281.0 Max. :1410.0
##
## Seats Price Price_k Brand
## Min. :2.000 Min. : 13270 Min. : 13.27 Audi : 17
## 1st Qu.:5.000 1st Qu.: 37270 1st Qu.: 37.27 Tesla : 14
## Median :5.000 Median : 50000 Median : 50.00 Porsche : 10
## Mean :5.168 Mean : 57448 Mean : 57.45 Volkswagen: 10
## 3rd Qu.:5.000 3rd Qu.: 63450 3rd Qu.: 63.45 Hyundai : 9
## Max. :7.000 Max. :215000 Max. :215.00 Kia : 9
## (Other) :110
## Battery Accel Top_Speed Car_Range Efficiency Charge_Speed
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## Seats Price
## "numeric" "numeric"
sapply(df_clean[, c("Battery", "Car_Range", "Accel", "Price")], class)
## Battery Car_Range Accel Price
## "numeric" "numeric" "numeric" "numeric"
# ✅ Run Multiple Linear Regression with Brand as Factor
model_brand <- lm(Price ~ Battery + Car_Range + Accel + factor(Brand), data = df_clean)
# ✅ Show Summary of Regression
summary(model_brand)
##
## Call:
## lm(formula = Price ~ Battery + Car_Range + Accel + factor(Brand),
## data = df_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -66314 -6689 0 5162 72270
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.038e+04 2.114e+04 0.491 0.624144
## Battery 6.192e+02 1.571e+02 3.942 0.000128 ***
## Car_Range -7.357e-01 3.974e+01 -0.019 0.985258
## Accel -1.753e+03 8.714e+02 -2.012 0.046212 *
## factor(Brand)Audi 2.427e+04 1.812e+04 1.339 0.182830
## factor(Brand)BMW 2.156e+04 1.878e+04 1.148 0.252823
## factor(Brand)Byton 7.633e+03 2.043e+04 0.374 0.709217
## factor(Brand)Citroen 2.920e+04 1.930e+04 1.513 0.132673
## factor(Brand)CUPRA 3.969e+03 1.964e+04 0.202 0.840141
## factor(Brand)Dacia 1.981e+04 2.544e+04 0.779 0.437427
## factor(Brand)DS 7.233e+03 2.483e+04 0.291 0.771300
## factor(Brand)Fiat 1.270e+04 1.973e+04 0.644 0.520915
## factor(Brand)Ford 6.105e+03 1.926e+04 0.317 0.751792
## factor(Brand)GMC -2.001e+04 3.363e+04 -0.595 0.552768
## factor(Brand)Honda 2.288e+04 2.168e+04 1.055 0.293140
## factor(Brand)Hyundai 1.123e+04 1.848e+04 0.607 0.544537
## factor(Brand)JAC -5.588e+01 2.503e+04 -0.002 0.998222
## factor(Brand)Jaguar 2.297e+04 2.503e+04 0.918 0.360370
## factor(Brand)Kia 9.707e+03 1.848e+04 0.525 0.600334
## factor(Brand)Lexus 1.955e+04 2.484e+04 0.787 0.432584
## factor(Brand)Lightyear 1.194e+05 2.711e+04 4.406 2.1e-05 ***
## factor(Brand)Lucid 4.357e+04 2.091e+04 2.083 0.039090 *
## factor(Brand)Mazda 2.266e+04 2.496e+04 0.908 0.365511
## factor(Brand)Mercedes 2.648e+04 1.855e+04 1.428 0.155667
## factor(Brand)MG 3.232e+03 1.894e+04 0.171 0.864740
## factor(Brand)Mini 1.710e+04 2.499e+04 0.684 0.495016
## factor(Brand)Nissan 1.078e+04 1.860e+04 0.580 0.563122
## factor(Brand)Opel 2.535e+04 1.879e+04 1.349 0.179452
## factor(Brand)Peugeot 2.644e+04 1.878e+04 1.408 0.161391
## factor(Brand)Polestar 7.254e+03 2.027e+04 0.358 0.721018
## factor(Brand)Porsche 7.449e+04 1.860e+04 4.006 0.000101 ***
## factor(Brand)Renault 1.644e+04 1.934e+04 0.850 0.396569
## factor(Brand)SEAT 1.617e+04 2.508e+04 0.645 0.520178
## factor(Brand)Seres 1.991e+04 2.482e+04 0.802 0.423761
## factor(Brand)Skoda 5.005e+03 1.924e+04 0.260 0.795197
## factor(Brand)Smart 2.008e+04 2.074e+04 0.968 0.334523
## factor(Brand)Sono 1.988e+03 2.483e+04 0.080 0.936307
## factor(Brand)Tesla 1.291e+04 1.842e+04 0.701 0.484538
## factor(Brand)Toyota 3.885e+04 2.006e+04 1.937 0.054786 .
## factor(Brand)Volkswagen 5.888e+03 1.844e+04 0.319 0.750042
## factor(Brand)Volvo 1.250e+04 2.169e+04 0.576 0.565388
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17530 on 138 degrees of freedom
## Multiple R-squared: 0.7691, Adjusted R-squared: 0.7022
## F-statistic: 11.49 on 40 and 138 DF, p-value: < 2.2e-16
df_clean <- df_clean %>%
mutate(log_Price = log(Price_k))
print(colnames(df_clean))
## [1] "CarName" "Battery" "Accel" "Top_Speed" "Car_Range"
## [6] "Efficiency" "Charge_Speed" "Drive" "Seats" "Price"
## [11] "Price_k" "Brand" "log_Price"
model_log <- lm(log_Price ~ Battery + Car_Range + Accel + factor(Brand), data = df_clean)
summary(model_log)
##
## Call:
## lm(formula = log_Price ~ Battery + Car_Range + Accel + factor(Brand),
## data = df_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.84361 -0.07310 0.00000 0.08136 0.58298
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.3324014 0.2429955 13.714 < 2e-16 ***
## Battery 0.0102826 0.0018058 5.694 7.16e-08 ***
## Car_Range -0.0005750 0.0004569 -1.258 0.21035
## Accel -0.0272332 0.0100186 -2.718 0.00740 **
## factor(Brand)Audi 0.4620486 0.2083732 2.217 0.02823 *
## factor(Brand)BMW 0.4496072 0.2158603 2.083 0.03911 *
## factor(Brand)Byton 0.2579602 0.2348241 1.099 0.27389
## factor(Brand)Citroen 0.4982734 0.2219133 2.245 0.02634 *
## factor(Brand)CUPRA 0.0920083 0.2257652 0.408 0.68424
## factor(Brand)Dacia -0.0838429 0.2924688 -0.287 0.77479
## factor(Brand)DS -0.0119046 0.2855088 -0.042 0.96680
## factor(Brand)Fiat 0.0196698 0.2267937 0.087 0.93101
## factor(Brand)Ford 0.2389043 0.2214701 1.079 0.28259
## factor(Brand)GMC -0.4294515 0.3865911 -1.111 0.26856
## factor(Brand)Honda 0.2824865 0.2492404 1.133 0.25902
## factor(Brand)Hyundai 0.2269541 0.2124889 1.068 0.28735
## factor(Brand)JAC -0.6917388 0.2877137 -2.404 0.01753 *
## factor(Brand)Jaguar 0.4818728 0.2878152 1.674 0.09635 .
## factor(Brand)Kia 0.2151521 0.2125157 1.012 0.31312
## factor(Brand)Lexus 0.3690046 0.2855740 1.292 0.19846
## factor(Brand)Lightyear 1.6575584 0.3116385 5.319 4.11e-07 ***
## factor(Brand)Lucid 0.7633725 0.2404511 3.175 0.00185 **
## factor(Brand)Mazda 0.2617056 0.2869945 0.912 0.36342
## factor(Brand)Mercedes 0.5130294 0.2132596 2.406 0.01747 *
## factor(Brand)MG 0.0026601 0.2177476 0.012 0.99027
## factor(Brand)Mini 0.1558246 0.2873020 0.542 0.58844
## factor(Brand)Nissan 0.2360932 0.2137908 1.104 0.27138
## factor(Brand)Opel 0.4246864 0.2159740 1.966 0.05126 .
## factor(Brand)Peugeot 0.4283003 0.2158676 1.984 0.04923 *
## factor(Brand)Polestar 0.2464088 0.2330645 1.057 0.29224
## factor(Brand)Porsche 0.9819502 0.2138073 4.593 9.76e-06 ***
## factor(Brand)Renault 0.1431220 0.2223031 0.644 0.52076
## factor(Brand)SEAT -0.0038199 0.2883329 -0.013 0.98945
## factor(Brand)Seres 0.3740643 0.2853625 1.311 0.19209
## factor(Brand)Skoda 0.1507110 0.2212455 0.681 0.49689
## factor(Brand)Smart -0.1414813 0.2384340 -0.593 0.55390
## factor(Brand)Sono -0.1824010 0.2854527 -0.639 0.52389
## factor(Brand)Tesla 0.2851355 0.2118091 1.346 0.18045
## factor(Brand)Toyota 0.6851192 0.2305787 2.971 0.00350 **
## factor(Brand)Volkswagen 0.1013254 0.2120403 0.478 0.63351
## factor(Brand)Volvo 0.3274826 0.2493440 1.313 0.19124
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2015 on 138 degrees of freedom
## Multiple R-squared: 0.8561, Adjusted R-squared: 0.8145
## F-statistic: 20.53 on 40 and 138 DF, p-value: < 2.2e-16
#Dropout of range and comparison before & after log model and linear
model_price <- lm(Price ~ Battery + Accel + factor(Brand), data = df_clean)
summary(model_price)
##
## Call:
## lm(formula = Price ~ Battery + Accel + factor(Brand), data = df_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -66233 -6687 0 5179 72195
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10242.14 19714.97 0.520 0.6042
## Battery 616.67 81.06 7.607 3.82e-12 ***
## Accel -1746.58 797.61 -2.190 0.0302 *
## factor(Brand)Audi 24290.48 18009.93 1.349 0.1796
## factor(Brand)BMW 21557.69 18707.53 1.152 0.2512
## factor(Brand)Byton 7670.32 20249.87 0.379 0.7054
## factor(Brand)Citroen 29223.51 19177.35 1.524 0.1298
## factor(Brand)CUPRA 3945.51 19527.14 0.202 0.8402
## factor(Brand)Dacia 19796.44 25333.21 0.781 0.4359
## factor(Brand)DS 7242.95 24738.58 0.293 0.7701
## factor(Brand)Fiat 12692.77 19655.12 0.646 0.5195
## factor(Brand)Ford 6116.48 19184.27 0.319 0.7503
## factor(Brand)GMC -19636.34 26789.29 -0.733 0.4648
## factor(Brand)Honda 22907.33 21542.82 1.063 0.2895
## factor(Brand)Hyundai 11221.88 18413.42 0.609 0.5432
## factor(Brand)JAC -63.33 24932.43 -0.003 0.9980
## factor(Brand)Jaguar 23024.51 24799.66 0.928 0.3548
## factor(Brand)Kia 9695.77 18408.35 0.527 0.5992
## factor(Brand)Lexus 19573.71 24719.65 0.792 0.4298
## factor(Brand)Lightyear 119223.45 24771.74 4.813 3.83e-06 ***
## factor(Brand)Lucid 43489.70 20417.83 2.130 0.0349 *
## factor(Brand)Mazda 22689.57 24836.06 0.914 0.3625
## factor(Brand)Mercedes 26477.08 18481.34 1.433 0.1542
## factor(Brand)MG 3233.03 18871.79 0.171 0.8642
## factor(Brand)Mini 17124.45 24857.32 0.689 0.4920
## factor(Brand)Nissan 10786.53 18523.70 0.582 0.5613
## factor(Brand)Opel 25366.40 18690.46 1.357 0.1769
## factor(Brand)Peugeot 26459.85 18667.17 1.417 0.1586
## factor(Brand)Polestar 7238.18 20181.29 0.359 0.7204
## factor(Brand)Porsche 74509.99 18505.19 4.026 9.27e-05 ***
## factor(Brand)Renault 16423.39 19235.22 0.854 0.3947
## factor(Brand)SEAT 16157.33 24980.89 0.647 0.5188
## factor(Brand)Seres 19925.57 24724.09 0.806 0.4217
## factor(Brand)Skoda 4987.44 19151.99 0.260 0.7949
## factor(Brand)Smart 20116.51 20593.57 0.977 0.3303
## factor(Brand)Sono 1993.58 24737.72 0.081 0.9359
## factor(Brand)Tesla 12897.10 18336.50 0.703 0.4830
## factor(Brand)Toyota 38880.40 19912.60 1.953 0.0529 .
## factor(Brand)Volkswagen 5867.49 18345.29 0.320 0.7496
## factor(Brand)Volvo 12541.21 21483.70 0.584 0.5603
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17460 on 139 degrees of freedom
## Multiple R-squared: 0.7691, Adjusted R-squared: 0.7043
## F-statistic: 11.87 on 39 and 139 DF, p-value: < 2.2e-16
cat("R² of linear Model (with Car_Range):", summary(model_brand)$r.squared, "\n")
## R² of linear Model (with Car_Range): 0.7691144
cat("R² of Updated linear Model (without Car_Range):", summary(model_price)$r.squared, "\n")
## R² of Updated linear Model (without Car_Range): 0.7691138
model_log_price <- lm(log_Price ~ Battery + Accel + factor(Brand), data = df_clean)
summary(model_log_price)
##
## Call:
## lm(formula = log_Price ~ Battery + Accel + factor(Brand), data = df_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.78086 -0.07121 0.00000 0.09153 0.58704
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.2248669 0.2279532 14.147 < 2e-16 ***
## Battery 0.0083387 0.0009373 8.897 2.75e-15 ***
## Accel -0.0222504 0.0092223 -2.413 0.01714 *
## factor(Brand)Audi 0.4814348 0.2082389 2.312 0.02225 *
## factor(Brand)BMW 0.4472203 0.2163047 2.068 0.04054 *
## factor(Brand)Byton 0.2875010 0.2341380 1.228 0.22156
## factor(Brand)Citroen 0.5194757 0.2217370 2.343 0.02056 *
## factor(Brand)CUPRA 0.0739531 0.2257815 0.328 0.74375
## factor(Brand)Dacia -0.0963179 0.2929139 -0.329 0.74278
## factor(Brand)DS -0.0039983 0.2860385 -0.014 0.98887
## factor(Brand)Fiat 0.0172456 0.2272613 0.076 0.93962
## factor(Brand)Ford 0.2479760 0.2218171 1.118 0.26553
## factor(Brand)GMC -0.1372555 0.3097496 -0.443 0.65837
## factor(Brand)Honda 0.3055404 0.2490876 1.227 0.22203
## factor(Brand)Hyundai 0.2224354 0.2129042 1.045 0.29794
## factor(Brand)JAC -0.6975641 0.2882799 -2.420 0.01682 *
## factor(Brand)Jaguar 0.5208430 0.2867447 1.816 0.07146 .
## factor(Brand)Kia 0.2063280 0.2128455 0.969 0.33404
## factor(Brand)Lexus 0.3868599 0.2858196 1.354 0.17809
## factor(Brand)Lightyear 1.5012638 0.2864219 5.241 5.79e-07 ***
## factor(Brand)Lucid 0.7028084 0.2360801 2.977 0.00343 **
## factor(Brand)Mazda 0.2814715 0.2871655 0.980 0.32870
## factor(Brand)Mercedes 0.5095951 0.2136894 2.385 0.01844 *
## factor(Brand)MG 0.0031979 0.2182040 0.015 0.98833
## factor(Brand)Mini 0.1769799 0.2874114 0.616 0.53905
## factor(Brand)Nissan 0.2424560 0.2141793 1.132 0.25957
## factor(Brand)Opel 0.4394508 0.2161074 2.033 0.04391 *
## factor(Brand)Peugeot 0.4464299 0.2158382 2.068 0.04046 *
## factor(Brand)Polestar 0.2340210 0.2333450 1.003 0.31765
## factor(Brand)Porsche 0.9959578 0.2139653 4.655 7.49e-06 ***
## factor(Brand)Renault 0.1271533 0.2224062 0.572 0.56844
## factor(Brand)SEAT -0.0132473 0.2888402 -0.046 0.96348
## factor(Brand)Seres 0.3830813 0.2858710 1.340 0.18242
## factor(Brand)Skoda 0.1370833 0.2214439 0.619 0.53690
## factor(Brand)Smart -0.1166113 0.2381120 -0.490 0.62510
## factor(Brand)Sono -0.1778519 0.2860285 -0.622 0.53509
## factor(Brand)Tesla 0.2724989 0.2120148 1.285 0.20083
## factor(Brand)Toyota 0.7096048 0.2302383 3.082 0.00248 **
## factor(Brand)Volkswagen 0.0856139 0.2121164 0.404 0.68711
## factor(Brand)Volvo 0.3613887 0.2484041 1.455 0.14797
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2019 on 139 degrees of freedom
## Multiple R-squared: 0.8545, Adjusted R-squared: 0.8137
## F-statistic: 20.93 on 39 and 139 DF, p-value: < 2.2e-16
cat("R² of log Model (with Car_Range):", summary(model_log)$r.squared, "\n")
## R² of log Model (with Car_Range): 0.8561485
cat("R² of Updated Log Model (without Car_Range):", summary(model_log_price)$r.squared, "\n")
## R² of Updated Log Model (without Car_Range): 0.8544976
library(ggplot2)
library(dplyr)
library(tidyr)
library(gridExtra)
# Extract coefficients from both models
coef_price <- coef(summary(model_price)) # Linear Model
coef_log_price <- coef(summary(model_log_price)) # Log-Linear Model
# Convert to data frames
coef_df <- data.frame(
Variable = rownames(coef_price),
Estimate_Price = coef_price[, "Estimate"],
Estimate_Log_Price = (exp(coef_log_price[, "Estimate"]) - 1) * 100 # Convert log-coefficients to percentage change
)
# Keep only brand coefficients
coef_df <- coef_df[grepl("factor\\(Brand\\)", coef_df$Variable), ]
# Create separate plots for each model
p1 <- ggplot(coef_df, aes(x = Variable, y = Estimate_Price, fill = "Linear Model (€)")) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(title = "Brand Influence on Price (Linear Model)",
x = "Brand", y = "Estimated Effect (€)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_manual(values = c("Linear Model (€)" = "red"), name = "Model")
p2 <- ggplot(coef_df, aes(x = Variable, y = Estimate_Log_Price, fill = "Log Model (% Change)")) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(title = "Brand Influence on Price (Log Model)",
x = "Brand", y = "Estimated Effect (% Change)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_manual(values = c("Log Model (% Change)" = "blue"), name = "Model")
# Display the two plots
library(gridExtra)
grid.arrange(p1, p2, ncol = 1)
# Plot Residuals vs. Fitted Values for Linear Model
p3 <- ggplot(data.frame(Fitted = fitted(model_price), Residuals = resid(model_price)),
aes(x = Fitted, y = Residuals)) +
geom_point(color = "red") +
geom_hline(yintercept = 0, linetype = "dashed") +
theme_minimal() +
labs(title = "Residuals vs. Fitted (Linear Model)", x = "Fitted Values (€)", y = "Residuals")
# Plot Residuals vs. Fitted Values for Log Model
p4 <- ggplot(data.frame(Fitted = fitted(model_log_price), Residuals = resid(model_log_price)),
aes(x = Fitted, y = Residuals)) +
geom_point(color = "blue") +
geom_hline(yintercept = 0, linetype = "dashed") +
theme_minimal() +
labs(title = "Residuals vs. Fitted (Log Model)", x = "Fitted Values (log)", y = "Residuals")
# Display both plots
grid.arrange(p3, p4, ncol = 2)
#Battery & Brand interact
model_interaction_bb <- lm(log_Price ~ Battery * factor(Brand) + Accel, data = df_clean)
summary(model_interaction_bb)
##
## Call:
## lm(formula = log_Price ~ Battery * factor(Brand) + Accel, data = df_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.52608 -0.06515 0.00000 0.06395 0.55683
##
## Coefficients: (16 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.1062155 0.3609347 8.606 3.84e-14 ***
## Battery 0.0099851 0.0043091 2.317 0.0222 *
## factor(Brand)Audi 0.0070847 0.4721091 0.015 0.9881
## factor(Brand)BMW 0.3241437 0.4194033 0.773 0.4411
## factor(Brand)Byton 0.5775563 1.0069895 0.574 0.5674
## factor(Brand)Citroen 0.4775432 0.5315553 0.898 0.3708
## factor(Brand)CUPRA 0.2403467 0.6337260 0.379 0.7052
## factor(Brand)Dacia -0.0519727 0.3286443 -0.158 0.8746
## factor(Brand)DS 0.0232492 0.2965041 0.078 0.9376
## factor(Brand)Fiat -0.1316935 0.7011893 -0.188 0.8513
## factor(Brand)Ford 0.5175062 0.8176705 0.633 0.5280
## factor(Brand)GMC -0.3538608 0.6493870 -0.545 0.5868
## factor(Brand)Honda 0.3592304 0.2872204 1.251 0.2135
## factor(Brand)Hyundai 0.3428502 0.4364295 0.786 0.4337
## factor(Brand)JAC -0.6670055 0.3054082 -2.184 0.0309 *
## factor(Brand)Jaguar 0.4899949 0.3014889 1.625 0.1068
## factor(Brand)Kia 0.1579080 0.4551594 0.347 0.7293
## factor(Brand)Lexus 0.4082635 0.2919948 1.398 0.1647
## factor(Brand)Lightyear 1.5012276 0.2879363 5.214 7.99e-07 ***
## factor(Brand)Lucid -0.1304923 0.9945684 -0.131 0.8958
## factor(Brand)Mazda 0.3314254 0.3187312 1.040 0.3005
## factor(Brand)Mercedes -0.2070857 0.4859280 -0.426 0.6708
## factor(Brand)MG -0.4019967 0.6752321 -0.595 0.5528
## factor(Brand)Mini 0.2333566 0.3220527 0.725 0.4701
## factor(Brand)Nissan 0.3989734 0.4123953 0.967 0.3353
## factor(Brand)Opel 0.2441375 0.5133109 0.476 0.6352
## factor(Brand)Peugeot 0.2284183 0.4951486 0.461 0.6454
## factor(Brand)Polestar 0.6017375 1.2962536 0.464 0.6434
## factor(Brand)Porsche -0.1698949 1.0598662 -0.160 0.8729
## factor(Brand)Renault 0.3412286 0.4462340 0.765 0.4460
## factor(Brand)SEAT 0.0282392 0.3162397 0.089 0.9290
## factor(Brand)Seres 0.3984058 0.2903835 1.372 0.1727
## factor(Brand)Skoda 0.1700651 0.6651676 0.256 0.7986
## factor(Brand)Smart -0.0499639 0.3044422 -0.164 0.8699
## factor(Brand)Sono -0.1544943 0.2944583 -0.525 0.6008
## factor(Brand)Tesla 0.6228468 0.3658724 1.702 0.0913 .
## factor(Brand)Toyota 0.9864561 0.6087544 1.620 0.1078
## factor(Brand)Volkswagen 0.0870034 0.2131840 0.408 0.6839
## factor(Brand)Volvo 0.3470050 0.2540663 1.366 0.1746
## Accel -0.0202602 0.0109951 -1.843 0.0679 .
## Battery:factor(Brand)Audi 0.0058806 0.0060102 0.978 0.3299
## Battery:factor(Brand)BMW 0.0016525 0.0054450 0.304 0.7620
## Battery:factor(Brand)Byton -0.0037648 0.0115942 -0.325 0.7460
## Battery:factor(Brand)Citroen 0.0009454 0.0088864 0.106 0.9155
## Battery:factor(Brand)CUPRA -0.0026980 0.0098359 -0.274 0.7843
## Battery:factor(Brand)Dacia NA NA NA NA
## Battery:factor(Brand)DS NA NA NA NA
## Battery:factor(Brand)Fiat 0.0057441 0.0184931 0.311 0.7566
## Battery:factor(Brand)Ford -0.0036837 0.0101755 -0.362 0.7180
## Battery:factor(Brand)GMC NA NA NA NA
## Battery:factor(Brand)Honda NA NA NA NA
## Battery:factor(Brand)Hyundai -0.0019470 0.0063618 -0.306 0.7601
## Battery:factor(Brand)JAC NA NA NA NA
## Battery:factor(Brand)Jaguar NA NA NA NA
## Battery:factor(Brand)Kia 0.0008006 0.0064311 0.124 0.9011
## Battery:factor(Brand)Lexus NA NA NA NA
## Battery:factor(Brand)Lightyear NA NA NA NA
## Battery:factor(Brand)Lucid 0.0084763 0.0108047 0.784 0.4343
## Battery:factor(Brand)Mazda NA NA NA NA
## Battery:factor(Brand)Mercedes 0.0082741 0.0059694 1.386 0.1683
## Battery:factor(Brand)MG 0.0074432 0.0111962 0.665 0.5075
## Battery:factor(Brand)Mini NA NA NA NA
## Battery:factor(Brand)Nissan -0.0024704 0.0055349 -0.446 0.6562
## Battery:factor(Brand)Opel 0.0039358 0.0085338 0.461 0.6455
## Battery:factor(Brand)Peugeot 0.0046090 0.0084371 0.546 0.5859
## Battery:factor(Brand)Polestar -0.0053716 0.0182252 -0.295 0.7687
## Battery:factor(Brand)Porsche 0.0140354 0.0130282 1.077 0.2835
## Battery:factor(Brand)Renault -0.0044573 0.0076217 -0.585 0.5598
## Battery:factor(Brand)SEAT NA NA NA NA
## Battery:factor(Brand)Seres NA NA NA NA
## Battery:factor(Brand)Skoda -0.0006337 0.0093196 -0.068 0.9459
## Battery:factor(Brand)Smart NA NA NA NA
## Battery:factor(Brand)Sono NA NA NA NA
## Battery:factor(Brand)Tesla -0.0041087 0.0044444 -0.924 0.3571
## Battery:factor(Brand)Toyota -0.0048903 0.0098600 -0.496 0.6208
## Battery:factor(Brand)Volkswagen NA NA NA NA
## Battery:factor(Brand)Volvo NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2026 on 118 degrees of freedom
## Multiple R-squared: 0.8756, Adjusted R-squared: 0.8124
## F-statistic: 13.84 on 60 and 118 DF, p-value: < 2.2e-16
library(ggplot2)
library(dplyr)
library(cluster)
library(factoextra)
library(scales)
# Select basis variables for clustering
basis_vars <- c("Battery", "Accel", "Top_Speed", "Car_Range", "Efficiency", "Charge_Speed")
# Remove missing values and scale the data
df_cluster <- df_clean %>% select(all_of(basis_vars)) %>% na.omit()
df_scaled <- scale(df_cluster) # Standardizing the data
# Function to compute silhouette scores for different k
silhouette_scores <- function(k) {
km <- kmeans(df_scaled, centers = k, nstart = 10)
ss <- silhouette(km$cluster, dist(df_scaled))
mean(ss[, 3])
}
# Test k values from 2 to 10
k_values <- 2:10
sil_scores <- sapply(k_values, silhouette_scores)
# Plot Silhouette Scores
plot(k_values, sil_scores, type = "b", pch = 19, col = "blue",
main = "Silhouette Scores for Different k",
xlab = "Number of Clusters (k)", ylab = "Silhouette Score")
set.seed(42) # Ensure reproducibility
optimal_k <- 5 # Based on previous analysis
# Run K-Means
kmeans_model <- kmeans(df_scaled, centers = optimal_k, nstart = 10)
# Add cluster labels to the dataset
df_clean$Cluster <- as.factor(kmeans_model$cluster)
fviz_cluster(kmeans_model, data = df_scaled, ellipse.type = "norm",
geom = "point", stand = FALSE,
palette = "jco", ggtheme = theme_minimal(),
main = "K-Means Clustering of Electric Vehicles")
ggplot(df_clean, aes(x = Cluster, y = Price, fill = Cluster)) +
geom_boxplot() +
theme_minimal() +
scale_y_continuous(labels = scales::comma) +
labs(title = "Price Distribution Across Clusters",
x = "Cluster", y = "Price (€)")
# Compute Cluster Centers in Original Scale
cluster_centers <- as.data.frame(kmeans_model$centers)
colnames(cluster_centers) <- basis_vars
# Print cluster centers
print(cluster_centers)
## Battery Accel Top_Speed Car_Range Efficiency Charge_Speed
## 1 0.6565797 -1.0903938 1.29966234 0.9664875 -0.23703305 1.54049321
## 2 -0.9621234 0.5491598 -0.67429612 -0.8378114 -0.80308363 -0.78525809
## 3 -0.4052781 1.6230104 -1.05990474 -0.9684564 1.52987254 -0.86859552
## 4 4.9769907 -1.6275644 2.39085694 2.9404014 1.56212638 0.43593670
## 5 0.2468222 -0.3271897 0.06219296 0.2937322 0.04919012 0.02730077
cluster_summary <- df_clean %>%
group_by(Cluster) %>%
summarise(across(c(Battery, Accel, Top_Speed, Car_Range, Efficiency, Charge_Speed, Price), mean, na.rm = TRUE))
#find out features for each cluster to categorize
print(cluster_summary)
## # A tibble: 5 × 8
## Cluster Battery Accel Top_Speed Car_Range Efficiency Charge_Speed Price
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 83.6 4.40 229. 450 187. 848. 92088.
## 2 2 40.0 9.6 148. 237. 168. 277. 32093.
## 3 3 55.0 13.0 133. 222. 246. 257. 55129.
## 4 4 200 2.7 273. 683 247. 577. 132900
## 5 5 72.6 6.82 178. 371. 197. 476. 54156.
#Identify Top Brands in Each Cluster Find out which brands dominate each cluster:
df_clean %>%
group_by(Cluster, Brand) %>%
summarise(count = n(), .groups = "drop") %>%
arrange(Cluster, desc(count))
## # A tibble: 61 × 3
## Cluster Brand count
## <fct> <fct> <int>
## 1 1 Porsche 10
## 2 1 Tesla 10
## 3 1 BMW 3
## 4 1 Hyundai 3
## 5 1 Kia 3
## 6 1 Lucid 3
## 7 1 Audi 2
## 8 1 Mercedes 2
## 9 2 Volkswagen 5
## 10 2 Fiat 4
## # ℹ 51 more rows
library(randomForest)
set.seed(42)
rf_model <- randomForest(as.factor(Cluster) ~ Battery + Accel + Top_Speed + Car_Range + Efficiency + Charge_Speed,
data = df_clean, ntree = 500)
varImpPlot(rf_model, main = "Feature Importance in Clustering")
#Charge Speed
ggplot(df_clean, aes(x = factor(Cluster), y = Charge_Speed, fill = factor(Cluster))) +
geom_boxplot() +
labs(title = "Charge Speed Across Clusters", x = "Cluster", y = "Charge Speed (kW)") +
theme_minimal()
#Compare Range vs. Charge Speed to Find Sweet Spots
ggplot(df_clean, aes(x = Car_Range, y = Charge_Speed, color = factor(Cluster))) +
geom_point() +
labs(title = "Range vs. Charge Speed", x = "Car Range (km)", y = "Charge Speed (kW)") +
theme_minimal()
#Predict Future Cluster for New EV Model
new_ev <- data.frame(Battery = 100, Accel = 5, Top_Speed = 200,
Car_Range = 500, Efficiency = 180, Charge_Speed = 600)
predict(rf_model, new_ev)
## 1
## 5
## Levels: 1 2 3 4 5