R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
df <- read_csv("https://raw.githubusercontent.com/gmoeser/AppliedStatisticalMethods/main/electric-cars.csv")
## Rows: 180 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Name, Drive
## dbl (8): BatterySize, Acceleration, TopSpeed, Range, Efficiency, ChargeSpeed...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df)
## # A tibble: 6 × 10
##   Name      BatterySize Acceleration TopSpeed Range Efficiency ChargeSpeed Drive
##   <chr>           <dbl>        <dbl>    <dbl> <dbl>      <dbl>       <dbl> <chr>
## 1 Tesla Ro…         200          2.1      410   970        206         920 All …
## 2 Tesla Mo…          90          2.6      262   455        198         680 All …
## 3 Porsche …          84          2.8      260   390        215         860 All …
## 4 Porsche …          84          2.9      250   380        220         790 All …
## 5 GMC Humm…         200          3        200   329        267         100 All …
## 6 Tesla Cy…         200          3        210   750        267         710 All …
## # ℹ 2 more variables: Seats <dbl>, Price <dbl>
dim(df)
## [1] 180  10
colnames(df)
##  [1] "Name"         "BatterySize"  "Acceleration" "TopSpeed"     "Range"       
##  [6] "Efficiency"   "ChargeSpeed"  "Drive"        "Seats"        "Price"
str(df)
## spc_tbl_ [180 × 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Name        : chr [1:180] "Tesla Roadster" "Tesla Model X Plaid" "Porsche Taycan Turbo S" "Porsche Taycan Turbo S Cross Turismo" ...
##  $ BatterySize : num [1:180] 200 90 84 84 200 200 110 90 84 76 ...
##  $ Acceleration: num [1:180] 2.1 2.6 2.8 2.9 3 3 3.2 3.2 3.2 3.3 ...
##  $ TopSpeed    : num [1:180] 410 262 260 250 200 210 270 250 260 261 ...
##  $ Range       : num [1:180] 970 455 390 380 329 750 660 555 400 470 ...
##  $ Efficiency  : num [1:180] 206 198 215 220 267 267 167 162 209 162 ...
##  $ ChargeSpeed : num [1:180] 920 680 860 790 100 710 1380 830 840 790 ...
##  $ Drive       : chr [1:180] "All Wheel Drive" "All Wheel Drive" "All Wheel Drive" "All Wheel Drive" ...
##  $ Seats       : num [1:180] 4 7 4 4 5 7 5 5 4 5 ...
##  $ Price       : num [1:180] 215000 116990 186336 187746 108700 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Name = col_character(),
##   ..   BatterySize = col_double(),
##   ..   Acceleration = col_double(),
##   ..   TopSpeed = col_double(),
##   ..   Range = col_double(),
##   ..   Efficiency = col_double(),
##   ..   ChargeSpeed = col_double(),
##   ..   Drive = col_character(),
##   ..   Seats = col_double(),
##   ..   Price = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
summary(df)
##      Name            BatterySize      Acceleration       TopSpeed    
##  Length:180         Min.   : 17.00   Min.   : 2.100   Min.   :123.0  
##  Class :character   1st Qu.: 45.00   1st Qu.: 5.575   1st Qu.:150.0  
##  Mode  :character   Median : 67.00   Median : 7.500   Median :160.0  
##                     Mean   : 65.94   Mean   : 7.859   Mean   :175.7  
##                     3rd Qu.: 77.00   3rd Qu.: 9.625   3rd Qu.:200.0  
##                     Max.   :200.00   Max.   :22.400   Max.   :410.0  
##      Range         Efficiency     ChargeSpeed        Drive          
##  Min.   : 95.0   Min.   :104.0   Min.   : 100.0   Length:180        
##  1st Qu.:258.8   1st Qu.:168.0   1st Qu.: 277.5   Class :character  
##  Median :340.0   Median :189.0   Median : 420.0   Mode  :character  
##  Mean   :336.2   Mean   :194.8   Mean   : 469.2                     
##  3rd Qu.:400.0   3rd Qu.:216.2   3rd Qu.: 575.0                     
##  Max.   :970.0   Max.   :281.0   Max.   :1410.0                     
##      Seats           Price       
##  Min.   :2.000   Min.   : 13270  
##  1st Qu.:5.000   1st Qu.: 37430  
##  Median :5.000   Median : 50000  
##  Mean   :5.167   Mean   : 57361  
##  3rd Qu.:5.000   3rd Qu.: 63175  
##  Max.   :7.000   Max.   :215000
sapply(df, function(x) sum(is.na(x)))
##         Name  BatterySize Acceleration     TopSpeed        Range   Efficiency 
##            0            0            0            0            0            0 
##  ChargeSpeed        Drive        Seats        Price 
##            0            0            0            0
library(tidyverse)
library(stringr)

## Task 1: Describe the Dataset

df_clean <- df %>%
  rename(
    CarName      = Name,
    Battery      = BatterySize,
    Accel        = Acceleration,
    Top_Speed    = TopSpeed,
    Car_Range    = Range,
    Charge_Speed = ChargeSpeed
  ) %>%
  # Convert character variables to factors
  mutate(
    Drive = as.factor(Drive),
    Price_k = Price / 1000  # Price in thousands
  )

# Remove duplicate rows if any
df_clean <- df_clean %>% distinct()

# ✅ Extract Brand from CarName
df_clean <- df_clean %>%
  mutate(Brand = word(CarName, 1))  # Extracts first word (Brand)

# ✅ Ensure Brand is a factor and NOT a data frame
df_clean$Brand <- as.character(df_clean$Brand)  # Convert to character first
df_clean$Brand <- as.factor(df_clean$Brand)  # Now convert to a proper factor

# ✅ Check if Brand was correctly created
str(df_clean$Brand)  # Should say "Factor w/ X levels"
##  Factor w/ 38 levels "Aiways","Audi",..: 35 35 28 28 11 35 19 35 28 35 ...
table(df_clean$Brand)  # Should list different car brands
## 
##     Aiways       Audi        BMW      Byton    Citroen      CUPRA      Dacia 
##          1         17          7          3          6          4          1 
##         DS       Fiat       Ford        GMC      Honda    Hyundai        JAC 
##          1          4          5          1          2          9          1 
##     Jaguar        Kia      Lexus  Lightyear      Lucid      Mazda   Mercedes 
##          1          9          1          1          3          1          9 
##         MG       Mini     Nissan       Opel    Peugeot   Polestar    Porsche 
##          6          1          8          8          9          3         10 
##    Renault       SEAT      Seres      Skoda      Smart       Sono      Tesla 
##          6          1          1          5          3          1         14 
##     Toyota Volkswagen      Volvo 
##          4         10          2
# ✅ Drop missing values to avoid errors in regression
df_clean <- df_clean %>% filter(!is.na(Price) & !is.na(Battery) & !is.na(Car_Range) & !is.na(Accel) & !is.na(Brand))

# Check final structure
str(df_clean)
## tibble [179 × 12] (S3: tbl_df/tbl/data.frame)
##  $ CarName     : chr [1:179] "Tesla Roadster" "Tesla Model X Plaid" "Porsche Taycan Turbo S" "Porsche Taycan Turbo S Cross Turismo" ...
##  $ Battery     : num [1:179] 200 90 84 84 200 200 110 90 84 76 ...
##  $ Accel       : num [1:179] 2.1 2.6 2.8 2.9 3 3 3.2 3.2 3.2 3.3 ...
##  $ Top_Speed   : num [1:179] 410 262 260 250 200 210 270 250 260 261 ...
##  $ Car_Range   : num [1:179] 970 455 390 380 329 750 660 555 400 470 ...
##  $ Efficiency  : num [1:179] 206 198 215 220 267 267 167 162 209 162 ...
##  $ Charge_Speed: num [1:179] 920 680 860 790 100 710 1380 830 840 790 ...
##  $ Drive       : Factor w/ 3 levels "All Wheel Drive",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Seats       : num [1:179] 4 7 4 4 5 7 5 5 4 5 ...
##  $ Price       : num [1:179] 215000 116990 186336 187746 108700 ...
##  $ Price_k     : num [1:179] 215 117 186 188 109 ...
##  $ Brand       : Factor w/ 38 levels "Aiways","Audi",..: 35 35 28 28 11 35 19 35 28 35 ...
summary(df_clean)
##    CarName             Battery           Accel          Top_Speed    
##  Length:179         Min.   : 17.00   Min.   : 2.100   Min.   :123.0  
##  Class :character   1st Qu.: 45.00   1st Qu.: 5.550   1st Qu.:150.0  
##  Mode  :character   Median : 67.00   Median : 7.500   Median :160.0  
##                     Mean   : 65.96   Mean   : 7.859   Mean   :175.8  
##                     3rd Qu.: 77.00   3rd Qu.: 9.650   3rd Qu.:200.0  
##                     Max.   :200.00   Max.   :22.400   Max.   :410.0  
##                                                                      
##    Car_Range       Efficiency     Charge_Speed                  Drive   
##  Min.   : 95.0   Min.   :104.0   Min.   : 100.0   All Wheel Drive  :64  
##  1st Qu.:257.5   1st Qu.:168.5   1st Qu.: 275.0   Front Wheel Drive:70  
##  Median :340.0   Median :189.0   Median : 420.0   Rear Wheel Drive :45  
##  Mean   :335.9   Mean   :195.0   Mean   : 469.8                         
##  3rd Qu.:400.0   3rd Qu.:216.5   3rd Qu.: 580.0                         
##  Max.   :970.0   Max.   :281.0   Max.   :1410.0                         
##                                                                         
##      Seats           Price           Price_k              Brand    
##  Min.   :2.000   Min.   : 13270   Min.   : 13.27   Audi      : 17  
##  1st Qu.:5.000   1st Qu.: 37270   1st Qu.: 37.27   Tesla     : 14  
##  Median :5.000   Median : 50000   Median : 50.00   Porsche   : 10  
##  Mean   :5.168   Mean   : 57448   Mean   : 57.45   Volkswagen: 10  
##  3rd Qu.:5.000   3rd Qu.: 63450   3rd Qu.: 63.45   Hyundai   :  9  
##  Max.   :7.000   Max.   :215000   Max.   :215.00   Kia       :  9  
##                                                    (Other)   :110
##      Battery        Accel    Top_Speed    Car_Range   Efficiency Charge_Speed 
##    "numeric"    "numeric"    "numeric"    "numeric"    "numeric"    "numeric" 
##        Seats        Price 
##    "numeric"    "numeric"

sapply(df_clean[, c("Battery", "Car_Range", "Accel", "Price")], class)
##   Battery Car_Range     Accel     Price 
## "numeric" "numeric" "numeric" "numeric"
# ✅ Run Multiple Linear Regression with Brand as Factor
model_brand <- lm(Price ~ Battery + Car_Range + Accel + factor(Brand), data = df_clean)

# ✅ Show Summary of Regression
summary(model_brand)
## 
## Call:
## lm(formula = Price ~ Battery + Car_Range + Accel + factor(Brand), 
##     data = df_clean)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -66314  -6689      0   5162  72270 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              1.038e+04  2.114e+04   0.491 0.624144    
## Battery                  6.192e+02  1.571e+02   3.942 0.000128 ***
## Car_Range               -7.357e-01  3.974e+01  -0.019 0.985258    
## Accel                   -1.753e+03  8.714e+02  -2.012 0.046212 *  
## factor(Brand)Audi        2.427e+04  1.812e+04   1.339 0.182830    
## factor(Brand)BMW         2.156e+04  1.878e+04   1.148 0.252823    
## factor(Brand)Byton       7.633e+03  2.043e+04   0.374 0.709217    
## factor(Brand)Citroen     2.920e+04  1.930e+04   1.513 0.132673    
## factor(Brand)CUPRA       3.969e+03  1.964e+04   0.202 0.840141    
## factor(Brand)Dacia       1.981e+04  2.544e+04   0.779 0.437427    
## factor(Brand)DS          7.233e+03  2.483e+04   0.291 0.771300    
## factor(Brand)Fiat        1.270e+04  1.973e+04   0.644 0.520915    
## factor(Brand)Ford        6.105e+03  1.926e+04   0.317 0.751792    
## factor(Brand)GMC        -2.001e+04  3.363e+04  -0.595 0.552768    
## factor(Brand)Honda       2.288e+04  2.168e+04   1.055 0.293140    
## factor(Brand)Hyundai     1.123e+04  1.848e+04   0.607 0.544537    
## factor(Brand)JAC        -5.588e+01  2.503e+04  -0.002 0.998222    
## factor(Brand)Jaguar      2.297e+04  2.503e+04   0.918 0.360370    
## factor(Brand)Kia         9.707e+03  1.848e+04   0.525 0.600334    
## factor(Brand)Lexus       1.955e+04  2.484e+04   0.787 0.432584    
## factor(Brand)Lightyear   1.194e+05  2.711e+04   4.406  2.1e-05 ***
## factor(Brand)Lucid       4.357e+04  2.091e+04   2.083 0.039090 *  
## factor(Brand)Mazda       2.266e+04  2.496e+04   0.908 0.365511    
## factor(Brand)Mercedes    2.648e+04  1.855e+04   1.428 0.155667    
## factor(Brand)MG          3.232e+03  1.894e+04   0.171 0.864740    
## factor(Brand)Mini        1.710e+04  2.499e+04   0.684 0.495016    
## factor(Brand)Nissan      1.078e+04  1.860e+04   0.580 0.563122    
## factor(Brand)Opel        2.535e+04  1.879e+04   1.349 0.179452    
## factor(Brand)Peugeot     2.644e+04  1.878e+04   1.408 0.161391    
## factor(Brand)Polestar    7.254e+03  2.027e+04   0.358 0.721018    
## factor(Brand)Porsche     7.449e+04  1.860e+04   4.006 0.000101 ***
## factor(Brand)Renault     1.644e+04  1.934e+04   0.850 0.396569    
## factor(Brand)SEAT        1.617e+04  2.508e+04   0.645 0.520178    
## factor(Brand)Seres       1.991e+04  2.482e+04   0.802 0.423761    
## factor(Brand)Skoda       5.005e+03  1.924e+04   0.260 0.795197    
## factor(Brand)Smart       2.008e+04  2.074e+04   0.968 0.334523    
## factor(Brand)Sono        1.988e+03  2.483e+04   0.080 0.936307    
## factor(Brand)Tesla       1.291e+04  1.842e+04   0.701 0.484538    
## factor(Brand)Toyota      3.885e+04  2.006e+04   1.937 0.054786 .  
## factor(Brand)Volkswagen  5.888e+03  1.844e+04   0.319 0.750042    
## factor(Brand)Volvo       1.250e+04  2.169e+04   0.576 0.565388    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17530 on 138 degrees of freedom
## Multiple R-squared:  0.7691, Adjusted R-squared:  0.7022 
## F-statistic: 11.49 on 40 and 138 DF,  p-value: < 2.2e-16
df_clean <- df_clean %>%
  mutate(log_Price = log(Price_k))

print(colnames(df_clean))
##  [1] "CarName"      "Battery"      "Accel"        "Top_Speed"    "Car_Range"   
##  [6] "Efficiency"   "Charge_Speed" "Drive"        "Seats"        "Price"       
## [11] "Price_k"      "Brand"        "log_Price"
model_log <- lm(log_Price ~ Battery + Car_Range + Accel + factor(Brand), data = df_clean)
summary(model_log)
## 
## Call:
## lm(formula = log_Price ~ Battery + Car_Range + Accel + factor(Brand), 
##     data = df_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.84361 -0.07310  0.00000  0.08136  0.58298 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              3.3324014  0.2429955  13.714  < 2e-16 ***
## Battery                  0.0102826  0.0018058   5.694 7.16e-08 ***
## Car_Range               -0.0005750  0.0004569  -1.258  0.21035    
## Accel                   -0.0272332  0.0100186  -2.718  0.00740 ** 
## factor(Brand)Audi        0.4620486  0.2083732   2.217  0.02823 *  
## factor(Brand)BMW         0.4496072  0.2158603   2.083  0.03911 *  
## factor(Brand)Byton       0.2579602  0.2348241   1.099  0.27389    
## factor(Brand)Citroen     0.4982734  0.2219133   2.245  0.02634 *  
## factor(Brand)CUPRA       0.0920083  0.2257652   0.408  0.68424    
## factor(Brand)Dacia      -0.0838429  0.2924688  -0.287  0.77479    
## factor(Brand)DS         -0.0119046  0.2855088  -0.042  0.96680    
## factor(Brand)Fiat        0.0196698  0.2267937   0.087  0.93101    
## factor(Brand)Ford        0.2389043  0.2214701   1.079  0.28259    
## factor(Brand)GMC        -0.4294515  0.3865911  -1.111  0.26856    
## factor(Brand)Honda       0.2824865  0.2492404   1.133  0.25902    
## factor(Brand)Hyundai     0.2269541  0.2124889   1.068  0.28735    
## factor(Brand)JAC        -0.6917388  0.2877137  -2.404  0.01753 *  
## factor(Brand)Jaguar      0.4818728  0.2878152   1.674  0.09635 .  
## factor(Brand)Kia         0.2151521  0.2125157   1.012  0.31312    
## factor(Brand)Lexus       0.3690046  0.2855740   1.292  0.19846    
## factor(Brand)Lightyear   1.6575584  0.3116385   5.319 4.11e-07 ***
## factor(Brand)Lucid       0.7633725  0.2404511   3.175  0.00185 ** 
## factor(Brand)Mazda       0.2617056  0.2869945   0.912  0.36342    
## factor(Brand)Mercedes    0.5130294  0.2132596   2.406  0.01747 *  
## factor(Brand)MG          0.0026601  0.2177476   0.012  0.99027    
## factor(Brand)Mini        0.1558246  0.2873020   0.542  0.58844    
## factor(Brand)Nissan      0.2360932  0.2137908   1.104  0.27138    
## factor(Brand)Opel        0.4246864  0.2159740   1.966  0.05126 .  
## factor(Brand)Peugeot     0.4283003  0.2158676   1.984  0.04923 *  
## factor(Brand)Polestar    0.2464088  0.2330645   1.057  0.29224    
## factor(Brand)Porsche     0.9819502  0.2138073   4.593 9.76e-06 ***
## factor(Brand)Renault     0.1431220  0.2223031   0.644  0.52076    
## factor(Brand)SEAT       -0.0038199  0.2883329  -0.013  0.98945    
## factor(Brand)Seres       0.3740643  0.2853625   1.311  0.19209    
## factor(Brand)Skoda       0.1507110  0.2212455   0.681  0.49689    
## factor(Brand)Smart      -0.1414813  0.2384340  -0.593  0.55390    
## factor(Brand)Sono       -0.1824010  0.2854527  -0.639  0.52389    
## factor(Brand)Tesla       0.2851355  0.2118091   1.346  0.18045    
## factor(Brand)Toyota      0.6851192  0.2305787   2.971  0.00350 ** 
## factor(Brand)Volkswagen  0.1013254  0.2120403   0.478  0.63351    
## factor(Brand)Volvo       0.3274826  0.2493440   1.313  0.19124    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2015 on 138 degrees of freedom
## Multiple R-squared:  0.8561, Adjusted R-squared:  0.8145 
## F-statistic: 20.53 on 40 and 138 DF,  p-value: < 2.2e-16
#Dropout of range and comparison before & after log model and linear
model_price <- lm(Price ~ Battery + Accel + factor(Brand), data = df_clean)
summary(model_price)
## 
## Call:
## lm(formula = Price ~ Battery + Accel + factor(Brand), data = df_clean)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -66233  -6687      0   5179  72195 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              10242.14   19714.97   0.520   0.6042    
## Battery                    616.67      81.06   7.607 3.82e-12 ***
## Accel                    -1746.58     797.61  -2.190   0.0302 *  
## factor(Brand)Audi        24290.48   18009.93   1.349   0.1796    
## factor(Brand)BMW         21557.69   18707.53   1.152   0.2512    
## factor(Brand)Byton        7670.32   20249.87   0.379   0.7054    
## factor(Brand)Citroen     29223.51   19177.35   1.524   0.1298    
## factor(Brand)CUPRA        3945.51   19527.14   0.202   0.8402    
## factor(Brand)Dacia       19796.44   25333.21   0.781   0.4359    
## factor(Brand)DS           7242.95   24738.58   0.293   0.7701    
## factor(Brand)Fiat        12692.77   19655.12   0.646   0.5195    
## factor(Brand)Ford         6116.48   19184.27   0.319   0.7503    
## factor(Brand)GMC        -19636.34   26789.29  -0.733   0.4648    
## factor(Brand)Honda       22907.33   21542.82   1.063   0.2895    
## factor(Brand)Hyundai     11221.88   18413.42   0.609   0.5432    
## factor(Brand)JAC           -63.33   24932.43  -0.003   0.9980    
## factor(Brand)Jaguar      23024.51   24799.66   0.928   0.3548    
## factor(Brand)Kia          9695.77   18408.35   0.527   0.5992    
## factor(Brand)Lexus       19573.71   24719.65   0.792   0.4298    
## factor(Brand)Lightyear  119223.45   24771.74   4.813 3.83e-06 ***
## factor(Brand)Lucid       43489.70   20417.83   2.130   0.0349 *  
## factor(Brand)Mazda       22689.57   24836.06   0.914   0.3625    
## factor(Brand)Mercedes    26477.08   18481.34   1.433   0.1542    
## factor(Brand)MG           3233.03   18871.79   0.171   0.8642    
## factor(Brand)Mini        17124.45   24857.32   0.689   0.4920    
## factor(Brand)Nissan      10786.53   18523.70   0.582   0.5613    
## factor(Brand)Opel        25366.40   18690.46   1.357   0.1769    
## factor(Brand)Peugeot     26459.85   18667.17   1.417   0.1586    
## factor(Brand)Polestar     7238.18   20181.29   0.359   0.7204    
## factor(Brand)Porsche     74509.99   18505.19   4.026 9.27e-05 ***
## factor(Brand)Renault     16423.39   19235.22   0.854   0.3947    
## factor(Brand)SEAT        16157.33   24980.89   0.647   0.5188    
## factor(Brand)Seres       19925.57   24724.09   0.806   0.4217    
## factor(Brand)Skoda        4987.44   19151.99   0.260   0.7949    
## factor(Brand)Smart       20116.51   20593.57   0.977   0.3303    
## factor(Brand)Sono         1993.58   24737.72   0.081   0.9359    
## factor(Brand)Tesla       12897.10   18336.50   0.703   0.4830    
## factor(Brand)Toyota      38880.40   19912.60   1.953   0.0529 .  
## factor(Brand)Volkswagen   5867.49   18345.29   0.320   0.7496    
## factor(Brand)Volvo       12541.21   21483.70   0.584   0.5603    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17460 on 139 degrees of freedom
## Multiple R-squared:  0.7691, Adjusted R-squared:  0.7043 
## F-statistic: 11.87 on 39 and 139 DF,  p-value: < 2.2e-16
cat("R² of linear Model (with Car_Range):", summary(model_brand)$r.squared, "\n")
## R² of linear Model (with Car_Range): 0.7691144
cat("R² of Updated linear Model (without Car_Range):", summary(model_price)$r.squared, "\n")
## R² of Updated linear Model (without Car_Range): 0.7691138
model_log_price <- lm(log_Price ~ Battery + Accel + factor(Brand), data = df_clean)
summary(model_log_price)
## 
## Call:
## lm(formula = log_Price ~ Battery + Accel + factor(Brand), data = df_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.78086 -0.07121  0.00000  0.09153  0.58704 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              3.2248669  0.2279532  14.147  < 2e-16 ***
## Battery                  0.0083387  0.0009373   8.897 2.75e-15 ***
## Accel                   -0.0222504  0.0092223  -2.413  0.01714 *  
## factor(Brand)Audi        0.4814348  0.2082389   2.312  0.02225 *  
## factor(Brand)BMW         0.4472203  0.2163047   2.068  0.04054 *  
## factor(Brand)Byton       0.2875010  0.2341380   1.228  0.22156    
## factor(Brand)Citroen     0.5194757  0.2217370   2.343  0.02056 *  
## factor(Brand)CUPRA       0.0739531  0.2257815   0.328  0.74375    
## factor(Brand)Dacia      -0.0963179  0.2929139  -0.329  0.74278    
## factor(Brand)DS         -0.0039983  0.2860385  -0.014  0.98887    
## factor(Brand)Fiat        0.0172456  0.2272613   0.076  0.93962    
## factor(Brand)Ford        0.2479760  0.2218171   1.118  0.26553    
## factor(Brand)GMC        -0.1372555  0.3097496  -0.443  0.65837    
## factor(Brand)Honda       0.3055404  0.2490876   1.227  0.22203    
## factor(Brand)Hyundai     0.2224354  0.2129042   1.045  0.29794    
## factor(Brand)JAC        -0.6975641  0.2882799  -2.420  0.01682 *  
## factor(Brand)Jaguar      0.5208430  0.2867447   1.816  0.07146 .  
## factor(Brand)Kia         0.2063280  0.2128455   0.969  0.33404    
## factor(Brand)Lexus       0.3868599  0.2858196   1.354  0.17809    
## factor(Brand)Lightyear   1.5012638  0.2864219   5.241 5.79e-07 ***
## factor(Brand)Lucid       0.7028084  0.2360801   2.977  0.00343 ** 
## factor(Brand)Mazda       0.2814715  0.2871655   0.980  0.32870    
## factor(Brand)Mercedes    0.5095951  0.2136894   2.385  0.01844 *  
## factor(Brand)MG          0.0031979  0.2182040   0.015  0.98833    
## factor(Brand)Mini        0.1769799  0.2874114   0.616  0.53905    
## factor(Brand)Nissan      0.2424560  0.2141793   1.132  0.25957    
## factor(Brand)Opel        0.4394508  0.2161074   2.033  0.04391 *  
## factor(Brand)Peugeot     0.4464299  0.2158382   2.068  0.04046 *  
## factor(Brand)Polestar    0.2340210  0.2333450   1.003  0.31765    
## factor(Brand)Porsche     0.9959578  0.2139653   4.655 7.49e-06 ***
## factor(Brand)Renault     0.1271533  0.2224062   0.572  0.56844    
## factor(Brand)SEAT       -0.0132473  0.2888402  -0.046  0.96348    
## factor(Brand)Seres       0.3830813  0.2858710   1.340  0.18242    
## factor(Brand)Skoda       0.1370833  0.2214439   0.619  0.53690    
## factor(Brand)Smart      -0.1166113  0.2381120  -0.490  0.62510    
## factor(Brand)Sono       -0.1778519  0.2860285  -0.622  0.53509    
## factor(Brand)Tesla       0.2724989  0.2120148   1.285  0.20083    
## factor(Brand)Toyota      0.7096048  0.2302383   3.082  0.00248 ** 
## factor(Brand)Volkswagen  0.0856139  0.2121164   0.404  0.68711    
## factor(Brand)Volvo       0.3613887  0.2484041   1.455  0.14797    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2019 on 139 degrees of freedom
## Multiple R-squared:  0.8545, Adjusted R-squared:  0.8137 
## F-statistic: 20.93 on 39 and 139 DF,  p-value: < 2.2e-16
cat("R² of log Model (with Car_Range):", summary(model_log)$r.squared, "\n")
## R² of log Model (with Car_Range): 0.8561485
cat("R² of Updated Log Model (without Car_Range):", summary(model_log_price)$r.squared, "\n")
## R² of Updated Log Model (without Car_Range): 0.8544976
library(ggplot2)
library(dplyr)
library(tidyr)
library(gridExtra)


# Extract coefficients from both models
coef_price <- coef(summary(model_price))  # Linear Model
coef_log_price <- coef(summary(model_log_price))  # Log-Linear Model

# Convert to data frames
coef_df <- data.frame(
  Variable = rownames(coef_price),
  Estimate_Price = coef_price[, "Estimate"],
  Estimate_Log_Price = (exp(coef_log_price[, "Estimate"]) - 1) * 100  # Convert log-coefficients to percentage change
)

# Keep only brand coefficients
coef_df <- coef_df[grepl("factor\\(Brand\\)", coef_df$Variable), ]

# Create separate plots for each model
p1 <- ggplot(coef_df, aes(x = Variable, y = Estimate_Price, fill = "Linear Model (€)")) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(title = "Brand Influence on Price (Linear Model)", 
       x = "Brand", y = "Estimated Effect (€)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_fill_manual(values = c("Linear Model (€)" = "red"), name = "Model")

p2 <- ggplot(coef_df, aes(x = Variable, y = Estimate_Log_Price, fill = "Log Model (% Change)")) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(title = "Brand Influence on Price (Log Model)", 
       x = "Brand", y = "Estimated Effect (% Change)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_fill_manual(values = c("Log Model (% Change)" = "blue"), name = "Model")

# Display the two plots
library(gridExtra)
grid.arrange(p1, p2, ncol = 1)

# Plot Residuals vs. Fitted Values for Linear Model
p3 <- ggplot(data.frame(Fitted = fitted(model_price), Residuals = resid(model_price)), 
             aes(x = Fitted, y = Residuals)) +
  geom_point(color = "red") +
  geom_hline(yintercept = 0, linetype = "dashed") +
  theme_minimal() +
  labs(title = "Residuals vs. Fitted (Linear Model)", x = "Fitted Values (€)", y = "Residuals")

# Plot Residuals vs. Fitted Values for Log Model
p4 <- ggplot(data.frame(Fitted = fitted(model_log_price), Residuals = resid(model_log_price)), 
             aes(x = Fitted, y = Residuals)) +
  geom_point(color = "blue") +
  geom_hline(yintercept = 0, linetype = "dashed") +
  theme_minimal() +
  labs(title = "Residuals vs. Fitted (Log Model)", x = "Fitted Values (log)", y = "Residuals")

# Display both plots
grid.arrange(p3, p4, ncol = 2)

#Battery & Brand interact
model_interaction_bb <- lm(log_Price ~ Battery * factor(Brand) + Accel, data = df_clean)
summary(model_interaction_bb)
## 
## Call:
## lm(formula = log_Price ~ Battery * factor(Brand) + Accel, data = df_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.52608 -0.06515  0.00000  0.06395  0.55683 
## 
## Coefficients: (16 not defined because of singularities)
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      3.1062155  0.3609347   8.606 3.84e-14 ***
## Battery                          0.0099851  0.0043091   2.317   0.0222 *  
## factor(Brand)Audi                0.0070847  0.4721091   0.015   0.9881    
## factor(Brand)BMW                 0.3241437  0.4194033   0.773   0.4411    
## factor(Brand)Byton               0.5775563  1.0069895   0.574   0.5674    
## factor(Brand)Citroen             0.4775432  0.5315553   0.898   0.3708    
## factor(Brand)CUPRA               0.2403467  0.6337260   0.379   0.7052    
## factor(Brand)Dacia              -0.0519727  0.3286443  -0.158   0.8746    
## factor(Brand)DS                  0.0232492  0.2965041   0.078   0.9376    
## factor(Brand)Fiat               -0.1316935  0.7011893  -0.188   0.8513    
## factor(Brand)Ford                0.5175062  0.8176705   0.633   0.5280    
## factor(Brand)GMC                -0.3538608  0.6493870  -0.545   0.5868    
## factor(Brand)Honda               0.3592304  0.2872204   1.251   0.2135    
## factor(Brand)Hyundai             0.3428502  0.4364295   0.786   0.4337    
## factor(Brand)JAC                -0.6670055  0.3054082  -2.184   0.0309 *  
## factor(Brand)Jaguar              0.4899949  0.3014889   1.625   0.1068    
## factor(Brand)Kia                 0.1579080  0.4551594   0.347   0.7293    
## factor(Brand)Lexus               0.4082635  0.2919948   1.398   0.1647    
## factor(Brand)Lightyear           1.5012276  0.2879363   5.214 7.99e-07 ***
## factor(Brand)Lucid              -0.1304923  0.9945684  -0.131   0.8958    
## factor(Brand)Mazda               0.3314254  0.3187312   1.040   0.3005    
## factor(Brand)Mercedes           -0.2070857  0.4859280  -0.426   0.6708    
## factor(Brand)MG                 -0.4019967  0.6752321  -0.595   0.5528    
## factor(Brand)Mini                0.2333566  0.3220527   0.725   0.4701    
## factor(Brand)Nissan              0.3989734  0.4123953   0.967   0.3353    
## factor(Brand)Opel                0.2441375  0.5133109   0.476   0.6352    
## factor(Brand)Peugeot             0.2284183  0.4951486   0.461   0.6454    
## factor(Brand)Polestar            0.6017375  1.2962536   0.464   0.6434    
## factor(Brand)Porsche            -0.1698949  1.0598662  -0.160   0.8729    
## factor(Brand)Renault             0.3412286  0.4462340   0.765   0.4460    
## factor(Brand)SEAT                0.0282392  0.3162397   0.089   0.9290    
## factor(Brand)Seres               0.3984058  0.2903835   1.372   0.1727    
## factor(Brand)Skoda               0.1700651  0.6651676   0.256   0.7986    
## factor(Brand)Smart              -0.0499639  0.3044422  -0.164   0.8699    
## factor(Brand)Sono               -0.1544943  0.2944583  -0.525   0.6008    
## factor(Brand)Tesla               0.6228468  0.3658724   1.702   0.0913 .  
## factor(Brand)Toyota              0.9864561  0.6087544   1.620   0.1078    
## factor(Brand)Volkswagen          0.0870034  0.2131840   0.408   0.6839    
## factor(Brand)Volvo               0.3470050  0.2540663   1.366   0.1746    
## Accel                           -0.0202602  0.0109951  -1.843   0.0679 .  
## Battery:factor(Brand)Audi        0.0058806  0.0060102   0.978   0.3299    
## Battery:factor(Brand)BMW         0.0016525  0.0054450   0.304   0.7620    
## Battery:factor(Brand)Byton      -0.0037648  0.0115942  -0.325   0.7460    
## Battery:factor(Brand)Citroen     0.0009454  0.0088864   0.106   0.9155    
## Battery:factor(Brand)CUPRA      -0.0026980  0.0098359  -0.274   0.7843    
## Battery:factor(Brand)Dacia              NA         NA      NA       NA    
## Battery:factor(Brand)DS                 NA         NA      NA       NA    
## Battery:factor(Brand)Fiat        0.0057441  0.0184931   0.311   0.7566    
## Battery:factor(Brand)Ford       -0.0036837  0.0101755  -0.362   0.7180    
## Battery:factor(Brand)GMC                NA         NA      NA       NA    
## Battery:factor(Brand)Honda              NA         NA      NA       NA    
## Battery:factor(Brand)Hyundai    -0.0019470  0.0063618  -0.306   0.7601    
## Battery:factor(Brand)JAC                NA         NA      NA       NA    
## Battery:factor(Brand)Jaguar             NA         NA      NA       NA    
## Battery:factor(Brand)Kia         0.0008006  0.0064311   0.124   0.9011    
## Battery:factor(Brand)Lexus              NA         NA      NA       NA    
## Battery:factor(Brand)Lightyear          NA         NA      NA       NA    
## Battery:factor(Brand)Lucid       0.0084763  0.0108047   0.784   0.4343    
## Battery:factor(Brand)Mazda              NA         NA      NA       NA    
## Battery:factor(Brand)Mercedes    0.0082741  0.0059694   1.386   0.1683    
## Battery:factor(Brand)MG          0.0074432  0.0111962   0.665   0.5075    
## Battery:factor(Brand)Mini               NA         NA      NA       NA    
## Battery:factor(Brand)Nissan     -0.0024704  0.0055349  -0.446   0.6562    
## Battery:factor(Brand)Opel        0.0039358  0.0085338   0.461   0.6455    
## Battery:factor(Brand)Peugeot     0.0046090  0.0084371   0.546   0.5859    
## Battery:factor(Brand)Polestar   -0.0053716  0.0182252  -0.295   0.7687    
## Battery:factor(Brand)Porsche     0.0140354  0.0130282   1.077   0.2835    
## Battery:factor(Brand)Renault    -0.0044573  0.0076217  -0.585   0.5598    
## Battery:factor(Brand)SEAT               NA         NA      NA       NA    
## Battery:factor(Brand)Seres              NA         NA      NA       NA    
## Battery:factor(Brand)Skoda      -0.0006337  0.0093196  -0.068   0.9459    
## Battery:factor(Brand)Smart              NA         NA      NA       NA    
## Battery:factor(Brand)Sono               NA         NA      NA       NA    
## Battery:factor(Brand)Tesla      -0.0041087  0.0044444  -0.924   0.3571    
## Battery:factor(Brand)Toyota     -0.0048903  0.0098600  -0.496   0.6208    
## Battery:factor(Brand)Volkswagen         NA         NA      NA       NA    
## Battery:factor(Brand)Volvo              NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2026 on 118 degrees of freedom
## Multiple R-squared:  0.8756, Adjusted R-squared:  0.8124 
## F-statistic: 13.84 on 60 and 118 DF,  p-value: < 2.2e-16
library(ggplot2)
library(dplyr)
library(cluster)
library(factoextra)
library(scales)
# Select basis variables for clustering
basis_vars <- c("Battery", "Accel", "Top_Speed", "Car_Range", "Efficiency", "Charge_Speed")

# Remove missing values and scale the data
df_cluster <- df_clean %>% select(all_of(basis_vars)) %>% na.omit()
df_scaled <- scale(df_cluster)  # Standardizing the data
# Function to compute silhouette scores for different k
silhouette_scores <- function(k) {
  km <- kmeans(df_scaled, centers = k, nstart = 10)
  ss <- silhouette(km$cluster, dist(df_scaled))
  mean(ss[, 3])
}

# Test k values from 2 to 10
k_values <- 2:10
sil_scores <- sapply(k_values, silhouette_scores)

# Plot Silhouette Scores
plot(k_values, sil_scores, type = "b", pch = 19, col = "blue",
     main = "Silhouette Scores for Different k",
     xlab = "Number of Clusters (k)", ylab = "Silhouette Score")

set.seed(42)  # Ensure reproducibility
optimal_k <- 5  # Based on previous analysis

# Run K-Means
kmeans_model <- kmeans(df_scaled, centers = optimal_k, nstart = 10)

# Add cluster labels to the dataset
df_clean$Cluster <- as.factor(kmeans_model$cluster)
fviz_cluster(kmeans_model, data = df_scaled, ellipse.type = "norm",
             geom = "point", stand = FALSE,
             palette = "jco", ggtheme = theme_minimal(),
             main = "K-Means Clustering of Electric Vehicles")

ggplot(df_clean, aes(x = Cluster, y = Price, fill = Cluster)) +
  geom_boxplot() +
  theme_minimal() +
  scale_y_continuous(labels = scales::comma) +
  labs(title = "Price Distribution Across Clusters",
       x = "Cluster", y = "Price (€)")

# Compute Cluster Centers in Original Scale
cluster_centers <- as.data.frame(kmeans_model$centers)
colnames(cluster_centers) <- basis_vars

# Print cluster centers
print(cluster_centers)
##      Battery      Accel   Top_Speed  Car_Range  Efficiency Charge_Speed
## 1  0.6565797 -1.0903938  1.29966234  0.9664875 -0.23703305   1.54049321
## 2 -0.9621234  0.5491598 -0.67429612 -0.8378114 -0.80308363  -0.78525809
## 3 -0.4052781  1.6230104 -1.05990474 -0.9684564  1.52987254  -0.86859552
## 4  4.9769907 -1.6275644  2.39085694  2.9404014  1.56212638   0.43593670
## 5  0.2468222 -0.3271897  0.06219296  0.2937322  0.04919012   0.02730077
cluster_summary <- df_clean %>%
  group_by(Cluster) %>%
  summarise(across(c(Battery, Accel, Top_Speed, Car_Range, Efficiency, Charge_Speed, Price), mean, na.rm = TRUE))
#find out features for each cluster to categorize
print(cluster_summary)
## # A tibble: 5 × 8
##   Cluster Battery Accel Top_Speed Car_Range Efficiency Charge_Speed   Price
##   <fct>     <dbl> <dbl>     <dbl>     <dbl>      <dbl>        <dbl>   <dbl>
## 1 1          83.6  4.40      229.      450        187.         848.  92088.
## 2 2          40.0  9.6       148.      237.       168.         277.  32093.
## 3 3          55.0 13.0       133.      222.       246.         257.  55129.
## 4 4         200    2.7       273.      683        247.         577. 132900 
## 5 5          72.6  6.82      178.      371.       197.         476.  54156.
#Identify Top Brands in Each Cluster Find out which brands dominate each cluster:
df_clean %>%
  group_by(Cluster, Brand) %>%
  summarise(count = n(), .groups = "drop") %>%
  arrange(Cluster, desc(count))
## # A tibble: 61 × 3
##    Cluster Brand      count
##    <fct>   <fct>      <int>
##  1 1       Porsche       10
##  2 1       Tesla         10
##  3 1       BMW            3
##  4 1       Hyundai        3
##  5 1       Kia            3
##  6 1       Lucid          3
##  7 1       Audi           2
##  8 1       Mercedes       2
##  9 2       Volkswagen     5
## 10 2       Fiat           4
## # ℹ 51 more rows
library(randomForest)

set.seed(42)
rf_model <- randomForest(as.factor(Cluster) ~ Battery + Accel + Top_Speed + Car_Range + Efficiency + Charge_Speed,
                         data = df_clean, ntree = 500)

varImpPlot(rf_model, main = "Feature Importance in Clustering")

#Charge Speed
ggplot(df_clean, aes(x = factor(Cluster), y = Charge_Speed, fill = factor(Cluster))) +
  geom_boxplot() +
  labs(title = "Charge Speed Across Clusters", x = "Cluster", y = "Charge Speed (kW)") +
  theme_minimal()

#Compare Range vs. Charge Speed to Find Sweet Spots
ggplot(df_clean, aes(x = Car_Range, y = Charge_Speed, color = factor(Cluster))) +
  geom_point() +
  labs(title = "Range vs. Charge Speed", x = "Car Range (km)", y = "Charge Speed (kW)") +
  theme_minimal()

#Predict Future Cluster for New EV Model
new_ev <- data.frame(Battery = 100, Accel = 5, Top_Speed = 200, 
                     Car_Range = 500, Efficiency = 180, Charge_Speed = 600)

predict(rf_model, new_ev)
## 1 
## 5 
## Levels: 1 2 3 4 5