library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
auto <- read.table("Auto.data", header = TRUE, na.strings = "?", stringsAsFactors = TRUE)
auto <- na.omit(auto) # Remove missing values
# Quantitative Variables: mpg, cylinders, displacement, horsepower, weight, acceleration, year
# Qualitative Variables: origin, name
quantitative_vars <- c("mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration", "year")
qualitative_vars <- c("origin", "name")
cat("Quantitative Variables:", paste(quantitative_vars, collapse = ", "), "\n")
## Quantitative Variables: mpg, cylinders, displacement, horsepower, weight, acceleration, year
cat("Qualitative Variables:", paste(qualitative_vars, collapse = ", "))
## Qualitative Variables: origin, name
sapply(auto[quantitative_vars], range, na.rm = TRUE)
## mpg cylinders displacement horsepower weight acceleration year
## [1,] 9.0 3 68 46 1613 8.0 70
## [2,] 46.6 8 455 230 5140 24.8 82
quantitative_vars <- names(auto)[sapply(auto, is.numeric)]
sapply(auto[quantitative_vars], function(x) c(Mean = mean(x, na.rm = TRUE), SD = sd(x, na.rm = TRUE)))
## mpg cylinders displacement horsepower weight acceleration
## Mean 23.445918 5.471939 194.412 104.46939 2977.5842 15.541327
## SD 7.805007 1.705783 104.644 38.49116 849.4026 2.758864
## year origin
## Mean 75.979592 1.5765306
## SD 3.683737 0.8055182
quantitative_vars <- names(auto)[sapply(auto, is.numeric)]
auto_new <- auto[-(10:85), ]
sapply(auto_new[quantitative_vars], range, na.rm = TRUE)
## mpg cylinders displacement horsepower weight acceleration year origin
## [1,] 11.0 3 68 46 1649 8.5 70 1
## [2,] 46.6 8 455 230 4997 24.8 82 3
quantitative_vars <- names(auto)[sapply(auto, is.numeric)]
sapply(auto_new[quantitative_vars], function(x) c(Mean = mean(x, na.rm = TRUE), SD = sd(x, na.rm = TRUE)))
## mpg cylinders displacement horsepower weight acceleration
## Mean 24.404430 5.373418 187.24051 100.72152 2935.9715 15.726899
## SD 7.867283 1.654179 99.67837 35.70885 811.3002 2.693721
## year origin
## Mean 77.145570 1.601266
## SD 3.106217 0.819910
quantitative_vars <- names(auto_new)[sapply(auto_new, is.numeric)]
str(auto) # Check structure of the dataset
## 'data.frame': 392 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : num 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : num 3504 3693 3436 3433 3449 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
## - attr(*, "na.action")= 'omit' Named int [1:5] 33 127 331 337 355
## ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...
num_vars <- auto %>% select(where(is.numeric))
range_values <- apply(num_vars, 2, range)
range_values
## mpg cylinders displacement horsepower weight acceleration year origin
## [1,] 9.0 3 68 46 1613 8.0 70 1
## [2,] 46.6 8 455 230 5140 24.8 82 3
summary_stats <- num_vars %>% summarise_all(list(mean = mean, sd = sd))
summary_stats
## mpg_mean cylinders_mean displacement_mean horsepower_mean weight_mean
## 1 23.44592 5.471939 194.412 104.4694 2977.584
## acceleration_mean year_mean origin_mean mpg_sd cylinders_sd displacement_sd
## 1 15.54133 75.97959 1.576531 7.805007 1.705783 104.644
## horsepower_sd weight_sd acceleration_sd year_sd origin_sd
## 1 38.49116 849.4026 2.758864 3.683737 0.8055182
auto_subset <- auto[-(10:85), ]
num_vars_subset <- auto_subset %>% select(where(is.numeric))
range_values_subset <- apply(num_vars_subset, 2, range)
summary_stats_subset <- num_vars_subset %>% summarise_all(list(mean = mean, sd = sd))
range_values_subset
## mpg cylinders displacement horsepower weight acceleration year origin
## [1,] 11.0 3 68 46 1649 8.5 70 1
## [2,] 46.6 8 455 230 4997 24.8 82 3
summary_stats_subset
## mpg_mean cylinders_mean displacement_mean horsepower_mean weight_mean
## 1 24.40443 5.373418 187.2405 100.7215 2935.972
## acceleration_mean year_mean origin_mean mpg_sd cylinders_sd displacement_sd
## 1 15.7269 77.14557 1.601266 7.867283 1.654179 99.67837
## horsepower_sd weight_sd acceleration_sd year_sd origin_sd
## 1 35.70885 811.3002 2.693721 3.106217 0.81991
pairs(num_vars) # Scatterplot matrix to explore relationships

correlations <- cor(num_vars)
correlations["mpg", ] # Correlation of mpg with other variables
## mpg cylinders displacement horsepower weight acceleration
## 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442 0.4233285
## year origin
## 0.5805410 0.5652088
par(mfrow = c(2, 2))
plot(auto$horsepower, auto$mpg, main = "MPG vs Horsepower", xlab = "Horsepower", ylab = "MPG", col = "blue")
plot(auto$weight, auto$mpg, main = "MPG vs Weight", xlab = "Weight", ylab = "MPG", col = "red")
plot(auto$displacement, auto$mpg, main = "MPG vs Displacement", xlab = "Displacement", ylab = "MPG", col = "green")
plot(auto$acceleration, auto$mpg, main = "MPG vs Acceleration", xlab = "Acceleration", ylab = "MPG", col = "purple")

correlations <- cor(num_vars)
correlations["mpg", ] # Correlation of mpg with other variables
## mpg cylinders displacement horsepower weight acceleration
## 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442 0.4233285
## year origin
## 0.5805410 0.5652088