library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
auto <- read.table("Auto.data", header = TRUE, na.strings = "?", stringsAsFactors = TRUE)
auto <- na.omit(auto) # Remove missing values
# Quantitative Variables: mpg, cylinders, displacement, horsepower, weight, acceleration, year
# Qualitative Variables: origin, name
quantitative_vars <- c("mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration", "year")
qualitative_vars <- c("origin", "name")

cat("Quantitative Variables:", paste(quantitative_vars, collapse = ", "), "\n")
## Quantitative Variables: mpg, cylinders, displacement, horsepower, weight, acceleration, year
cat("Qualitative Variables:", paste(qualitative_vars, collapse = ", "))
## Qualitative Variables: origin, name
sapply(auto[quantitative_vars], range, na.rm = TRUE)
##       mpg cylinders displacement horsepower weight acceleration year
## [1,]  9.0         3           68         46   1613          8.0   70
## [2,] 46.6         8          455        230   5140         24.8   82
quantitative_vars <- names(auto)[sapply(auto, is.numeric)]
sapply(auto[quantitative_vars], function(x) c(Mean = mean(x, na.rm = TRUE), SD = sd(x, na.rm = TRUE)))
##            mpg cylinders displacement horsepower    weight acceleration
## Mean 23.445918  5.471939      194.412  104.46939 2977.5842    15.541327
## SD    7.805007  1.705783      104.644   38.49116  849.4026     2.758864
##           year    origin
## Mean 75.979592 1.5765306
## SD    3.683737 0.8055182
quantitative_vars <- names(auto)[sapply(auto, is.numeric)]
auto_new <- auto[-(10:85), ]

sapply(auto_new[quantitative_vars], range, na.rm = TRUE)
##       mpg cylinders displacement horsepower weight acceleration year origin
## [1,] 11.0         3           68         46   1649          8.5   70      1
## [2,] 46.6         8          455        230   4997         24.8   82      3
quantitative_vars <- names(auto)[sapply(auto, is.numeric)]
sapply(auto_new[quantitative_vars], function(x) c(Mean = mean(x, na.rm = TRUE), SD = sd(x, na.rm = TRUE)))
##            mpg cylinders displacement horsepower    weight acceleration
## Mean 24.404430  5.373418    187.24051  100.72152 2935.9715    15.726899
## SD    7.867283  1.654179     99.67837   35.70885  811.3002     2.693721
##           year   origin
## Mean 77.145570 1.601266
## SD    3.106217 0.819910
quantitative_vars <- names(auto_new)[sapply(auto_new, is.numeric)]
str(auto) # Check structure of the dataset
## 'data.frame':    392 obs. of  9 variables:
##  $ mpg         : num  18 15 18 16 17 15 14 14 14 15 ...
##  $ cylinders   : int  8 8 8 8 8 8 8 8 8 8 ...
##  $ displacement: num  307 350 318 304 302 429 454 440 455 390 ...
##  $ horsepower  : num  130 165 150 150 140 198 220 215 225 190 ...
##  $ weight      : num  3504 3693 3436 3433 3449 ...
##  $ acceleration: num  12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
##  $ year        : int  70 70 70 70 70 70 70 70 70 70 ...
##  $ origin      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ name        : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
##  - attr(*, "na.action")= 'omit' Named int [1:5] 33 127 331 337 355
##   ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...
num_vars <- auto %>% select(where(is.numeric))
range_values <- apply(num_vars, 2, range)
range_values
##       mpg cylinders displacement horsepower weight acceleration year origin
## [1,]  9.0         3           68         46   1613          8.0   70      1
## [2,] 46.6         8          455        230   5140         24.8   82      3
summary_stats <- num_vars %>% summarise_all(list(mean = mean, sd = sd))
summary_stats
##   mpg_mean cylinders_mean displacement_mean horsepower_mean weight_mean
## 1 23.44592       5.471939           194.412        104.4694    2977.584
##   acceleration_mean year_mean origin_mean   mpg_sd cylinders_sd displacement_sd
## 1          15.54133  75.97959    1.576531 7.805007     1.705783         104.644
##   horsepower_sd weight_sd acceleration_sd  year_sd origin_sd
## 1      38.49116  849.4026        2.758864 3.683737 0.8055182
auto_subset <- auto[-(10:85), ]
num_vars_subset <- auto_subset %>% select(where(is.numeric))
range_values_subset <- apply(num_vars_subset, 2, range)
summary_stats_subset <- num_vars_subset %>% summarise_all(list(mean = mean, sd = sd))

range_values_subset
##       mpg cylinders displacement horsepower weight acceleration year origin
## [1,] 11.0         3           68         46   1649          8.5   70      1
## [2,] 46.6         8          455        230   4997         24.8   82      3
summary_stats_subset
##   mpg_mean cylinders_mean displacement_mean horsepower_mean weight_mean
## 1 24.40443       5.373418          187.2405        100.7215    2935.972
##   acceleration_mean year_mean origin_mean   mpg_sd cylinders_sd displacement_sd
## 1           15.7269  77.14557    1.601266 7.867283     1.654179        99.67837
##   horsepower_sd weight_sd acceleration_sd  year_sd origin_sd
## 1      35.70885  811.3002        2.693721 3.106217   0.81991
pairs(num_vars) # Scatterplot matrix to explore relationships

correlations <- cor(num_vars)
correlations["mpg", ] # Correlation of mpg with other variables
##          mpg    cylinders displacement   horsepower       weight acceleration 
##    1.0000000   -0.7776175   -0.8051269   -0.7784268   -0.8322442    0.4233285 
##         year       origin 
##    0.5805410    0.5652088
par(mfrow = c(2, 2))
plot(auto$horsepower, auto$mpg, main = "MPG vs Horsepower", xlab = "Horsepower", ylab = "MPG", col = "blue")
plot(auto$weight, auto$mpg, main = "MPG vs Weight", xlab = "Weight", ylab = "MPG", col = "red")
plot(auto$displacement, auto$mpg, main = "MPG vs Displacement", xlab = "Displacement", ylab = "MPG", col = "green")
plot(auto$acceleration, auto$mpg, main = "MPG vs Acceleration", xlab = "Acceleration", ylab = "MPG", col = "purple")

correlations <- cor(num_vars)
correlations["mpg", ] # Correlation of mpg with other variables
##          mpg    cylinders displacement   horsepower       weight acceleration 
##    1.0000000   -0.7776175   -0.8051269   -0.7784268   -0.8322442    0.4233285 
##         year       origin 
##    0.5805410    0.5652088