Exercise 1

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

auto_data <- read.csv("Auto.csv", stringsAsFactors = FALSE)
str(auto_data)

## 'data.frame':    397 obs. of  9 variables:
##  $ mpg         : num  18 15 18 16 17 15 14 14 14 15 ...
##  $ cylinders   : int  8 8 8 8 8 8 8 8 8 8 ...
##  $ displacement: num  307 350 318 304 302 429 454 440 455 390 ...
##  $ horsepower  : chr  "130" "165" "150" "150" ...
##  $ weight      : int  3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
##  $ acceleration: num  12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
##  $ year        : int  70 70 70 70 70 70 70 70 70 70 ...
##  $ origin      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ name        : chr  "chevrolet chevelle malibu" "buick skylark 320" "plymouth satellite" "amc rebel sst" ...

categorical_vars <- names(auto_data)[sapply(auto_data, is.character)]
numerical_vars <- names(auto_data)[sapply(auto_data, is.numeric)]
cat("Quantitative Variables:\n")

## Quantitative Variables:

print(numerical_vars)

## [1] "mpg"          "cylinders"    "displacement" "weight"       "acceleration"
## [6] "year"         "origin"

cat("\nQualitative Variables:\n")

## 
## Qualitative Variables:

print(categorical_vars)

## [1] "horsepower" "name"

auto_data <- read.csv("Auto.csv", stringsAsFactors = FALSE)
auto_data$horsepower <- as.numeric(auto_data$horsepower)

## Warning: NAs introduced by coercion

quantitative_vars <- c("mpg", "cylinders", "displacement", "horsepower", 
                       "weight", "acceleration", "year", "origin")
for (var in quantitative_vars) {
  cat("Range of", var, ":", range(auto_data[[var]], na.rm = TRUE), "\n")
}

## Range of mpg : 9 46.6 
## Range of cylinders : 3 8 
## Range of displacement : 68 455 
## Range of horsepower : 46 230 
## Range of weight : 1613 5140 
## Range of acceleration : 8 24.8 
## Range of year : 70 82 
## Range of origin : 1 3

auto_data$horsepower[auto_data$horsepower == "?"] <- NA
auto_data$horsepower <- as.numeric(auto_data$horsepower)
quantitative_vars <- c("mpg", "cylinders", "displacement", "horsepower", 
                       "weight", "acceleration", "year", "origin")
for (var in quantitative_vars) {
  cat("Mean of", var, ":", mean(auto_data[[var]], na.rm = TRUE), "\n")
  cat("Standard Deviation of", var, ":", sd(auto_data[[var]], na.rm = TRUE), "\n\n")
}

## Mean of mpg : 23.51587 
## Standard Deviation of mpg : 7.825804 
## 
## Mean of cylinders : 5.458438 
## Standard Deviation of cylinders : 1.701577 
## 
## Mean of displacement : 193.5327 
## Standard Deviation of displacement : 104.3796 
## 
## Mean of horsepower : 104.4694 
## Standard Deviation of horsepower : 38.49116 
## 
## Mean of weight : 2970.262 
## Standard Deviation of weight : 847.9041 
## 
## Mean of acceleration : 15.55567 
## Standard Deviation of acceleration : 2.749995 
## 
## Mean of year : 75.99496 
## Standard Deviation of year : 3.690005 
## 
## Mean of origin : 1.574307 
## Standard Deviation of origin : 0.8025495

auto_subset <- auto_data[-(10:85), ]
quantitative_vars <- c("mpg", "cylinders", "displacement", "horsepower", 
                       "weight", "acceleration", "year", "origin")
for (var in quantitative_vars) {
  cat("Variable:", var, "\n")
  cat("Range:", range(auto_subset[[var]], na.rm = TRUE), "\n")
  cat("Mean:", mean(auto_subset[[var]], na.rm = TRUE), "\n")
  cat("Standard Deviation:", sd(auto_subset[[var]], na.rm = TRUE), "\n\n")
}

## Variable: mpg 
## Range: 11 46.6 
## Mean: 24.43863 
## Standard Deviation: 7.908184 
## 
## Variable: cylinders 
## Range: 3 8 
## Mean: 5.370717 
## Standard Deviation: 1.653486 
## 
## Variable: displacement 
## Range: 68 455 
## Mean: 187.0498 
## Standard Deviation: 99.63539 
## 
## Variable: horsepower 
## Range: 46 230 
## Mean: 100.9558 
## Standard Deviation: 35.89557 
## 
## Variable: weight 
## Range: 1649 4997 
## Mean: 2933.963 
## Standard Deviation: 810.6429 
## 
## Variable: acceleration 
## Range: 8.5 24.8 
## Mean: 15.72305 
## Standard Deviation: 2.680514 
## 
## Variable: year 
## Range: 70 82 
## Mean: 77.15265 
## Standard Deviation: 3.11123 
## 
## Variable: origin 
## Range: 1 3 
## Mean: 1.598131 
## Standard Deviation: 0.8161627

library(ggplot2)
library(GGally)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

auto_data$horsepower[auto_data$horsepower == "?"] <- NA
auto_data$horsepower <- as.numeric(auto_data$horsepower)
ggpairs(auto_data[, c("mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration", "year", "origin")])

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 5 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 5 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 5 rows containing missing values

## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 5 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 5 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_density()`).

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 5 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 5 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 5 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 5 rows containing missing values

## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 5 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 5 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 5 rows containing missing values or values outside the scale range
## (`geom_point()`).

#Many of the variables appear to be highly (positively or negatively) correlated with some relationships being non-linear.

#f. Suppose that we wish to predict gas mileage (`mpg`) on the basis of the other variables. Do your plots suggest that any of the other variables might be useful in predicting `mpg`? Justify your answer.

#Yes, since other variables are correlated. However, horsepower, weight and displacement are highly related.

Exercise 1

113035131 Baljingarav

2025-03-04