library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
auto_data <- read.csv("Auto.csv", stringsAsFactors = FALSE)
str(auto_data)
## 'data.frame': 397 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : chr "130" "165" "150" "150" ...
## $ weight : int 3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ name : chr "chevrolet chevelle malibu" "buick skylark 320" "plymouth satellite" "amc rebel sst" ...
categorical_vars <- names(auto_data)[sapply(auto_data, is.character)]
numerical_vars <- names(auto_data)[sapply(auto_data, is.numeric)]
cat("Quantitative Variables:\n")
## Quantitative Variables:
print(numerical_vars)
## [1] "mpg" "cylinders" "displacement" "weight" "acceleration"
## [6] "year" "origin"
cat("\nQualitative Variables:\n")
##
## Qualitative Variables:
print(categorical_vars)
## [1] "horsepower" "name"
auto_data <- read.csv("Auto.csv", stringsAsFactors = FALSE)
auto_data$horsepower <- as.numeric(auto_data$horsepower)
## Warning: NAs introduced by coercion
quantitative_vars <- c("mpg", "cylinders", "displacement", "horsepower",
"weight", "acceleration", "year", "origin")
for (var in quantitative_vars) {
cat("Range of", var, ":", range(auto_data[[var]], na.rm = TRUE), "\n")
}
## Range of mpg : 9 46.6
## Range of cylinders : 3 8
## Range of displacement : 68 455
## Range of horsepower : 46 230
## Range of weight : 1613 5140
## Range of acceleration : 8 24.8
## Range of year : 70 82
## Range of origin : 1 3
auto_data$horsepower[auto_data$horsepower == "?"] <- NA
auto_data$horsepower <- as.numeric(auto_data$horsepower)
quantitative_vars <- c("mpg", "cylinders", "displacement", "horsepower",
"weight", "acceleration", "year", "origin")
for (var in quantitative_vars) {
cat("Mean of", var, ":", mean(auto_data[[var]], na.rm = TRUE), "\n")
cat("Standard Deviation of", var, ":", sd(auto_data[[var]], na.rm = TRUE), "\n\n")
}
## Mean of mpg : 23.51587
## Standard Deviation of mpg : 7.825804
##
## Mean of cylinders : 5.458438
## Standard Deviation of cylinders : 1.701577
##
## Mean of displacement : 193.5327
## Standard Deviation of displacement : 104.3796
##
## Mean of horsepower : 104.4694
## Standard Deviation of horsepower : 38.49116
##
## Mean of weight : 2970.262
## Standard Deviation of weight : 847.9041
##
## Mean of acceleration : 15.55567
## Standard Deviation of acceleration : 2.749995
##
## Mean of year : 75.99496
## Standard Deviation of year : 3.690005
##
## Mean of origin : 1.574307
## Standard Deviation of origin : 0.8025495
auto_subset <- auto_data[-(10:85), ]
quantitative_vars <- c("mpg", "cylinders", "displacement", "horsepower",
"weight", "acceleration", "year", "origin")
for (var in quantitative_vars) {
cat("Variable:", var, "\n")
cat("Range:", range(auto_subset[[var]], na.rm = TRUE), "\n")
cat("Mean:", mean(auto_subset[[var]], na.rm = TRUE), "\n")
cat("Standard Deviation:", sd(auto_subset[[var]], na.rm = TRUE), "\n\n")
}
## Variable: mpg
## Range: 11 46.6
## Mean: 24.43863
## Standard Deviation: 7.908184
##
## Variable: cylinders
## Range: 3 8
## Mean: 5.370717
## Standard Deviation: 1.653486
##
## Variable: displacement
## Range: 68 455
## Mean: 187.0498
## Standard Deviation: 99.63539
##
## Variable: horsepower
## Range: 46 230
## Mean: 100.9558
## Standard Deviation: 35.89557
##
## Variable: weight
## Range: 1649 4997
## Mean: 2933.963
## Standard Deviation: 810.6429
##
## Variable: acceleration
## Range: 8.5 24.8
## Mean: 15.72305
## Standard Deviation: 2.680514
##
## Variable: year
## Range: 70 82
## Mean: 77.15265
## Standard Deviation: 3.11123
##
## Variable: origin
## Range: 1 3
## Mean: 1.598131
## Standard Deviation: 0.8161627
library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
auto_data$horsepower[auto_data$horsepower == "?"] <- NA
auto_data$horsepower <- as.numeric(auto_data$horsepower)
ggpairs(auto_data[, c("mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration", "year", "origin")])
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 5 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 5 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 5 rows containing missing values
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 5 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 5 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 5 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 5 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 5 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 5 rows containing missing values
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 5 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 5 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 5 rows containing missing values or values outside the scale range
## (`geom_point()`).

#Many of the variables appear to be highly (positively or negatively) correlated with some relationships being non-linear.
#f. Suppose that we wish to predict gas mileage (`mpg`) on the basis of the other variables. Do your plots suggest that any of the other variables might be useful in predicting `mpg`? Justify your answer.
#Yes, since other variables are correlated. However, horsepower, weight and displacement are highly related.