install.packages("ISLR")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(ISLR)
data("Auto")  # Example dataset from ISLR package
url <- "https://www.statlearning.com/s/Auto.csv"
auto_data <- read.csv(url, header = TRUE)

# (a) Which of the predictors are quantitative, and which are qualitative?
# Quantitative Variables:
# mpg - Miles per gallon
# cylinders - Number of cylinders (4 to 8)
# displacement - Engine displacement (cu. inches)
# horsepower - Engine horsepower
# weight - Vehicle weight (lbs.)
# acceleration - 0 to 

Qualitative:

origin - Origin of car (1. American, 2. European, 3. Japanese) name - Vehicle name’’

What is the range of each quantitative predictor? {r} range_Auto <- data.frame(sapply(Auto[ ,1:7], range)) rownames(range_Auto) <- c(“min:”, “max:”) range_Auto

What is the mean and standard deviation of each quantitative predictor? {r} sapply(Auto[ ,1:7], mean) {r} sapply(Auto[ ,1:7], sd)

Now remove the 10th through 85th observations. What is the range, mean, and standard deviation of each predictor in the subset of the data that remains?

Auto_2 <- Auto[-c(10:85), ]
range_Auto_2 <- data.frame(sapply(Auto_2[, 1:7], range))  # Compute the range for columns 1 to 7
rownames(range_Auto_2) <- c("min:", "max:")  # Assign row names
range_Auto_2  # Display the result
##       mpg cylinders displacement horsepower weight acceleration year
## min: 11.0         3           68         46   1649          8.5   70
## max: 46.6         8          455        230   4997         24.8   82
sapply(Auto_2[ ,1:7], mean)
##          mpg    cylinders displacement   horsepower       weight acceleration 
##    24.404430     5.373418   187.240506   100.721519  2935.971519    15.726899 
##         year 
##    77.145570
sapply(Auto_2[ ,1:7], sd)
##          mpg    cylinders displacement   horsepower       weight acceleration 
##     7.867283     1.654179    99.678367    35.708853   811.300208     2.693721 
##         year 
##     3.106217

Using the full data set, investigate the predictors graphically, using scatterplots or other tools of your choice. Create some plots highlighting the relationships among the predictors. Comment on your findings.

pairs(Auto[ ,1:7])

summary(lm(displacement ~ year, data = Auto))
## 
## Call:
## lm(formula = displacement ~ year, data = Auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -175.73  -75.58  -13.45   69.69  229.28 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  992.691    101.661   9.765  < 2e-16 ***
## year         -10.506      1.336  -7.862 3.75e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 97.35 on 390 degrees of freedom
## Multiple R-squared:  0.1368, Adjusted R-squared:  0.1346 
## F-statistic:  61.8 on 1 and 390 DF,  p-value: 3.748e-14
summary(lm(displacement ~ year * origin, data = Auto))
## 
## Call:
## lm(formula = displacement ~ year * origin, data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -208.809  -49.130   -1.753   47.496  180.801 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1980.870    169.234  11.705  < 2e-16 ***
## year         -21.902      2.222  -9.855  < 2e-16 ***
## origin      -771.920     96.358  -8.011 1.34e-14 ***
## year:origin    9.098      1.254   7.256 2.18e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 73.33 on 388 degrees of freedom
## Multiple R-squared:  0.5128, Adjusted R-squared:  0.509 
## F-statistic: 136.1 on 3 and 388 DF,  p-value: < 2.2e-16

Suppose that we wish to predict gas mileage (mpg) on the basis of the other variables. Do your plots suggest that any of the other variables might be useful in predicting mpg? Justify your answer.

```{r} Autobrand<−sapply(strsplit(as.character(Auto name), split = ” “), function(x) x[1]) # extract the first item from each list element

Autobrand<−factor(ifelse(Auto brand %in% c(“vokswagen”, “vw”), “volkswagen”, ifelse(Autobrand==“toyouta”,“toyota”,ifelse(Auto brand %in% c(“chevroelt”, “chevy”), “chevrolet”, ifelse(Autobrand==“maxda”,“mazda”,Auto brand))))) # fixing typo’s

table(Auto$brand)





``` r
Auto$brand <- sapply(strsplit(as.character(Auto$name), split = " "),
                     function(x) x[1]) # extract the first item from each list element

Auto$brand <- factor(ifelse(Auto$brand %in% c("vokswagen", "vw"), "volkswagen", 
                     ifelse(Auto$brand == "toyouta", "toyota", 
                            ifelse(Auto$brand %in% c("chevroelt", "chevy"), "chevrolet", 
                                   ifelse(Auto$brand == "maxda", "mazda", 
                                          Auto$brand))))) # fixing typo's
library(forcats)

Auto$brand <- fct_lump(Auto$brand, 
                       n = 9, 
                       other_level = "uncommon") # collapse into 10 categories

table(Auto$brand)
## 
##        amc      buick  chevrolet     datsun      dodge       ford   plymouth 
##         27         17         47         23         28         48         31 
##     toyota volkswagen   uncommon 
##         26         22        123
library(ggplot2)  # Load ggplot2 package
colnames(Auto)  # Check column names of the Auto dataset
##  [1] "mpg"          "cylinders"    "displacement" "horsepower"   "weight"      
##  [6] "acceleration" "year"         "origin"       "name"         "brand"
str(Auto)  # Inspect structure of the Auto dataset
## 'data.frame':    392 obs. of  10 variables:
##  $ mpg         : num  18 15 18 16 17 15 14 14 14 15 ...
##  $ cylinders   : num  8 8 8 8 8 8 8 8 8 8 ...
##  $ displacement: num  307 350 318 304 302 429 454 440 455 390 ...
##  $ horsepower  : num  130 165 150 150 140 198 220 215 225 190 ...
##  $ weight      : num  3504 3693 3436 3433 3449 ...
##  $ acceleration: num  12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
##  $ year        : num  70 70 70 70 70 70 70 70 70 70 ...
##  $ origin      : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ name        : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
##  $ brand       : Factor w/ 10 levels "amc","buick",..: 3 2 7 1 6 6 3 7 10 1 ...
colnames(Auto)  # Check column names of the Auto dataset
##  [1] "mpg"          "cylinders"    "displacement" "horsepower"   "weight"      
##  [6] "acceleration" "year"         "origin"       "name"         "brand"
head(Auto)   # View the first few rows of the Auto dataset
##   mpg cylinders displacement horsepower weight acceleration year origin
## 1  18         8          307        130   3504         12.0   70      1
## 2  15         8          350        165   3693         11.5   70      1
## 3  18         8          318        150   3436         11.0   70      1
## 4  16         8          304        150   3433         12.0   70      1
## 5  17         8          302        140   3449         10.5   70      1
## 6  15         8          429        198   4341         10.0   70      1
##                        name     brand
## 1 chevrolet chevelle malibu chevrolet
## 2         buick skylark 320     buick
## 3        plymouth satellite  plymouth
## 4             amc rebel sst       amc
## 5               ford torino      ford
## 6          ford galaxie 500      ford
str(Auto)    # View the structure of the dataset (column names, types, etc.)
## 'data.frame':    392 obs. of  10 variables:
##  $ mpg         : num  18 15 18 16 17 15 14 14 14 15 ...
##  $ cylinders   : num  8 8 8 8 8 8 8 8 8 8 ...
##  $ displacement: num  307 350 318 304 302 429 454 440 455 390 ...
##  $ horsepower  : num  130 165 150 150 140 198 220 215 225 190 ...
##  $ weight      : num  3504 3693 3436 3433 3449 ...
##  $ acceleration: num  12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
##  $ year        : num  70 70 70 70 70 70 70 70 70 70 ...
##  $ origin      : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ name        : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
##  $ brand       : Factor w/ 10 levels "amc","buick",..: 3 2 7 1 6 6 3 7 10 1 ...
colnames(Auto)  # Check the column names of the Auto dataset
##  [1] "mpg"          "cylinders"    "displacement" "horsepower"   "weight"      
##  [6] "acceleration" "year"         "origin"       "name"         "brand"
colnames(Auto)  # Check the column names of the Auto dataset
##  [1] "mpg"          "cylinders"    "displacement" "horsepower"   "weight"      
##  [6] "acceleration" "year"         "origin"       "name"         "brand"
head(Auto)  # View the first few rows of the Auto dataset
##   mpg cylinders displacement horsepower weight acceleration year origin
## 1  18         8          307        130   3504         12.0   70      1
## 2  15         8          350        165   3693         11.5   70      1
## 3  18         8          318        150   3436         11.0   70      1
## 4  16         8          304        150   3433         12.0   70      1
## 5  17         8          302        140   3449         10.5   70      1
## 6  15         8          429        198   4341         10.0   70      1
##                        name     brand
## 1 chevrolet chevelle malibu chevrolet
## 2         buick skylark 320     buick
## 3        plymouth satellite  plymouth
## 4             amc rebel sst       amc
## 5               ford torino      ford
## 6          ford galaxie 500      ford
str(Auto)  # Check the structure of the Auto dataset
## 'data.frame':    392 obs. of  10 variables:
##  $ mpg         : num  18 15 18 16 17 15 14 14 14 15 ...
##  $ cylinders   : num  8 8 8 8 8 8 8 8 8 8 ...
##  $ displacement: num  307 350 318 304 302 429 454 440 455 390 ...
##  $ horsepower  : num  130 165 150 150 140 198 220 215 225 190 ...
##  $ weight      : num  3504 3693 3436 3433 3449 ...
##  $ acceleration: num  12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
##  $ year        : num  70 70 70 70 70 70 70 70 70 70 ...
##  $ origin      : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ name        : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
##  $ brand       : Factor w/ 10 levels "amc","buick",..: 3 2 7 1 6 6 3 7 10 1 ...
colnames(Auto)  # Check the column names of the Auto dataset
##  [1] "mpg"          "cylinders"    "displacement" "horsepower"   "weight"      
##  [6] "acceleration" "year"         "origin"       "name"         "brand"
head(Auto)  # View the first few rows of the Auto dataset
##   mpg cylinders displacement horsepower weight acceleration year origin
## 1  18         8          307        130   3504         12.0   70      1
## 2  15         8          350        165   3693         11.5   70      1
## 3  18         8          318        150   3436         11.0   70      1
## 4  16         8          304        150   3433         12.0   70      1
## 5  17         8          302        140   3449         10.5   70      1
## 6  15         8          429        198   4341         10.0   70      1
##                        name     brand
## 1 chevrolet chevelle malibu chevrolet
## 2         buick skylark 320     buick
## 3        plymouth satellite  plymouth
## 4             amc rebel sst       amc
## 5               ford torino      ford
## 6          ford galaxie 500      ford
str(Auto)  # View the structure of the Auto dataset
## 'data.frame':    392 obs. of  10 variables:
##  $ mpg         : num  18 15 18 16 17 15 14 14 14 15 ...
##  $ cylinders   : num  8 8 8 8 8 8 8 8 8 8 ...
##  $ displacement: num  307 350 318 304 302 429 454 440 455 390 ...
##  $ horsepower  : num  130 165 150 150 140 198 220 215 225 190 ...
##  $ weight      : num  3504 3693 3436 3433 3449 ...
##  $ acceleration: num  12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
##  $ year        : num  70 70 70 70 70 70 70 70 70 70 ...
##  $ origin      : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ name        : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
##  $ brand       : Factor w/ 10 levels "amc","buick",..: 3 2 7 1 6 6 3 7 10 1 ...
ggplot(Auto, aes(x = origin, y = mpg, fill = origin)) +    geom_boxplot() +    theme(legend.position = "none") +    labs(title = "Origin vs Mpg - Boxplot",         x = "Origin",         y = "MPG")
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## Warning: The following aesthetics were dropped during statistical transformation: fill.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

Auto$brand <- sapply(strsplit(as.character(Auto$name), split = " "),
                     function(x) x[1]) # extract the first item from each list element

Auto$brand <- factor(ifelse(Auto$brand %in% c("vokswagen", "vw"), "volkswagen", 
                     ifelse(Auto$brand == "toyouta", "toyota", 
                            ifelse(Auto$brand %in% c("chevroelt", "chevy"), "chevrolet", 
                                   ifelse(Auto$brand == "maxda", "mazda", 
                                          Auto$brand))))) # fixing typo's
library(forcats)

Auto$brand <- fct_lump(Auto$brand, 
                       n = 9, 
                       other_level = "uncommon") # collapse into 10 categories

table(Auto$brand)
## 
##        amc      buick  chevrolet     datsun      dodge       ford   plymouth 
##         27         17         47         23         28         48         31 
##     toyota volkswagen   uncommon 
##         26         22        123
ggplot(Auto, aes(x = brand, y = mpg, fill = brand)) + 
  geom_boxplot() + 
  theme(legend.position = "none") + 
  labs(
    title = "Brand vs Mpg - Boxplot",
    subtitle = "Engineered feature",
    x = "Brand",
    y = "MPG"
  )

ggplot(Auto, aes(x = brand, y = mpg, fill = brand)) + 
  geom_boxplot() + 
  theme(legend.position = "none") + 
  labs(
    title = "Brand vs Mpg - Boxplot",
    subtitle = "Engineered feature",
    x = "Brand",
    y = "MPG"
  )

ggplot(Auto, aes(x = origin, y = mpg, fill = origin)) +    geom_boxplot() +    theme(legend.position = "none") +    labs(title = "Origin vs Mpg - Boxplot",         x = "Origin",         y = "MPG")
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## Warning: The following aesthetics were dropped during statistical transformation: fill.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?