For data manipulation

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

For visualization

library(ggplot2) 

#Print the structure of your dataset.# # Load built-in mtcars dataset #

data <- mtcars
str(data)
## 'data.frame':    32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...

#List the Variables in the Dataset.#

colnames(data)
##  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear"
## [11] "carb"

Create a User-Defined Function

Convert miles per gallon to kilometers per liter

convert_mpg_to_kmpl <- function(mpg) {
  return(mpg * 0.425144)  }

Apply the function

data$kmpl <- sapply(data$mpg, convert_mpg_to_kmpl)  

Filter Data Based on a Condition

filtered_data <- filter(data, mpg > 20)

Select cars with MPG greater than 20

print(filtered_data)
##                 mpg cyl  disp  hp drat    wt  qsec vs am gear carb      kmpl
## Mazda RX4      21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4  8.928024
## Mazda RX4 Wag  21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4  8.928024
## Datsun 710     22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1  9.693283
## Hornet 4 Drive 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1  9.098082
## Merc 240D      24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2 10.373514
## Merc 230       22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2  9.693283
## Fiat 128       32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1 13.774666
## Honda Civic    30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2 12.924378
## Toyota Corolla 33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1 14.412382
## Toyota Corona  21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1  9.140596
## Fiat X1-9      27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1 11.606431
## Porsche 914-2  26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2 11.053744
## Lotus Europa   30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2 12.924378
## Volvo 142E     21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2  9.098082

Identify Dependent & Independent Variables and Reshape Data

Dependent variable (MPG)

dependent_var <- data$mpg

All other columns as independent variables

independent_vars <- data %>% select(-mpg)

Combine

reshaped_data <- data.frame(MPG = dependent_var, Independent = independent_vars)

Remove Missing Values.

cleaned_data <- na.omit(data)

Identify and Remove Duplicated Data.

data_no_duplicates <- distinct(cleaned_data)

Reorder Rows in Descending Order.

Sort dataset by MPG in descending order

sorted_data <- data[order(-data$mpg),]

Rename Some Column Names.

colnames(data)[colnames(data) == "mpg"] <- "Miles_Per_Gallon"
colnames(data)[colnames(data) == "hp"] <- "Horse_Power"

Add a New Variable.

New column: double the weight values

data$Double_Weight <- data$wt * 2 

Create a Training Set (80%).

Set seed for reproducibility

set.seed(123)
train_index <- sample(seq_len(nrow(data)), size = 0.8 * nrow(data))

Training set (80%)

train_data <- data[train_index, ]

Test set (20%)

test_data <- data[-train_index, ]

Calculate Mean, Median, Mode & Range.

Mean

mean_mpg <- mean(data$Miles_Per_Gallon)

Median

median_mpg <- median(data$Miles_Per_Gallon)

Mode

mode_mpg <- as.numeric(names(sort(table(data$Miles_Per_Gallon), decreasing=TRUE)[1]))

Range

range_mpg <- range(data$Miles_Per_Gallon)

Create a Scatter Plot (MPG vs HP).

ggplot(data, aes(x = Horse_Power, y = Miles_Per_Gallon)) +
  geom_point(color = "blue") +
  ggtitle("Scatter Plot: Horse Power vs Miles Per Gallon") +
  xlab("Horse Power") + 
  ylab("Miles Per Gallon")

Create a Bar Plot (MPG by Cylinders).

  ggplot(data, aes(x = as.factor(cyl), y = Miles_Per_Gallon, fill = as.factor(cyl))) +
    geom_bar(stat="identity") +
    ggtitle("Bar Plot: MPG by Cylinders") +
    xlab("Number of Cylinders") + 
    ylab("Miles Per Gallon") +
    scale_fill_discrete(name = "Cylinders")

Find Correlation & Apply Linear Regression.

correlation <- cor(data$Miles_Per_Gallon, data$Horse_Power)
print(paste("Correlation between MPG and HP:", correlation))
## [1] "Correlation between MPG and HP: -0.776168371826586"

Apply Linear Regression Model

linear_model <- lm(Miles_Per_Gallon ~ Horse_Power, data = data)
summary(linear_model)
## 
## Call:
## lm(formula = Miles_Per_Gallon ~ Horse_Power, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.7121 -2.1122 -0.8854  1.5819  8.2360 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 30.09886    1.63392  18.421  < 2e-16 ***
## Horse_Power -0.06823    0.01012  -6.742 1.79e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.863 on 30 degrees of freedom
## Multiple R-squared:  0.6024, Adjusted R-squared:  0.5892 
## F-statistic: 45.46 on 1 and 30 DF,  p-value: 1.788e-07