This is a tutorial on data manipulation in R using the mtcars dataset. This example covers common data manipulation tasks such as filtering, sorting, grouping, summarizing, and creating new variables.
The mtcars dataset is included in R by default. You can load it using the data() function.
View the first few rows of the dataset
data("mtcars")
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
Filter the dataset to include only cars with more than 6 cylinders.
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
filtered_data <- mtcars %>%
filter(cyl > 6)
head(filtered_data)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.44 17.02 0 0 3 2
## Duster 360 14.3 8 360.0 245 3.21 3.57 15.84 0 0 3 4
## Merc 450SE 16.4 8 275.8 180 3.07 4.07 17.40 0 0 3 3
## Merc 450SL 17.3 8 275.8 180 3.07 3.73 17.60 0 0 3 3
## Merc 450SLC 15.2 8 275.8 180 3.07 3.78 18.00 0 0 3 3
## Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.25 17.98 0 0 3 4
Select specific columns (e.g., mpg, cyl, hp).
selected_data <- mtcars %>%
select(mpg, cyl, hp)
head(selected_data)
## mpg cyl hp
## Mazda RX4 21.0 6 110
## Mazda RX4 Wag 21.0 6 110
## Datsun 710 22.8 4 93
## Hornet 4 Drive 21.4 6 110
## Hornet Sportabout 18.7 8 175
## Valiant 18.1 6 105
Sort the dataset by mpg (miles per gallon) in descending order.
sorted_data <- mtcars %>%
arrange(desc(mpg))
head(sorted_data)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
Add a new column kmpl (kilometers per liter) by converting mpg to kilometers per liter (1 mpg ≈ 0.425 kmpl).
mutated_data <- mtcars %>%
mutate(kmpl = mpg * 0.425)
head(mutated_data)
## mpg cyl disp hp drat wt qsec vs am gear carb kmpl
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 8.9250
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 8.9250
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 9.6900
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 9.0950
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 7.9475
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 7.6925
Group the data by cyl (number of cylinders) and calculate the mean mpg for each group.
summary_data <- mtcars %>%
group_by(cyl) %>%
summarize(mean_mpg = mean(mpg),
max_hp = max(hp))
print(summary_data)
## # A tibble: 3 × 3
## cyl mean_mpg max_hp
## <dbl> <dbl> <dbl>
## 1 4 26.7 113
## 2 6 19.7 175
## 3 8 15.1 335
Rename the mpg column to miles_per_gallon.
renamed_data <- mtcars %>%
rename(miles_per_gallon = mpg)
head(renamed_data)
## miles_per_gallon cyl disp hp drat wt qsec vs am gear
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3
## carb
## Mazda RX4 4
## Mazda RX4 Wag 4
## Datsun 710 1
## Hornet 4 Drive 1
## Hornet Sportabout 2
## Valiant 1
Add a new column with row numbers.
row_numbered_data <- mtcars %>%
mutate(row_num = row_number())
head(row_numbered_data)
## mpg cyl disp hp drat wt qsec vs am gear carb row_num
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 1
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 2
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 3
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 4
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 5
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 6
Create a small dataset and merge it with mtcars.
Create a small dataset
car_names <- data.frame(
car = rownames(mtcars),
brand = c("Toyota", "Ford", "Mazda", "Honda", "Datsun", "Chrysler", "AMC", "Pontiac", "Fiat", "Porsche", "Lotus", "Ford", "Chevrolet", "Cadillac", "Lincoln", "Dodge", "Merc", "Valiant", "Duster", "Merc", "Merc", "Merc", "Merc", "Merc", "Merc", "Merc", "Merc", "Merc", "Merc", "Merc", "Merc", "Merc")
)
Merge with mtcars
merged_data <- mtcars %>%
mutate(car = rownames(mtcars)) %>%
left_join(car_names, by = "car")
head(merged_data)
## mpg cyl disp hp drat wt qsec vs am gear carb car brand
## 1 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 Mazda RX4 Toyota
## 2 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 Mazda RX4 Wag Ford
## 3 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 Datsun 710 Mazda
## 4 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 Hornet 4 Drive Honda
## 5 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 Hornet Sportabout Datsun
## 6 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 Valiant Chrysler
Reshape the data from wide to long format using pivot_longer.
library(tidyr)
long_data <- mtcars %>%
pivot_longer(cols = c(mpg, hp, wt),
names_to = "metric",
values_to = "value")
head(long_data)
## # A tibble: 6 × 10
## cyl disp drat qsec vs am gear carb metric value
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 6 160 3.9 16.5 0 1 4 4 mpg 21
## 2 6 160 3.9 16.5 0 1 4 4 hp 110
## 3 6 160 3.9 16.5 0 1 4 4 wt 2.62
## 4 6 160 3.9 17.0 0 1 4 4 mpg 21
## 5 6 160 3.9 17.0 0 1 4 4 hp 110
## 6 6 160 3.9 17.0 0 1 4 4 wt 2.88
Add some missing values and handle them.
Introduce missing values
mtcars_with_na <- mtcars
mtcars_with_na[1:5, "mpg"] <- NA
Remove rows with missing values
cleaned_data <- mtcars_with_na %>%
drop_na(mpg)
head(cleaned_data)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Valiant 18.1 6 225.0 105 2.76 3.46 20.22 1 0 3 1
## Duster 360 14.3 8 360.0 245 3.21 3.57 15.84 0 0 3 4
## Merc 240D 24.4 4 146.7 62 3.69 3.19 20.00 1 0 4 2
## Merc 230 22.8 4 140.8 95 3.92 3.15 22.90 1 0 4 2
## Merc 280 19.2 6 167.6 123 3.92 3.44 18.30 1 0 4 4
## Merc 280C 17.8 6 167.6 123 3.92 3.44 18.90 1 0 4 4
Data visualization is a crucial part of data analysis, and R provides powerful tools like ggplot2 and base R plotting functions to create insightful visualizations. We will demonstrate various types of visualizations using the mtcars dataset.
Load necessary libraries
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.2
Visualize the relationship between two numeric variables, such as mpg (miles per gallon) and wt (weight). We shall make a Scatter plot using ggplot2
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point(color = "blue", size = 3) +
labs(title = "Scatter Plot of MPG vs Weight",
x = "Weight (1000 lbs)",
y = "Miles per Gallon") +
theme_minimal()
Figure 1: The relationship between the weight of the motor vehicle and the miles per gallon
Compare the distribution of mpg across different numbers of cylinders (cyl). Let us make a Box plot using ggplot2
ggplot(mtcars, aes(x = factor(cyl), y = mpg, fill = factor(cyl))) +
geom_boxplot() +
labs(title = "Box Plot of MPG by Cylinders",
x = "Number of Cylinders",
y = "Miles per Gallon") +
theme_minimal()
Visualize the average mpg for each number of cylinders. Here, we shall create a Bar plot using ggplot2
avg_mpg <- mtcars %>%
group_by(cyl) %>%
summarize(mean_mpg = mean(mpg))
ggplot(avg_mpg, aes(x = factor(cyl), y = mean_mpg, fill = factor(cyl))) +
geom_bar(stat = "identity") +
labs(title = "Average MPG by Cylinders",
x = "Number of Cylinders",
y = "Average Miles per Gallon") +
theme_minimal()
Visualize the distribution of mpg. Let us create a Histogram using ggplot2
ggplot(mtcars, aes(x = mpg)) +
geom_histogram(binwidth = 2, fill = "orange", color = "black") +
labs(title = "Histogram of MPG",
x = "Miles per Gallon",
y = "Frequency") +
theme_minimal()
Visualize the density distribution of mpg. Let us make a Density plot using ggplot2
ggplot(mtcars, aes(x = mpg)) +
geom_density(fill = "lightblue", alpha = 0.5) +
labs(title = "Density Plot of MPG",
x = "Miles per Gallon",
y = "Density") +
theme_minimal()
Visualize the relationship between mpg and hp (horsepower) using a line plot. Here, we shall make a Line plot using ggplot2
ggplot(mtcars, aes(x = hp, y = mpg)) +
geom_line(color = "red") +
labs(title = "Line Plot of MPG vs Horsepower",
x = "Horsepower",
y = "Miles per Gallon") +
theme_minimal()
Visualize correlations between numeric variables in the dataset. Let us make a Heatmap using base R
correlation_matrix <- cor(mtcars)
heatmap(correlation_matrix,
col = colorRampPalette(c("blue", "white", "red"))(100),
main = "Correlation Heatmap of mtcars Dataset")
Compare mpg vs wt for different numbers of cylinders using facets. Making a Faceted plot using ggplot2
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point(color = "purple") +
facet_wrap(~ cyl) +
labs(title = "Faceted Scatter Plot of MPG vs Weight by Cylinders",
x = "Weight (1000 lbs)",
y = "Miles per Gallon") +
theme_minimal()
Visualize the distribution of mpg across different numbers of cylinders. Let us make a Violin plot using ggplot2
ggplot(mtcars, aes(x = factor(cyl), y = mpg, fill = factor(cyl))) +
geom_violin() +
labs(title = "Violin Plot of MPG by Cylinders",
x = "Number of Cylinders",
y = "Miles per Gallon") +
theme_minimal()
Visualize pairwise relationships between numeric variables. Making a Pair plot using base R
pairs(mtcars[, 1:5],
main = "Pair Plot of mtcars Dataset",
pch = 19,
col = "darkblue")
Customize the theme of your plots for better aesthetics.
Custom theme
custom_theme <- theme_minimal() +
theme(
plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
axis.title = element_text(size = 12),
axis.text = element_text(size = 10),
legend.position = "bottom"
)
Example plot with custom theme
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point(color = "green", size = 3) +
labs(title = "Custom Themed Scatter Plot",
x = "Weight (1000 lbs)",
y = "Miles per Gallon") +
custom_theme