library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load the Motor Trend Car Road Tests (mtcars) dataset
carData = read.csv("C:/Users/User/Downloads/MAHE FILES/MAHE SEMESTER 1/R and Python/MAHE R WK3&4/mtcars3.csv", header = TRUE)
head(carData, 5)
# Create a vector of categorical columns
categorical_cols = c('vs', 'am')
# Convert the columns to factor type
carData[categorical_cols] = lapply(carData[categorical_cols], as.factor)
# Add a new column called cyltype with value High
# is cyl is greater than 4 and Low otherwise
carData = carData %>% mutate(cyltype = ifelse(cyl > 4, 'High', 'Low'))
head(carData)
# Summarize the features
summary(carData)
## X mpg cyl disp
## Length:32 Min. :10.40 Min. :4.000 Min. : 71.1
## Class :character 1st Qu.:15.43 1st Qu.:4.000 1st Qu.:120.8
## Mode :character Median :19.20 Median :6.000 Median :196.3
## Mean :20.09 Mean :6.188 Mean :230.7
## 3rd Qu.:22.80 3rd Qu.:8.000 3rd Qu.:326.0
## Max. :33.90 Max. :8.000 Max. :472.0
## hp drat wt qsec vs am
## Min. : 52.0 Min. :2.760 Min. :1.513 Min. :14.50 0:18 0:19
## 1st Qu.: 96.5 1st Qu.:3.080 1st Qu.:2.581 1st Qu.:16.89 1:14 1:13
## Median :123.0 Median :3.695 Median :3.325 Median :17.71
## Mean :146.7 Mean :3.597 Mean :3.217 Mean :17.85
## 3rd Qu.:180.0 3rd Qu.:3.920 3rd Qu.:3.610 3rd Qu.:18.90
## Max. :335.0 Max. :4.930 Max. :5.424 Max. :22.90
## gear carb cyltype
## Min. :3.000 Min. :1.000 Length:32
## 1st Qu.:3.000 1st Qu.:2.000 Class :character
## Median :4.000 Median :2.000 Mode :character
## Mean :3.688 Mean :2.812
## 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :8.000
# Visualize distribution of a categorical
# variable using bar chart
ggplot(data = carData) +
geom_bar(aes(x = cyltype))
# Count the number of observations in each category
carData %>% count(cyltype)
# Visualize distribution of a continuous
# variable using histogram
ggplot(data = carData) +
geom_histogram(aes(x = mpg), binwidth = 2.5)
# Visualzing the histogram using counts
#cut_width is a ggplot fxn for generating numerical values of a histogram
#cut_width (2.5) basically splits the range of variable-mpg values into blocks that are of 2.5 units apart. And then count fxn will count how many falls into each block.
#In summary, cut_width acts like class-interval
carData %>%
count(cut_width(mpg, 2.5))
# Visualizing multiple histograms
#Sometime, creating a multiple barchart on one chart might cause overlapping of the boxes. Therefore, we use the straight line called freq polygon
#freqpoly is a Frequency Polygon object
ggplot(data = carData, mapping = aes(x = mpg)) +
geom_freqpoly(binwidth = 2.5, mapping = aes(colour = cyltype))
The red line (“High”) shows a peak in frequency around the lower mpg
range (approximately 15-20 mpg), indicating vehicles in this category
are less fuel-efficient. The blue line (“Low”) peaks at higher mpg
ranges (approximately 25-30 mpg), suggesting that these vehicles are
more fuel-efficient. The data for the two groups have little overlap,
with distinct trends in different ranges of mpg.
# Boxplot to visualize the covariance
# between a continuous and categorical
#To show the distribution of a categorical variable over a continous variable
# feature
ggplot(data = carData, mapping = aes(x = cyltype, y = mpg)) +
geom_boxplot()
# Load the mpg dataset
data('mpg')
mpgData = mpg
head(mpgData)
# Boxplot to visualize highway mpg according to
# car type
ggplot(data = mpgData, mapping = aes(x = class, y = hwy)) +
geom_boxplot()
# Reorder boxplot according to median
#I want to rearrange the boxplot according to their median. Then i use the reorder fxn. That is, reorder the 'class' var using the values of thr 'hwy' var wrt their median.
# to visualize the trend
ggplot(data = mpgData, mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy)) +
geom_boxplot()
# Flip the boxplot for better visualization
#coord_flip changes the position of the axes
ggplot(data = mpgData) +
geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy)) +
coord_flip()
# Visualize covariance between two
# categorical features
carData %>% count(cyltype, am)
ggplot(data = carData) +
geom_count(mapping = aes(x = cyltype, y = am))
# Visualize covariance between two
# continuous features - create a
# scatter plot of mpg vs. HP
ggplot(data = carData, aes(x = hp, y = mpg, color = factor(cyl))) +
geom_point(size = 1)
# Load the diamonds dataset
data(diamonds)
head(diamonds)
# Visualize covariance between two
# continuous features - create a
# scatter plot of carat vs. price
ggplot(data = diamonds) +
geom_point(mapping = aes(x = carat, y = price))
# Load the hexbin package
library(hexbin)
## Warning: package 'hexbin' was built under R version 4.3.3
ggplot(data = diamonds) +
geom_hex(mapping = aes(x = carat, y = price))