#install.packages('tidyverse') #comes with dplyr, readr, ggplot2, tidyr etc
#install.packages('readxl') #reads excel files
#install.packages('writexl') # writes excel files
#install.packages('lubridate') # to help parse and manipulate dates
#load the packages
library (tidyverse)
## Warning: package 'lubridate' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library (readxl)
## Warning: package 'readxl' was built under R version 4.4.3
library (writexl)
## Warning: package 'writexl' was built under R version 4.4.3
library (lubridate)
#load the data
data(mtcars)
attach(mtcars)
## The following object is masked from package:ggplot2:
##
## mpg
View(mtcars)
#Take a quick look of the data
glimpse(mtcars)
## Rows: 32
## Columns: 11
## $ mpg <dbl> 21.0, 21.0, 22.8, 21.4, 18.7, 18.1, 14.3, 24.4, 22.8, 19.2, 17.8,…
## $ cyl <dbl> 6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 8,…
## $ disp <dbl> 160.0, 160.0, 108.0, 258.0, 360.0, 225.0, 360.0, 146.7, 140.8, 16…
## $ hp <dbl> 110, 110, 93, 110, 175, 105, 245, 62, 95, 123, 123, 180, 180, 180…
## $ drat <dbl> 3.90, 3.90, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92, 3.92,…
## $ wt <dbl> 2.620, 2.875, 2.320, 3.215, 3.440, 3.460, 3.570, 3.190, 3.150, 3.…
## $ qsec <dbl> 16.46, 17.02, 18.61, 19.44, 17.02, 20.22, 15.84, 20.00, 22.90, 18…
## $ vs <dbl> 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,…
## $ am <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,…
## $ gear <dbl> 4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3,…
## $ carb <dbl> 4, 4, 1, 1, 2, 1, 4, 2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1, 1, 2,…
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
tail(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.7 0 1 5 2
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.9 1 1 5 2
## Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.5 0 1 5 4
## Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.5 0 1 5 6
## Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.6 0 1 5 8
## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.6 1 1 4 2
dim(mtcars) #the data has 32 rows and 11 columns
## [1] 32 11
names(mtcars)
## [1] "mpg" "cyl" "disp" "hp" "drat" "wt" "qsec" "vs" "am" "gear"
## [11] "carb"
cars_clean<-mtcars%>%
rownames_to_column(var="Model") %>% #make car names a column
mutate(
cyl=as.factor(cyl), #convert to a categorical variable
am=factor(am,labels=c("Automatic",'Manual')),
vs=factor(vs,labels=c("V-shaped","Straight")),
wt_kg=453.6*wt #convert weight from lbs to kgs
)
head(cars_clean)
## Model mpg cyl disp hp drat wt qsec vs am gear
## 1 Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 V-shaped Manual 4
## 2 Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 V-shaped Manual 4
## 3 Datsun 710 22.8 4 108 93 3.85 2.320 18.61 Straight Manual 4
## 4 Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 Straight Automatic 3
## 5 Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 V-shaped Automatic 3
## 6 Valiant 18.1 6 225 105 2.76 3.460 20.22 Straight Automatic 3
## carb wt_kg
## 1 4 1188.432
## 2 4 1304.100
## 3 1 1052.352
## 4 1 1458.324
## 5 2 1560.384
## 6 1 1569.456
summary(cars_clean)
## Model mpg cyl disp hp
## Length:32 Min. :10.40 4:11 Min. : 71.1 Min. : 52.0
## Class :character 1st Qu.:15.43 6: 7 1st Qu.:120.8 1st Qu.: 96.5
## Mode :character Median :19.20 8:14 Median :196.3 Median :123.0
## Mean :20.09 Mean :230.7 Mean :146.7
## 3rd Qu.:22.80 3rd Qu.:326.0 3rd Qu.:180.0
## Max. :33.90 Max. :472.0 Max. :335.0
## drat wt qsec vs am
## Min. :2.760 Min. :1.513 Min. :14.50 V-shaped:18 Automatic:19
## 1st Qu.:3.080 1st Qu.:2.581 1st Qu.:16.89 Straight:14 Manual :13
## Median :3.695 Median :3.325 Median :17.71
## Mean :3.597 Mean :3.217 Mean :17.85
## 3rd Qu.:3.920 3rd Qu.:3.610 3rd Qu.:18.90
## Max. :4.930 Max. :5.424 Max. :22.90
## gear carb wt_kg
## Min. :3.000 Min. :1.000 Min. : 686.3
## 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:1170.9
## Median :4.000 Median :2.000 Median :1508.2
## Mean :3.688 Mean :2.812 Mean :1459.3
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:1637.5
## Max. :5.000 Max. :8.000 Max. :2460.3
cars_clean %>%
summarise(
avg_mpg=mean(mpg),
avg_hp=mean(hp),
avg_wt=mean(wt_kg),
) #run summaries for mpg, hp and wt in kg
## avg_mpg avg_hp avg_wt
## 1 20.09062 146.6875 1459.345
cars_clean%>%
group_by(am)%>%
summarise(
mean_mpg=mean(mpg),
mean_hp=mean(hp),
n=n()
) #run summaries and group by transmission
## # A tibble: 2 × 4
## am mean_mpg mean_hp n
## <fct> <dbl> <dbl> <int>
## 1 Automatic 17.1 160. 19
## 2 Manual 24.4 127. 13
cars_clean %>%
group_by(vs)%>%
summarise(
mean_hp1=mean(hp),
mean_wt=mean(wt_kg),
mean_mpg=mean(mpg),
n=n()
) #summarise and group by Engine type
## # A tibble: 2 × 5
## vs mean_hp1 mean_wt mean_mpg n
## <fct> <dbl> <dbl> <dbl> <int>
## 1 V-shaped 190. 1673. 16.6 18
## 2 Straight 91.4 1184. 24.6 14
#barchart
cars_clean %>%
group_by(cyl)%>%
summarise(mean_mpg=mean(mpg)) %>%
ggplot(aes(x=cyl,y=mean_mpg,fill=cyl))+
geom_col()+
labs(title="Average fuel Efficiency by Cylinders",y="Mean MPG",x="Number of Cylinders")+ theme_minimal()
#box plot
ggplot(cars_clean, aes(x=am, y=mpg, fill=am)) + geom_boxplot() +
labs(title="Fuel Efficiency by Transmission type", x= "Transmission type", y="Miles per Galon") +
theme_minimal()
#scatter plot
ggplot(cars_clean, aes(x=hp, y=mpg, color=am, size =wt_kg)) +
geom_point(alpha=0.7) +
labs(title= "Horsepower vs. Fuel Efficiency",
x="Horsepower (hp)", y= "Miles per gallon (mpg)",color="Transmission", size="Weight (kg)") +
theme_minimal()
#trend plot
ggplot(cars_clean, aes(x = wt_kg, y = mpg)) +
geom_point() +
geom_smooth(method = "lm", se = TRUE, color = "red") +
labs(title = "Fuel Efficiency vs Weight",
x = "Weight (kg)", y = "MPG") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'